From cddc18c927035d058e75c19113e019ae03f5bdac Mon Sep 17 00:00:00 2001 From: laskoviymishka Date: Thu, 25 Jun 2026 16:59:57 +0200 Subject: [PATCH 1/2] feat: prototype collation support for strings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A working reference implementation of collation support, to drive the Iceberg collation spec discussion (Snowflake/Löser proposal, aligned with Delta's approach to collation statistics). - collation/: parse the collation specifier grammar (locale, ci/cs, ai/as, trim, casing, "utf8" pseudo-locale) and compare/sort via golang.org/x/text/collate (CLDR/UCA, the Go ICU-equivalent). - StringType carries an optional collation; it round-trips as a collation_spec sibling on NestedField, keeping the on-disk type name "string" so collation-unaware readers still read the column as a plain string. - Delta-aligned collation bounds: store the original min/max VALUES (not ICU sort keys, which aren't stable across versions) tagged with a collation version, and prune only on an exact collation+version match. - Persist them in a prototype data_file.collation_bounds Avro field (v3, experimental field IDs 9000-9006 pending an official reservation), with a full WriteManifest/ReadManifest round-trip. - Version-gated, collator-based data-file pruning in the inclusive metrics evaluator; the strict evaluator and collated columns without valid bounds are conservatively kept (byte-order bounds must not prune a collated column). Prototype scope and deferred items are documented in the collation package doc. --- collation/collation.go | 385 ++++++++++++++++++++++++++++++++ collation/collation_test.go | 172 ++++++++++++++ collation_manifest_test.go | 125 +++++++++++ collation_metrics.go | 108 +++++++++ collation_schema_test.go | 179 +++++++++++++++ go.mod | 2 +- internal/avro_schemas.go | 31 +++ literals.go | 32 ++- manifest.go | 87 +++++--- table/collation_pruning_test.go | 278 +++++++++++++++++++++++ table/evaluators.go | 156 +++++++++++++ types.go | 104 ++++++++- 12 files changed, 1609 insertions(+), 50 deletions(-) create mode 100644 collation/collation.go create mode 100644 collation/collation_test.go create mode 100644 collation_manifest_test.go create mode 100644 collation_metrics.go create mode 100644 collation_schema_test.go create mode 100644 table/collation_pruning_test.go diff --git a/collation/collation.go b/collation/collation.go new file mode 100644 index 000000000..a7c4da451 --- /dev/null +++ b/collation/collation.go @@ -0,0 +1,385 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Package collation is a prototype implementation of collation support for +// iceberg-go. A collation is an annotation on a string field that changes how +// values are compared and ordered (for example case-insensitive, accent- +// insensitive, or locale-aware ordering) without changing how the data is +// stored: 'appLE' is still read back as 'appLE'. +// +// The specifier grammar follows the Apache Iceberg collation proposal +// (https://lists.apache.org/thread/zdvc0gmqjgws8whwxv0ztn7h1jqclzyk): +// +// specifier := locale ("-" modifier)* +// locale := "utf8" | +// modifier := "ci" | "cs" // case (in)sensitive +// | "ai" | "as" // accent (in)sensitive +// | "rtrim" | "ltrim" | "trim" +// | "upper" | "lower" | "casefold" +// +// "utf8" is a pseudo-locale meaning the default byte-order (binary) comparison. +// A nil *Spec is also treated as binary. +// +// Collation comparison is backed by golang.org/x/text/collate, which implements +// the Unicode Collation Algorithm over CLDR locale data — the same family of +// rules ICU provides, and the natural ICU-equivalent available to Go without a +// new dependency. +// +// Prototype scope and known limitations (this is a demo to drive an Iceberg +// spec discussion, not production-complete): +// +// - Collation-aware bounds ("collation metrics") live in the parent iceberg +// package (CollationBoundEntry, ComputeCollatedBounds) and the manifest's +// prototype data_file.collation_bounds field; this package only does the +// specifier parsing and comparison. +// - The collation_bounds Avro field uses provisional, experimental field IDs +// (9000-9006) until an official range is reserved in the spec. +// - The collation_spec schema-JSON shape (name + icu_collator_version) tracks +// the current proposal and is provisional; a Java reference implementation +// may settle on different keys. +// - Collation annotations round-trip only at the NestedField level; a collated +// string inside a list element or map key/value is not yet preserved. +// - Changing a column's collation via UpdateColumn is not supported (PromoteType +// rejects it); adding/altering collation needs a dedicated evolution path. +// - Pruning is collation-aware only at the data-file level and only in the +// inclusive evaluator; Parquet row-group pruning and the strict evaluator +// conservatively keep everything for collated columns. +// - The writer does not yet auto-compute collation bounds; ComputeCollatedBounds +// is the building block a writer would call. +package collation + +import ( + "errors" + "fmt" + "strings" + "sync" + + "golang.org/x/text/cases" + "golang.org/x/text/collate" + "golang.org/x/text/language" +) + +// ErrInvalidCollation is the sentinel wrapped by every Parse failure, so callers +// can match collation-parse errors with errors.Is (mirroring iceberg-go's +// ErrInvalidSchema / ErrBadCast convention). +var ErrInvalidCollation = errors.New("invalid collation") + +// trimMode controls optional whitespace trimming before comparison. +type trimMode uint8 + +const ( + trimNone trimMode = iota + trimRight + trimLeft + trimBoth +) + +// caseMode controls optional case folding before comparison. Casing modifiers +// are only valid on the "utf8" (binary) base — a real locale must use the "ci" +// modifier instead. +type caseMode uint8 + +const ( + caseNone caseMode = iota + caseUpper + caseLower + caseFold +) + +// Spec is a parsed, immutable collation specifier. The zero value is not +// meaningful; build one with Parse. A nil *Spec means binary (UTF-8 byte order). +type Spec struct { + // locale is the raw locale token as written ("en_US", "sv", ...). Empty for + // the binary/"utf8" pseudo-locale. + locale string + binary bool + + caseInsensitive bool + accentInsensitive bool + trim trimMode + casing caseMode + + // version optionally pins the collation implementation version (the + // proposal's icu_collator_version). It is metadata only: it does not change + // comparison here, but two specs with different versions are not equal, + // matching the "moving to a newer version is a new collation" rule. + version string + + // tag is the BCP-47 tag derived from locale (only set when !binary). + tag language.Tag + + // pool hands out reusable collators for locale comparison (collate.Collator + // is not safe for concurrent use). Built once in Parse for non-binary specs + // and shared across copies made by WithVersion (version doesn't affect the + // collator). nil for binary specs. It is a pointer, so copying a Spec by + // value just shares the pool. + pool *sync.Pool +} + +// Parse parses a collation specifier such as "en_US-ci" or "utf8". +func Parse(spec string) (*Spec, error) { + if spec == "" { + return nil, fmt.Errorf("%w: empty specifier", ErrInvalidCollation) + } + + parts := strings.Split(spec, "-") + locale := parts[0] + s := &Spec{} + + if strings.EqualFold(locale, "utf8") { + s.binary = true + } else { + s.locale = locale + tag, err := language.Parse(strings.ReplaceAll(locale, "_", "-")) + if err != nil { + return nil, fmt.Errorf("%w: invalid locale %q: %w", ErrInvalidCollation, locale, err) + } + s.tag = tag + } + + var sawCase, sawAccent, sawCasing bool + for _, m := range parts[1:] { + switch strings.ToLower(m) { + case "cs", "ci": + if sawCase { + return nil, fmt.Errorf("%w: conflicting case modifiers in %q", ErrInvalidCollation, spec) + } + sawCase = true + s.caseInsensitive = strings.EqualFold(m, "ci") + case "as", "ai": + if sawAccent { + return nil, fmt.Errorf("%w: conflicting accent modifiers in %q", ErrInvalidCollation, spec) + } + sawAccent = true + s.accentInsensitive = strings.EqualFold(m, "ai") + case "rtrim": + s.trim = trimRight + case "ltrim": + s.trim = trimLeft + case "trim": + s.trim = trimBoth + case "upper", "lower", "casefold": + if sawCasing { + return nil, fmt.Errorf("%w: conflicting casing modifiers in %q", ErrInvalidCollation, spec) + } + sawCasing = true + switch strings.ToLower(m) { + case "upper": + s.casing = caseUpper + case "lower": + s.casing = caseLower + case "casefold": + s.casing = caseFold + } + default: + return nil, fmt.Errorf("%w: unknown modifier %q in %q", ErrInvalidCollation, m, spec) + } + } + + // Validation rules from the proposal. + if s.binary { + // Case/accent insensitivity is language-specific and needs a real locale. + if s.caseInsensitive { + return nil, fmt.Errorf("%w: 'ci' requires a locale, not utf8 (%q)", ErrInvalidCollation, spec) + } + if s.accentInsensitive { + return nil, fmt.Errorf("%w: 'ai' requires a locale, not utf8 (%q)", ErrInvalidCollation, spec) + } + + return s, nil + } + + if s.casing != caseNone { + // A real locale cannot be combined with a casing modifier; use 'ci'. + return nil, fmt.Errorf("%w: casing modifiers cannot be combined with locale %q; use 'ci' instead", ErrInvalidCollation, locale) + } + + // Build the pooled collator once per spec; locale comparison reuses pooled + // instances across calls instead of allocating a pool (and collator) per + // Comparator() call. + tag, opts := s.tag, s.collatorOptions() + s.pool = &sync.Pool{New: func() any { return collate.New(tag, opts...) }} + + return s, nil +} + +// MustParse is Parse but panics on error. Intended for tests and constants. +func MustParse(spec string) *Spec { + s, err := Parse(spec) + if err != nil { + panic(err) + } + + return s +} + +// WithVersion returns a copy of the spec with its implementation version pinned +// (the proposal's icu_collator_version). +func (s *Spec) WithVersion(version string) *Spec { + if s == nil { + return nil + } + cp := *s + cp.version = version + + return &cp +} + +// IsBinary reports whether comparison is exactly UTF-8 byte order, with no +// transform. This is the condition under which the original column's UTF-8 +// lower/upper bounds remain valid for pruning. A nil *Spec is binary. Note that +// "utf8-lower" or "utf8-rtrim" are NOT binary: their transform changes equality +// relative to the stored (untransformed) byte-order bounds. +func (s *Spec) IsBinary() bool { + return s == nil || (s.binary && s.casing == caseNone && s.trim == trimNone) +} + +// Version returns the pinned implementation version, or "" if unset. +func (s *Spec) Version() string { + if s == nil { + return "" + } + + return s.version +} + +// String returns the canonical specifier. Default modifiers (cs/as) are +// omitted, so "en_US-cs" and "en_US" both render as "en_US". +func (s *Spec) String() string { + if s == nil { + return "utf8" + } + + var b strings.Builder + if s.binary { + b.WriteString("utf8") + } else { + b.WriteString(s.locale) + if s.caseInsensitive { + b.WriteString("-ci") + } + if s.accentInsensitive { + b.WriteString("-ai") + } + } + switch s.trim { + case trimRight: + b.WriteString("-rtrim") + case trimLeft: + b.WriteString("-ltrim") + case trimBoth: + b.WriteString("-trim") + } + switch s.casing { + case caseUpper: + b.WriteString("-upper") + case caseLower: + b.WriteString("-lower") + case caseFold: + b.WriteString("-casefold") + } + + return b.String() +} + +// Equal reports whether two specs compare and order strings identically. Both +// nil/binary specs are equal to each other; pinned versions must match. +func Equal(a, b *Spec) bool { + return a.String() == b.String() && a.Version() == b.Version() +} + +// normalize applies the optional trim and casing transforms that are not +// expressed through collator options. +func (s *Spec) normalize(str string) string { + if s == nil { + return str + } + switch s.trim { + case trimRight: + str = strings.TrimRight(str, " ") + case trimLeft: + str = strings.TrimLeft(str, " ") + case trimBoth: + str = strings.Trim(str, " ") + } + switch s.casing { + case caseUpper: + str = strings.ToUpper(str) + case caseLower: + str = strings.ToLower(str) + case caseFold: + str = cases.Fold().String(str) + } + + return str +} + +func (s *Spec) collatorOptions() []collate.Option { + var opts []collate.Option + if s.caseInsensitive { + opts = append(opts, collate.IgnoreCase) + } + if s.accentInsensitive { + opts = append(opts, collate.IgnoreDiacritics) + } + + return opts +} + +// Comparator returns a string comparator honoring this collation. The returned +// function is safe for concurrent use. For a binary spec it is byte-order +// comparison (after any trim/casing transform). +func (s *Spec) Comparator() func(a, b string) int { + if s == nil { + return strings.Compare + } + if s.binary { + if s.casing == caseNone && s.trim == trimNone { + return strings.Compare + } + // Binary base with a trim/casing transform applied first. + return func(a, b string) int { + return strings.Compare(s.normalize(a), s.normalize(b)) + } + } + + // Locale-aware comparison. collate.Collator is not safe for concurrent use, + // so hand each goroutine its own instance from the spec's shared pool. + return func(a, b string) int { + c := s.pool.Get().(*collate.Collator) + defer s.pool.Put(c) + + return c.CompareString(s.normalize(a), s.normalize(b)) + } +} + +// Key returns the collation sort key for str: a binary-comparable encoding such +// that two strings equal under this collation share a key. This is the building +// block for collation-aware min/max bounds; it is provided for completeness and +// is not yet consumed by pruning in this prototype. +func (s *Spec) Key(str string) []byte { + if s == nil { + return []byte(str) + } + if s.binary { + return []byte(s.normalize(str)) + } + c := collate.New(s.tag, s.collatorOptions()...) + var buf collate.Buffer + + return append([]byte(nil), c.KeyFromString(&buf, s.normalize(str))...) +} diff --git a/collation/collation_test.go b/collation/collation_test.go new file mode 100644 index 000000000..087954d5c --- /dev/null +++ b/collation/collation_test.go @@ -0,0 +1,172 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package collation_test + +import ( + "testing" + + "github.com/apache/iceberg-go/collation" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func sign(n int) int { + switch { + case n < 0: + return -1 + case n > 0: + return 1 + default: + return 0 + } +} + +func TestParseAndCanonical(t *testing.T) { + tests := []struct { + spec string + canonical string + binary bool + }{ + {"utf8", "utf8", true}, + {"UTF8", "utf8", true}, + {"en_US", "en_US", false}, + {"en_US-cs", "en_US", false}, // cs is the default, dropped + {"en_US-cs-as", "en_US", false}, // both defaults, dropped + {"en_US-ci", "en_US-ci", false}, + {"en_US-ci-ai", "en_US-ci-ai", false}, + {"de_DE-ci", "de_DE-ci", false}, + {"sv", "sv", false}, + {"utf8-lower", "utf8-lower", false}, // transform => not byte-order-safe + {"utf8-rtrim", "utf8-rtrim", false}, + } + for _, tt := range tests { + t.Run(tt.spec, func(t *testing.T) { + s, err := collation.Parse(tt.spec) + require.NoError(t, err) + assert.Equal(t, tt.canonical, s.String()) + assert.Equal(t, tt.binary, s.IsBinary()) + }) + } +} + +func TestParseErrors(t *testing.T) { + bad := []string{ + "", // empty + "en_US-ci-cs", // conflicting case + "en_US-ai-as", // conflicting accent + "utf8-upper-lower", + "utf8-ci", // ci needs a real locale + "utf8-ai", // ai needs a real locale + "en_US-lower", // casing cannot combine with a locale + "en_US-bogus", // unknown modifier + "not a locale", + } + for _, spec := range bad { + t.Run(spec, func(t *testing.T) { + _, err := collation.Parse(spec) + require.Error(t, err) + require.ErrorIs(t, err, collation.ErrInvalidCollation) + }) + } +} + +func TestCaseInsensitiveEquality(t *testing.T) { + cmp := collation.MustParse("en_US-ci").Comparator() + assert.Equal(t, 0, cmp("APPLE", "apple"), "ci should treat APPLE == apple") + assert.NotEqual(t, 0, cmp("apple", "banana")) + + // Binary comparison must distinguish case. + bin := collation.MustParse("utf8").Comparator() + assert.NotEqual(t, 0, bin("APPLE", "apple")) +} + +func TestAccentInsensitiveEquality(t *testing.T) { + cmp := collation.MustParse("en_US-ci-ai").Comparator() + assert.Equal(t, 0, cmp("résumé", "resume"), "ai should treat résumé == resume") +} + +func TestLocaleOrdering(t *testing.T) { + // In Swedish, 'ö' sorts after 'z'. + sv := collation.MustParse("sv").Comparator() + assert.Equal(t, 1, sign(sv("ö", "z")), "sv: ö sorts after z") + + // In German (phonebook-style root behavior here), 'ö' sorts near 'o', + // i.e. before 'z'. This is the crux of why UTF-8 byte order is wrong for + // collated columns. + de := collation.MustParse("de_DE").Comparator() + assert.Equal(t, -1, sign(de("ö", "z")), "de: ö sorts before z") +} + +func TestBinaryVsCollationOrderDiffer(t *testing.T) { + // 'a' (0x61) > 'B' (0x42) in UTF-8 byte order, but 'a' < 'B' under a + // case-insensitive collation. This divergence is exactly why pruning with + // UTF-8 bounds is unsafe for collated columns. + bin := collation.MustParse("utf8").Comparator() + assert.Equal(t, 1, sign(bin("a", "B")), "byte order: a > B") + + ci := collation.MustParse("en_US-ci").Comparator() + assert.Equal(t, -1, sign(ci("a", "B")), "case-insensitive: a < B") +} + +func TestTrimming(t *testing.T) { + rtrim := collation.MustParse("utf8-rtrim").Comparator() + assert.Equal(t, 0, rtrim("abc ", "abc")) + + notrim := collation.MustParse("utf8").Comparator() + assert.NotEqual(t, 0, notrim("abc ", "abc")) +} + +func TestCasing(t *testing.T) { + lower := collation.MustParse("utf8-lower").Comparator() + assert.Equal(t, 0, lower("HELLO", "hello")) +} + +func TestKeyEqualityUnderCollation(t *testing.T) { + s := collation.MustParse("en_US-ci") + // Strings equal under the collation produce identical sort keys. + assert.Equal(t, s.Key("APPLE"), s.Key("apple")) + assert.NotEqual(t, s.Key("apple"), s.Key("banana")) +} + +func TestEqualAndVersion(t *testing.T) { + assert.True(t, collation.Equal(nil, collation.MustParse("utf8"))) + assert.True(t, collation.Equal(collation.MustParse("en_US"), collation.MustParse("en_US-cs"))) + assert.False(t, collation.Equal(collation.MustParse("en_US"), collation.MustParse("en_US-ci"))) + + v1 := collation.MustParse("en_US-ci").WithVersion("153.88") + v2 := collation.MustParse("en_US-ci").WithVersion("154.0") + assert.False(t, collation.Equal(v1, v2), "different pinned versions are not equal") + assert.Equal(t, "153.88", v1.Version()) +} + +func TestComparatorConcurrentSafe(t *testing.T) { + cmp := collation.MustParse("en_US-ci").Comparator() + done := make(chan int, 8) + for range 8 { + go func() { + r := 0 + for range 1000 { + r += cmp("Apple", "apple") + } + done <- r + }() + } + for range 8 { + assert.Equal(t, 0, <-done) + } +} diff --git a/collation_manifest_test.go b/collation_manifest_test.go new file mode 100644 index 000000000..cc29f1e9c --- /dev/null +++ b/collation_manifest_test.go @@ -0,0 +1,125 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package iceberg + +import ( + "bytes" + "testing" + + "github.com/apache/iceberg-go/collation" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +// TestCollationBoundsAvroRoundTrip writes a v3 manifest whose data file carries +// collation-aware bounds, reads it back through the real Avro path, and verifies +// the bounds survive — i.e. the data_file.collation_bounds field is a working +// reference for the proposed manifest/spec change. +func TestCollationBoundsAvroRoundTrip(t *testing.T) { + spec := collation.MustParse("en_US-ci").WithVersion("v1") + schema := NewSchema( + 0, + NestedField{ID: 1, Name: "id", Type: PrimitiveTypes.Int64, Required: true}, + NestedField{ID: 2, Name: "name", Type: StringTypeWithCollation(spec), Required: true}, + ) + pspec := NewPartitionSpecID(0) // unpartitioned + + entry, ok := ComputeCollatedBounds(spec, []string{"Banana", "apple", "Cherry"}) + require.True(t, ok) + + snapshotID := int64(123) + entries := []ManifestEntry{ + &manifestEntry{ + EntryStatus: EntryStatusADDED, + Snapshot: &snapshotID, + Data: &dataFile{ + Content: EntryContentData, + Path: "/data/file.parquet", + Format: ParquetFile, + PartitionData: map[string]any{}, + RecordCount: 3, + FileSize: 1000, + BlockSizeInBytes: 64 * 1024, + CollationBoundsData: mapToAvroColMap(map[int]CollationBoundEntry{2: entry}), + }, + }, + } + + var buf bytes.Buffer + _, err := WriteManifest("/manifest.avro", &buf, 3, pspec, schema, snapshotID, entries) + require.NoError(t, err) + + mf := &manifestFile{version: 3, Path: "/manifest.avro", Content: ManifestContentData} + got, err := ReadManifest(mf, bytes.NewReader(buf.Bytes()), false) + require.NoError(t, err) + require.Len(t, got, 1) + + provider, ok := got[0].DataFile().(CollationBoundsProvider) + require.True(t, ok, "decoded data file should expose collation bounds") + + bounds := provider.CollationBounds() + require.Contains(t, bounds, 2) + assert.Equal(t, "en_US-ci", bounds[2].Collation) + assert.Equal(t, "v1", bounds[2].Version) + assert.Equal(t, []byte("apple"), bounds[2].Lower, "lower bound is the original value, not a sort key") + assert.Equal(t, []byte("Cherry"), bounds[2].Upper) + + // And the round-tripped bound is usable for pruning under a matching version. + assert.True(t, bounds[2].ValidFor(spec)) +} + +// TestManifestWithoutCollationBoundsRoundTrip confirms the new optional field is +// backward compatible: a data file with no collation bounds round-trips with an +// empty collation-bounds map and no error. +func TestManifestWithoutCollationBoundsRoundTrip(t *testing.T) { + schema := NewSchema( + 0, + NestedField{ID: 1, Name: "id", Type: PrimitiveTypes.Int64, Required: true}, + ) + pspec := NewPartitionSpecID(0) + + snapshotID := int64(7) + entries := []ManifestEntry{ + &manifestEntry{ + EntryStatus: EntryStatusADDED, + Snapshot: &snapshotID, + Data: &dataFile{ + Content: EntryContentData, + Path: "/data/file.parquet", + Format: ParquetFile, + PartitionData: map[string]any{}, + RecordCount: 1, + FileSize: 10, + BlockSizeInBytes: 64 * 1024, + }, + }, + } + + var buf bytes.Buffer + _, err := WriteManifest("/manifest.avro", &buf, 3, pspec, schema, snapshotID, entries) + require.NoError(t, err) + + mf := &manifestFile{version: 3, Path: "/manifest.avro", Content: ManifestContentData} + got, err := ReadManifest(mf, bytes.NewReader(buf.Bytes()), false) + require.NoError(t, err) + require.Len(t, got, 1) + + provider, ok := got[0].DataFile().(CollationBoundsProvider) + require.True(t, ok) + assert.Empty(t, provider.CollationBounds()) +} diff --git a/collation_metrics.go b/collation_metrics.go new file mode 100644 index 000000000..2143014e0 --- /dev/null +++ b/collation_metrics.go @@ -0,0 +1,108 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package iceberg + +import "github.com/apache/iceberg-go/collation" + +// CollationBoundEntry is the collation-aware lower/upper bound for one column in +// one data file, following the Delta approach to collation statistics. It is +// both the on-disk Avro record (the data_file.collation_bounds map value) and +// the in-memory value consumed by the metrics evaluator. +// +// - Lower and Upper hold the ORIGINAL string values (their normal binary +// encoding), not ICU sort keys. Sort keys are not stable across ICU/CLDR +// versions, so storing them would couple every reader to one exact +// implementation; original values stay readable and let each engine compare +// with its own collator. +// - Collation and Version identify the collation and the implementation version +// under which the min/max were selected. A reader must only trust the bound +// when both match its own collation (see ValidFor), because a different +// version may order values differently and thus pick a different min/max. +// +// Unlike the original UTF-8 lower/upper bounds (the byte-order min/max), these +// are the min/max under the column's COLLATION order, which can select entirely +// different values (e.g. for {"Banana","apple"} the byte-min is "Banana" but the +// case-insensitive min is "apple"). +// +// Prototype note: the spec should generalize the data_file.collation_bounds map +// value to a LIST of these entries (Delta's validForCollations), letting one file +// carry bounds for several collation versions at once for smooth engine upgrades. +// This prototype stores a single entry per column. +type CollationBoundEntry struct { + Collation string `avro:"collation"` + Version string `avro:"version"` + Lower []byte `avro:"lower_bound"` + Upper []byte `avro:"upper_bound"` +} + +// ValidFor reports whether this bound may be used for collation-aware pruning +// under spec. It requires the collation identity and a non-empty version to +// match exactly: an unversioned or version-mismatched bound cannot be trusted to +// be the true min/max under the reader's collation. This is the mechanism that +// keeps pruning correct across ICU/CLDR version changes. +func (e CollationBoundEntry) ValidFor(spec *collation.Spec) bool { + return e.Version != "" && spec.Version() == e.Version && e.Collation == spec.String() +} + +// CollationBoundsProvider is an optional interface a DataFile may implement to +// expose collation-aware bounds (the Delta statsWithCollation analogue) keyed by +// field ID. The metrics evaluator consults it for collated columns; data files +// that don't implement it (or carry no collation bounds) are conservatively kept. +type CollationBoundsProvider interface { + CollationBounds() map[int]CollationBoundEntry +} + +// ComputeCollatedBounds selects the collation-order minimum and maximum of +// values and returns them as an original-value bound entry tagged with spec's +// collation and version. It returns ok=false for a binary spec or empty input. +// This is the write-side counterpart consumed by collation-aware pruning. +func ComputeCollatedBounds(spec *collation.Spec, values []string) (CollationBoundEntry, bool) { + if spec.IsBinary() || len(values) == 0 { + return CollationBoundEntry{}, false + } + + cmp := spec.Comparator() + lo, hi := values[0], values[0] + for _, v := range values[1:] { + if cmp(v, lo) < 0 { + lo = v + } + if cmp(v, hi) > 0 { + hi = v + } + } + + encLo, err := StringLiteral(lo).MarshalBinary() + if err != nil { + return CollationBoundEntry{}, false + } + encHi, err := StringLiteral(hi).MarshalBinary() + if err != nil { + return CollationBoundEntry{}, false + } + + // StringLiteral.MarshalBinary returns bytes that alias the input string's + // backing memory; copy so the stored bounds can't be corrupted by later + // reuse/mutation of the source buffer. + return CollationBoundEntry{ + Collation: spec.String(), + Version: spec.Version(), + Lower: append([]byte(nil), encLo...), + Upper: append([]byte(nil), encHi...), + }, true +} diff --git a/collation_schema_test.go b/collation_schema_test.go new file mode 100644 index 000000000..ed87caf73 --- /dev/null +++ b/collation_schema_test.go @@ -0,0 +1,179 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package iceberg_test + +import ( + "encoding/json" + "testing" + + "github.com/apache/iceberg-go" + "github.com/apache/iceberg-go/collation" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestCollatedStringTypeEquals(t *testing.T) { + plain := iceberg.StringType{} + ciA := iceberg.StringTypeWithCollation(collation.MustParse("en_US-ci")) + ciB := iceberg.StringTypeWithCollation(collation.MustParse("en_US-ci")) + ai := iceberg.StringTypeWithCollation(collation.MustParse("en_US-ci-ai")) + binary := iceberg.StringTypeWithCollation(collation.MustParse("utf8")) + + assert.True(t, ciA.Equals(ciB)) + assert.False(t, plain.Equals(ciA)) + assert.False(t, ciA.Equals(ai)) + // An explicit utf8 collation is equivalent to the default string type. + assert.True(t, plain.Equals(binary)) + assert.True(t, binary.Equals(plain)) +} + +func TestCollatedStringTypeString(t *testing.T) { + assert.Equal(t, "string", iceberg.StringType{}.String()) + assert.Equal(t, "string collate en_US-ci", + iceberg.StringTypeWithCollation(collation.MustParse("en_US-ci")).String()) +} + +func TestCollatedStringTypeComparator(t *testing.T) { + cmp := iceberg.StringTypeWithCollation(collation.MustParse("en_US-ci")).Comparator() + assert.Equal(t, 0, cmp("APPLE", "apple")) + + plain := iceberg.StringType{}.Comparator() + assert.NotEqual(t, 0, plain("APPLE", "apple")) +} + +func TestCollationSpecJSONRoundTrip(t *testing.T) { + field := iceberg.NestedField{ + ID: 3, + Name: "product_name", + Required: true, + Type: iceberg.StringTypeWithCollation(collation.MustParse("en_US-ci")), + } + + data, err := json.Marshal(field) + require.NoError(t, err) + // type stays "string"; collation rides alongside as collation_spec. + assert.JSONEq(t, `{ + "id": 3, + "name": "product_name", + "required": true, + "type": "string", + "collation_spec": {"name": "en_US-ci"} + }`, string(data)) + + var got iceberg.NestedField + require.NoError(t, json.Unmarshal(data, &got)) + require.True(t, got.Type.Equals(field.Type)) + + st, ok := got.Type.(iceberg.StringType) + require.True(t, ok) + require.NotNil(t, st.Collation()) + assert.Equal(t, "en_US-ci", st.Collation().String()) +} + +func TestCollationSpecJSONWithVersion(t *testing.T) { + field := iceberg.NestedField{ + ID: 1, + Name: "c", + Type: iceberg.StringTypeWithCollation( + collation.MustParse("de_DE-ci").WithVersion("153.88.33.0"), + ), + } + + data, err := json.Marshal(field) + require.NoError(t, err) + assert.Contains(t, string(data), `"icu_collator_version":"153.88.33.0"`) + + var got iceberg.NestedField + require.NoError(t, json.Unmarshal(data, &got)) + st := got.Type.(iceberg.StringType) + assert.Equal(t, "153.88.33.0", st.Collation().Version()) +} + +func TestPlainStringHasNoCollationSpec(t *testing.T) { + field := iceberg.NestedField{ID: 1, Name: "c", Type: iceberg.PrimitiveTypes.String} + data, err := json.Marshal(field) + require.NoError(t, err) + assert.NotContains(t, string(data), "collation_spec") +} + +func TestCollationSpecOnNonStringErrors(t *testing.T) { + const j = `{"id":1,"name":"c","required":false,"type":"long","collation_spec":{"name":"en_US-ci"}}` + var got iceberg.NestedField + err := json.Unmarshal([]byte(j), &got) + assert.Error(t, err) +} + +func TestComputeCollatedBounds(t *testing.T) { + spec := collation.MustParse("en_US-ci").WithVersion("v1") + + // Byte-order min/max of these is "Banana"/"apple"; collation-order (ci) + // min/max is "apple"/"Cherry". + entry, ok := iceberg.ComputeCollatedBounds(spec, []string{"Banana", "apple", "Cherry"}) + require.True(t, ok) + assert.Equal(t, "v1", entry.Version) + assert.Equal(t, "en_US-ci", entry.Collation) + + lower, err := iceberg.LiteralFromBytes(iceberg.PrimitiveTypes.String, entry.Lower) + require.NoError(t, err) + upper, err := iceberg.LiteralFromBytes(iceberg.PrimitiveTypes.String, entry.Upper) + require.NoError(t, err) + assert.Equal(t, iceberg.StringLiteral("apple"), lower) + assert.Equal(t, iceberg.StringLiteral("Cherry"), upper) + + // Bounds carry the original values, not ICU sort keys. + assert.Equal(t, []byte("apple"), entry.Lower) + + // A binary spec has no collation bounds. + _, ok = iceberg.ComputeCollatedBounds(collation.MustParse("utf8"), []string{"a", "b"}) + assert.False(t, ok) +} + +func TestCollationBoundEntryValidFor(t *testing.T) { + v1 := iceberg.CollationBoundEntry{Collation: "en_US-ci", Version: "v1", Lower: []byte("a"), Upper: []byte("z")} + assert.True(t, v1.ValidFor(collation.MustParse("en_US-ci").WithVersion("v1"))) + assert.False(t, v1.ValidFor(collation.MustParse("en_US-ci").WithVersion("v2")), "version mismatch") + assert.False(t, v1.ValidFor(collation.MustParse("en_US-ci")), "unversioned reader cannot use bound") + assert.False(t, v1.ValidFor(collation.MustParse("de_DE-ci").WithVersion("v1")), "collation mismatch") + + unversioned := iceberg.CollationBoundEntry{Collation: "en_US-ci", Lower: []byte("a"), Upper: []byte("z")} + assert.False(t, unversioned.ValidFor(collation.MustParse("en_US-ci").WithVersion("v1"))) +} + +func TestCollatedStringInSchemaRoundTrip(t *testing.T) { + sc := iceberg.NewSchema( + 0, + iceberg.NestedField{ID: 1, Name: "id", Type: iceberg.PrimitiveTypes.Int64, Required: true}, + iceberg.NestedField{ + ID: 2, Name: "name", Required: true, + Type: iceberg.StringTypeWithCollation(collation.MustParse("en_US-ci")), + }, + ) + + data, err := json.Marshal(sc) + require.NoError(t, err) + + var got iceberg.Schema + require.NoError(t, json.Unmarshal(data, &got)) + + f, ok := got.FindFieldByName("name") + require.True(t, ok) + st, ok := f.Type.(iceberg.StringType) + require.True(t, ok) + require.NotNil(t, st.Collation()) + assert.Equal(t, "en_US-ci", st.Collation().String()) +} diff --git a/go.mod b/go.mod index d94f4404d..0ffef14ca 100644 --- a/go.mod +++ b/go.mod @@ -61,6 +61,7 @@ require ( golang.org/x/oauth2 v0.36.0 golang.org/x/sync v0.21.0 golang.org/x/term v0.44.0 + golang.org/x/text v0.38.0 google.golang.org/api v0.284.0 gopkg.in/yaml.v3 v3.0.1 ) @@ -266,7 +267,6 @@ require ( golang.org/x/mod v0.36.0 // indirect golang.org/x/net v0.55.0 // indirect golang.org/x/sys v0.46.0 // indirect - golang.org/x/text v0.38.0 // indirect golang.org/x/time v0.15.0 // indirect golang.org/x/xerrors v0.0.0-20240903120638-7835f813f4da // indirect google.golang.org/genproto v0.0.0-20260319201613-d00831a3d3e7 // indirect diff --git a/internal/avro_schemas.go b/internal/avro_schemas.go index 720f4be41..63d275425 100644 --- a/internal/avro_schemas.go +++ b/internal/avro_schemas.go @@ -138,6 +138,27 @@ func fieldNode(name string, typ avro.SchemaNode, fieldID int, opts ...func(*avro return f } +// collationBoundNode is the Avro record stored as the value of the prototype +// data_file.collation_bounds map: collation-aware original-value bounds plus the +// collation/version they were computed under. +// +// PROTOTYPE field IDs: 9000-9006 are a deliberately high, experimental range so +// this demo does not collide with spec-reserved data_file field IDs. In +// particular 146 (and the IDs immediately above it) is reserved for the V4 +// content_stats struct, and the spec reserves field IDs globally across +// versions, so the prototype must stay out of that range until an official range +// is reserved for collation bounds. +var collationBoundNode = avro.SchemaNode{ + Type: "record", + Name: "collation_bound_entry", + Fields: []avro.SchemaField{ + fieldNode("collation", StringNode, 9003), + fieldNode("version", StringNode, 9004), + fieldNode("lower_bound", BytesNode, 9005), + fieldNode("upper_bound", BytesNode, 9006), + }, +} + func withDoc(doc string) func(*avro.SchemaField) { return func(f *avro.SchemaField) { f.Doc = doc @@ -400,6 +421,16 @@ func init() { withDoc("The offset in the file where the content starts")), fieldNode("content_size_in_bytes", NullableNode(LongNode), 145, withDoc("The length of the referenced content stored in the file")), + // PROTOTYPE: collation-aware bounds (Delta-style original values + + // version) for collated string columns. Field IDs 9000-9006 are a + // high experimental range chosen to avoid the spec's globally-reserved + // data_file IDs (e.g. 146 = V4 content_stats); they are provisional and + // must be replaced with an officially reserved range before this is a + // real spec change. + fieldNode("collation_bounds", + NullableNode(newMapNode("k9001_v9002", IntNode, collationBoundNode, 9001, 9002)), + 9000, withDefault(nil), + withDoc("map of column id to collation-aware original-value bounds (prototype)")), }, }) diff --git a/literals.go b/literals.go index 7a4989630..985fcd3a3 100644 --- a/literals.go +++ b/literals.go @@ -34,6 +34,7 @@ import ( "github.com/apache/arrow-go/v18/arrow" "github.com/apache/arrow-go/v18/arrow/decimal128" "github.com/apache/arrow-go/v18/parquet/variant" + "github.com/apache/iceberg-go/collation" "github.com/google/uuid" ) @@ -603,7 +604,8 @@ func (Float32Literal) Comparator() Comparator[float32] { return cmp.Compare[floa func (f Float32Literal) Type() Type { return PrimitiveTypes.Float32 } func (f Float32Literal) Value() float32 { return float32(f) } func (f Float32Literal) Any() any { return f.Value() } -func (f Float32Literal) String() string { return strconv.FormatFloat(float64(f), 'g', -1, 32) } +func (f Float32Literal) String() string { return strconv.FormatFloat(float64(f), 'g', -1, 32) } + func (f Float32Literal) To(t Type) (Literal, error) { switch t := t.(type) { case Float32Type: @@ -651,7 +653,8 @@ func (Float64Literal) Comparator() Comparator[float64] { return cmp.Compare[floa func (f Float64Literal) Type() Type { return PrimitiveTypes.Float64 } func (f Float64Literal) Value() float64 { return float64(f) } func (f Float64Literal) Any() any { return f.Value() } -func (f Float64Literal) String() string { return strconv.FormatFloat(float64(f), 'g', -1, 64) } +func (f Float64Literal) String() string { return strconv.FormatFloat(float64(f), 'g', -1, 64) } + func (f Float64Literal) To(t Type) (Literal, error) { switch t := t.(type) { case Float32Type: @@ -848,9 +851,10 @@ func (t *TimestampLiteral) UnmarshalBinary(data []byte) error { type TimestampNsLiteral TimestampNano func (TimestampNsLiteral) Comparator() Comparator[TimestampNano] { return cmp.Compare[TimestampNano] } -func (t TimestampNsLiteral) Type() Type { return PrimitiveTypes.TimestampNs } -func (t TimestampNsLiteral) Value() TimestampNano { return TimestampNano(t) } -func (t TimestampNsLiteral) Any() any { return t.Value() } + +func (t TimestampNsLiteral) Type() Type { return PrimitiveTypes.TimestampNs } +func (t TimestampNsLiteral) Value() TimestampNano { return TimestampNano(t) } +func (t TimestampNsLiteral) Any() any { return t.Value() } func (t TimestampNsLiteral) String() string { tm := TimestampNano(t).ToTime() @@ -902,6 +906,24 @@ func (t *TimestampNsLiteral) UnmarshalBinary(data []byte) error { type StringLiteral string +// CollatedStringComparator returns a string Comparator honoring spec. For a nil +// or binary spec it is the default byte-order comparator, identical to +// StringLiteral.Comparator(). +// +// Note that a collation is a property of a column/field, not of an individual +// literal value, so the collation-aware comparator is sourced from the field's +// StringType (see StringType.Comparator) rather than from a StringLiteral. A +// bare StringLiteral has no collation context and therefore always compares +// byte-wise; this mirrors Delta Kernel, where predicates (not literals) carry +// the collation and default to binary. +func CollatedStringComparator(spec *collation.Spec) Comparator[string] { + if spec.IsBinary() { + return cmp.Compare[string] + } + + return spec.Comparator() +} + func (StringLiteral) Comparator() Comparator[string] { return cmp.Compare[string] } func (s StringLiteral) Type() Type { return PrimitiveTypes.String } func (s StringLiteral) Value() string { return string(s) } diff --git a/manifest.go b/manifest.go index 5be249b84..5322491bf 100644 --- a/manifest.go +++ b/manifest.go @@ -1541,7 +1541,7 @@ func (m *ManifestListWriter) AddManifests(files []ManifestFile) error { ErrInvalidArgument, m.version, file.Version()) } - wrapped := *(file.(*manifestFile)) + wrapped := *file.(*manifestFile) if m.version == 3 { // Ref: https://github.com/apache/iceberg/blob/ea2071568dc66148b483a82eefedcd2992b435f7/core/src/main/java/org/apache/iceberg/ManifestListWriter.java#L157-L168 if wrapped.Content == ManifestContentData && wrapped.FirstRowIDValue == nil { @@ -1821,36 +1821,38 @@ func padOrTruncateBytes(bytes []byte, size int) []byte { } type dataFile struct { - Content ManifestEntryContent `avro:"content"` - Path string `avro:"file_path"` - Format FileFormat `avro:"file_format"` - PartitionData map[string]any `avro:"partition"` - RecordCount int64 `avro:"record_count"` - FileSize int64 `avro:"file_size_in_bytes"` - BlockSizeInBytes int64 `avro:"block_size_in_bytes"` - ColSizes *[]colMap[int, int64] `avro:"column_sizes"` - ValCounts *[]colMap[int, int64] `avro:"value_counts"` - NullCounts *[]colMap[int, int64] `avro:"null_value_counts"` - NaNCounts *[]colMap[int, int64] `avro:"nan_value_counts"` - DistinctCounts *[]colMap[int, int64] `avro:"distinct_counts"` - LowerBounds *[]colMap[int, []byte] `avro:"lower_bounds"` - UpperBounds *[]colMap[int, []byte] `avro:"upper_bounds"` - Key *[]byte `avro:"key_metadata"` - Splits *[]int64 `avro:"split_offsets"` - EqualityIDs *[]int `avro:"equality_ids"` - SortOrder *int `avro:"sort_order_id"` - FirstRowIDField *int64 `avro:"first_row_id"` - ReferencedDataFileField *string `avro:"referenced_data_file"` - ContentOffsetField *int64 `avro:"content_offset"` - ContentSizeInBytesField *int64 `avro:"content_size_in_bytes"` - - colSizeMap map[int]int64 - valCntMap map[int]int64 - nullCntMap map[int]int64 - nanCntMap map[int]int64 - distinctCntMap map[int]int64 - lowerBoundMap map[int][]byte - upperBoundMap map[int][]byte + Content ManifestEntryContent `avro:"content"` + Path string `avro:"file_path"` + Format FileFormat `avro:"file_format"` + PartitionData map[string]any `avro:"partition"` + RecordCount int64 `avro:"record_count"` + FileSize int64 `avro:"file_size_in_bytes"` + BlockSizeInBytes int64 `avro:"block_size_in_bytes"` + ColSizes *[]colMap[int, int64] `avro:"column_sizes"` + ValCounts *[]colMap[int, int64] `avro:"value_counts"` + NullCounts *[]colMap[int, int64] `avro:"null_value_counts"` + NaNCounts *[]colMap[int, int64] `avro:"nan_value_counts"` + DistinctCounts *[]colMap[int, int64] `avro:"distinct_counts"` + LowerBounds *[]colMap[int, []byte] `avro:"lower_bounds"` + UpperBounds *[]colMap[int, []byte] `avro:"upper_bounds"` + CollationBoundsData *[]colMap[int, CollationBoundEntry] `avro:"collation_bounds"` + Key *[]byte `avro:"key_metadata"` + Splits *[]int64 `avro:"split_offsets"` + EqualityIDs *[]int `avro:"equality_ids"` + SortOrder *int `avro:"sort_order_id"` + FirstRowIDField *int64 `avro:"first_row_id"` + ReferencedDataFileField *string `avro:"referenced_data_file"` + ContentOffsetField *int64 `avro:"content_offset"` + ContentSizeInBytesField *int64 `avro:"content_size_in_bytes"` + + colSizeMap map[int]int64 + valCntMap map[int]int64 + nullCntMap map[int]int64 + nanCntMap map[int]int64 + distinctCntMap map[int]int64 + lowerBoundMap map[int][]byte + upperBoundMap map[int][]byte + collationBoundMap map[int]CollationBoundEntry // used for partition retrieval fieldNameToID map[string]int @@ -1886,6 +1888,7 @@ func (d *dataFile) initColumnStatsData() { d.distinctCntMap = avroColMapToMap(d.DistinctCounts) d.lowerBoundMap = avroColMapToMap(d.LowerBounds) d.upperBoundMap = avroColMapToMap(d.UpperBounds) + d.collationBoundMap = avroColMapToMap(d.CollationBoundsData) }) } @@ -2034,6 +2037,14 @@ func (d *dataFile) UpperBoundValues() map[int][]byte { return d.upperBoundMap } +// CollationBounds returns the collation-aware bounds (prototype) keyed by field +// ID. Implements CollationBoundsProvider. +func (d *dataFile) CollationBounds() map[int]CollationBoundEntry { + d.initColumnStatsData() + + return d.collationBoundMap +} + func (d *dataFile) KeyMetadata() []byte { if d.Key == nil { return nil @@ -2322,6 +2333,17 @@ func (b *DataFileBuilder) UpperBoundValues(bounds map[int][]byte) *DataFileBuild return b } +// WithCollationBounds sets the collation-aware bounds for the data file +// (prototype). Named With… to distinguish the setter from the CollationBounds() +// accessor. Note: collation_bounds is only present in the v3 data_file Avro +// schema, so bounds set here are silently dropped when written to a v1/v2 +// manifest. +func (b *DataFileBuilder) WithCollationBounds(bounds map[int]CollationBoundEntry) *DataFileBuilder { + b.d.CollationBoundsData = mapToAvroColMap(bounds) + + return b +} + // KeyMetadata sets the key metadata for the data file. func (b *DataFileBuilder) KeyMetadata(key []byte) *DataFileBuilder { b.d.Key = &key @@ -2475,7 +2497,8 @@ type ManifestEntry interface { wrap(status ManifestEntryStatus, snapshotID, seqNum, fileSeqNum *int64, datafile DataFile) ManifestEntry } -var PositionalDeleteSchema = NewSchema(0, +var PositionalDeleteSchema = NewSchema( + 0, NestedField{ID: 2147483546, Type: PrimitiveTypes.String, Name: "file_path", Required: true}, NestedField{ID: 2147483545, Type: PrimitiveTypes.Int64, Name: "pos", Required: true}, ) diff --git a/table/collation_pruning_test.go b/table/collation_pruning_test.go new file mode 100644 index 000000000..e6387856c --- /dev/null +++ b/table/collation_pruning_test.go @@ -0,0 +1,278 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package table + +import ( + "testing" + + "github.com/apache/iceberg-go" + "github.com/apache/iceberg-go/collation" + "github.com/stretchr/testify/require" +) + +// stringBound returns the binary encoding of a string min/max bound, as stored +// in a data file's lower/upper bound maps. +func stringBound(t *testing.T, s string) []byte { + t.Helper() + b, err := iceberg.StringLiteral(s).MarshalBinary() + require.NoError(t, err) + + return b +} + +// schemaWithName builds a single-string-column schema, optionally collated. +func schemaWithName(spec *collation.Spec) *iceberg.Schema { + typ := iceberg.Type(iceberg.StringType{}) + if spec != nil { + typ = iceberg.StringTypeWithCollation(spec) + } + + return iceberg.NewSchema( + 0, + iceberg.NestedField{ID: 1, Name: "name", Type: typ, Required: true}, + ) +} + +// TestCollationPruningSafety shows that data-file pruning must not use the +// stored UTF-8 byte-order bounds for a collated column, because collation order +// disagrees with byte order. For each case, a plain (binary) string column +// prunes the file — which would silently drop matching rows if the column were +// actually collated — while the collated column correctly keeps the file. +func TestCollationPruningSafety(t *testing.T) { + ci := collation.MustParse("en_US-ci") + + cases := []struct { + name string + // file holds a single value (lower == upper) encoded as UTF-8 bounds. + fileValue string + expr iceberg.BooleanExpression + }{ + { + // File contains only "apple". Under en_US-ci, name = 'APPLE' matches + // it, but in byte order 'APPLE' < 'apple' so a binary evaluator drops + // the file. + name: "equality APPLE vs apple", + fileValue: "apple", + expr: iceberg.EqualTo(iceberg.Reference("name"), "APPLE"), + }, + { + // File contains only "a". Under en_US-ci, 'a' < 'B' so name < 'B' + // matches, but in byte order 'a' (0x61) > 'B' (0x42) so a binary + // evaluator drops the file. + name: "ordering a < B", + fileValue: "a", + expr: iceberg.LessThan(iceberg.Reference("name"), "B"), + }, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + bound := stringBound(t, tc.fileValue) + file := &mockDataFile{ + path: "file.parquet", + format: iceberg.ParquetFile, + count: 1, + valueCounts: map[int]int64{1: 1}, + nullCounts: map[int]int64{1: 0}, + lowerBounds: map[int][]byte{1: bound}, + upperBounds: map[int][]byte{1: bound}, + } + + // Binary column: byte-order bounds are authoritative and the file is + // pruned. (Correct for a genuinely binary column; catastrophic if the + // column were collated and an engine ignored that.) + binEval, err := newInclusiveMetricsEvaluator(schemaWithName(nil), tc.expr, true, true) + require.NoError(t, err) + binMatch, err := binEval(file) + require.NoError(t, err) + require.False(t, binMatch, "binary column should prune the file via byte-order bounds") + + // Collated column: byte-order bounds are not safe, so the file must be + // kept (it actually contains a matching value under the collation). + ciEval, err := newInclusiveMetricsEvaluator(schemaWithName(ci), tc.expr, true, true) + require.NoError(t, err) + ciMatch, err := ciEval(file) + require.NoError(t, err) + require.True(t, ciMatch, "collated column must keep the file; UTF-8 bounds cannot prune it") + }) + } +} + +// TestExplicitUtf8CollationStillPrunes confirms the guard is precise: an +// explicit utf8 collation is byte-order-equivalent, so pruning still applies. +func TestExplicitUtf8CollationStillPrunes(t *testing.T) { + bound := stringBound(t, "apple") + file := &mockDataFile{ + path: "file.parquet", + format: iceberg.ParquetFile, + count: 1, + valueCounts: map[int]int64{1: 1}, + nullCounts: map[int]int64{1: 0}, + lowerBounds: map[int][]byte{1: bound}, + upperBounds: map[int][]byte{1: bound}, + } + + // "zzz" > "apple" in byte order, so an equality predicate prunes. + expr := iceberg.EqualTo(iceberg.Reference("name"), "zzz") + eval, err := newInclusiveMetricsEvaluator(schemaWithName(collation.MustParse("utf8")), expr, true, true) + require.NoError(t, err) + match, err := eval(file) + require.NoError(t, err) + require.False(t, match, "utf8 collation is byte-order-safe and should still prune") +} + +// collatedMockDataFile augments mockDataFile with Delta-style collation bounds. +type collatedMockDataFile struct { + *mockDataFile + collBounds map[int]iceberg.CollationBoundEntry +} + +func (c *collatedMockDataFile) CollationBounds() map[int]iceberg.CollationBoundEntry { + return c.collBounds +} + +// fileWithCollatedValues builds a data file whose single column holds the given +// values, with collation-order min/max bounds tagged with the spec's version +// (the Delta write path) plus the real byte-order min/max bounds a normal writer +// would emit. The two disagree for collated data — which is the whole point — +// and the byte-order bounds are deliberately left intact to prove the evaluator +// ignores them for collated columns. +func fileWithCollatedValues(t *testing.T, spec *collation.Spec, values ...string) *collatedMockDataFile { + t.Helper() + entry, ok := iceberg.ComputeCollatedBounds(spec, values) + require.True(t, ok) + + byteMin, byteMax := values[0], values[0] + for _, v := range values[1:] { + if v < byteMin { + byteMin = v + } + if v > byteMax { + byteMax = v + } + } + + return &collatedMockDataFile{ + mockDataFile: &mockDataFile{ + path: "file.parquet", + format: iceberg.ParquetFile, + count: int64(len(values)), + valueCounts: map[int]int64{1: int64(len(values))}, + nullCounts: map[int]int64{1: 0}, + lowerBounds: map[int][]byte{1: stringBound(t, byteMin)}, + upperBounds: map[int][]byte{1: stringBound(t, byteMax)}, + }, + collBounds: map[int]iceberg.CollationBoundEntry{1: entry}, + } +} + +// TestDeltaAlignedCollationPruning shows collation-aware pruning using original- +// value bounds selected under the collation order and gated by a matching +// collation version. For {"Banana","apple"} under en_US-ci the collation-order +// min/max are "apple"/"Banana" (not the byte-order "Banana"/"apple"). +func TestDeltaAlignedCollationPruning(t *testing.T) { + spec := collation.MustParse("en_US-ci").WithVersion("v1") + sc := schemaWithName(spec) + + cases := []struct { + name string + expr iceberg.BooleanExpression + mightMatch bool + }{ + {"eq above range pruned", iceberg.EqualTo(iceberg.Reference("name"), "CHERRY"), false}, + {"eq in range kept (ci match)", iceberg.EqualTo(iceberg.Reference("name"), "BANANA"), true}, + {"eq min kept", iceberg.EqualTo(iceberg.Reference("name"), "apple"), true}, + {"less than min pruned", iceberg.LessThan(iceberg.Reference("name"), "A"), false}, + {"greater than max pruned", iceberg.GreaterThan(iceberg.Reference("name"), "cherry"), false}, + {"in with one match kept", iceberg.IsIn(iceberg.Reference("name"), "CHERRY", "banana"), true}, + {"in with no match pruned", iceberg.IsIn(iceberg.Reference("name"), "cherry", "date"), false}, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + file := fileWithCollatedValues(t, spec, "Banana", "apple") + eval, err := newInclusiveMetricsEvaluator(sc, tc.expr, true, true) + require.NoError(t, err) + match, err := eval(file) + require.NoError(t, err) + require.Equal(t, tc.mightMatch, match) + }) + } +} + +// TestCollationVersionMismatchKeepsFile shows the version gate: bounds computed +// under one collation version must not be trusted by a reader pinned to a +// different version, since ordering (and thus the selected min/max) may differ. +func TestCollationVersionMismatchKeepsFile(t *testing.T) { + writeSpec := collation.MustParse("en_US-ci").WithVersion("v1") + file := fileWithCollatedValues(t, writeSpec, "Banana", "apple") + + // Reader pins v2; bounds are tagged v1, so they are ignored and the file is + // conservatively kept even though "CHERRY" would otherwise be pruned. + readSpec := collation.MustParse("en_US-ci").WithVersion("v2") + expr := iceberg.EqualTo(iceberg.Reference("name"), "CHERRY") + eval, err := newInclusiveMetricsEvaluator(schemaWithName(readSpec), expr, true, true) + require.NoError(t, err) + match, err := eval(file) + require.NoError(t, err) + require.True(t, match, "version-mismatched collation bounds must not prune") +} + +// TestUnversionedCollationKeepsFile shows that without a pinned version, bounds +// cannot be trusted (no stability guarantee), so the file is kept. +func TestUnversionedCollationKeepsFile(t *testing.T) { + spec := collation.MustParse("en_US-ci") // no version + file := fileWithCollatedValues(t, spec, "Banana", "apple") + expr := iceberg.EqualTo(iceberg.Reference("name"), "CHERRY") + eval, err := newInclusiveMetricsEvaluator(schemaWithName(spec), expr, true, true) + require.NoError(t, err) + match, err := eval(file) + require.NoError(t, err) + require.True(t, match, "unversioned collation bounds must not prune") +} + +// TestVersionedBoundsUnversionedReaderKeepsFile covers the other direction of +// the gate: the file carries v1 bounds but the reader's schema pins no version, +// so the bounds are not trusted and the file is kept. +func TestVersionedBoundsUnversionedReaderKeepsFile(t *testing.T) { + file := fileWithCollatedValues(t, collation.MustParse("en_US-ci").WithVersion("v1"), "Banana", "apple") + readSpec := collation.MustParse("en_US-ci") // unversioned reader + expr := iceberg.EqualTo(iceberg.Reference("name"), "CHERRY") + eval, err := newInclusiveMetricsEvaluator(schemaWithName(readSpec), expr, true, true) + require.NoError(t, err) + match, err := eval(file) + require.NoError(t, err) + require.True(t, match, "versioned bounds must not prune for an unversioned reader") +} + +// TestStrictEvaluatorConservativeForCollatedColumn shows the strict evaluator +// (used for delete/scan planning, where it must prove ALL rows match) never +// claims a match for an ordering predicate on a collated column, since byte-order +// bounds can't prove collation-order containment. It returns rowsMightNotMatch +// even where byte-order bounds would (wrongly) prove all rows match. +func TestStrictEvaluatorConservativeForCollatedColumn(t *testing.T) { + spec := collation.MustParse("en_US-ci").WithVersion("v1") + file := fileWithCollatedValues(t, spec, "apple", "apple") + // Every value is "apple"; byte-order bounds would prove name <= "apple". + expr := iceberg.LessThanEqual(iceberg.Reference("name"), "apple") + eval, err := newStrictMetricsEvaluator(schemaWithName(spec), expr, true, true) + require.NoError(t, err) + allMatch, err := eval(file) + require.NoError(t, err) + require.False(t, allMatch, "strict eval must not claim all-match for a collated column") +} diff --git a/table/evaluators.go b/table/evaluators.go index 2cff27fe5..23f7a7ce5 100644 --- a/table/evaluators.go +++ b/table/evaluators.go @@ -25,6 +25,7 @@ import ( "github.com/apache/arrow-go/v18/parquet/metadata" "github.com/apache/iceberg-go" + "github.com/apache/iceberg-go/collation" "github.com/apache/iceberg-go/table/internal" "github.com/google/uuid" ) @@ -737,6 +738,11 @@ type inclusiveMetricsEval struct { st iceberg.StructType expr iceberg.BooleanExpression includeEmptyFiles bool + + // collation-aware bounds (Delta-style: original values + version), keyed by + // field ID. Populated from a DataFile that implements + // iceberg.CollationBoundsProvider; nil for files without collation stats. + collationBounds map[int]iceberg.CollationBoundEntry } func (m *inclusiveMetricsEval) TestRowGroup(rgmeta *metadata.RowGroupMetaData, colIndices []int) (bool, error) { @@ -749,6 +755,11 @@ func (m *inclusiveMetricsEval) TestRowGroup(rgmeta *metadata.RowGroupMetaData, c m.nanCounts = nil m.lowerBounds = make(map[int][]byte) m.upperBounds = make(map[int][]byte) + // Parquet row groups carry no collation-aware bounds (collations are not a + // Parquet concept), so clear any left over from a prior file-level Eval on a + // reused evaluator. Collated columns therefore conservatively keep every row + // group; row-group-level collation pruning is out of scope for this prototype. + m.collationBounds = nil for _, c := range colIndices { colMeta, err := rgmeta.ColumnChunk(c) @@ -798,6 +809,9 @@ func (m *inclusiveMetricsEval) Eval(file iceberg.DataFile) (bool, error) { ev.valueCounts, ev.nullCounts = file.ValueCounts(), file.NullValueCounts() ev.nanCounts = file.NaNValueCounts() ev.lowerBounds, ev.upperBounds = file.LowerBoundValues(), file.UpperBoundValues() + if p, ok := file.(iceberg.CollationBoundsProvider); ok { + ev.collationBounds = p.CollationBounds() + } return iceberg.VisitExpr(m.expr, &ev) } @@ -816,10 +830,145 @@ func (m *inclusiveMetricsEval) VisitUnbound(iceberg.UnboundPredicate) bool { panic("need bound predicate") } +// collationOrderSensitive reports whether an operation compares string values +// (ordering, equality, prefix, or set membership) and therefore must not be +// pruned using a collated column's UTF-8 byte-order bounds. Every such op is +// intercepted for collated columns: those collatedMightMatch understands +// (LT/LTEQ/GT/GTEQ/EQ/IN) are pruned with collation-aware bounds, and the rest +// (NEQ/NotIn/StartsWith/NotStartsWith) are conservatively kept — byte-order +// prefix/inequality pruning is unsafe once comparison is collation-defined. +// Spelled out rather than a range check so it can't silently drift if the +// Operation enum is reordered or extended. +func collationOrderSensitive(op iceberg.Operation) bool { + switch op { + case iceberg.OpLT, iceberg.OpLTEQ, iceberg.OpGT, iceberg.OpGTEQ, + iceberg.OpEQ, iceberg.OpNEQ, iceberg.OpStartsWith, iceberg.OpNotStartsWith, + iceberg.OpIn, iceberg.OpNotIn: + return true + default: + return false + } +} + +// fieldHasNonBinaryCollation reports whether the field is a string column with a +// collation whose order differs from UTF-8 byte order. +func fieldHasNonBinaryCollation(f iceberg.NestedField) bool { + st, ok := f.Type.(iceberg.StringType) + + return ok && !st.Collation().IsBinary() +} + func (m *inclusiveMetricsEval) VisitBound(pred iceberg.BoundPredicate) bool { + // Collation handling for ordering/equality predicates. The stored lower/upper + // bounds are UTF-8 byte-order bounds, but a collated column orders by its + // collation, which can disagree with byte order ('a' > 'B' in UTF-8 but + // 'a' < 'B' case-insensitively), so byte-order bounds must not be used to + // prune. Instead we use the column's collation-aware bounds when the data + // file carries them for a matching collation version; otherwise we keep the + // file. Null/NaN predicates are collation-independent and evaluate normally. + if collationOrderSensitive(pred.Op()) { + field := pred.Ref().Field() + if st, ok := field.Type.(iceberg.StringType); ok && !st.Collation().IsBinary() { + if result, decided := m.evalCollated(pred, field.ID, st.Collation()); decided { + return result + } + + return rowsMightMatch + } + } + return iceberg.VisitBoundPredicate(pred, m) } +// evalCollated attempts collation-aware pruning for a predicate on a collated +// string column. It returns (result, true) when it could make a decision using +// version-matching collation bounds, or (_, false) when no usable bounds exist +// (the caller then conservatively keeps the file). +func (m *inclusiveMetricsEval) evalCollated(pred iceberg.BoundPredicate, fieldID int, spec *collation.Spec) (bool, bool) { + entry, ok := m.collationBounds[fieldID] + if !ok || !entry.ValidFor(spec) { + return false, false + } + + if m.containsNullsOnly(fieldID) { + return rowsCannotMatch, true + } + + lower, err := iceberg.LiteralFromBytes(iceberg.PrimitiveTypes.String, entry.Lower) + if err != nil { + return false, false + } + upper, err := iceberg.LiteralFromBytes(iceberg.PrimitiveTypes.String, entry.Upper) + if err != nil { + return false, false + } + + lowerVal, ok1 := stringLiteralValue(lower) + upperVal, ok2 := stringLiteralValue(upper) + if !ok1 || !ok2 { + return false, false + } + + cmp := spec.Comparator() + + return collatedMightMatch(pred, cmp, lowerVal, upperVal) +} + +func stringLiteralValue(lit iceberg.Literal) (string, bool) { + sl, ok := lit.(iceberg.StringLiteral) + + return string(sl), ok +} + +// collatedMightMatch applies the inclusive-metrics logic for ordering/equality +// predicates using the collation comparator cmp against original-value bounds +// [lower, upper]. It returns (result, true) for handled ops; for ops that this +// prototype does not prune (NEQ, NotIn, StartsWith, NotStartsWith) it returns +// (rowsMightMatch, true). +func collatedMightMatch(pred iceberg.BoundPredicate, cmp func(a, b string) int, lower, upper string) (bool, bool) { + switch pred.Op() { + case iceberg.OpLT, iceberg.OpLTEQ, iceberg.OpGT, iceberg.OpGTEQ, iceberg.OpEQ: + lp, ok := pred.(iceberg.BoundLiteralPredicate) + if !ok { + return rowsMightMatch, true + } + val, ok := stringLiteralValue(lp.Literal()) + if !ok { + return rowsMightMatch, true + } + switch pred.Op() { + case iceberg.OpLT: + return cmp(lower, val) < 0, true + case iceberg.OpLTEQ: + return cmp(lower, val) <= 0, true + case iceberg.OpGT: + return cmp(upper, val) > 0, true + case iceberg.OpGTEQ: + return cmp(upper, val) >= 0, true + case iceberg.OpEQ: + return cmp(lower, val) <= 0 && cmp(upper, val) >= 0, true + default: + return rowsMightMatch, true + } + case iceberg.OpIn: + sp, ok := pred.(iceberg.BoundSetPredicate) + if !ok { + return rowsMightMatch, true + } + for _, lit := range sp.Literals().Members() { + val, ok := stringLiteralValue(lit) + if ok && cmp(lower, val) <= 0 && cmp(upper, val) >= 0 { + return rowsMightMatch, true + } + } + + return rowsCannotMatch, true + } + + // NEQ, NotIn, StartsWith, NotStartsWith: not pruned in this prototype. + return rowsMightMatch, true +} + func (m *inclusiveMetricsEval) VisitIsNull(t iceberg.BoundTerm) bool { fieldID := t.Ref().Field().ID if cnt, exists := m.nullCounts[fieldID]; exists && cnt == 0 { @@ -1282,6 +1431,13 @@ func (m *strictMetricsEval) VisitUnbound(iceberg.UnboundPredicate) bool { } func (m *strictMetricsEval) VisitBound(pred iceberg.BoundPredicate) bool { + // A collated column orders by its collation, not byte order, so the stored + // byte-order bounds cannot prove that all rows satisfy an ordering/equality + // predicate. Conservatively report that some rows might not match. + if collationOrderSensitive(pred.Op()) && fieldHasNonBinaryCollation(pred.Ref().Field()) { + return rowsMightNotMatch + } + return iceberg.VisitBoundPredicate(pred, m) } diff --git a/types.go b/types.go index 3be85017f..14a2c1523 100644 --- a/types.go +++ b/types.go @@ -27,6 +27,7 @@ import ( "time" "github.com/apache/arrow-go/v18/arrow/decimal" + "github.com/apache/iceberg-go/collation" "github.com/apache/iceberg-go/internal" "github.com/geoarrow/geoarrow-go" ) @@ -288,20 +289,50 @@ func (n *NestedField) Equals(other NestedField) bool { n.Type.Equals(other.Type) } +// collationSpecJSON is the on-disk form of a collation annotation, following +// the Iceberg collation proposal's collation_spec object. The proposal's +// collation_metrics_id (a pseudo-field holding collation-key bounds) is omitted: +// this prototype uses Delta-style original-value bounds (see CollationBoundEntry) +// rather than the sort-key pseudo-field. +// +// PROTOTYPE: this JSON shape (name + icu_collator_version) is provisional and +// tracks the current proposal; a Java reference implementation may settle on +// different key names, in which case stored schemas would need migration. The +// annotation is only (de)serialized at the NestedField level — a collated string +// nested inside a list/map element is not yet preserved on round-trip. +type collationSpecJSON struct { + Name string `json:"name"` + ICUCollatorVersion string `json:"icu_collator_version,omitempty"` +} + func (n NestedField) MarshalJSON() ([]byte, error) { type Alias NestedField - return json.Marshal(struct { - Type *typeIFace `json:"type"` + aux := struct { + Type *typeIFace `json:"type"` + Collation *collationSpecJSON `json:"collation_spec,omitempty"` *Alias - }{Type: &typeIFace{n.Type}, Alias: (*Alias)(&n)}) + }{Type: &typeIFace{n.Type}, Alias: (*Alias)(&n)} + + // A collated string keeps its Iceberg type name "string" and carries the + // collation as a sibling collation_spec field, so engines that don't + // understand collations still read it as a plain string. + if st, ok := n.Type.(StringType); ok && !st.Collation().IsBinary() { + aux.Collation = &collationSpecJSON{ + Name: st.Collation().String(), + ICUCollatorVersion: st.Collation().Version(), + } + } + + return json.Marshal(aux) } func (n *NestedField) UnmarshalJSON(b []byte) error { type Alias NestedField aux := struct { - ID *int `json:"id"` - Type typeIFace `json:"type"` + ID *int `json:"id"` + Type typeIFace `json:"type"` + Collation *collationSpecJSON `json:"collation_spec,omitempty"` *Alias }{ Alias: (*Alias)(n), @@ -326,6 +357,20 @@ func (n *NestedField) UnmarshalJSON(b []byte) error { return fmt.Errorf("%w: field %q is missing required 'type' key in JSON", ErrInvalidSchema, n.Name) } + if aux.Collation != nil { + if _, ok := n.Type.(StringType); !ok { + return fmt.Errorf("%w: field %q has collation_spec but is not a string type", ErrInvalidSchema, n.Name) + } + spec, err := collation.Parse(aux.Collation.Name) + if err != nil { + return fmt.Errorf("%w: field %q: %w", ErrInvalidSchema, n.Name, err) + } + if v := aux.Collation.ICUCollatorVersion; v != "" { + spec = spec.WithVersion(v) + } + n.Type = StringTypeWithCollation(spec) + } + return nil } @@ -749,17 +794,52 @@ func (TimestampTzType) primitive() {} func (TimestampTzType) Type() string { return "timestamptz" } func (TimestampTzType) String() string { return "timestamptz" } -type StringType struct{} +// StringType is the Iceberg string type. It optionally carries a collation +// annotation that changes how values are compared and ordered without changing +// how they are stored. The zero value (StringType{}) is the default UTF-8 +// byte-order string. +// +// Note: StringType now holds a pointer field, so two collated string types must +// be compared with Equals, not ==. (== compares the spec pointer, not the +// collation it denotes, and is unsafe as a map key for collated strings.) +type StringType struct { + collation *collation.Spec +} -func (StringType) Equals(other Type) bool { - _, ok := other.(StringType) +// StringTypeWithCollation returns a string type annotated with the given +// collation. A nil spec yields the default (binary) string type. +func StringTypeWithCollation(spec *collation.Spec) StringType { + return StringType{collation: spec} +} - return ok +// Collation returns the collation annotation, or nil for the default UTF-8 +// byte-order comparison. +func (s StringType) Collation() *collation.Spec { return s.collation } + +// Comparator returns the string comparator implied by this type's collation: +// byte order for a plain string, or the collation-aware comparator otherwise. +func (s StringType) Comparator() Comparator[string] { + return CollatedStringComparator(s.collation) +} + +func (s StringType) Equals(other Type) bool { + o, ok := other.(StringType) + if !ok { + return false + } + + return collation.Equal(s.collation, o.collation) } -func (StringType) primitive() {} -func (StringType) Type() string { return "string" } -func (StringType) String() string { return "string" } +func (StringType) primitive() {} +func (StringType) Type() string { return "string" } +func (s StringType) String() string { + if !s.collation.IsBinary() { + return "string collate " + s.collation.String() + } + + return "string" +} type UUIDType struct{} From ed255bd775f61146b431c0ab102dbdaf5e1992fc Mon Sep 17 00:00:00 2001 From: laskoviymishka Date: Fri, 26 Jun 2026 10:44:37 +0200 Subject: [PATCH 2/2] style: fix gofmt alignment in literals.go The gofmt formatter (run by golangci-lint in CI) requires the single-line String() methods on Float32Literal/Float64Literal to be column-aligned with their sibling methods. Align them. --- literals.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/literals.go b/literals.go index 985fcd3a3..c7205a7ac 100644 --- a/literals.go +++ b/literals.go @@ -604,7 +604,7 @@ func (Float32Literal) Comparator() Comparator[float32] { return cmp.Compare[floa func (f Float32Literal) Type() Type { return PrimitiveTypes.Float32 } func (f Float32Literal) Value() float32 { return float32(f) } func (f Float32Literal) Any() any { return f.Value() } -func (f Float32Literal) String() string { return strconv.FormatFloat(float64(f), 'g', -1, 32) } +func (f Float32Literal) String() string { return strconv.FormatFloat(float64(f), 'g', -1, 32) } func (f Float32Literal) To(t Type) (Literal, error) { switch t := t.(type) { @@ -653,7 +653,7 @@ func (Float64Literal) Comparator() Comparator[float64] { return cmp.Compare[floa func (f Float64Literal) Type() Type { return PrimitiveTypes.Float64 } func (f Float64Literal) Value() float64 { return float64(f) } func (f Float64Literal) Any() any { return f.Value() } -func (f Float64Literal) String() string { return strconv.FormatFloat(float64(f), 'g', -1, 64) } +func (f Float64Literal) String() string { return strconv.FormatFloat(float64(f), 'g', -1, 64) } func (f Float64Literal) To(t Type) (Literal, error) { switch t := t.(type) {