From cddc18c927035d058e75c19113e019ae03f5bdac Mon Sep 17 00:00:00 2001
From: laskoviymishka <laskoviymishka@gmail.com>
Date: Thu, 25 Jun 2026 16:59:57 +0200
Subject: [PATCH 1/2] feat: prototype collation support for strings
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A working reference implementation of collation support, to drive the Iceberg
collation spec discussion (Snowflake/Löser proposal, aligned with Delta's
approach to collation statistics).

- collation/: parse the collation specifier grammar (locale, ci/cs, ai/as,
  trim, casing, "utf8" pseudo-locale) and compare/sort via
  golang.org/x/text/collate (CLDR/UCA, the Go ICU-equivalent).
- StringType carries an optional collation; it round-trips as a collation_spec
  sibling on NestedField, keeping the on-disk type name "string" so
  collation-unaware readers still read the column as a plain string.
- Delta-aligned collation bounds: store the original min/max VALUES (not ICU
  sort keys, which aren't stable across versions) tagged with a collation
  version, and prune only on an exact collation+version match.
- Persist them in a prototype data_file.collation_bounds Avro field (v3,
  experimental field IDs 9000-9006 pending an official reservation), with a
  full WriteManifest/ReadManifest round-trip.
- Version-gated, collator-based data-file pruning in the inclusive metrics
  evaluator; the strict evaluator and collated columns without valid bounds are
  conservatively kept (byte-order bounds must not prune a collated column).

Prototype scope and deferred items are documented in the collation package doc.
---
 collation/collation.go          | 385 ++++++++++++++++++++++++++++++++
 collation/collation_test.go     | 172 ++++++++++++++
 collation_manifest_test.go      | 125 +++++++++++
 collation_metrics.go            | 108 +++++++++
 collation_schema_test.go        | 179 +++++++++++++++
 go.mod                          |   2 +-
 internal/avro_schemas.go        |  31 +++
 literals.go                     |  32 ++-
 manifest.go                     |  87 +++++---
 table/collation_pruning_test.go | 278 +++++++++++++++++++++++
 table/evaluators.go             | 156 +++++++++++++
 types.go                        | 104 ++++++++-
 12 files changed, 1609 insertions(+), 50 deletions(-)
 create mode 100644 collation/collation.go
 create mode 100644 collation/collation_test.go
 create mode 100644 collation_manifest_test.go
 create mode 100644 collation_metrics.go
 create mode 100644 collation_schema_test.go
 create mode 100644 table/collation_pruning_test.go

diff --git a/collation/collation.go b/collation/collation.go
new file mode 100644
index 000000000..a7c4da451
--- /dev/null
+++ b/collation/collation.go
@@ -0,0 +1,385 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Package collation is a prototype implementation of collation support for
+// iceberg-go. A collation is an annotation on a string field that changes how
+// values are compared and ordered (for example case-insensitive, accent-
+// insensitive, or locale-aware ordering) without changing how the data is
+// stored: 'appLE' is still read back as 'appLE'.
+//
+// The specifier grammar follows the Apache Iceberg collation proposal
+// (https://lists.apache.org/thread/zdvc0gmqjgws8whwxv0ztn7h1jqclzyk):
+//
+//	specifier := locale ("-" modifier)*
+//	locale    := "utf8" | <CLDR locale, e.g. "en", "en_US", "de_DE", "sv">
+//	modifier  := "ci" | "cs"        // case (in)sensitive
+//	           | "ai" | "as"        // accent (in)sensitive
+//	           | "rtrim" | "ltrim" | "trim"
+//	           | "upper" | "lower" | "casefold"
+//
+// "utf8" is a pseudo-locale meaning the default byte-order (binary) comparison.
+// A nil *Spec is also treated as binary.
+//
+// Collation comparison is backed by golang.org/x/text/collate, which implements
+// the Unicode Collation Algorithm over CLDR locale data — the same family of
+// rules ICU provides, and the natural ICU-equivalent available to Go without a
+// new dependency.
+//
+// Prototype scope and known limitations (this is a demo to drive an Iceberg
+// spec discussion, not production-complete):
+//
+//   - Collation-aware bounds ("collation metrics") live in the parent iceberg
+//     package (CollationBoundEntry, ComputeCollatedBounds) and the manifest's
+//     prototype data_file.collation_bounds field; this package only does the
+//     specifier parsing and comparison.
+//   - The collation_bounds Avro field uses provisional, experimental field IDs
+//     (9000-9006) until an official range is reserved in the spec.
+//   - The collation_spec schema-JSON shape (name + icu_collator_version) tracks
+//     the current proposal and is provisional; a Java reference implementation
+//     may settle on different keys.
+//   - Collation annotations round-trip only at the NestedField level; a collated
+//     string inside a list element or map key/value is not yet preserved.
+//   - Changing a column's collation via UpdateColumn is not supported (PromoteType
+//     rejects it); adding/altering collation needs a dedicated evolution path.
+//   - Pruning is collation-aware only at the data-file level and only in the
+//     inclusive evaluator; Parquet row-group pruning and the strict evaluator
+//     conservatively keep everything for collated columns.
+//   - The writer does not yet auto-compute collation bounds; ComputeCollatedBounds
+//     is the building block a writer would call.
+package collation
+
+import (
+	"errors"
+	"fmt"
+	"strings"
+	"sync"
+
+	"golang.org/x/text/cases"
+	"golang.org/x/text/collate"
+	"golang.org/x/text/language"
+)
+
+// ErrInvalidCollation is the sentinel wrapped by every Parse failure, so callers
+// can match collation-parse errors with errors.Is (mirroring iceberg-go's
+// ErrInvalidSchema / ErrBadCast convention).
+var ErrInvalidCollation = errors.New("invalid collation")
+
+// trimMode controls optional whitespace trimming before comparison.
+type trimMode uint8
+
+const (
+	trimNone trimMode = iota
+	trimRight
+	trimLeft
+	trimBoth
+)
+
+// caseMode controls optional case folding before comparison. Casing modifiers
+// are only valid on the "utf8" (binary) base — a real locale must use the "ci"
+// modifier instead.
+type caseMode uint8
+
+const (
+	caseNone caseMode = iota
+	caseUpper
+	caseLower
+	caseFold
+)
+
+// Spec is a parsed, immutable collation specifier. The zero value is not
+// meaningful; build one with Parse. A nil *Spec means binary (UTF-8 byte order).
+type Spec struct {
+	// locale is the raw locale token as written ("en_US", "sv", ...). Empty for
+	// the binary/"utf8" pseudo-locale.
+	locale string
+	binary bool
+
+	caseInsensitive   bool
+	accentInsensitive bool
+	trim              trimMode
+	casing            caseMode
+
+	// version optionally pins the collation implementation version (the
+	// proposal's icu_collator_version). It is metadata only: it does not change
+	// comparison here, but two specs with different versions are not equal,
+	// matching the "moving to a newer version is a new collation" rule.
+	version string
+
+	// tag is the BCP-47 tag derived from locale (only set when !binary).
+	tag language.Tag
+
+	// pool hands out reusable collators for locale comparison (collate.Collator
+	// is not safe for concurrent use). Built once in Parse for non-binary specs
+	// and shared across copies made by WithVersion (version doesn't affect the
+	// collator). nil for binary specs. It is a pointer, so copying a Spec by
+	// value just shares the pool.
+	pool *sync.Pool
+}
+
+// Parse parses a collation specifier such as "en_US-ci" or "utf8".
+func Parse(spec string) (*Spec, error) {
+	if spec == "" {
+		return nil, fmt.Errorf("%w: empty specifier", ErrInvalidCollation)
+	}
+
+	parts := strings.Split(spec, "-")
+	locale := parts[0]
+	s := &Spec{}
+
+	if strings.EqualFold(locale, "utf8") {
+		s.binary = true
+	} else {
+		s.locale = locale
+		tag, err := language.Parse(strings.ReplaceAll(locale, "_", "-"))
+		if err != nil {
+			return nil, fmt.Errorf("%w: invalid locale %q: %w", ErrInvalidCollation, locale, err)
+		}
+		s.tag = tag
+	}
+
+	var sawCase, sawAccent, sawCasing bool
+	for _, m := range parts[1:] {
+		switch strings.ToLower(m) {
+		case "cs", "ci":
+			if sawCase {
+				return nil, fmt.Errorf("%w: conflicting case modifiers in %q", ErrInvalidCollation, spec)
+			}
+			sawCase = true
+			s.caseInsensitive = strings.EqualFold(m, "ci")
+		case "as", "ai":
+			if sawAccent {
+				return nil, fmt.Errorf("%w: conflicting accent modifiers in %q", ErrInvalidCollation, spec)
+			}
+			sawAccent = true
+			s.accentInsensitive = strings.EqualFold(m, "ai")
+		case "rtrim":
+			s.trim = trimRight
+		case "ltrim":
+			s.trim = trimLeft
+		case "trim":
+			s.trim = trimBoth
+		case "upper", "lower", "casefold":
+			if sawCasing {
+				return nil, fmt.Errorf("%w: conflicting casing modifiers in %q", ErrInvalidCollation, spec)
+			}
+			sawCasing = true
+			switch strings.ToLower(m) {
+			case "upper":
+				s.casing = caseUpper
+			case "lower":
+				s.casing = caseLower
+			case "casefold":
+				s.casing = caseFold
+			}
+		default:
+			return nil, fmt.Errorf("%w: unknown modifier %q in %q", ErrInvalidCollation, m, spec)
+		}
+	}
+
+	// Validation rules from the proposal.
+	if s.binary {
+		// Case/accent insensitivity is language-specific and needs a real locale.
+		if s.caseInsensitive {
+			return nil, fmt.Errorf("%w: 'ci' requires a locale, not utf8 (%q)", ErrInvalidCollation, spec)
+		}
+		if s.accentInsensitive {
+			return nil, fmt.Errorf("%w: 'ai' requires a locale, not utf8 (%q)", ErrInvalidCollation, spec)
+		}
+
+		return s, nil
+	}
+
+	if s.casing != caseNone {
+		// A real locale cannot be combined with a casing modifier; use 'ci'.
+		return nil, fmt.Errorf("%w: casing modifiers cannot be combined with locale %q; use 'ci' instead", ErrInvalidCollation, locale)
+	}
+
+	// Build the pooled collator once per spec; locale comparison reuses pooled
+	// instances across calls instead of allocating a pool (and collator) per
+	// Comparator() call.
+	tag, opts := s.tag, s.collatorOptions()
+	s.pool = &sync.Pool{New: func() any { return collate.New(tag, opts...) }}
+
+	return s, nil
+}
+
+// MustParse is Parse but panics on error. Intended for tests and constants.
+func MustParse(spec string) *Spec {
+	s, err := Parse(spec)
+	if err != nil {
+		panic(err)
+	}
+
+	return s
+}
+
+// WithVersion returns a copy of the spec with its implementation version pinned
+// (the proposal's icu_collator_version).
+func (s *Spec) WithVersion(version string) *Spec {
+	if s == nil {
+		return nil
+	}
+	cp := *s
+	cp.version = version
+
+	return &cp
+}
+
+// IsBinary reports whether comparison is exactly UTF-8 byte order, with no
+// transform. This is the condition under which the original column's UTF-8
+// lower/upper bounds remain valid for pruning. A nil *Spec is binary. Note that
+// "utf8-lower" or "utf8-rtrim" are NOT binary: their transform changes equality
+// relative to the stored (untransformed) byte-order bounds.
+func (s *Spec) IsBinary() bool {
+	return s == nil || (s.binary && s.casing == caseNone && s.trim == trimNone)
+}
+
+// Version returns the pinned implementation version, or "" if unset.
+func (s *Spec) Version() string {
+	if s == nil {
+		return ""
+	}
+
+	return s.version
+}
+
+// String returns the canonical specifier. Default modifiers (cs/as) are
+// omitted, so "en_US-cs" and "en_US" both render as "en_US".
+func (s *Spec) String() string {
+	if s == nil {
+		return "utf8"
+	}
+
+	var b strings.Builder
+	if s.binary {
+		b.WriteString("utf8")
+	} else {
+		b.WriteString(s.locale)
+		if s.caseInsensitive {
+			b.WriteString("-ci")
+		}
+		if s.accentInsensitive {
+			b.WriteString("-ai")
+		}
+	}
+	switch s.trim {
+	case trimRight:
+		b.WriteString("-rtrim")
+	case trimLeft:
+		b.WriteString("-ltrim")
+	case trimBoth:
+		b.WriteString("-trim")
+	}
+	switch s.casing {
+	case caseUpper:
+		b.WriteString("-upper")
+	case caseLower:
+		b.WriteString("-lower")
+	case caseFold:
+		b.WriteString("-casefold")
+	}
+
+	return b.String()
+}
+
+// Equal reports whether two specs compare and order strings identically. Both
+// nil/binary specs are equal to each other; pinned versions must match.
+func Equal(a, b *Spec) bool {
+	return a.String() == b.String() && a.Version() == b.Version()
+}
+
+// normalize applies the optional trim and casing transforms that are not
+// expressed through collator options.
+func (s *Spec) normalize(str string) string {
+	if s == nil {
+		return str
+	}
+	switch s.trim {
+	case trimRight:
+		str = strings.TrimRight(str, " ")
+	case trimLeft:
+		str = strings.TrimLeft(str, " ")
+	case trimBoth:
+		str = strings.Trim(str, " ")
+	}
+	switch s.casing {
+	case caseUpper:
+		str = strings.ToUpper(str)
+	case caseLower:
+		str = strings.ToLower(str)
+	case caseFold:
+		str = cases.Fold().String(str)
+	}
+
+	return str
+}
+
+func (s *Spec) collatorOptions() []collate.Option {
+	var opts []collate.Option
+	if s.caseInsensitive {
+		opts = append(opts, collate.IgnoreCase)
+	}
+	if s.accentInsensitive {
+		opts = append(opts, collate.IgnoreDiacritics)
+	}
+
+	return opts
+}
+
+// Comparator returns a string comparator honoring this collation. The returned
+// function is safe for concurrent use. For a binary spec it is byte-order
+// comparison (after any trim/casing transform).
+func (s *Spec) Comparator() func(a, b string) int {
+	if s == nil {
+		return strings.Compare
+	}
+	if s.binary {
+		if s.casing == caseNone && s.trim == trimNone {
+			return strings.Compare
+		}
+		// Binary base with a trim/casing transform applied first.
+		return func(a, b string) int {
+			return strings.Compare(s.normalize(a), s.normalize(b))
+		}
+	}
+
+	// Locale-aware comparison. collate.Collator is not safe for concurrent use,
+	// so hand each goroutine its own instance from the spec's shared pool.
+	return func(a, b string) int {
+		c := s.pool.Get().(*collate.Collator)
+		defer s.pool.Put(c)
+
+		return c.CompareString(s.normalize(a), s.normalize(b))
+	}
+}
+
+// Key returns the collation sort key for str: a binary-comparable encoding such
+// that two strings equal under this collation share a key. This is the building
+// block for collation-aware min/max bounds; it is provided for completeness and
+// is not yet consumed by pruning in this prototype.
+func (s *Spec) Key(str string) []byte {
+	if s == nil {
+		return []byte(str)
+	}
+	if s.binary {
+		return []byte(s.normalize(str))
+	}
+	c := collate.New(s.tag, s.collatorOptions()...)
+	var buf collate.Buffer
+
+	return append([]byte(nil), c.KeyFromString(&buf, s.normalize(str))...)
+}
diff --git a/collation/collation_test.go b/collation/collation_test.go
new file mode 100644
index 000000000..087954d5c
--- /dev/null
+++ b/collation/collation_test.go
@@ -0,0 +1,172 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package collation_test
+
+import (
+	"testing"
+
+	"github.com/apache/iceberg-go/collation"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+func sign(n int) int {
+	switch {
+	case n < 0:
+		return -1
+	case n > 0:
+		return 1
+	default:
+		return 0
+	}
+}
+
+func TestParseAndCanonical(t *testing.T) {
+	tests := []struct {
+		spec      string
+		canonical string
+		binary    bool
+	}{
+		{"utf8", "utf8", true},
+		{"UTF8", "utf8", true},
+		{"en_US", "en_US", false},
+		{"en_US-cs", "en_US", false},    // cs is the default, dropped
+		{"en_US-cs-as", "en_US", false}, // both defaults, dropped
+		{"en_US-ci", "en_US-ci", false},
+		{"en_US-ci-ai", "en_US-ci-ai", false},
+		{"de_DE-ci", "de_DE-ci", false},
+		{"sv", "sv", false},
+		{"utf8-lower", "utf8-lower", false}, // transform => not byte-order-safe
+		{"utf8-rtrim", "utf8-rtrim", false},
+	}
+	for _, tt := range tests {
+		t.Run(tt.spec, func(t *testing.T) {
+			s, err := collation.Parse(tt.spec)
+			require.NoError(t, err)
+			assert.Equal(t, tt.canonical, s.String())
+			assert.Equal(t, tt.binary, s.IsBinary())
+		})
+	}
+}
+
+func TestParseErrors(t *testing.T) {
+	bad := []string{
+		"",            // empty
+		"en_US-ci-cs", // conflicting case
+		"en_US-ai-as", // conflicting accent
+		"utf8-upper-lower",
+		"utf8-ci",     // ci needs a real locale
+		"utf8-ai",     // ai needs a real locale
+		"en_US-lower", // casing cannot combine with a locale
+		"en_US-bogus", // unknown modifier
+		"not a locale",
+	}
+	for _, spec := range bad {
+		t.Run(spec, func(t *testing.T) {
+			_, err := collation.Parse(spec)
+			require.Error(t, err)
+			require.ErrorIs(t, err, collation.ErrInvalidCollation)
+		})
+	}
+}
+
+func TestCaseInsensitiveEquality(t *testing.T) {
+	cmp := collation.MustParse("en_US-ci").Comparator()
+	assert.Equal(t, 0, cmp("APPLE", "apple"), "ci should treat APPLE == apple")
+	assert.NotEqual(t, 0, cmp("apple", "banana"))
+
+	// Binary comparison must distinguish case.
+	bin := collation.MustParse("utf8").Comparator()
+	assert.NotEqual(t, 0, bin("APPLE", "apple"))
+}
+
+func TestAccentInsensitiveEquality(t *testing.T) {
+	cmp := collation.MustParse("en_US-ci-ai").Comparator()
+	assert.Equal(t, 0, cmp("résumé", "resume"), "ai should treat résumé == resume")
+}
+
+func TestLocaleOrdering(t *testing.T) {
+	// In Swedish, 'ö' sorts after 'z'.
+	sv := collation.MustParse("sv").Comparator()
+	assert.Equal(t, 1, sign(sv("ö", "z")), "sv: ö sorts after z")
+
+	// In German (phonebook-style root behavior here), 'ö' sorts near 'o',
+	// i.e. before 'z'. This is the crux of why UTF-8 byte order is wrong for
+	// collated columns.
+	de := collation.MustParse("de_DE").Comparator()
+	assert.Equal(t, -1, sign(de("ö", "z")), "de: ö sorts before z")
+}
+
+func TestBinaryVsCollationOrderDiffer(t *testing.T) {
+	// 'a' (0x61) > 'B' (0x42) in UTF-8 byte order, but 'a' < 'B' under a
+	// case-insensitive collation. This divergence is exactly why pruning with
+	// UTF-8 bounds is unsafe for collated columns.
+	bin := collation.MustParse("utf8").Comparator()
+	assert.Equal(t, 1, sign(bin("a", "B")), "byte order: a > B")
+
+	ci := collation.MustParse("en_US-ci").Comparator()
+	assert.Equal(t, -1, sign(ci("a", "B")), "case-insensitive: a < B")
+}
+
+func TestTrimming(t *testing.T) {
+	rtrim := collation.MustParse("utf8-rtrim").Comparator()
+	assert.Equal(t, 0, rtrim("abc   ", "abc"))
+
+	notrim := collation.MustParse("utf8").Comparator()
+	assert.NotEqual(t, 0, notrim("abc   ", "abc"))
+}
+
+func TestCasing(t *testing.T) {
+	lower := collation.MustParse("utf8-lower").Comparator()
+	assert.Equal(t, 0, lower("HELLO", "hello"))
+}
+
+func TestKeyEqualityUnderCollation(t *testing.T) {
+	s := collation.MustParse("en_US-ci")
+	// Strings equal under the collation produce identical sort keys.
+	assert.Equal(t, s.Key("APPLE"), s.Key("apple"))
+	assert.NotEqual(t, s.Key("apple"), s.Key("banana"))
+}
+
+func TestEqualAndVersion(t *testing.T) {
+	assert.True(t, collation.Equal(nil, collation.MustParse("utf8")))
+	assert.True(t, collation.Equal(collation.MustParse("en_US"), collation.MustParse("en_US-cs")))
+	assert.False(t, collation.Equal(collation.MustParse("en_US"), collation.MustParse("en_US-ci")))
+
+	v1 := collation.MustParse("en_US-ci").WithVersion("153.88")
+	v2 := collation.MustParse("en_US-ci").WithVersion("154.0")
+	assert.False(t, collation.Equal(v1, v2), "different pinned versions are not equal")
+	assert.Equal(t, "153.88", v1.Version())
+}
+
+func TestComparatorConcurrentSafe(t *testing.T) {
+	cmp := collation.MustParse("en_US-ci").Comparator()
+	done := make(chan int, 8)
+	for range 8 {
+		go func() {
+			r := 0
+			for range 1000 {
+				r += cmp("Apple", "apple")
+			}
+			done <- r
+		}()
+	}
+	for range 8 {
+		assert.Equal(t, 0, <-done)
+	}
+}
diff --git a/collation_manifest_test.go b/collation_manifest_test.go
new file mode 100644
index 000000000..cc29f1e9c
--- /dev/null
+++ b/collation_manifest_test.go
@@ -0,0 +1,125 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package iceberg
+
+import (
+	"bytes"
+	"testing"
+
+	"github.com/apache/iceberg-go/collation"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+// TestCollationBoundsAvroRoundTrip writes a v3 manifest whose data file carries
+// collation-aware bounds, reads it back through the real Avro path, and verifies
+// the bounds survive — i.e. the data_file.collation_bounds field is a working
+// reference for the proposed manifest/spec change.
+func TestCollationBoundsAvroRoundTrip(t *testing.T) {
+	spec := collation.MustParse("en_US-ci").WithVersion("v1")
+	schema := NewSchema(
+		0,
+		NestedField{ID: 1, Name: "id", Type: PrimitiveTypes.Int64, Required: true},
+		NestedField{ID: 2, Name: "name", Type: StringTypeWithCollation(spec), Required: true},
+	)
+	pspec := NewPartitionSpecID(0) // unpartitioned
+
+	entry, ok := ComputeCollatedBounds(spec, []string{"Banana", "apple", "Cherry"})
+	require.True(t, ok)
+
+	snapshotID := int64(123)
+	entries := []ManifestEntry{
+		&manifestEntry{
+			EntryStatus: EntryStatusADDED,
+			Snapshot:    &snapshotID,
+			Data: &dataFile{
+				Content:             EntryContentData,
+				Path:                "/data/file.parquet",
+				Format:              ParquetFile,
+				PartitionData:       map[string]any{},
+				RecordCount:         3,
+				FileSize:            1000,
+				BlockSizeInBytes:    64 * 1024,
+				CollationBoundsData: mapToAvroColMap(map[int]CollationBoundEntry{2: entry}),
+			},
+		},
+	}
+
+	var buf bytes.Buffer
+	_, err := WriteManifest("/manifest.avro", &buf, 3, pspec, schema, snapshotID, entries)
+	require.NoError(t, err)
+
+	mf := &manifestFile{version: 3, Path: "/manifest.avro", Content: ManifestContentData}
+	got, err := ReadManifest(mf, bytes.NewReader(buf.Bytes()), false)
+	require.NoError(t, err)
+	require.Len(t, got, 1)
+
+	provider, ok := got[0].DataFile().(CollationBoundsProvider)
+	require.True(t, ok, "decoded data file should expose collation bounds")
+
+	bounds := provider.CollationBounds()
+	require.Contains(t, bounds, 2)
+	assert.Equal(t, "en_US-ci", bounds[2].Collation)
+	assert.Equal(t, "v1", bounds[2].Version)
+	assert.Equal(t, []byte("apple"), bounds[2].Lower, "lower bound is the original value, not a sort key")
+	assert.Equal(t, []byte("Cherry"), bounds[2].Upper)
+
+	// And the round-tripped bound is usable for pruning under a matching version.
+	assert.True(t, bounds[2].ValidFor(spec))
+}
+
+// TestManifestWithoutCollationBoundsRoundTrip confirms the new optional field is
+// backward compatible: a data file with no collation bounds round-trips with an
+// empty collation-bounds map and no error.
+func TestManifestWithoutCollationBoundsRoundTrip(t *testing.T) {
+	schema := NewSchema(
+		0,
+		NestedField{ID: 1, Name: "id", Type: PrimitiveTypes.Int64, Required: true},
+	)
+	pspec := NewPartitionSpecID(0)
+
+	snapshotID := int64(7)
+	entries := []ManifestEntry{
+		&manifestEntry{
+			EntryStatus: EntryStatusADDED,
+			Snapshot:    &snapshotID,
+			Data: &dataFile{
+				Content:          EntryContentData,
+				Path:             "/data/file.parquet",
+				Format:           ParquetFile,
+				PartitionData:    map[string]any{},
+				RecordCount:      1,
+				FileSize:         10,
+				BlockSizeInBytes: 64 * 1024,
+			},
+		},
+	}
+
+	var buf bytes.Buffer
+	_, err := WriteManifest("/manifest.avro", &buf, 3, pspec, schema, snapshotID, entries)
+	require.NoError(t, err)
+
+	mf := &manifestFile{version: 3, Path: "/manifest.avro", Content: ManifestContentData}
+	got, err := ReadManifest(mf, bytes.NewReader(buf.Bytes()), false)
+	require.NoError(t, err)
+	require.Len(t, got, 1)
+
+	provider, ok := got[0].DataFile().(CollationBoundsProvider)
+	require.True(t, ok)
+	assert.Empty(t, provider.CollationBounds())
+}
diff --git a/collation_metrics.go b/collation_metrics.go
new file mode 100644
index 000000000..2143014e0
--- /dev/null
+++ b/collation_metrics.go
@@ -0,0 +1,108 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package iceberg
+
+import "github.com/apache/iceberg-go/collation"
+
+// CollationBoundEntry is the collation-aware lower/upper bound for one column in
+// one data file, following the Delta approach to collation statistics. It is
+// both the on-disk Avro record (the data_file.collation_bounds map value) and
+// the in-memory value consumed by the metrics evaluator.
+//
+//   - Lower and Upper hold the ORIGINAL string values (their normal binary
+//     encoding), not ICU sort keys. Sort keys are not stable across ICU/CLDR
+//     versions, so storing them would couple every reader to one exact
+//     implementation; original values stay readable and let each engine compare
+//     with its own collator.
+//   - Collation and Version identify the collation and the implementation version
+//     under which the min/max were selected. A reader must only trust the bound
+//     when both match its own collation (see ValidFor), because a different
+//     version may order values differently and thus pick a different min/max.
+//
+// Unlike the original UTF-8 lower/upper bounds (the byte-order min/max), these
+// are the min/max under the column's COLLATION order, which can select entirely
+// different values (e.g. for {"Banana","apple"} the byte-min is "Banana" but the
+// case-insensitive min is "apple").
+//
+// Prototype note: the spec should generalize the data_file.collation_bounds map
+// value to a LIST of these entries (Delta's validForCollations), letting one file
+// carry bounds for several collation versions at once for smooth engine upgrades.
+// This prototype stores a single entry per column.
+type CollationBoundEntry struct {
+	Collation string `avro:"collation"`
+	Version   string `avro:"version"`
+	Lower     []byte `avro:"lower_bound"`
+	Upper     []byte `avro:"upper_bound"`
+}
+
+// ValidFor reports whether this bound may be used for collation-aware pruning
+// under spec. It requires the collation identity and a non-empty version to
+// match exactly: an unversioned or version-mismatched bound cannot be trusted to
+// be the true min/max under the reader's collation. This is the mechanism that
+// keeps pruning correct across ICU/CLDR version changes.
+func (e CollationBoundEntry) ValidFor(spec *collation.Spec) bool {
+	return e.Version != "" && spec.Version() == e.Version && e.Collation == spec.String()
+}
+
+// CollationBoundsProvider is an optional interface a DataFile may implement to
+// expose collation-aware bounds (the Delta statsWithCollation analogue) keyed by
+// field ID. The metrics evaluator consults it for collated columns; data files
+// that don't implement it (or carry no collation bounds) are conservatively kept.
+type CollationBoundsProvider interface {
+	CollationBounds() map[int]CollationBoundEntry
+}
+
+// ComputeCollatedBounds selects the collation-order minimum and maximum of
+// values and returns them as an original-value bound entry tagged with spec's
+// collation and version. It returns ok=false for a binary spec or empty input.
+// This is the write-side counterpart consumed by collation-aware pruning.
+func ComputeCollatedBounds(spec *collation.Spec, values []string) (CollationBoundEntry, bool) {
+	if spec.IsBinary() || len(values) == 0 {
+		return CollationBoundEntry{}, false
+	}
+
+	cmp := spec.Comparator()
+	lo, hi := values[0], values[0]
+	for _, v := range values[1:] {
+		if cmp(v, lo) < 0 {
+			lo = v
+		}
+		if cmp(v, hi) > 0 {
+			hi = v
+		}
+	}
+
+	encLo, err := StringLiteral(lo).MarshalBinary()
+	if err != nil {
+		return CollationBoundEntry{}, false
+	}
+	encHi, err := StringLiteral(hi).MarshalBinary()
+	if err != nil {
+		return CollationBoundEntry{}, false
+	}
+
+	// StringLiteral.MarshalBinary returns bytes that alias the input string's
+	// backing memory; copy so the stored bounds can't be corrupted by later
+	// reuse/mutation of the source buffer.
+	return CollationBoundEntry{
+		Collation: spec.String(),
+		Version:   spec.Version(),
+		Lower:     append([]byte(nil), encLo...),
+		Upper:     append([]byte(nil), encHi...),
+	}, true
+}
diff --git a/collation_schema_test.go b/collation_schema_test.go
new file mode 100644
index 000000000..ed87caf73
--- /dev/null
+++ b/collation_schema_test.go
@@ -0,0 +1,179 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package iceberg_test
+
+import (
+	"encoding/json"
+	"testing"
+
+	"github.com/apache/iceberg-go"
+	"github.com/apache/iceberg-go/collation"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+func TestCollatedStringTypeEquals(t *testing.T) {
+	plain := iceberg.StringType{}
+	ciA := iceberg.StringTypeWithCollation(collation.MustParse("en_US-ci"))
+	ciB := iceberg.StringTypeWithCollation(collation.MustParse("en_US-ci"))
+	ai := iceberg.StringTypeWithCollation(collation.MustParse("en_US-ci-ai"))
+	binary := iceberg.StringTypeWithCollation(collation.MustParse("utf8"))
+
+	assert.True(t, ciA.Equals(ciB))
+	assert.False(t, plain.Equals(ciA))
+	assert.False(t, ciA.Equals(ai))
+	// An explicit utf8 collation is equivalent to the default string type.
+	assert.True(t, plain.Equals(binary))
+	assert.True(t, binary.Equals(plain))
+}
+
+func TestCollatedStringTypeString(t *testing.T) {
+	assert.Equal(t, "string", iceberg.StringType{}.String())
+	assert.Equal(t, "string collate en_US-ci",
+		iceberg.StringTypeWithCollation(collation.MustParse("en_US-ci")).String())
+}
+
+func TestCollatedStringTypeComparator(t *testing.T) {
+	cmp := iceberg.StringTypeWithCollation(collation.MustParse("en_US-ci")).Comparator()
+	assert.Equal(t, 0, cmp("APPLE", "apple"))
+
+	plain := iceberg.StringType{}.Comparator()
+	assert.NotEqual(t, 0, plain("APPLE", "apple"))
+}
+
+func TestCollationSpecJSONRoundTrip(t *testing.T) {
+	field := iceberg.NestedField{
+		ID:       3,
+		Name:     "product_name",
+		Required: true,
+		Type:     iceberg.StringTypeWithCollation(collation.MustParse("en_US-ci")),
+	}
+
+	data, err := json.Marshal(field)
+	require.NoError(t, err)
+	// type stays "string"; collation rides alongside as collation_spec.
+	assert.JSONEq(t, `{
+		"id": 3,
+		"name": "product_name",
+		"required": true,
+		"type": "string",
+		"collation_spec": {"name": "en_US-ci"}
+	}`, string(data))
+
+	var got iceberg.NestedField
+	require.NoError(t, json.Unmarshal(data, &got))
+	require.True(t, got.Type.Equals(field.Type))
+
+	st, ok := got.Type.(iceberg.StringType)
+	require.True(t, ok)
+	require.NotNil(t, st.Collation())
+	assert.Equal(t, "en_US-ci", st.Collation().String())
+}
+
+func TestCollationSpecJSONWithVersion(t *testing.T) {
+	field := iceberg.NestedField{
+		ID:   1,
+		Name: "c",
+		Type: iceberg.StringTypeWithCollation(
+			collation.MustParse("de_DE-ci").WithVersion("153.88.33.0"),
+		),
+	}
+
+	data, err := json.Marshal(field)
+	require.NoError(t, err)
+	assert.Contains(t, string(data), `"icu_collator_version":"153.88.33.0"`)
+
+	var got iceberg.NestedField
+	require.NoError(t, json.Unmarshal(data, &got))
+	st := got.Type.(iceberg.StringType)
+	assert.Equal(t, "153.88.33.0", st.Collation().Version())
+}
+
+func TestPlainStringHasNoCollationSpec(t *testing.T) {
+	field := iceberg.NestedField{ID: 1, Name: "c", Type: iceberg.PrimitiveTypes.String}
+	data, err := json.Marshal(field)
+	require.NoError(t, err)
+	assert.NotContains(t, string(data), "collation_spec")
+}
+
+func TestCollationSpecOnNonStringErrors(t *testing.T) {
+	const j = `{"id":1,"name":"c","required":false,"type":"long","collation_spec":{"name":"en_US-ci"}}`
+	var got iceberg.NestedField
+	err := json.Unmarshal([]byte(j), &got)
+	assert.Error(t, err)
+}
+
+func TestComputeCollatedBounds(t *testing.T) {
+	spec := collation.MustParse("en_US-ci").WithVersion("v1")
+
+	// Byte-order min/max of these is "Banana"/"apple"; collation-order (ci)
+	// min/max is "apple"/"Cherry".
+	entry, ok := iceberg.ComputeCollatedBounds(spec, []string{"Banana", "apple", "Cherry"})
+	require.True(t, ok)
+	assert.Equal(t, "v1", entry.Version)
+	assert.Equal(t, "en_US-ci", entry.Collation)
+
+	lower, err := iceberg.LiteralFromBytes(iceberg.PrimitiveTypes.String, entry.Lower)
+	require.NoError(t, err)
+	upper, err := iceberg.LiteralFromBytes(iceberg.PrimitiveTypes.String, entry.Upper)
+	require.NoError(t, err)
+	assert.Equal(t, iceberg.StringLiteral("apple"), lower)
+	assert.Equal(t, iceberg.StringLiteral("Cherry"), upper)
+
+	// Bounds carry the original values, not ICU sort keys.
+	assert.Equal(t, []byte("apple"), entry.Lower)
+
+	// A binary spec has no collation bounds.
+	_, ok = iceberg.ComputeCollatedBounds(collation.MustParse("utf8"), []string{"a", "b"})
+	assert.False(t, ok)
+}
+
+func TestCollationBoundEntryValidFor(t *testing.T) {
+	v1 := iceberg.CollationBoundEntry{Collation: "en_US-ci", Version: "v1", Lower: []byte("a"), Upper: []byte("z")}
+	assert.True(t, v1.ValidFor(collation.MustParse("en_US-ci").WithVersion("v1")))
+	assert.False(t, v1.ValidFor(collation.MustParse("en_US-ci").WithVersion("v2")), "version mismatch")
+	assert.False(t, v1.ValidFor(collation.MustParse("en_US-ci")), "unversioned reader cannot use bound")
+	assert.False(t, v1.ValidFor(collation.MustParse("de_DE-ci").WithVersion("v1")), "collation mismatch")
+
+	unversioned := iceberg.CollationBoundEntry{Collation: "en_US-ci", Lower: []byte("a"), Upper: []byte("z")}
+	assert.False(t, unversioned.ValidFor(collation.MustParse("en_US-ci").WithVersion("v1")))
+}
+
+func TestCollatedStringInSchemaRoundTrip(t *testing.T) {
+	sc := iceberg.NewSchema(
+		0,
+		iceberg.NestedField{ID: 1, Name: "id", Type: iceberg.PrimitiveTypes.Int64, Required: true},
+		iceberg.NestedField{
+			ID: 2, Name: "name", Required: true,
+			Type: iceberg.StringTypeWithCollation(collation.MustParse("en_US-ci")),
+		},
+	)
+
+	data, err := json.Marshal(sc)
+	require.NoError(t, err)
+
+	var got iceberg.Schema
+	require.NoError(t, json.Unmarshal(data, &got))
+
+	f, ok := got.FindFieldByName("name")
+	require.True(t, ok)
+	st, ok := f.Type.(iceberg.StringType)
+	require.True(t, ok)
+	require.NotNil(t, st.Collation())
+	assert.Equal(t, "en_US-ci", st.Collation().String())
+}
diff --git a/go.mod b/go.mod
index d94f4404d..0ffef14ca 100644
--- a/go.mod
+++ b/go.mod
@@ -61,6 +61,7 @@ require (
 	golang.org/x/oauth2 v0.36.0
 	golang.org/x/sync v0.21.0
 	golang.org/x/term v0.44.0
+	golang.org/x/text v0.38.0
 	google.golang.org/api v0.284.0
 	gopkg.in/yaml.v3 v3.0.1
 )
@@ -266,7 +267,6 @@ require (
 	golang.org/x/mod v0.36.0 // indirect
 	golang.org/x/net v0.55.0 // indirect
 	golang.org/x/sys v0.46.0 // indirect
-	golang.org/x/text v0.38.0 // indirect
 	golang.org/x/time v0.15.0 // indirect
 	golang.org/x/xerrors v0.0.0-20240903120638-7835f813f4da // indirect
 	google.golang.org/genproto v0.0.0-20260319201613-d00831a3d3e7 // indirect
diff --git a/internal/avro_schemas.go b/internal/avro_schemas.go
index 720f4be41..63d275425 100644
--- a/internal/avro_schemas.go
+++ b/internal/avro_schemas.go
@@ -138,6 +138,27 @@ func fieldNode(name string, typ avro.SchemaNode, fieldID int, opts ...func(*avro
 	return f
 }
 
+// collationBoundNode is the Avro record stored as the value of the prototype
+// data_file.collation_bounds map: collation-aware original-value bounds plus the
+// collation/version they were computed under.
+//
+// PROTOTYPE field IDs: 9000-9006 are a deliberately high, experimental range so
+// this demo does not collide with spec-reserved data_file field IDs. In
+// particular 146 (and the IDs immediately above it) is reserved for the V4
+// content_stats struct, and the spec reserves field IDs globally across
+// versions, so the prototype must stay out of that range until an official range
+// is reserved for collation bounds.
+var collationBoundNode = avro.SchemaNode{
+	Type: "record",
+	Name: "collation_bound_entry",
+	Fields: []avro.SchemaField{
+		fieldNode("collation", StringNode, 9003),
+		fieldNode("version", StringNode, 9004),
+		fieldNode("lower_bound", BytesNode, 9005),
+		fieldNode("upper_bound", BytesNode, 9006),
+	},
+}
+
 func withDoc(doc string) func(*avro.SchemaField) {
 	return func(f *avro.SchemaField) {
 		f.Doc = doc
@@ -400,6 +421,16 @@ func init() {
 				withDoc("The offset in the file where the content starts")),
 			fieldNode("content_size_in_bytes", NullableNode(LongNode), 145,
 				withDoc("The length of the referenced content stored in the file")),
+			// PROTOTYPE: collation-aware bounds (Delta-style original values +
+			// version) for collated string columns. Field IDs 9000-9006 are a
+			// high experimental range chosen to avoid the spec's globally-reserved
+			// data_file IDs (e.g. 146 = V4 content_stats); they are provisional and
+			// must be replaced with an officially reserved range before this is a
+			// real spec change.
+			fieldNode("collation_bounds",
+				NullableNode(newMapNode("k9001_v9002", IntNode, collationBoundNode, 9001, 9002)),
+				9000, withDefault(nil),
+				withDoc("map of column id to collation-aware original-value bounds (prototype)")),
 		},
 	})
 
diff --git a/literals.go b/literals.go
index 7a4989630..985fcd3a3 100644
--- a/literals.go
+++ b/literals.go
@@ -34,6 +34,7 @@ import (
 	"github.com/apache/arrow-go/v18/arrow"
 	"github.com/apache/arrow-go/v18/arrow/decimal128"
 	"github.com/apache/arrow-go/v18/parquet/variant"
+	"github.com/apache/iceberg-go/collation"
 	"github.com/google/uuid"
 )
 
@@ -603,7 +604,8 @@ func (Float32Literal) Comparator() Comparator[float32] { return cmp.Compare[floa
 func (f Float32Literal) Type() Type                    { return PrimitiveTypes.Float32 }
 func (f Float32Literal) Value() float32                { return float32(f) }
 func (f Float32Literal) Any() any                      { return f.Value() }
-func (f Float32Literal) String() string                { return strconv.FormatFloat(float64(f), 'g', -1, 32) }
+func (f Float32Literal) String() string { return strconv.FormatFloat(float64(f), 'g', -1, 32) }
+
 func (f Float32Literal) To(t Type) (Literal, error) {
 	switch t := t.(type) {
 	case Float32Type:
@@ -651,7 +653,8 @@ func (Float64Literal) Comparator() Comparator[float64] { return cmp.Compare[floa
 func (f Float64Literal) Type() Type                    { return PrimitiveTypes.Float64 }
 func (f Float64Literal) Value() float64                { return float64(f) }
 func (f Float64Literal) Any() any                      { return f.Value() }
-func (f Float64Literal) String() string                { return strconv.FormatFloat(float64(f), 'g', -1, 64) }
+func (f Float64Literal) String() string { return strconv.FormatFloat(float64(f), 'g', -1, 64) }
+
 func (f Float64Literal) To(t Type) (Literal, error) {
 	switch t := t.(type) {
 	case Float32Type:
@@ -848,9 +851,10 @@ func (t *TimestampLiteral) UnmarshalBinary(data []byte) error {
 type TimestampNsLiteral TimestampNano
 
 func (TimestampNsLiteral) Comparator() Comparator[TimestampNano] { return cmp.Compare[TimestampNano] }
-func (t TimestampNsLiteral) Type() Type                          { return PrimitiveTypes.TimestampNs }
-func (t TimestampNsLiteral) Value() TimestampNano                { return TimestampNano(t) }
-func (t TimestampNsLiteral) Any() any                            { return t.Value() }
+
+func (t TimestampNsLiteral) Type() Type           { return PrimitiveTypes.TimestampNs }
+func (t TimestampNsLiteral) Value() TimestampNano { return TimestampNano(t) }
+func (t TimestampNsLiteral) Any() any             { return t.Value() }
 func (t TimestampNsLiteral) String() string {
 	tm := TimestampNano(t).ToTime()
 
@@ -902,6 +906,24 @@ func (t *TimestampNsLiteral) UnmarshalBinary(data []byte) error {
 
 type StringLiteral string
 
+// CollatedStringComparator returns a string Comparator honoring spec. For a nil
+// or binary spec it is the default byte-order comparator, identical to
+// StringLiteral.Comparator().
+//
+// Note that a collation is a property of a column/field, not of an individual
+// literal value, so the collation-aware comparator is sourced from the field's
+// StringType (see StringType.Comparator) rather than from a StringLiteral. A
+// bare StringLiteral has no collation context and therefore always compares
+// byte-wise; this mirrors Delta Kernel, where predicates (not literals) carry
+// the collation and default to binary.
+func CollatedStringComparator(spec *collation.Spec) Comparator[string] {
+	if spec.IsBinary() {
+		return cmp.Compare[string]
+	}
+
+	return spec.Comparator()
+}
+
 func (StringLiteral) Comparator() Comparator[string] { return cmp.Compare[string] }
 func (s StringLiteral) Type() Type                   { return PrimitiveTypes.String }
 func (s StringLiteral) Value() string                { return string(s) }
diff --git a/manifest.go b/manifest.go
index 5be249b84..5322491bf 100644
--- a/manifest.go
+++ b/manifest.go
@@ -1541,7 +1541,7 @@ func (m *ManifestListWriter) AddManifests(files []ManifestFile) error {
 					ErrInvalidArgument, m.version, file.Version())
 			}
 
-			wrapped := *(file.(*manifestFile))
+			wrapped := *file.(*manifestFile)
 			if m.version == 3 {
 				// Ref: https://github.com/apache/iceberg/blob/ea2071568dc66148b483a82eefedcd2992b435f7/core/src/main/java/org/apache/iceberg/ManifestListWriter.java#L157-L168
 				if wrapped.Content == ManifestContentData && wrapped.FirstRowIDValue == nil {
@@ -1821,36 +1821,38 @@ func padOrTruncateBytes(bytes []byte, size int) []byte {
 }
 
 type dataFile struct {
-	Content                 ManifestEntryContent   `avro:"content"`
-	Path                    string                 `avro:"file_path"`
-	Format                  FileFormat             `avro:"file_format"`
-	PartitionData           map[string]any         `avro:"partition"`
-	RecordCount             int64                  `avro:"record_count"`
-	FileSize                int64                  `avro:"file_size_in_bytes"`
-	BlockSizeInBytes        int64                  `avro:"block_size_in_bytes"`
-	ColSizes                *[]colMap[int, int64]  `avro:"column_sizes"`
-	ValCounts               *[]colMap[int, int64]  `avro:"value_counts"`
-	NullCounts              *[]colMap[int, int64]  `avro:"null_value_counts"`
-	NaNCounts               *[]colMap[int, int64]  `avro:"nan_value_counts"`
-	DistinctCounts          *[]colMap[int, int64]  `avro:"distinct_counts"`
-	LowerBounds             *[]colMap[int, []byte] `avro:"lower_bounds"`
-	UpperBounds             *[]colMap[int, []byte] `avro:"upper_bounds"`
-	Key                     *[]byte                `avro:"key_metadata"`
-	Splits                  *[]int64               `avro:"split_offsets"`
-	EqualityIDs             *[]int                 `avro:"equality_ids"`
-	SortOrder               *int                   `avro:"sort_order_id"`
-	FirstRowIDField         *int64                 `avro:"first_row_id"`
-	ReferencedDataFileField *string                `avro:"referenced_data_file"`
-	ContentOffsetField      *int64                 `avro:"content_offset"`
-	ContentSizeInBytesField *int64                 `avro:"content_size_in_bytes"`
-
-	colSizeMap     map[int]int64
-	valCntMap      map[int]int64
-	nullCntMap     map[int]int64
-	nanCntMap      map[int]int64
-	distinctCntMap map[int]int64
-	lowerBoundMap  map[int][]byte
-	upperBoundMap  map[int][]byte
+	Content                 ManifestEntryContent                `avro:"content"`
+	Path                    string                              `avro:"file_path"`
+	Format                  FileFormat                          `avro:"file_format"`
+	PartitionData           map[string]any                      `avro:"partition"`
+	RecordCount             int64                               `avro:"record_count"`
+	FileSize                int64                               `avro:"file_size_in_bytes"`
+	BlockSizeInBytes        int64                               `avro:"block_size_in_bytes"`
+	ColSizes                *[]colMap[int, int64]               `avro:"column_sizes"`
+	ValCounts               *[]colMap[int, int64]               `avro:"value_counts"`
+	NullCounts              *[]colMap[int, int64]               `avro:"null_value_counts"`
+	NaNCounts               *[]colMap[int, int64]               `avro:"nan_value_counts"`
+	DistinctCounts          *[]colMap[int, int64]               `avro:"distinct_counts"`
+	LowerBounds             *[]colMap[int, []byte]              `avro:"lower_bounds"`
+	UpperBounds             *[]colMap[int, []byte]              `avro:"upper_bounds"`
+	CollationBoundsData     *[]colMap[int, CollationBoundEntry] `avro:"collation_bounds"`
+	Key                     *[]byte                             `avro:"key_metadata"`
+	Splits                  *[]int64                            `avro:"split_offsets"`
+	EqualityIDs             *[]int                              `avro:"equality_ids"`
+	SortOrder               *int                                `avro:"sort_order_id"`
+	FirstRowIDField         *int64                              `avro:"first_row_id"`
+	ReferencedDataFileField *string                             `avro:"referenced_data_file"`
+	ContentOffsetField      *int64                              `avro:"content_offset"`
+	ContentSizeInBytesField *int64                              `avro:"content_size_in_bytes"`
+
+	colSizeMap        map[int]int64
+	valCntMap         map[int]int64
+	nullCntMap        map[int]int64
+	nanCntMap         map[int]int64
+	distinctCntMap    map[int]int64
+	lowerBoundMap     map[int][]byte
+	upperBoundMap     map[int][]byte
+	collationBoundMap map[int]CollationBoundEntry
 
 	// used for partition retrieval
 	fieldNameToID          map[string]int
@@ -1886,6 +1888,7 @@ func (d *dataFile) initColumnStatsData() {
 		d.distinctCntMap = avroColMapToMap(d.DistinctCounts)
 		d.lowerBoundMap = avroColMapToMap(d.LowerBounds)
 		d.upperBoundMap = avroColMapToMap(d.UpperBounds)
+		d.collationBoundMap = avroColMapToMap(d.CollationBoundsData)
 	})
 }
 
@@ -2034,6 +2037,14 @@ func (d *dataFile) UpperBoundValues() map[int][]byte {
 	return d.upperBoundMap
 }
 
+// CollationBounds returns the collation-aware bounds (prototype) keyed by field
+// ID. Implements CollationBoundsProvider.
+func (d *dataFile) CollationBounds() map[int]CollationBoundEntry {
+	d.initColumnStatsData()
+
+	return d.collationBoundMap
+}
+
 func (d *dataFile) KeyMetadata() []byte {
 	if d.Key == nil {
 		return nil
@@ -2322,6 +2333,17 @@ func (b *DataFileBuilder) UpperBoundValues(bounds map[int][]byte) *DataFileBuild
 	return b
 }
 
+// WithCollationBounds sets the collation-aware bounds for the data file
+// (prototype). Named With… to distinguish the setter from the CollationBounds()
+// accessor. Note: collation_bounds is only present in the v3 data_file Avro
+// schema, so bounds set here are silently dropped when written to a v1/v2
+// manifest.
+func (b *DataFileBuilder) WithCollationBounds(bounds map[int]CollationBoundEntry) *DataFileBuilder {
+	b.d.CollationBoundsData = mapToAvroColMap(bounds)
+
+	return b
+}
+
 // KeyMetadata sets the key metadata for the data file.
 func (b *DataFileBuilder) KeyMetadata(key []byte) *DataFileBuilder {
 	b.d.Key = &key
@@ -2475,7 +2497,8 @@ type ManifestEntry interface {
 	wrap(status ManifestEntryStatus, snapshotID, seqNum, fileSeqNum *int64, datafile DataFile) ManifestEntry
 }
 
-var PositionalDeleteSchema = NewSchema(0,
+var PositionalDeleteSchema = NewSchema(
+	0,
 	NestedField{ID: 2147483546, Type: PrimitiveTypes.String, Name: "file_path", Required: true},
 	NestedField{ID: 2147483545, Type: PrimitiveTypes.Int64, Name: "pos", Required: true},
 )
diff --git a/table/collation_pruning_test.go b/table/collation_pruning_test.go
new file mode 100644
index 000000000..e6387856c
--- /dev/null
+++ b/table/collation_pruning_test.go
@@ -0,0 +1,278 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package table
+
+import (
+	"testing"
+
+	"github.com/apache/iceberg-go"
+	"github.com/apache/iceberg-go/collation"
+	"github.com/stretchr/testify/require"
+)
+
+// stringBound returns the binary encoding of a string min/max bound, as stored
+// in a data file's lower/upper bound maps.
+func stringBound(t *testing.T, s string) []byte {
+	t.Helper()
+	b, err := iceberg.StringLiteral(s).MarshalBinary()
+	require.NoError(t, err)
+
+	return b
+}
+
+// schemaWithName builds a single-string-column schema, optionally collated.
+func schemaWithName(spec *collation.Spec) *iceberg.Schema {
+	typ := iceberg.Type(iceberg.StringType{})
+	if spec != nil {
+		typ = iceberg.StringTypeWithCollation(spec)
+	}
+
+	return iceberg.NewSchema(
+		0,
+		iceberg.NestedField{ID: 1, Name: "name", Type: typ, Required: true},
+	)
+}
+
+// TestCollationPruningSafety shows that data-file pruning must not use the
+// stored UTF-8 byte-order bounds for a collated column, because collation order
+// disagrees with byte order. For each case, a plain (binary) string column
+// prunes the file — which would silently drop matching rows if the column were
+// actually collated — while the collated column correctly keeps the file.
+func TestCollationPruningSafety(t *testing.T) {
+	ci := collation.MustParse("en_US-ci")
+
+	cases := []struct {
+		name string
+		// file holds a single value (lower == upper) encoded as UTF-8 bounds.
+		fileValue string
+		expr      iceberg.BooleanExpression
+	}{
+		{
+			// File contains only "apple". Under en_US-ci, name = 'APPLE' matches
+			// it, but in byte order 'APPLE' < 'apple' so a binary evaluator drops
+			// the file.
+			name:      "equality APPLE vs apple",
+			fileValue: "apple",
+			expr:      iceberg.EqualTo(iceberg.Reference("name"), "APPLE"),
+		},
+		{
+			// File contains only "a". Under en_US-ci, 'a' < 'B' so name < 'B'
+			// matches, but in byte order 'a' (0x61) > 'B' (0x42) so a binary
+			// evaluator drops the file.
+			name:      "ordering a < B",
+			fileValue: "a",
+			expr:      iceberg.LessThan(iceberg.Reference("name"), "B"),
+		},
+	}
+
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			bound := stringBound(t, tc.fileValue)
+			file := &mockDataFile{
+				path:        "file.parquet",
+				format:      iceberg.ParquetFile,
+				count:       1,
+				valueCounts: map[int]int64{1: 1},
+				nullCounts:  map[int]int64{1: 0},
+				lowerBounds: map[int][]byte{1: bound},
+				upperBounds: map[int][]byte{1: bound},
+			}
+
+			// Binary column: byte-order bounds are authoritative and the file is
+			// pruned. (Correct for a genuinely binary column; catastrophic if the
+			// column were collated and an engine ignored that.)
+			binEval, err := newInclusiveMetricsEvaluator(schemaWithName(nil), tc.expr, true, true)
+			require.NoError(t, err)
+			binMatch, err := binEval(file)
+			require.NoError(t, err)
+			require.False(t, binMatch, "binary column should prune the file via byte-order bounds")
+
+			// Collated column: byte-order bounds are not safe, so the file must be
+			// kept (it actually contains a matching value under the collation).
+			ciEval, err := newInclusiveMetricsEvaluator(schemaWithName(ci), tc.expr, true, true)
+			require.NoError(t, err)
+			ciMatch, err := ciEval(file)
+			require.NoError(t, err)
+			require.True(t, ciMatch, "collated column must keep the file; UTF-8 bounds cannot prune it")
+		})
+	}
+}
+
+// TestExplicitUtf8CollationStillPrunes confirms the guard is precise: an
+// explicit utf8 collation is byte-order-equivalent, so pruning still applies.
+func TestExplicitUtf8CollationStillPrunes(t *testing.T) {
+	bound := stringBound(t, "apple")
+	file := &mockDataFile{
+		path:        "file.parquet",
+		format:      iceberg.ParquetFile,
+		count:       1,
+		valueCounts: map[int]int64{1: 1},
+		nullCounts:  map[int]int64{1: 0},
+		lowerBounds: map[int][]byte{1: bound},
+		upperBounds: map[int][]byte{1: bound},
+	}
+
+	// "zzz" > "apple" in byte order, so an equality predicate prunes.
+	expr := iceberg.EqualTo(iceberg.Reference("name"), "zzz")
+	eval, err := newInclusiveMetricsEvaluator(schemaWithName(collation.MustParse("utf8")), expr, true, true)
+	require.NoError(t, err)
+	match, err := eval(file)
+	require.NoError(t, err)
+	require.False(t, match, "utf8 collation is byte-order-safe and should still prune")
+}
+
+// collatedMockDataFile augments mockDataFile with Delta-style collation bounds.
+type collatedMockDataFile struct {
+	*mockDataFile
+	collBounds map[int]iceberg.CollationBoundEntry
+}
+
+func (c *collatedMockDataFile) CollationBounds() map[int]iceberg.CollationBoundEntry {
+	return c.collBounds
+}
+
+// fileWithCollatedValues builds a data file whose single column holds the given
+// values, with collation-order min/max bounds tagged with the spec's version
+// (the Delta write path) plus the real byte-order min/max bounds a normal writer
+// would emit. The two disagree for collated data — which is the whole point —
+// and the byte-order bounds are deliberately left intact to prove the evaluator
+// ignores them for collated columns.
+func fileWithCollatedValues(t *testing.T, spec *collation.Spec, values ...string) *collatedMockDataFile {
+	t.Helper()
+	entry, ok := iceberg.ComputeCollatedBounds(spec, values)
+	require.True(t, ok)
+
+	byteMin, byteMax := values[0], values[0]
+	for _, v := range values[1:] {
+		if v < byteMin {
+			byteMin = v
+		}
+		if v > byteMax {
+			byteMax = v
+		}
+	}
+
+	return &collatedMockDataFile{
+		mockDataFile: &mockDataFile{
+			path:        "file.parquet",
+			format:      iceberg.ParquetFile,
+			count:       int64(len(values)),
+			valueCounts: map[int]int64{1: int64(len(values))},
+			nullCounts:  map[int]int64{1: 0},
+			lowerBounds: map[int][]byte{1: stringBound(t, byteMin)},
+			upperBounds: map[int][]byte{1: stringBound(t, byteMax)},
+		},
+		collBounds: map[int]iceberg.CollationBoundEntry{1: entry},
+	}
+}
+
+// TestDeltaAlignedCollationPruning shows collation-aware pruning using original-
+// value bounds selected under the collation order and gated by a matching
+// collation version. For {"Banana","apple"} under en_US-ci the collation-order
+// min/max are "apple"/"Banana" (not the byte-order "Banana"/"apple").
+func TestDeltaAlignedCollationPruning(t *testing.T) {
+	spec := collation.MustParse("en_US-ci").WithVersion("v1")
+	sc := schemaWithName(spec)
+
+	cases := []struct {
+		name       string
+		expr       iceberg.BooleanExpression
+		mightMatch bool
+	}{
+		{"eq above range pruned", iceberg.EqualTo(iceberg.Reference("name"), "CHERRY"), false},
+		{"eq in range kept (ci match)", iceberg.EqualTo(iceberg.Reference("name"), "BANANA"), true},
+		{"eq min kept", iceberg.EqualTo(iceberg.Reference("name"), "apple"), true},
+		{"less than min pruned", iceberg.LessThan(iceberg.Reference("name"), "A"), false},
+		{"greater than max pruned", iceberg.GreaterThan(iceberg.Reference("name"), "cherry"), false},
+		{"in with one match kept", iceberg.IsIn(iceberg.Reference("name"), "CHERRY", "banana"), true},
+		{"in with no match pruned", iceberg.IsIn(iceberg.Reference("name"), "cherry", "date"), false},
+	}
+
+	for _, tc := range cases {
+		t.Run(tc.name, func(t *testing.T) {
+			file := fileWithCollatedValues(t, spec, "Banana", "apple")
+			eval, err := newInclusiveMetricsEvaluator(sc, tc.expr, true, true)
+			require.NoError(t, err)
+			match, err := eval(file)
+			require.NoError(t, err)
+			require.Equal(t, tc.mightMatch, match)
+		})
+	}
+}
+
+// TestCollationVersionMismatchKeepsFile shows the version gate: bounds computed
+// under one collation version must not be trusted by a reader pinned to a
+// different version, since ordering (and thus the selected min/max) may differ.
+func TestCollationVersionMismatchKeepsFile(t *testing.T) {
+	writeSpec := collation.MustParse("en_US-ci").WithVersion("v1")
+	file := fileWithCollatedValues(t, writeSpec, "Banana", "apple")
+
+	// Reader pins v2; bounds are tagged v1, so they are ignored and the file is
+	// conservatively kept even though "CHERRY" would otherwise be pruned.
+	readSpec := collation.MustParse("en_US-ci").WithVersion("v2")
+	expr := iceberg.EqualTo(iceberg.Reference("name"), "CHERRY")
+	eval, err := newInclusiveMetricsEvaluator(schemaWithName(readSpec), expr, true, true)
+	require.NoError(t, err)
+	match, err := eval(file)
+	require.NoError(t, err)
+	require.True(t, match, "version-mismatched collation bounds must not prune")
+}
+
+// TestUnversionedCollationKeepsFile shows that without a pinned version, bounds
+// cannot be trusted (no stability guarantee), so the file is kept.
+func TestUnversionedCollationKeepsFile(t *testing.T) {
+	spec := collation.MustParse("en_US-ci") // no version
+	file := fileWithCollatedValues(t, spec, "Banana", "apple")
+	expr := iceberg.EqualTo(iceberg.Reference("name"), "CHERRY")
+	eval, err := newInclusiveMetricsEvaluator(schemaWithName(spec), expr, true, true)
+	require.NoError(t, err)
+	match, err := eval(file)
+	require.NoError(t, err)
+	require.True(t, match, "unversioned collation bounds must not prune")
+}
+
+// TestVersionedBoundsUnversionedReaderKeepsFile covers the other direction of
+// the gate: the file carries v1 bounds but the reader's schema pins no version,
+// so the bounds are not trusted and the file is kept.
+func TestVersionedBoundsUnversionedReaderKeepsFile(t *testing.T) {
+	file := fileWithCollatedValues(t, collation.MustParse("en_US-ci").WithVersion("v1"), "Banana", "apple")
+	readSpec := collation.MustParse("en_US-ci") // unversioned reader
+	expr := iceberg.EqualTo(iceberg.Reference("name"), "CHERRY")
+	eval, err := newInclusiveMetricsEvaluator(schemaWithName(readSpec), expr, true, true)
+	require.NoError(t, err)
+	match, err := eval(file)
+	require.NoError(t, err)
+	require.True(t, match, "versioned bounds must not prune for an unversioned reader")
+}
+
+// TestStrictEvaluatorConservativeForCollatedColumn shows the strict evaluator
+// (used for delete/scan planning, where it must prove ALL rows match) never
+// claims a match for an ordering predicate on a collated column, since byte-order
+// bounds can't prove collation-order containment. It returns rowsMightNotMatch
+// even where byte-order bounds would (wrongly) prove all rows match.
+func TestStrictEvaluatorConservativeForCollatedColumn(t *testing.T) {
+	spec := collation.MustParse("en_US-ci").WithVersion("v1")
+	file := fileWithCollatedValues(t, spec, "apple", "apple")
+	// Every value is "apple"; byte-order bounds would prove name <= "apple".
+	expr := iceberg.LessThanEqual(iceberg.Reference("name"), "apple")
+	eval, err := newStrictMetricsEvaluator(schemaWithName(spec), expr, true, true)
+	require.NoError(t, err)
+	allMatch, err := eval(file)
+	require.NoError(t, err)
+	require.False(t, allMatch, "strict eval must not claim all-match for a collated column")
+}
diff --git a/table/evaluators.go b/table/evaluators.go
index 2cff27fe5..23f7a7ce5 100644
--- a/table/evaluators.go
+++ b/table/evaluators.go
@@ -25,6 +25,7 @@ import (
 
 	"github.com/apache/arrow-go/v18/parquet/metadata"
 	"github.com/apache/iceberg-go"
+	"github.com/apache/iceberg-go/collation"
 	"github.com/apache/iceberg-go/table/internal"
 	"github.com/google/uuid"
 )
@@ -737,6 +738,11 @@ type inclusiveMetricsEval struct {
 	st                iceberg.StructType
 	expr              iceberg.BooleanExpression
 	includeEmptyFiles bool
+
+	// collation-aware bounds (Delta-style: original values + version), keyed by
+	// field ID. Populated from a DataFile that implements
+	// iceberg.CollationBoundsProvider; nil for files without collation stats.
+	collationBounds map[int]iceberg.CollationBoundEntry
 }
 
 func (m *inclusiveMetricsEval) TestRowGroup(rgmeta *metadata.RowGroupMetaData, colIndices []int) (bool, error) {
@@ -749,6 +755,11 @@ func (m *inclusiveMetricsEval) TestRowGroup(rgmeta *metadata.RowGroupMetaData, c
 	m.nanCounts = nil
 	m.lowerBounds = make(map[int][]byte)
 	m.upperBounds = make(map[int][]byte)
+	// Parquet row groups carry no collation-aware bounds (collations are not a
+	// Parquet concept), so clear any left over from a prior file-level Eval on a
+	// reused evaluator. Collated columns therefore conservatively keep every row
+	// group; row-group-level collation pruning is out of scope for this prototype.
+	m.collationBounds = nil
 
 	for _, c := range colIndices {
 		colMeta, err := rgmeta.ColumnChunk(c)
@@ -798,6 +809,9 @@ func (m *inclusiveMetricsEval) Eval(file iceberg.DataFile) (bool, error) {
 	ev.valueCounts, ev.nullCounts = file.ValueCounts(), file.NullValueCounts()
 	ev.nanCounts = file.NaNValueCounts()
 	ev.lowerBounds, ev.upperBounds = file.LowerBoundValues(), file.UpperBoundValues()
+	if p, ok := file.(iceberg.CollationBoundsProvider); ok {
+		ev.collationBounds = p.CollationBounds()
+	}
 
 	return iceberg.VisitExpr(m.expr, &ev)
 }
@@ -816,10 +830,145 @@ func (m *inclusiveMetricsEval) VisitUnbound(iceberg.UnboundPredicate) bool {
 	panic("need bound predicate")
 }
 
+// collationOrderSensitive reports whether an operation compares string values
+// (ordering, equality, prefix, or set membership) and therefore must not be
+// pruned using a collated column's UTF-8 byte-order bounds. Every such op is
+// intercepted for collated columns: those collatedMightMatch understands
+// (LT/LTEQ/GT/GTEQ/EQ/IN) are pruned with collation-aware bounds, and the rest
+// (NEQ/NotIn/StartsWith/NotStartsWith) are conservatively kept — byte-order
+// prefix/inequality pruning is unsafe once comparison is collation-defined.
+// Spelled out rather than a range check so it can't silently drift if the
+// Operation enum is reordered or extended.
+func collationOrderSensitive(op iceberg.Operation) bool {
+	switch op {
+	case iceberg.OpLT, iceberg.OpLTEQ, iceberg.OpGT, iceberg.OpGTEQ,
+		iceberg.OpEQ, iceberg.OpNEQ, iceberg.OpStartsWith, iceberg.OpNotStartsWith,
+		iceberg.OpIn, iceberg.OpNotIn:
+		return true
+	default:
+		return false
+	}
+}
+
+// fieldHasNonBinaryCollation reports whether the field is a string column with a
+// collation whose order differs from UTF-8 byte order.
+func fieldHasNonBinaryCollation(f iceberg.NestedField) bool {
+	st, ok := f.Type.(iceberg.StringType)
+
+	return ok && !st.Collation().IsBinary()
+}
+
 func (m *inclusiveMetricsEval) VisitBound(pred iceberg.BoundPredicate) bool {
+	// Collation handling for ordering/equality predicates. The stored lower/upper
+	// bounds are UTF-8 byte-order bounds, but a collated column orders by its
+	// collation, which can disagree with byte order ('a' > 'B' in UTF-8 but
+	// 'a' < 'B' case-insensitively), so byte-order bounds must not be used to
+	// prune. Instead we use the column's collation-aware bounds when the data
+	// file carries them for a matching collation version; otherwise we keep the
+	// file. Null/NaN predicates are collation-independent and evaluate normally.
+	if collationOrderSensitive(pred.Op()) {
+		field := pred.Ref().Field()
+		if st, ok := field.Type.(iceberg.StringType); ok && !st.Collation().IsBinary() {
+			if result, decided := m.evalCollated(pred, field.ID, st.Collation()); decided {
+				return result
+			}
+
+			return rowsMightMatch
+		}
+	}
+
 	return iceberg.VisitBoundPredicate(pred, m)
 }
 
+// evalCollated attempts collation-aware pruning for a predicate on a collated
+// string column. It returns (result, true) when it could make a decision using
+// version-matching collation bounds, or (_, false) when no usable bounds exist
+// (the caller then conservatively keeps the file).
+func (m *inclusiveMetricsEval) evalCollated(pred iceberg.BoundPredicate, fieldID int, spec *collation.Spec) (bool, bool) {
+	entry, ok := m.collationBounds[fieldID]
+	if !ok || !entry.ValidFor(spec) {
+		return false, false
+	}
+
+	if m.containsNullsOnly(fieldID) {
+		return rowsCannotMatch, true
+	}
+
+	lower, err := iceberg.LiteralFromBytes(iceberg.PrimitiveTypes.String, entry.Lower)
+	if err != nil {
+		return false, false
+	}
+	upper, err := iceberg.LiteralFromBytes(iceberg.PrimitiveTypes.String, entry.Upper)
+	if err != nil {
+		return false, false
+	}
+
+	lowerVal, ok1 := stringLiteralValue(lower)
+	upperVal, ok2 := stringLiteralValue(upper)
+	if !ok1 || !ok2 {
+		return false, false
+	}
+
+	cmp := spec.Comparator()
+
+	return collatedMightMatch(pred, cmp, lowerVal, upperVal)
+}
+
+func stringLiteralValue(lit iceberg.Literal) (string, bool) {
+	sl, ok := lit.(iceberg.StringLiteral)
+
+	return string(sl), ok
+}
+
+// collatedMightMatch applies the inclusive-metrics logic for ordering/equality
+// predicates using the collation comparator cmp against original-value bounds
+// [lower, upper]. It returns (result, true) for handled ops; for ops that this
+// prototype does not prune (NEQ, NotIn, StartsWith, NotStartsWith) it returns
+// (rowsMightMatch, true).
+func collatedMightMatch(pred iceberg.BoundPredicate, cmp func(a, b string) int, lower, upper string) (bool, bool) {
+	switch pred.Op() {
+	case iceberg.OpLT, iceberg.OpLTEQ, iceberg.OpGT, iceberg.OpGTEQ, iceberg.OpEQ:
+		lp, ok := pred.(iceberg.BoundLiteralPredicate)
+		if !ok {
+			return rowsMightMatch, true
+		}
+		val, ok := stringLiteralValue(lp.Literal())
+		if !ok {
+			return rowsMightMatch, true
+		}
+		switch pred.Op() {
+		case iceberg.OpLT:
+			return cmp(lower, val) < 0, true
+		case iceberg.OpLTEQ:
+			return cmp(lower, val) <= 0, true
+		case iceberg.OpGT:
+			return cmp(upper, val) > 0, true
+		case iceberg.OpGTEQ:
+			return cmp(upper, val) >= 0, true
+		case iceberg.OpEQ:
+			return cmp(lower, val) <= 0 && cmp(upper, val) >= 0, true
+		default:
+			return rowsMightMatch, true
+		}
+	case iceberg.OpIn:
+		sp, ok := pred.(iceberg.BoundSetPredicate)
+		if !ok {
+			return rowsMightMatch, true
+		}
+		for _, lit := range sp.Literals().Members() {
+			val, ok := stringLiteralValue(lit)
+			if ok && cmp(lower, val) <= 0 && cmp(upper, val) >= 0 {
+				return rowsMightMatch, true
+			}
+		}
+
+		return rowsCannotMatch, true
+	}
+
+	// NEQ, NotIn, StartsWith, NotStartsWith: not pruned in this prototype.
+	return rowsMightMatch, true
+}
+
 func (m *inclusiveMetricsEval) VisitIsNull(t iceberg.BoundTerm) bool {
 	fieldID := t.Ref().Field().ID
 	if cnt, exists := m.nullCounts[fieldID]; exists && cnt == 0 {
@@ -1282,6 +1431,13 @@ func (m *strictMetricsEval) VisitUnbound(iceberg.UnboundPredicate) bool {
 }
 
 func (m *strictMetricsEval) VisitBound(pred iceberg.BoundPredicate) bool {
+	// A collated column orders by its collation, not byte order, so the stored
+	// byte-order bounds cannot prove that all rows satisfy an ordering/equality
+	// predicate. Conservatively report that some rows might not match.
+	if collationOrderSensitive(pred.Op()) && fieldHasNonBinaryCollation(pred.Ref().Field()) {
+		return rowsMightNotMatch
+	}
+
 	return iceberg.VisitBoundPredicate(pred, m)
 }
 
diff --git a/types.go b/types.go
index 3be85017f..14a2c1523 100644
--- a/types.go
+++ b/types.go
@@ -27,6 +27,7 @@ import (
 	"time"
 
 	"github.com/apache/arrow-go/v18/arrow/decimal"
+	"github.com/apache/iceberg-go/collation"
 	"github.com/apache/iceberg-go/internal"
 	"github.com/geoarrow/geoarrow-go"
 )
@@ -288,20 +289,50 @@ func (n *NestedField) Equals(other NestedField) bool {
 		n.Type.Equals(other.Type)
 }
 
+// collationSpecJSON is the on-disk form of a collation annotation, following
+// the Iceberg collation proposal's collation_spec object. The proposal's
+// collation_metrics_id (a pseudo-field holding collation-key bounds) is omitted:
+// this prototype uses Delta-style original-value bounds (see CollationBoundEntry)
+// rather than the sort-key pseudo-field.
+//
+// PROTOTYPE: this JSON shape (name + icu_collator_version) is provisional and
+// tracks the current proposal; a Java reference implementation may settle on
+// different key names, in which case stored schemas would need migration. The
+// annotation is only (de)serialized at the NestedField level — a collated string
+// nested inside a list/map element is not yet preserved on round-trip.
+type collationSpecJSON struct {
+	Name               string `json:"name"`
+	ICUCollatorVersion string `json:"icu_collator_version,omitempty"`
+}
+
 func (n NestedField) MarshalJSON() ([]byte, error) {
 	type Alias NestedField
 
-	return json.Marshal(struct {
-		Type *typeIFace `json:"type"`
+	aux := struct {
+		Type      *typeIFace         `json:"type"`
+		Collation *collationSpecJSON `json:"collation_spec,omitempty"`
 		*Alias
-	}{Type: &typeIFace{n.Type}, Alias: (*Alias)(&n)})
+	}{Type: &typeIFace{n.Type}, Alias: (*Alias)(&n)}
+
+	// A collated string keeps its Iceberg type name "string" and carries the
+	// collation as a sibling collation_spec field, so engines that don't
+	// understand collations still read it as a plain string.
+	if st, ok := n.Type.(StringType); ok && !st.Collation().IsBinary() {
+		aux.Collation = &collationSpecJSON{
+			Name:               st.Collation().String(),
+			ICUCollatorVersion: st.Collation().Version(),
+		}
+	}
+
+	return json.Marshal(aux)
 }
 
 func (n *NestedField) UnmarshalJSON(b []byte) error {
 	type Alias NestedField
 	aux := struct {
-		ID   *int      `json:"id"`
-		Type typeIFace `json:"type"`
+		ID        *int               `json:"id"`
+		Type      typeIFace          `json:"type"`
+		Collation *collationSpecJSON `json:"collation_spec,omitempty"`
 		*Alias
 	}{
 		Alias: (*Alias)(n),
@@ -326,6 +357,20 @@ func (n *NestedField) UnmarshalJSON(b []byte) error {
 		return fmt.Errorf("%w: field %q is missing required 'type' key in JSON", ErrInvalidSchema, n.Name)
 	}
 
+	if aux.Collation != nil {
+		if _, ok := n.Type.(StringType); !ok {
+			return fmt.Errorf("%w: field %q has collation_spec but is not a string type", ErrInvalidSchema, n.Name)
+		}
+		spec, err := collation.Parse(aux.Collation.Name)
+		if err != nil {
+			return fmt.Errorf("%w: field %q: %w", ErrInvalidSchema, n.Name, err)
+		}
+		if v := aux.Collation.ICUCollatorVersion; v != "" {
+			spec = spec.WithVersion(v)
+		}
+		n.Type = StringTypeWithCollation(spec)
+	}
+
 	return nil
 }
 
@@ -749,17 +794,52 @@ func (TimestampTzType) primitive()     {}
 func (TimestampTzType) Type() string   { return "timestamptz" }
 func (TimestampTzType) String() string { return "timestamptz" }
 
-type StringType struct{}
+// StringType is the Iceberg string type. It optionally carries a collation
+// annotation that changes how values are compared and ordered without changing
+// how they are stored. The zero value (StringType{}) is the default UTF-8
+// byte-order string.
+//
+// Note: StringType now holds a pointer field, so two collated string types must
+// be compared with Equals, not ==. (== compares the spec pointer, not the
+// collation it denotes, and is unsafe as a map key for collated strings.)
+type StringType struct {
+	collation *collation.Spec
+}
 
-func (StringType) Equals(other Type) bool {
-	_, ok := other.(StringType)
+// StringTypeWithCollation returns a string type annotated with the given
+// collation. A nil spec yields the default (binary) string type.
+func StringTypeWithCollation(spec *collation.Spec) StringType {
+	return StringType{collation: spec}
+}
 
-	return ok
+// Collation returns the collation annotation, or nil for the default UTF-8
+// byte-order comparison.
+func (s StringType) Collation() *collation.Spec { return s.collation }
+
+// Comparator returns the string comparator implied by this type's collation:
+// byte order for a plain string, or the collation-aware comparator otherwise.
+func (s StringType) Comparator() Comparator[string] {
+	return CollatedStringComparator(s.collation)
+}
+
+func (s StringType) Equals(other Type) bool {
+	o, ok := other.(StringType)
+	if !ok {
+		return false
+	}
+
+	return collation.Equal(s.collation, o.collation)
 }
 
-func (StringType) primitive()     {}
-func (StringType) Type() string   { return "string" }
-func (StringType) String() string { return "string" }
+func (StringType) primitive()   {}
+func (StringType) Type() string { return "string" }
+func (s StringType) String() string {
+	if !s.collation.IsBinary() {
+		return "string collate " + s.collation.String()
+	}
+
+	return "string"
+}
 
 type UUIDType struct{}
 

From ed255bd775f61146b431c0ab102dbdaf5e1992fc Mon Sep 17 00:00:00 2001
From: laskoviymishka <laskoviymishka@gmail.com>
Date: Fri, 26 Jun 2026 10:44:37 +0200
Subject: [PATCH 2/2] style: fix gofmt alignment in literals.go

The gofmt formatter (run by golangci-lint in CI) requires the single-line
String() methods on Float32Literal/Float64Literal to be column-aligned with
their sibling methods. Align them.
---
 literals.go | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/literals.go b/literals.go
index 985fcd3a3..c7205a7ac 100644
--- a/literals.go
+++ b/literals.go
@@ -604,7 +604,7 @@ func (Float32Literal) Comparator() Comparator[float32] { return cmp.Compare[floa
 func (f Float32Literal) Type() Type                    { return PrimitiveTypes.Float32 }
 func (f Float32Literal) Value() float32                { return float32(f) }
 func (f Float32Literal) Any() any                      { return f.Value() }
-func (f Float32Literal) String() string { return strconv.FormatFloat(float64(f), 'g', -1, 32) }
+func (f Float32Literal) String() string                { return strconv.FormatFloat(float64(f), 'g', -1, 32) }
 
 func (f Float32Literal) To(t Type) (Literal, error) {
 	switch t := t.(type) {
@@ -653,7 +653,7 @@ func (Float64Literal) Comparator() Comparator[float64] { return cmp.Compare[floa
 func (f Float64Literal) Type() Type                    { return PrimitiveTypes.Float64 }
 func (f Float64Literal) Value() float64                { return float64(f) }
 func (f Float64Literal) Any() any                      { return f.Value() }
-func (f Float64Literal) String() string { return strconv.FormatFloat(float64(f), 'g', -1, 64) }
+func (f Float64Literal) String() string                { return strconv.FormatFloat(float64(f), 'g', -1, 64) }
 
 func (f Float64Literal) To(t Type) (Literal, error) {
 	switch t := t.(type) {