From 59c8f4b7a9a6e31dd5033b3c0a37e8d483f4809b Mon Sep 17 00:00:00 2001 From: blackmwk Date: Mon, 13 Apr 2026 16:29:51 +0800 Subject: [PATCH 01/45] Fix zizmor workflow (#2324) ## Which issue does this PR close? - Closes #2323 . ## What changes are included in this PR? ## Are these changes tested? ci --- .github/workflows/bindings_python_ci.yml | 2 +- .github/workflows/release_python.yml | 8 ++++---- .github/workflows/release_python_nightly.yml | 8 ++++---- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/.github/workflows/bindings_python_ci.yml b/.github/workflows/bindings_python_ci.yml index a7abfcbeed..a02ae9f0af 100644 --- a/.github/workflows/bindings_python_ci.yml +++ b/.github/workflows/bindings_python_ci.yml @@ -95,7 +95,7 @@ jobs: - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6 with: python-version: 3.12 - - uses: PyO3/maturin-action@04ac600d27cdf7a9a280dadf7147097c42b757ad # v1 + - uses: PyO3/maturin-action@04ac600d27cdf7a9a280dadf7147097c42b757ad # v1.50.1 with: working-directory: "bindings/python" command: build diff --git a/.github/workflows/release_python.yml b/.github/workflows/release_python.yml index b19fa165dc..c9817e064c 100644 --- a/.github/workflows/release_python.yml +++ b/.github/workflows/release_python.yml @@ -124,13 +124,13 @@ jobs: env: NEEDS_VALIDATE_RELEASE_TAG_OUTPUTS_CARGO_VERSION: ${{ needs.validate-release-tag.outputs.cargo-version }} - - uses: PyO3/maturin-action@04ac600d27cdf7a9a280dadf7147097c42b757ad # v1 + - uses: PyO3/maturin-action@04ac600d27cdf7a9a280dadf7147097c42b757ad # v1.50.1 with: working-directory: "bindings/python" command: sdist args: -o dist - name: Upload sdist - uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7 + uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 with: name: wheels-sdist path: bindings/python/dist @@ -184,7 +184,7 @@ jobs: uses: ./.github/actions/setup-builder with: rust-version: ${{ steps.get-msrv.outputs.msrv }} - - uses: PyO3/maturin-action@04ac600d27cdf7a9a280dadf7147097c42b757ad # v1 + - uses: PyO3/maturin-action@04ac600d27cdf7a9a280dadf7147097c42b757ad # v1.50.1 with: target: ${{ matrix.target }} manylinux: ${{ matrix.manylinux || 'auto' }} @@ -192,7 +192,7 @@ jobs: command: build args: --release -o dist -i python3.12 # Explicitly set interpreter; manylinux containers have multiple Pythons and maturin may pick an older one - name: Upload wheels - uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7 + uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 with: name: wheels-${{ matrix.os }}-${{ matrix.target }} path: bindings/python/dist diff --git a/.github/workflows/release_python_nightly.yml b/.github/workflows/release_python_nightly.yml index 66ae0e1db2..55695784e9 100644 --- a/.github/workflows/release_python_nightly.yml +++ b/.github/workflows/release_python_nightly.yml @@ -48,14 +48,14 @@ jobs: with: timestamp: ${{ needs.set-version.outputs.TIMESTAMP }} - - uses: PyO3/maturin-action@04ac600d27cdf7a9a280dadf7147097c42b757ad # v1 + - uses: PyO3/maturin-action@04ac600d27cdf7a9a280dadf7147097c42b757ad # v1.50.1 with: working-directory: "bindings/python" command: sdist args: -o dist - name: Upload sdist - uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7 + uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 with: name: wheels-sdist path: bindings/python/dist @@ -98,7 +98,7 @@ jobs: with: rust-version: ${{ steps.get-msrv.outputs.msrv }} - - uses: PyO3/maturin-action@04ac600d27cdf7a9a280dadf7147097c42b757ad # v1 + - uses: PyO3/maturin-action@04ac600d27cdf7a9a280dadf7147097c42b757ad # v1.50.1 with: target: ${{ matrix.target }} manylinux: ${{ matrix.manylinux || 'auto' }} @@ -107,7 +107,7 @@ jobs: args: --release -o dist -i python3.12 # Explicitly set interpreter; manylinux containers have multiple Pythons and maturin may pick an older one - name: Upload wheels - uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7 + uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 with: name: wheels-${{ matrix.os }}-${{ matrix.target }} path: bindings/python/dist From c6c0ce7a1033c221f43eec74ca53a8b36f1bb627 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 13 Apr 2026 16:59:43 +0800 Subject: [PATCH 02/45] chore(deps): Bump crate-ci/typos from 1.44.0 to 1.45.0 (#2318) Bumps [crate-ci/typos](https://github.com/crate-ci/typos) from 1.44.0 to 1.45.0.
Release notes

Sourced from crate-ci/typos's releases.

v1.45.0

[1.45.0] - 2026-04-01

Features

  • Updated the dictionary with the March 2026 changes
Changelog

Sourced from crate-ci/typos's changelog.

Change Log

All notable changes to this project will be documented in this file.

The format is based on Keep a Changelog and this project adheres to Semantic Versioning.

[Unreleased] - ReleaseDate

[1.45.0] - 2026-04-01

Features

  • Updated the dictionary with the March 2026 changes

[1.44.0] - 2026-02-27

Features

[1.43.5] - 2026-02-16

Fixes

  • (pypi) Hopefully fix the sdist build

[1.43.4] - 2026-02-09

Fixes

  • Don't correct pincher

[1.43.3] - 2026-02-06

Fixes

  • (action) Adjust how typos are reported to github

[1.43.2] - 2026-02-05

Fixes

  • Don't correct certifi in Python

[1.43.1] - 2026-02-03

Fixes

  • Don't correct consts

... (truncated)

Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=crate-ci/typos&package-manager=github_actions&previous-version=1.44.0&new-version=1.45.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: blackmwk --- .github/workflows/ci_typos.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci_typos.yml b/.github/workflows/ci_typos.yml index 9373c7295d..089ddfe8e2 100644 --- a/.github/workflows/ci_typos.yml +++ b/.github/workflows/ci_typos.yml @@ -47,4 +47,4 @@ jobs: with: persist-credentials: false - name: Check typos - uses: crate-ci/typos@631208b7aac2daa8b707f55e7331f9112b0e062d # v1.44.0 + uses: crate-ci/typos@02ea592e44b3a53c302f697cddca7641cd051c3d # v1.45.0 From 37d19b895c6f767e13697222630cd7a48ecec896 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 13 Apr 2026 17:26:54 +0800 Subject: [PATCH 03/45] chore(deps): Bump taiki-e/install-action from 2.70.0 to 2.73.0 (#2319) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [taiki-e/install-action](https://github.com/taiki-e/install-action) from 2.70.0 to 2.73.0.
Release notes

Sourced from taiki-e/install-action's releases.

2.73.0

  • Introduce dependency cooldown when installing with taiki-e/install-action@<tool_name>, tool: <tool_name>@latest, or tool: <tool_name>@<omitted_version> to mitigate the risk of supply chain attacks by default. (#1666)

    This action without this cooldown already takes a few hours to a few days for new releases to be reflected (as with other common package managers that verify checksums or signatures), so this should not affect most users.

    See the "Security" section in readme for more details.

  • Improve robustness for network failure.

  • Documentation improvements.

2.72.0

  • Support cargo-xwin. (#1659, thanks @​daxpedda)

  • Support trailing comma in tool input option.

  • Update tombi@latest to 0.9.14.

2.71.3

  • Update wasm-tools@latest to 1.246.2.

  • Update mise@latest to 2026.4.3.

2.71.2

  • Implement workaround for windows-11-arm runner bug which sometimes causes installation failure. (#1657)

    This addresses an issue that was attempted to be worked around in 2.71.0 but was insufficient.

  • Update mise@latest to 2026.4.1.

  • Update uv@latest to 0.11.3.

2.71.1

  • Fix a regression that caused an execution policy violation on self-hosted Windows runner due to use of non-default powershell shell, introduced in 2.71.0.

  • Update dprint@latest to 0.53.2.

2.71.0

  • Support wasm-tools. (#1642, thanks @​crepererum)

  • Support covgate. (#1613, thanks @​jesse-black)

  • Implement potential workaround for windows-11-arm runner bug which sometimes causes issue that the action successfully completes but the tool is not installed. (#1647)

  • Update typos@latest to 1.45.0.

  • Update mise@latest to 2026.4.0.

  • Update cargo-careful@latest to 0.4.10.

... (truncated)

Changelog

Sourced from taiki-e/install-action's changelog.

Changelog

All notable changes to this project will be documented in this file.

This project adheres to Semantic Versioning.

[Unreleased]

  • Update vacuum@latest to 0.25.6.

  • Update gungraun-runner@latest to 0.18.1.

[2.75.7] - 2026-04-11

  • Update covgate@latest to 0.1.4.

  • Update wasm-bindgen@latest to 0.2.118.

[2.75.6] - 2026-04-11

  • Update mise@latest to 2026.4.8.

  • Update cargo-deny@latest to 0.19.1.

[2.75.5] - 2026-04-10

  • Update biome@latest to 2.4.11.

  • Update wasmtime@latest to 43.0.1.

  • Update uv@latest to 0.11.6.

  • Update mise@latest to 2026.4.7.

  • Update gungraun-runner@latest to 0.18.0.

[2.75.4] - 2026-04-10

  • Enhance security when cargo-binstall fallback is enabled. (08a38582, ba626b4d)

  • Update martin@latest to 1.5.0.

  • Update uv@latest to 0.11.5.

  • Update syft@latest to 1.42.4.

... (truncated)

Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=taiki-e/install-action&package-manager=github_actions&previous-version=2.70.0&new-version=2.73.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: blackmwk --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 071d6dbcbf..1949015462 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -163,7 +163,7 @@ jobs: - name: Install cargo-nextest if: matrix.test-suite.name == 'default' - uses: taiki-e/install-action@0fde6d128a3d980ceac30be8c8b8739abd963b81 # v2.70.0 + uses: taiki-e/install-action@0abfcd587b70a713fdaa7fb502c885e2112acb15 # v2.75.7 with: tool: cargo-nextest From d472bf8e06a7fa39e8aec98ff6cdc209631188b4 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 13 Apr 2026 17:53:34 +0800 Subject: [PATCH 04/45] chore(deps): Bump aws-sdk-s3tables from 1.53.0 to 1.54.0 (#2320) Bumps [aws-sdk-s3tables](https://github.com/awslabs/aws-sdk-rust) from 1.53.0 to 1.54.0.
Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=aws-sdk-s3tables&package-manager=cargo&previous-version=1.53.0&new-version=1.54.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: blackmwk --- Cargo.lock | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5110d5a480..607bc0e754 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -157,7 +157,7 @@ version = "1.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc" dependencies = [ - "windows-sys 0.61.2", + "windows-sys 0.60.2", ] [[package]] @@ -168,7 +168,7 @@ checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d" dependencies = [ "anstyle", "once_cell_polyfill", - "windows-sys 0.61.2", + "windows-sys 0.60.2", ] [[package]] @@ -665,9 +665,9 @@ dependencies = [ [[package]] name = "aws-sdk-s3tables" -version = "1.53.0" +version = "1.54.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c91febb29f5287a7b723dbacca6d81b1086b8ac0af6b35b873539ee19c74827f" +checksum = "2e0ec266873694efc365debded01f44e27a0de3946a3ac15d24c489759e5ddf8" dependencies = [ "aws-credential-types", "aws-runtime", @@ -1310,7 +1310,7 @@ version = "3.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "faf9468729b8cbcea668e36183cb69d317348c2e08e994829fb56ebfdfbaac34" dependencies = [ - "windows-sys 0.61.2", + "windows-sys 0.48.0", ] [[package]] @@ -2504,7 +2504,7 @@ dependencies = [ "libc", "option-ext", "redox_users", - "windows-sys 0.61.2", + "windows-sys 0.59.0", ] [[package]] @@ -2657,7 +2657,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" dependencies = [ "libc", - "windows-sys 0.61.2", + "windows-sys 0.59.0", ] [[package]] @@ -3287,7 +3287,7 @@ dependencies = [ "libc", "percent-encoding", "pin-project-lite", - "socket2 0.6.3", + "socket2 0.5.10", "tokio", "tower-service", "tracing", @@ -3852,7 +3852,7 @@ dependencies = [ "portable-atomic", "portable-atomic-util", "serde_core", - "windows-sys 0.61.2", + "windows-sys 0.59.0", ] [[package]] @@ -4386,7 +4386,7 @@ version = "0.50.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5" dependencies = [ - "windows-sys 0.61.2", + "windows-sys 0.59.0", ] [[package]] @@ -5155,7 +5155,7 @@ dependencies = [ "quinn-udp", "rustc-hash", "rustls", - "socket2 0.6.3", + "socket2 0.5.10", "thiserror 2.0.18", "tokio", "tracing", @@ -5192,7 +5192,7 @@ dependencies = [ "cfg_aliases", "libc", "once_cell", - "socket2 0.6.3", + "socket2 0.5.10", "tracing", "windows-sys 0.60.2", ] @@ -5657,7 +5657,7 @@ dependencies = [ "errno", "libc", "linux-raw-sys", - "windows-sys 0.61.2", + "windows-sys 0.59.0", ] [[package]] @@ -6166,7 +6166,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3a766e1110788c36f4fa1c2b71b387a7815aa65f88ce0229841826633d93723e" dependencies = [ "libc", - "windows-sys 0.61.2", + "windows-sys 0.60.2", ] [[package]] @@ -6602,10 +6602,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32497e9a4c7b38532efcdebeef879707aa9f794296a4f0244f6f69e9bc8574bd" dependencies = [ "fastrand", - "getrandom 0.3.4", + "getrandom 0.4.2", "once_cell", "rustix", - "windows-sys 0.61.2", + "windows-sys 0.59.0", ] [[package]] @@ -7501,7 +7501,7 @@ version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" dependencies = [ - "windows-sys 0.61.2", + "windows-sys 0.48.0", ] [[package]] From 99ffd68b85399e753b1d78f3923fbe049de09ac3 Mon Sep 17 00:00:00 2001 From: blackmwk Date: Tue, 14 Apr 2026 23:27:48 +0800 Subject: [PATCH 05/45] Fix ci workflow failure (#2325) ## Which issue does this PR close? - Closes #. ## What changes are included in this PR? Our current ci failed due to an audit issue, see: https://github.com/apache/iceberg-rust/pull/2321 . This pr fix the ci failure. ## Are these changes tested? CI. --- .github/workflows/audit.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/audit.yml b/.github/workflows/audit.yml index 68731cbed3..3f9865ed8a 100644 --- a/.github/workflows/audit.yml +++ b/.github/workflows/audit.yml @@ -37,7 +37,10 @@ on: - cron: '0 0 * * *' permissions: + # All other permissions are set to none contents: read + checks: write + issues: write jobs: security_audit: From 575ebcde2f487a20b6bce2e37b5988121f048413 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 15 Apr 2026 16:45:12 +0800 Subject: [PATCH 06/45] chore(deps): Bump tokio from 1.50.0 to 1.51.0 (#2321) Bumps [tokio](https://github.com/tokio-rs/tokio) from 1.50.0 to 1.51.0.
Release notes

Sourced from tokio's releases.

Tokio v1.51.0

1.51.0 (April 3rd, 2026)

Added

  • net: implement get_peer_cred on Hurd (#7989)
  • runtime: add tokio::runtime::worker_index() (#7921)
  • runtime: add runtime name (#7924)
  • runtime: stabilize LocalRuntime (#7557)
  • wasm: add wasm32-wasip2 networking support (#7933)

Changed

  • runtime: steal tasks from the LIFO slot (#7431)

Fixed

  • docs: do not show "Available on non-loom only." doc label (#7977)
  • macros: improve overall macro hygiene (#7997)
  • sync: fix notify_waiters priority in Notify (#7996)
  • sync: fix panic in Chan::recv_many when called with non-empty vector on closed channel (#7991)

#7431: tokio-rs/tokio#7431 #7557: tokio-rs/tokio#7557 #7921: tokio-rs/tokio#7921 #7924: tokio-rs/tokio#7924 #7933: tokio-rs/tokio#7933 #7977: tokio-rs/tokio#7977 #7989: tokio-rs/tokio#7989 #7991: tokio-rs/tokio#7991 #7996: tokio-rs/tokio#7996 #7997: tokio-rs/tokio#7997

Commits

Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.lock | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 607bc0e754..ee70ed2302 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -157,7 +157,7 @@ version = "1.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc" dependencies = [ - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] @@ -168,7 +168,7 @@ checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d" dependencies = [ "anstyle", "once_cell_polyfill", - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] @@ -2504,7 +2504,7 @@ dependencies = [ "libc", "option-ext", "redox_users", - "windows-sys 0.59.0", + "windows-sys 0.61.2", ] [[package]] @@ -2657,7 +2657,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" dependencies = [ "libc", - "windows-sys 0.59.0", + "windows-sys 0.52.0", ] [[package]] @@ -2752,7 +2752,7 @@ checksum = "0ce92ff622d6dadf7349484f42c93271a0d49b7cc4d466a936405bacbe10aa78" dependencies = [ "cfg-if", "rustix", - "windows-sys 0.59.0", + "windows-sys 0.52.0", ] [[package]] @@ -3852,7 +3852,7 @@ dependencies = [ "portable-atomic", "portable-atomic-util", "serde_core", - "windows-sys 0.59.0", + "windows-sys 0.52.0", ] [[package]] @@ -4205,9 +4205,9 @@ dependencies = [ [[package]] name = "mio" -version = "1.1.1" +version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a69bcab0ad47271a0234d9422b131806bf3968021e5dc9328caf2d4cd58557fc" +checksum = "50b7e5b27aa02a74bac8c3f23f448f8d87ff11f92d3aac1a6ed369ee08cc56c1" dependencies = [ "libc", "wasi", @@ -4386,7 +4386,7 @@ version = "0.50.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5" dependencies = [ - "windows-sys 0.59.0", + "windows-sys 0.61.2", ] [[package]] @@ -5194,7 +5194,7 @@ dependencies = [ "once_cell", "socket2 0.5.10", "tracing", - "windows-sys 0.60.2", + "windows-sys 0.52.0", ] [[package]] @@ -5657,7 +5657,7 @@ dependencies = [ "errno", "libc", "linux-raw-sys", - "windows-sys 0.59.0", + "windows-sys 0.52.0", ] [[package]] @@ -6166,7 +6166,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3a766e1110788c36f4fa1c2b71b387a7815aa65f88ce0229841826633d93723e" dependencies = [ "libc", - "windows-sys 0.60.2", + "windows-sys 0.61.2", ] [[package]] @@ -6476,6 +6476,7 @@ dependencies = [ "cfg-if", "libc", "psm", + "windows-sys 0.52.0", "windows-sys 0.59.0", ] @@ -6605,7 +6606,7 @@ dependencies = [ "getrandom 0.4.2", "once_cell", "rustix", - "windows-sys 0.59.0", + "windows-sys 0.52.0", ] [[package]] @@ -6741,9 +6742,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.50.0" +version = "1.51.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "27ad5e34374e03cfffefc301becb44e9dc3c17584f414349ebe29ed26661822d" +checksum = "f66bf9585cda4b724d3e78ab34b73fb2bbaba9011b9bfdf69dc836382ea13b8c" dependencies = [ "bytes", "libc", @@ -6758,9 +6759,9 @@ dependencies = [ [[package]] name = "tokio-macros" -version = "2.6.1" +version = "2.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c55a2eff8b69ce66c84f85e1da1c233edc36ceb85a2058d11b0d6a3c7e7569c" +checksum = "385a6cb71ab9ab790c5fe8d67f1645e6c450a7ce006a33de03daa956cf70a496" dependencies = [ "proc-macro2", "quote", From 8e884efa41a2a416d212a7aae0e8946967d5dc4e Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 15 Apr 2026 17:01:21 +0800 Subject: [PATCH 07/45] chore(deps): Bump minijinja from 2.18.0 to 2.19.0 (#2322) Bumps [minijinja](https://github.com/mitsuhiko/minijinja) from 2.18.0 to 2.19.0.
Changelog

Sourced from minijinja's changelog.

2.19.0

  • Fixed strict undefined behavior for comparison operators (such as ==), string concatenation (~), and undefined needles in the in operator to better match Jinja2. #886 #888
  • Fixed the default filter in strict undefined mode so an explicitly passed undefined fallback argument errors instead of being treated like a missing argument. #887
Commits
  • f15dc1e chore(release): 2.19.0
  • e04d276 fix(undefined): align strict undefined behavior with Jinja2
  • See full diff in compare view

Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.lock | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ee70ed2302..73a3f311bc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4185,9 +4185,9 @@ dependencies = [ [[package]] name = "minijinja" -version = "2.18.0" +version = "2.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "328251e58ad8e415be6198888fc207502727dc77945806421ab34f35bf012e7d" +checksum = "805bfd7352166bae857ee569628b52bcd85a1cecf7810861ebceb1686b72b75d" dependencies = [ "memo-map", "serde", From b06b5734b87f64062c33bf65f73062356ba87d91 Mon Sep 17 00:00:00 2001 From: "R. Conner Howell" <5731503+rchowell@users.noreply.github.com> Date: Wed, 15 Apr 2026 02:54:16 -0700 Subject: [PATCH 08/45] fix(s3tables): use 's3' as the default scheme (#2313) ## Which issue does this PR close? - Closes #2312 ## What changes are included in this PR? Changes the s3tables default scheme to "s3://" ## Are these changes tested? With this change, I can read files resolved through an s3table catalog now. --- crates/catalog/s3tables/src/catalog.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/catalog/s3tables/src/catalog.rs b/crates/catalog/s3tables/src/catalog.rs index b88bd77d29..27d7be76ac 100644 --- a/crates/catalog/s3tables/src/catalog.rs +++ b/crates/catalog/s3tables/src/catalog.rs @@ -202,7 +202,7 @@ impl S3TablesCatalog { // Use provided factory or default to OpenDalStorageFactory::S3 let factory = storage_factory.unwrap_or_else(|| { Arc::new(OpenDalStorageFactory::S3 { - configured_scheme: "s3a".to_string(), + configured_scheme: "s3".to_string(), customized_credential_load: None, }) }); From d6692fc5d1e2e3cea8aaea3d2cb62cfd0f6eec98 Mon Sep 17 00:00:00 2001 From: Jiajia Li Date: Wed, 15 Apr 2026 18:00:47 +0800 Subject: [PATCH 09/45] fix(storage/s3): default to virtual-host-style addressing (#2330) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Which issue does this PR close? N/A ## What changes are included in this PR? The Java Iceberg SDK defaults [`s3.path-style-access` to `false`](https://github.com/apache/iceberg/blob/main/aws/src/main/java/org/apache/iceberg/aws/s3/S3FileIOProperties.java#L238-L240) i.e. virtual-host-style addressing is the spec default. opendal's `S3Config::default()`, which the opendal storage backend inherits from, uses path-style. As a result, any user building an iceberg-rust `FileIO` against AWS S3 or an S3-compatible service has to explicitly set `s3.path-style-access=false` to get the same behavior they would get from Java out of the box — and several S3-compatible endpoints only accept virtual-host-style URLs and fail outright with `SecondLevelDomainForbidden` (or equivalent) under the current default. ## Are these changes tested? Yes: - `cargo test -p iceberg --lib io::storage::config::s3::tests` - `cargo test -p iceberg-storage-opendal --lib s3::tests --- crates/catalog/hms/tests/hms_catalog_test.rs | 6 +++- crates/catalog/loader/tests/common/mod.rs | 6 ++-- crates/iceberg/src/io/storage/config/s3.rs | 31 +++++++++++++++++-- crates/integration_tests/src/lib.rs | 5 ++- crates/storage/opendal/src/s3.rs | 31 +++++++++++++++++++ .../storage/opendal/tests/file_io_s3_test.rs | 7 ++++- .../opendal/tests/resolving_storage_test.rs | 5 ++- 7 files changed, 82 insertions(+), 9 deletions(-) diff --git a/crates/catalog/hms/tests/hms_catalog_test.rs b/crates/catalog/hms/tests/hms_catalog_test.rs index f19cf7bff4..c1ae4db7ee 100644 --- a/crates/catalog/hms/tests/hms_catalog_test.rs +++ b/crates/catalog/hms/tests/hms_catalog_test.rs @@ -23,7 +23,10 @@ use std::collections::HashMap; use std::sync::Arc; -use iceberg::io::{FileIOBuilder, S3_ACCESS_KEY_ID, S3_ENDPOINT, S3_REGION, S3_SECRET_ACCESS_KEY}; +use iceberg::io::{ + FileIOBuilder, S3_ACCESS_KEY_ID, S3_ENDPOINT, S3_PATH_STYLE_ACCESS, S3_REGION, + S3_SECRET_ACCESS_KEY, +}; use iceberg::{Catalog, CatalogBuilder, Namespace, NamespaceIdent}; use iceberg_catalog_hms::{ HMS_CATALOG_PROP_THRIFT_TRANSPORT, HMS_CATALOG_PROP_URI, HMS_CATALOG_PROP_WAREHOUSE, @@ -56,6 +59,7 @@ async fn get_catalog() -> HmsCatalog { (S3_ACCESS_KEY_ID.to_string(), "admin".to_string()), (S3_SECRET_ACCESS_KEY.to_string(), "password".to_string()), (S3_REGION.to_string(), "us-east-1".to_string()), + (S3_PATH_STYLE_ACCESS.to_string(), "true".to_string()), ]); // Wait for bucket to actually exist diff --git a/crates/catalog/loader/tests/common/mod.rs b/crates/catalog/loader/tests/common/mod.rs index 600cd9b6f4..1a3fb8d8f1 100644 --- a/crates/catalog/loader/tests/common/mod.rs +++ b/crates/catalog/loader/tests/common/mod.rs @@ -24,8 +24,8 @@ use std::fmt; use std::sync::Arc; use iceberg::io::{ - FileIOBuilder, LocalFsStorageFactory, S3_ACCESS_KEY_ID, S3_ENDPOINT, S3_REGION, - S3_SECRET_ACCESS_KEY, + FileIOBuilder, LocalFsStorageFactory, S3_ACCESS_KEY_ID, S3_ENDPOINT, S3_PATH_STYLE_ACCESS, + S3_REGION, S3_SECRET_ACCESS_KEY, }; use iceberg::memory::{MEMORY_CATALOG_WAREHOUSE, MemoryCatalogBuilder}; use iceberg::spec::{NestedField, PrimitiveType, Schema, Type}; @@ -229,6 +229,7 @@ async fn glue_catalog() -> GlueCatalog { (S3_ACCESS_KEY_ID.to_string(), "admin".to_string()), (S3_SECRET_ACCESS_KEY.to_string(), "password".to_string()), (S3_REGION.to_string(), "us-east-1".to_string()), + (S3_PATH_STYLE_ACCESS.to_string(), "true".to_string()), ]); let file_io = FileIOBuilder::new(Arc::new(OpenDalStorageFactory::S3 { @@ -280,6 +281,7 @@ async fn hms_catalog() -> HmsCatalog { (S3_ACCESS_KEY_ID.to_string(), "admin".to_string()), (S3_SECRET_ACCESS_KEY.to_string(), "password".to_string()), (S3_REGION.to_string(), "us-east-1".to_string()), + (S3_PATH_STYLE_ACCESS.to_string(), "true".to_string()), ]); let file_io = FileIOBuilder::new(Arc::new(OpenDalStorageFactory::S3 { diff --git a/crates/iceberg/src/io/storage/config/s3.rs b/crates/iceberg/src/io/storage/config/s3.rs index fae3a14757..64db47084e 100644 --- a/crates/iceberg/src/io/storage/config/s3.rs +++ b/crates/iceberg/src/io/storage/config/s3.rs @@ -69,8 +69,14 @@ pub const S3_DISABLE_CONFIG_LOAD: &str = "s3.disable-config-load"; /// /// This struct contains all the configuration options for connecting to Amazon S3. /// Use the builder pattern via `S3Config::builder()` to construct instances. -/// ``` -#[derive(Clone, Debug, Default, PartialEq, Eq, Serialize, Deserialize, TypedBuilder)] +/// +/// Defaults follow the Iceberg `S3FileIOProperties` spec (see +/// [`PATH_STYLE_ACCESS_DEFAULT = false`](https://github.com/apache/iceberg/blob/main/aws/src/main/java/org/apache/iceberg/aws/s3/S3FileIOProperties.java)), +/// i.e. virtual-host-style addressing is enabled unless +/// `s3.path-style-access=true` is explicitly set. This matches what +/// Java clients do out of the box and is required for a number of +/// S3-compatible stores that do not support path-style URLs. +#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize, TypedBuilder)] pub struct S3Config { /// S3 endpoint URL. #[builder(default, setter(strip_option, into))] @@ -88,7 +94,9 @@ pub struct S3Config { #[builder(default, setter(strip_option, into))] pub region: Option, /// Enable virtual host style (opposite of path style access). - #[builder(default)] + /// + /// Defaults to `true` to match Iceberg `S3FileIOProperties.PATH_STYLE_ACCESS_DEFAULT = false`. + #[builder(default = true)] pub enable_virtual_host_style: bool, /// Server side encryption type. #[builder(default, setter(strip_option, into))] @@ -125,6 +133,12 @@ pub struct S3Config { pub disable_config_load: bool, } +impl Default for S3Config { + fn default() -> Self { + Self::builder().build() + } +} + impl TryFrom<&StorageConfig> for S3Config { type Error = crate::Error; @@ -267,6 +281,17 @@ mod tests { assert_eq!(s3_config.region.as_deref(), Some("eu-west-1")); } + #[test] + fn test_s3_config_default_is_virtual_host_style() { + // Matches Iceberg S3FileIOProperties.PATH_STYLE_ACCESS_DEFAULT = false. + assert!(S3Config::default().enable_virtual_host_style); + assert!( + S3Config::try_from(&StorageConfig::new()) + .unwrap() + .enable_virtual_host_style + ); + } + #[test] fn test_s3_config_path_style_access() { let storage_config = StorageConfig::new().with_prop(S3_PATH_STYLE_ACCESS, "true"); diff --git a/crates/integration_tests/src/lib.rs b/crates/integration_tests/src/lib.rs index 4bf8f4d19c..feafa3ae9f 100644 --- a/crates/integration_tests/src/lib.rs +++ b/crates/integration_tests/src/lib.rs @@ -18,7 +18,9 @@ use std::collections::HashMap; use std::sync::OnceLock; -use iceberg::io::{S3_ACCESS_KEY_ID, S3_ENDPOINT, S3_REGION, S3_SECRET_ACCESS_KEY}; +use iceberg::io::{ + S3_ACCESS_KEY_ID, S3_ENDPOINT, S3_PATH_STYLE_ACCESS, S3_REGION, S3_SECRET_ACCESS_KEY, +}; use iceberg_catalog_rest::REST_CATALOG_PROP_URI; use iceberg_test_utils::{get_minio_endpoint, get_rest_catalog_endpoint, set_up}; @@ -45,6 +47,7 @@ impl GlobalTestFixture { (S3_ACCESS_KEY_ID.to_string(), "admin".to_string()), (S3_SECRET_ACCESS_KEY.to_string(), "password".to_string()), (S3_REGION.to_string(), "us-east-1".to_string()), + (S3_PATH_STYLE_ACCESS.to_string(), "true".to_string()), ]); GlobalTestFixture { catalog_config } diff --git a/crates/storage/opendal/src/s3.rs b/crates/storage/opendal/src/s3.rs index 7db88d273f..2e21418606 100644 --- a/crates/storage/opendal/src/s3.rs +++ b/crates/storage/opendal/src/s3.rs @@ -37,6 +37,12 @@ use crate::utils::{from_opendal_error, is_truthy}; /// Parse iceberg props to s3 config. pub(crate) fn s3_config_parse(mut m: HashMap) -> Result { let mut cfg = S3Config::default(); + // Match Iceberg `S3FileIOProperties.PATH_STYLE_ACCESS_DEFAULT = false`: + // virtual-host-style addressing is the spec default. opendal's own + // default is path-style, which disagrees with the Java SDK and breaks + // S3-compatible stores that only accept virtual-hosted-style URLs. + // Any explicit `s3.path-style-access` property below overrides this. + cfg.enable_virtual_host_style = true; if let Some(endpoint) = m.remove(S3_ENDPOINT) { cfg.endpoint = Some(endpoint); }; @@ -177,3 +183,28 @@ impl AwsCredentialLoad for CustomAwsCredentialLoader { self.0.load_credential(client).await } } + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + + use iceberg::io::S3_PATH_STYLE_ACCESS; + + use super::s3_config_parse; + + fn parse_with(prop: Option<&str>) -> bool { + let mut props = HashMap::new(); + if let Some(v) = prop { + props.insert(S3_PATH_STYLE_ACCESS.to_string(), v.to_string()); + } + s3_config_parse(props).unwrap().enable_virtual_host_style + } + + #[test] + fn s3_config_parse_path_style_access() { + // Match Iceberg S3FileIOProperties.PATH_STYLE_ACCESS_DEFAULT = false. + assert!(parse_with(None)); + assert!(parse_with(Some("false"))); + assert!(!parse_with(Some("true"))); + } +} diff --git a/crates/storage/opendal/tests/file_io_s3_test.rs b/crates/storage/opendal/tests/file_io_s3_test.rs index 207a4454d7..a27afb6996 100644 --- a/crates/storage/opendal/tests/file_io_s3_test.rs +++ b/crates/storage/opendal/tests/file_io_s3_test.rs @@ -26,7 +26,8 @@ mod tests { use async_trait::async_trait; use futures::StreamExt; use iceberg::io::{ - FileIO, FileIOBuilder, S3_ACCESS_KEY_ID, S3_ENDPOINT, S3_REGION, S3_SECRET_ACCESS_KEY, + FileIO, FileIOBuilder, S3_ACCESS_KEY_ID, S3_ENDPOINT, S3_PATH_STYLE_ACCESS, S3_REGION, + S3_SECRET_ACCESS_KEY, }; use iceberg_storage_opendal::{CustomAwsCredentialLoader, OpenDalStorageFactory}; use iceberg_test_utils::{get_minio_endpoint, normalize_test_name_with_parts, set_up}; @@ -47,6 +48,7 @@ mod tests { (S3_ACCESS_KEY_ID, "admin".to_string()), (S3_SECRET_ACCESS_KEY, "password".to_string()), (S3_REGION, "us-east-1".to_string()), + (S3_PATH_STYLE_ACCESS, "true".to_string()), ]) .build() } @@ -139,6 +141,7 @@ mod tests { (S3_ENDPOINT, "http://localhost:9000".to_string()), ("bucket", "test-bucket".to_string()), (S3_REGION, "us-east-1".to_string()), + (S3_PATH_STYLE_ACCESS, "true".to_string()), ]); } @@ -160,6 +163,7 @@ mod tests { .with_props(vec![ (S3_ENDPOINT, minio_endpoint), (S3_REGION, "us-east-1".to_string()), + (S3_PATH_STYLE_ACCESS, "true".to_string()), ]) .build(); @@ -188,6 +192,7 @@ mod tests { .with_props(vec![ (S3_ENDPOINT, minio_endpoint), (S3_REGION, "us-east-1".to_string()), + (S3_PATH_STYLE_ACCESS, "true".to_string()), ]) .build(); diff --git a/crates/storage/opendal/tests/resolving_storage_test.rs b/crates/storage/opendal/tests/resolving_storage_test.rs index 4572ad2c2d..c235089508 100644 --- a/crates/storage/opendal/tests/resolving_storage_test.rs +++ b/crates/storage/opendal/tests/resolving_storage_test.rs @@ -29,7 +29,8 @@ mod tests { use std::sync::Arc; use iceberg::io::{ - FileIOBuilder, S3_ACCESS_KEY_ID, S3_ENDPOINT, S3_REGION, S3_SECRET_ACCESS_KEY, + FileIOBuilder, S3_ACCESS_KEY_ID, S3_ENDPOINT, S3_PATH_STYLE_ACCESS, S3_REGION, + S3_SECRET_ACCESS_KEY, }; use iceberg_storage_opendal::OpenDalResolvingStorageFactory; use iceberg_test_utils::{get_minio_endpoint, normalize_test_name_with_parts, set_up}; @@ -45,6 +46,7 @@ mod tests { (S3_ACCESS_KEY_ID, "admin".to_string()), (S3_SECRET_ACCESS_KEY, "password".to_string()), (S3_REGION, "us-east-1".to_string()), + (S3_PATH_STYLE_ACCESS, "true".to_string()), ]) .build() } @@ -288,6 +290,7 @@ mod tests { .with_props(vec![ (S3_ENDPOINT, minio_endpoint), (S3_REGION, "us-east-1".to_string()), + (S3_PATH_STYLE_ACCESS, "true".to_string()), ]) .build(); From 3fe7be196cae9e119747de3bc3fcae5818e2e9b0 Mon Sep 17 00:00:00 2001 From: blackmwk Date: Thu, 16 Apr 2026 01:30:19 +0800 Subject: [PATCH 10/45] Fix RUSTSEC-2026-0097 (#2331) ## Which issue does this PR close? - Closes #2327 #2328 #2329 ## What changes are included in this PR? Upgrade rnd version, use recommended api. ## Are these changes tested? ut. --- .cargo/audit.toml | 5 ++ Cargo.lock | 57 +++++++++---------- Cargo.toml | 2 +- .../src/expr/visitors/page_index_evaluator.rs | 6 +- .../visitors/row_group_metrics_evaluator.rs | 6 +- .../src/writer/file_writer/rolling_writer.rs | 2 +- 6 files changed, 41 insertions(+), 37 deletions(-) diff --git a/.cargo/audit.toml b/.cargo/audit.toml index 09e2d35c50..71354ea3a5 100644 --- a/.cargo/audit.toml +++ b/.cargo/audit.toml @@ -33,4 +33,9 @@ ignore = [ # # Introduced by object_store, see https://github.com/apache/arrow-rs-object-store/issues/564 "RUSTSEC-2025-0134", + # `rand` unsoundness with custom logger using `rand::rng()` + # + # Direct dependency upgraded to 0.9.3+. Transitive rand 0.8.5 remains + # from reqsign/sqllogictest/rustc-hash — no 0.8.x patch exists. + "RUSTSEC-2026-0097", ] diff --git a/Cargo.lock b/Cargo.lock index 73a3f311bc..297b566f46 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -193,7 +193,7 @@ dependencies = [ "miniz_oxide", "num-bigint", "quad-rand", - "rand 0.9.2", + "rand 0.9.4", "regex-lite", "serde", "serde_bytes", @@ -1310,7 +1310,7 @@ version = "3.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "faf9468729b8cbcea668e36183cb69d317348c2e08e994829fb56ebfdfbaac34" dependencies = [ - "windows-sys 0.48.0", + "windows-sys 0.61.2", ] [[package]] @@ -1673,7 +1673,7 @@ dependencies = [ "object_store", "parking_lot", "parquet", - "rand 0.9.2", + "rand 0.9.4", "regex", "sqlparser", "tempfile", @@ -1825,7 +1825,7 @@ dependencies = [ "liblzma", "log", "object_store", - "rand 0.9.2", + "rand 0.9.4", "tokio", "tokio-util", "url", @@ -1978,7 +1978,7 @@ dependencies = [ "object_store", "parking_lot", "parquet", - "rand 0.9.2", + "rand 0.9.4", "tempfile", "url", ] @@ -2044,7 +2044,7 @@ dependencies = [ "md-5", "memchr", "num-traits", - "rand 0.9.2", + "rand 0.9.4", "regex", "sha2", "unicode-segmentation", @@ -2344,7 +2344,7 @@ dependencies = [ "datafusion-functions-nested", "log", "percent-encoding", - "rand 0.9.2", + "rand 0.9.4", "serde_json", "sha1", "sha2", @@ -2657,7 +2657,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" dependencies = [ "libc", - "windows-sys 0.52.0", + "windows-sys 0.61.2", ] [[package]] @@ -2752,7 +2752,7 @@ checksum = "0ce92ff622d6dadf7349484f42c93271a0d49b7cc4d466a936405bacbe10aa78" dependencies = [ "cfg-if", "rustix", - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] @@ -3287,7 +3287,7 @@ dependencies = [ "libc", "percent-encoding", "pin-project-lite", - "socket2 0.5.10", + "socket2 0.6.3", "tokio", "tower-service", "tracing", @@ -3356,7 +3356,7 @@ dependencies = [ "ordered-float 4.6.0", "parquet", "pretty_assertions", - "rand 0.8.5", + "rand 0.9.4", "regex", "reqwest", "roaring", @@ -3852,7 +3852,7 @@ dependencies = [ "portable-atomic", "portable-atomic-util", "serde_core", - "windows-sys 0.52.0", + "windows-sys 0.61.2", ] [[package]] @@ -4257,7 +4257,7 @@ dependencies = [ "hyper-util", "log", "pin-project-lite", - "rand 0.9.2", + "rand 0.9.4", "regex", "serde_json", "serde_urlencoded", @@ -4515,7 +4515,7 @@ dependencies = [ "parking_lot", "percent-encoding", "quick-xml 0.39.2", - "rand 0.10.0", + "rand 0.10.1", "reqwest", "ring", "rustls-pki-types", @@ -5155,7 +5155,7 @@ dependencies = [ "quinn-udp", "rustc-hash", "rustls", - "socket2 0.5.10", + "socket2 0.6.3", "thiserror 2.0.18", "tokio", "tracing", @@ -5171,7 +5171,7 @@ dependencies = [ "bytes", "getrandom 0.3.4", "lru-slab", - "rand 0.9.2", + "rand 0.9.4", "ring", "rustc-hash", "rustls", @@ -5192,9 +5192,9 @@ dependencies = [ "cfg_aliases", "libc", "once_cell", - "socket2 0.5.10", + "socket2 0.6.3", "tracing", - "windows-sys 0.52.0", + "windows-sys 0.60.2", ] [[package]] @@ -5251,9 +5251,9 @@ dependencies = [ [[package]] name = "rand" -version = "0.9.2" +version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1" +checksum = "44c5af06bb1b7d3216d91932aed5265164bf384dc89cd6ba05cf59a35f5f76ea" dependencies = [ "rand_chacha 0.9.0", "rand_core 0.9.5", @@ -5261,9 +5261,9 @@ dependencies = [ [[package]] name = "rand" -version = "0.10.0" +version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc266eb313df6c5c09c1c7b1fbe2510961e5bcd3add930c1e31f7ed9da0feff8" +checksum = "d2e8e8bcc7961af1fdac401278c6a831614941f6164ee3bf4ce61b7edb162207" dependencies = [ "chacha20", "getrandom 0.4.2", @@ -5657,7 +5657,7 @@ dependencies = [ "errno", "libc", "linux-raw-sys", - "windows-sys 0.52.0", + "windows-sys 0.61.2", ] [[package]] @@ -5699,9 +5699,9 @@ dependencies = [ [[package]] name = "rustls-webpki" -version = "0.103.10" +version = "0.103.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df33b2b81ac578cabaf06b89b0631153a3f416b0a886e8a7a1707fb51abbd1ef" +checksum = "8279bb85272c9f10811ae6a6c547ff594d6a7f3c6c6b02ee9726d1d0dcfcdd06" dependencies = [ "aws-lc-rs", "ring", @@ -6476,7 +6476,6 @@ dependencies = [ "cfg-if", "libc", "psm", - "windows-sys 0.52.0", "windows-sys 0.59.0", ] @@ -6606,7 +6605,7 @@ dependencies = [ "getrandom 0.4.2", "once_cell", "rustix", - "windows-sys 0.52.0", + "windows-sys 0.61.2", ] [[package]] @@ -7248,7 +7247,7 @@ dependencies = [ "nix 0.29.0", "once_cell", "pin-project", - "rand 0.9.2", + "rand 0.9.4", "socket2 0.5.10", "thiserror 2.0.18", "tokio", @@ -7502,7 +7501,7 @@ version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" dependencies = [ - "windows-sys 0.48.0", + "windows-sys 0.61.2", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 778e69c9d9..2f5a515ef0 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -108,7 +108,7 @@ ordered-float = "4" parquet = "58" pilota = "0.11.10" pretty_assertions = "1.4" -rand = "0.8.5" +rand = "0.9.3" regex = "1.11.3" reqwest = { version = "0.12.12", default-features = false, features = ["json"] } roaring = { version = "0.11" } diff --git a/crates/iceberg/src/expr/visitors/page_index_evaluator.rs b/crates/iceberg/src/expr/visitors/page_index_evaluator.rs index 96d1c651cd..4cd676dab1 100644 --- a/crates/iceberg/src/expr/visitors/page_index_evaluator.rs +++ b/crates/iceberg/src/expr/visitors/page_index_evaluator.rs @@ -793,7 +793,7 @@ mod tests { }; use parquet::file::metadata::{PageIndexPolicy, ParquetMetaData}; use parquet::file::properties::WriterProperties; - use rand::{Rng, thread_rng}; + use rand::Rng; use tempfile::NamedTempFile; use super::PageIndexEvaluator; @@ -1284,13 +1284,13 @@ mod tests { #[test] fn eval_in_length_of_set_above_limit_all_rows() -> Result<()> { - let mut rng = thread_rng(); + let mut rng = rand::rng(); let (metadata, _temp_file) = create_test_parquet_file()?; let (column_index, offset_index, row_group_metadata) = get_test_metadata(&metadata); let (iceberg_schema_ref, field_id_map) = build_iceberg_schema_and_field_map()?; let filter = Reference::new("col_float") - .is_in(std::iter::repeat_with(|| Datum::float(rng.gen_range(0.0..10.0))).take(1000)) + .is_in(std::iter::repeat_with(|| Datum::float(rng.random_range(0.0..10.0))).take(1000)) .bind(iceberg_schema_ref.clone(), false)?; let result = PageIndexEvaluator::eval( diff --git a/crates/iceberg/src/expr/visitors/row_group_metrics_evaluator.rs b/crates/iceberg/src/expr/visitors/row_group_metrics_evaluator.rs index 0506b33af0..ad7e19f548 100644 --- a/crates/iceberg/src/expr/visitors/row_group_metrics_evaluator.rs +++ b/crates/iceberg/src/expr/visitors/row_group_metrics_evaluator.rs @@ -528,7 +528,7 @@ mod tests { use parquet::schema::types::{ ColumnDescriptor, ColumnPath, SchemaDescriptor, Type as parquetSchemaType, }; - use rand::{Rng, thread_rng}; + use rand::Rng; use super::RowGroupMetricsEvaluator; use crate::Result; @@ -1617,7 +1617,7 @@ mod tests { #[test] fn eval_true_for_too_many_literals_filter_is_in() -> Result<()> { - let mut rng = thread_rng(); + let mut rng = rand::rng(); let row_group_metadata = create_row_group_metadata( 1, @@ -1636,7 +1636,7 @@ mod tests { let (iceberg_schema_ref, field_id_map) = build_iceberg_schema_and_field_map()?; let filter = Reference::new("col_float") - .is_in(std::iter::repeat_with(|| Datum::float(rng.gen_range(0.0..10.0))).take(1000)) + .is_in(std::iter::repeat_with(|| Datum::float(rng.random_range(0.0..10.0))).take(1000)) .bind(iceberg_schema_ref.clone(), false)?; let result = RowGroupMetricsEvaluator::eval( diff --git a/crates/iceberg/src/writer/file_writer/rolling_writer.rs b/crates/iceberg/src/writer/file_writer/rolling_writer.rs index b86f6a2ea7..b0b2d2f191 100644 --- a/crates/iceberg/src/writer/file_writer/rolling_writer.rs +++ b/crates/iceberg/src/writer/file_writer/rolling_writer.rs @@ -399,7 +399,7 @@ mod tests { "Kelly", "Larry", "Mallory", "Shawn", ]; - let mut rng = rand::thread_rng(); + let mut rng = rand::rng(); let batch_num = 10; let batch_rows = 100; let expected_rows = batch_num * batch_rows; From 5334dcb5ba670377c974b49c87c8c273f4c7181d Mon Sep 17 00:00:00 2001 From: Shawn Chang Date: Wed, 15 Apr 2026 11:43:46 -0700 Subject: [PATCH 11/45] chore: update comment tag to match the exact version to fix zizmor (#2333) ## Which issue does this PR close? - Closes #. ## What changes are included in this PR? - update codeql version from 4.35.1 to 4.35.2 - update comment tag ## Are these changes tested? --- .github/workflows/codeql.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 81bc6b16f8..7e9c8208c8 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -46,11 +46,11 @@ jobs: persist-credentials: false - name: Initialize CodeQL - uses: github/codeql-action/init@c10b8064de6f491fea524254123dbe5e09572f13 # v4 + uses: github/codeql-action/init@95e58e9a2cdfd71adc6e0353d5c52f41a045d225 # v4.35.2 with: languages: actions - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@c10b8064de6f491fea524254123dbe5e09572f13 # v4 + uses: github/codeql-action/analyze@95e58e9a2cdfd71adc6e0353d5c52f41a045d225 # v4.35.2 with: category: "/language:actions" From e285c7957fc6860a654f7da10c92c6d93ac90531 Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Wed, 15 Apr 2026 12:08:48 -0700 Subject: [PATCH 12/45] ci: fix zizmor workflow (#2334) --- .github/workflows/asf-allowlist-check.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/asf-allowlist-check.yml b/.github/workflows/asf-allowlist-check.yml index d4e84c5922..65dbe8bcbe 100644 --- a/.github/workflows/asf-allowlist-check.yml +++ b/.github/workflows/asf-allowlist-check.yml @@ -43,5 +43,4 @@ jobs: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 with: persist-credentials: false - # Intentionally unpinned to always use the latest allowlist from the ASF. - - uses: apache/infrastructure-actions/allowlist-check@main # zizmor: ignore[unpinned-uses] + - uses: apache/infrastructure-actions/allowlist-check@4e9c961f587f72b170874b6f5cd4ac15f7f26eb8 # main From d118aa84b863f463a1a7dda7fea5af7ee16c5cf6 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 15 Apr 2026 14:08:22 -0700 Subject: [PATCH 13/45] chore(deps): Bump astral-sh/setup-uv from 7.3.1 to 8.0.0 (#2314) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [astral-sh/setup-uv](https://github.com/astral-sh/setup-uv) from 7.3.1 to 8.0.0.
Release notes

Sourced from astral-sh/setup-uv's releases.

v8.0.0 🌈 Immutable releases and secure tags

This is the first immutable release of setup-uv 🥳

All future releases are also immutable, if you want to know more about what this means checkout the docs.

This release also has two breaking changes

New format for manifest-file

The previously deprecated way of defining a custom version manifest to control which uv versions are available and where to download them from got removed. The functionality is still there but you have to use the new format.

No more major and minor tags

To increase security even more we will stop publishing minor tags. You won't be able to use @v8 or @v8.0 any longer. We do this because pinning to major releases opens up users to supply chain attacks like what happened to tj-actions.

[!TIP] Use the immutable tag as a version astral-sh/setup-uv@v8.0.0 Or even better the githash astral-sh/setup-uv@cec208311dfd045dd5311c1add060b2062131d57

🚨 Breaking changes

🧰 Maintenance

v7.6.0 🌈 Fetch uv from Astral's mirror by default

Changes

We now default to download uv from releases.astral.sh. This means by default we don't hit the GitHub API at all and shouldn't see any rate limits and timeouts any more.

🚀 Enhancements

🧰 Maintenance

... (truncated)

Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=astral-sh/setup-uv&package-manager=github_actions&previous-version=7.3.1&new-version=8.0.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Shawn Chang --- .github/workflows/bindings_python_ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/bindings_python_ci.yml b/.github/workflows/bindings_python_ci.yml index a02ae9f0af..916113e06f 100644 --- a/.github/workflows/bindings_python_ci.yml +++ b/.github/workflows/bindings_python_ci.yml @@ -63,7 +63,7 @@ jobs: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 with: persist-credentials: false - - uses: astral-sh/setup-uv@5a095e7a2014a4212f075830d4f7277575a9d098 # v7.3.1 + - uses: astral-sh/setup-uv@cec208311dfd045dd5311c1add060b2062131d57 # v8.0.0 with: version: "0.9.3" enable-cache: true @@ -100,7 +100,7 @@ jobs: working-directory: "bindings/python" command: build args: --out dist -i python3.12 # Explicitly set interpreter; manylinux containers have multiple Pythons and maturin may pick an older one - - uses: astral-sh/setup-uv@5a095e7a2014a4212f075830d4f7277575a9d098 # v7.3.1 + - uses: astral-sh/setup-uv@cec208311dfd045dd5311c1add060b2062131d57 # v8.0.0 with: version: "0.9.3" enable-cache: true From 2890a6190eec612d231f34465133e8f1053f0f95 Mon Sep 17 00:00:00 2001 From: Daniel Carl Jones Date: Wed, 15 Apr 2026 22:10:42 +0100 Subject: [PATCH 14/45] tests(s3tables): add s3tables end-to-end test to verify table creation/loading plus reading/writing (#2315) ## Which issue does this PR close? - Extends #2313 with a test. In particular, I've written a test that verifies creating a table and writing to it, and then loading that table from the catalog and reading from it using the default storage backend. ## What changes are included in this PR? This adds a test that currently fails and will be passing once #2313 is merged. ## Are these changes tested? This change is only a new test. Signed-off-by: Daniel Carl Jones --- Cargo.lock | 5 ++ crates/catalog/s3tables/Cargo.toml | 5 ++ crates/catalog/s3tables/src/catalog.rs | 105 +++++++++++++++++++++++++ 3 files changed, 115 insertions(+) diff --git a/Cargo.lock b/Cargo.lock index 297b566f46..66390fd209 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3468,14 +3468,19 @@ name = "iceberg-catalog-s3tables" version = "0.9.0" dependencies = [ "anyhow", + "arrow-array", + "arrow-schema", "async-trait", "aws-config", "aws-sdk-s3tables", + "futures", "iceberg", "iceberg-storage-opendal", "iceberg_test_utils", "itertools 0.13.0", + "parquet", "tokio", + "uuid", ] [[package]] diff --git a/crates/catalog/s3tables/Cargo.toml b/crates/catalog/s3tables/Cargo.toml index 2fe096fec9..dc7be3027f 100644 --- a/crates/catalog/s3tables/Cargo.toml +++ b/crates/catalog/s3tables/Cargo.toml @@ -39,6 +39,11 @@ iceberg-storage-opendal = { workspace = true, features = ["opendal-s3"] } [dev-dependencies] +arrow-array = { workspace = true } +arrow-schema = { workspace = true } +futures = { workspace = true } iceberg_test_utils = { path = "../../test_utils", features = ["tests"] } itertools = { workspace = true } +parquet = { workspace = true } tokio = { workspace = true } +uuid = { workspace = true } diff --git a/crates/catalog/s3tables/src/catalog.rs b/crates/catalog/s3tables/src/catalog.rs index 27d7be76ac..e19f5ae092 100644 --- a/crates/catalog/s3tables/src/catalog.rs +++ b/crates/catalog/s3tables/src/catalog.rs @@ -707,6 +707,7 @@ where T: std::fmt::Debug { #[cfg(test)] mod tests { + use futures::TryStreamExt; use iceberg::spec::{NestedField, PrimitiveType, Schema, Type}; use iceberg::transaction::{ApplyTransactionAction, Transaction}; @@ -1175,4 +1176,108 @@ mod tests { assert_eq!(err.message(), "Catalog name cannot be empty"); } } + + /// Verify that an S3 Table catalog can create a table, write data, load the same table, and read from it. + #[tokio::test] + async fn test_s3tables_create_table_write_load_table_read() { + use iceberg::writer::base_writer::data_file_writer::DataFileWriterBuilder; + use iceberg::writer::file_writer::ParquetWriterBuilder; + use iceberg::writer::file_writer::location_generator::{ + DefaultFileNameGenerator, DefaultLocationGenerator, + }; + use iceberg::writer::file_writer::rolling_writer::RollingFileWriterBuilder; + use iceberg::writer::{IcebergWriter, IcebergWriterBuilder}; + + let catalog = match load_s3tables_catalog_from_env().await { + Ok(Some(c)) => c, + Ok(None) => return, + Err(e) => panic!("Error loading catalog: {e}"), + }; + + let ns = NamespaceIdent::new(format!("test_rw_{}", uuid::Uuid::new_v4().simple())); + catalog.create_namespace(&ns, HashMap::new()).await.unwrap(); + + let table_name = String::from("table"); + + let schema = Schema::builder() + .with_fields(vec![ + NestedField::required(1, "id", Type::Primitive(PrimitiveType::Int)).into(), + ]) + .build() + .unwrap(); + let creation = TableCreation::builder() + .name(table_name.clone()) + .schema(schema) + .build(); + + let table = catalog.create_table(&ns, creation).await.unwrap(); + + // Write one row. + let arrow_schema: Arc = Arc::new( + table + .metadata() + .current_schema() + .as_ref() + .try_into() + .unwrap(), + ); + let batch = arrow_array::RecordBatch::try_new(arrow_schema, vec![Arc::new( + arrow_array::Int32Array::from(vec![42]), + )]) + .unwrap(); + + // Locations will be generated based on the table metadata, which will be using `s3://` for Amazon S3 Tables. + let location_generator = DefaultLocationGenerator::new(table.metadata().clone()).unwrap(); + let file_name_generator = DefaultFileNameGenerator::new( + "test".to_string(), + None, + iceberg::spec::DataFileFormat::Parquet, + ); + let parquet_writer_builder = ParquetWriterBuilder::new( + parquet::file::properties::WriterProperties::default(), + table.metadata().current_schema().clone(), + ); + let rw = RollingFileWriterBuilder::new_with_default_file_size( + parquet_writer_builder, + table.file_io().clone(), + location_generator, + file_name_generator, + ); + let mut writer = DataFileWriterBuilder::new(rw).build(None).await.unwrap(); + writer.write(batch.clone()).await.unwrap(); + let data_files = writer.close().await.unwrap(); + + let tx = Transaction::new(&table); + let tx = tx + .fast_append() + .add_data_files(data_files) + .apply(tx) + .unwrap(); + tx.commit(&catalog).await.unwrap(); + + // Reload from catalog and read back. + let table_ident = TableIdent::new(ns.clone(), table_name.clone()); + let reloaded = catalog.load_table(&table_ident).await.unwrap(); + let batches: Vec = reloaded + .scan() + .select_all() + .build() + .expect("scan to be valid (snapshot exists, schema is OK)") + .to_arrow() + .await + .expect("scan tasks should be OK") + .try_collect() + .await + .expect("scan should complete successfully"); + + assert_eq!(batches.len(), 1); + assert_eq!( + batches[0], batch, + "read records should match records written earlier" + ); + + // Clean up. + catalog.purge_table(&table_ident).await.ok(); + catalog.drop_namespace(&ns).await.ok(); + } } From 5ea6f4cfed6477a1b423a7513de02a4ff686a2fa Mon Sep 17 00:00:00 2001 From: Matt Butrovich Date: Wed, 15 Apr 2026 21:30:40 -0400 Subject: [PATCH 15/45] fix: build_fallback_field_id_map produces incorrect column indices for schemas with nested types (#2307) ## Which issue does this PR close? - Closes #2306. - Downstream issue: https://github.com/apache/datafusion-comet/issues/3860 ## What changes are included in this PR? `build_fallback_field_id_map` iterated over Parquet leaf columns instead of top-level fields when building the field ID to column index mapping for migrated files (no embedded field IDs). When nested types (struct, list, map) precede a primitive column, they expand into multiple leaves, causing the mapping to diverge from `add_fallback_field_ids_to_arrow_schema` which correctly assigns ordinal IDs to top-level Arrow fields. This made predicates on columns after nested types resolve to a leaf inside the group, crashing with "Leaf column `id` in predicates isn't a root column in Parquet schema". The fix iterates `root_schema().get_fields()` directly, assigning ordinal IDs only to top-level fields. For non-primitive fields (struct/list/map), it uses `get_column_root_idx` to advance past their leaf columns. This mirrors iceberg-java's `ParquetSchemaUtil.addFallbackIds()`, which iterates `fileSchema.getFields()` assigning ordinal IDs to top-level fields. Also renames "Leave column" to "Leaf column" in error messages. ## Are these changes tested? - An integration test (`test_predicate_on_migrated_file_with_nested_types`) writes a Parquet file without field IDs containing struct, list, and map columns before an `id` column, then reads with a predicate on `id`. This reproduces the exact crash before the fix. Test data is constructed with `serde_arrow` for readability. - [Apache DataFusion Comet](https://github.com/apache/datafusion-comet) used the repro test in [apache/datafusion-comet#3860](https://github.com/apache/datafusion-comet/issues/3860) and it passes with this change: https://github.com/apache/datafusion-comet/pull/3872 --------- Co-authored-by: blackmwk --- Cargo.lock | 46 ++++++ crates/iceberg/Cargo.toml | 1 + crates/iceberg/src/arrow/reader.rs | 238 ++++++++++++++++++++++++++++- 3 files changed, 277 insertions(+), 8 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 66390fd209..3f80d0013e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1127,6 +1127,20 @@ name = "bytemuck" version = "1.25.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c8efb64bd706a16a1bdde310ae86b351e4d21550d98d056f22f8a7f7a2183fec" +dependencies = [ + "bytemuck_derive", +] + +[[package]] +name = "bytemuck_derive" +version = "1.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9abbd1bc6865053c427f7198e6af43bfdedc55ab791faed4fbd361d789575ff" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] [[package]] name = "byteorder" @@ -3056,6 +3070,7 @@ version = "2.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b" dependencies = [ + "bytemuck", "cfg-if", "crunchy", "num-traits", @@ -3361,6 +3376,7 @@ dependencies = [ "reqwest", "roaring", "serde", + "serde_arrow", "serde_bytes", "serde_derive", "serde_json", @@ -4135,6 +4151,21 @@ dependencies = [ "twox-hash", ] +[[package]] +name = "marrow" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f5240d6977234968ff9ad254bfa73aa397fb51e41dcb22b1eb85835e9295485b" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "bytemuck", + "half", + "serde", +] + [[package]] name = "md-5" version = "0.10.6" @@ -5898,6 +5929,21 @@ dependencies = [ "serde", ] +[[package]] +name = "serde_arrow" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2784e59a0315568e850cb01ddadf458f8c09e28d8cfc4880c2cc08f5dc3444e0" +dependencies = [ + "arrow-array", + "arrow-schema", + "bytemuck", + "chrono", + "half", + "marrow", + "serde", +] + [[package]] name = "serde_bytes" version = "0.11.19" diff --git a/crates/iceberg/Cargo.toml b/crates/iceberg/Cargo.toml index aa1d0cd4a5..18729176dc 100644 --- a/crates/iceberg/Cargo.toml +++ b/crates/iceberg/Cargo.toml @@ -91,6 +91,7 @@ rand = { workspace = true } regex = { workspace = true } tempfile = { workspace = true } minijinja = { workspace = true } +serde_arrow = { version = "0.14", features = ["arrow-58"] } [package.metadata.cargo-machete] # These dependencies are added to ensure minimal dependency version diff --git a/crates/iceberg/src/arrow/reader.rs b/crates/iceberg/src/arrow/reader.rs index 700ba69262..9ccf1ac3d7 100644 --- a/crates/iceberg/src/arrow/reader.rs +++ b/crates/iceberg/src/arrow/reader.rs @@ -1100,7 +1100,7 @@ fn build_field_id_map(parquet_schema: &SchemaDescriptor) -> Result Result usize { + if ty.is_primitive() { + 1 + } else { + ty.get_fields().iter().map(|f| leaf_count(f)).sum() + } +} + +/// Builds a mapping from fallback field IDs to leaf column indices for Parquet files +/// without embedded field IDs. Returns entries only for primitive top-level fields. +/// +/// Must use top-level field positions (not leaf column positions) to stay consistent +/// with `add_fallback_field_ids_to_arrow_schema`, which assigns ordinal IDs to +/// top-level Arrow fields. Using leaf positions instead would produce wrong indices +/// when nested types (struct/list/map) expand into multiple leaf columns. +/// +/// Mirrors iceberg-java's ParquetSchemaUtil.addFallbackIds() which iterates +/// fileSchema.getFields() assigning ordinal IDs to top-level fields. fn build_fallback_field_id_map(parquet_schema: &SchemaDescriptor) -> HashMap { let mut column_map = HashMap::new(); + let mut leaf_idx = 0; - // 1-indexed to match iceberg-java's convention - for (idx, _field) in parquet_schema.columns().iter().enumerate() { - let field_id = (idx + 1) as i32; - column_map.insert(field_id, idx); + for (top_pos, field) in parquet_schema.root_schema().get_fields().iter().enumerate() { + let field_id = (top_pos + 1) as i32; + if field.is_primitive() { + column_map.insert(field_id, leaf_idx); + } + leaf_idx += leaf_count(field); } column_map @@ -1409,7 +1431,7 @@ impl PredicateConverter<'_> { return Err(Error::new( ErrorKind::DataInvalid, format!( - "Leave column `{}` in predicates isn't a root column in Parquet schema.", + "Leaf column `{}` in predicates isn't a root column in Parquet schema.", reference.field().name ), )); @@ -1423,7 +1445,7 @@ impl PredicateConverter<'_> { .ok_or(Error::new( ErrorKind::DataInvalid, format!( - "Leave column `{}` in predicates cannot be found in the required column indices.", + "Leaf column `{}` in predicates cannot be found in the required column indices.", reference.field().name ), ))?; @@ -4667,4 +4689,204 @@ message schema { assert_eq!(result[1], expected_1); assert_eq!(result[2], expected_2); } + + /// Regression for : + /// predicate on a column after nested types in a migrated file (no field IDs). + /// Schema has struct, list, and map columns before the predicate target (`id`), + /// exercising the fallback field ID mapping across all nested type variants. + #[tokio::test] + async fn test_predicate_on_migrated_file_with_nested_types() { + use serde::{Deserialize, Serialize}; + use serde_arrow::schema::{SchemaLike, TracingOptions}; + + #[derive(Serialize, Deserialize)] + struct Person { + name: String, + age: i32, + } + + #[derive(Serialize, Deserialize)] + struct Row { + person: Person, + people: Vec, + props: std::collections::BTreeMap, + id: i32, + } + + let rows = vec![ + Row { + person: Person { + name: "Alice".into(), + age: 30, + }, + people: vec![Person { + name: "Alice".into(), + age: 30, + }], + props: [("k1".into(), "v1".into())].into(), + id: 1, + }, + Row { + person: Person { + name: "Bob".into(), + age: 25, + }, + people: vec![Person { + name: "Bob".into(), + age: 25, + }], + props: [("k2".into(), "v2".into())].into(), + id: 2, + }, + Row { + person: Person { + name: "Carol".into(), + age: 40, + }, + people: vec![Person { + name: "Carol".into(), + age: 40, + }], + props: [("k3".into(), "v3".into())].into(), + id: 3, + }, + ]; + + let tracing_options = TracingOptions::default() + .map_as_struct(false) + .strings_as_large_utf8(false) + .sequence_as_large_list(false); + let fields = Vec::::from_type::(tracing_options).unwrap(); + let arrow_schema = Arc::new(ArrowSchema::new(fields.clone())); + let batch = serde_arrow::to_record_batch(&fields, &rows).unwrap(); + + // Fallback field IDs: person=1, people=2, props=3, id=4 + let iceberg_schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::required( + 1, + "person", + Type::Struct(crate::spec::StructType::new(vec![ + NestedField::required( + 5, + "name", + Type::Primitive(PrimitiveType::String), + ) + .into(), + NestedField::required(6, "age", Type::Primitive(PrimitiveType::Int)) + .into(), + ])), + ) + .into(), + NestedField::required( + 2, + "people", + Type::List(crate::spec::ListType { + element_field: NestedField::required( + 7, + "element", + Type::Struct(crate::spec::StructType::new(vec![ + NestedField::required( + 8, + "name", + Type::Primitive(PrimitiveType::String), + ) + .into(), + NestedField::required( + 9, + "age", + Type::Primitive(PrimitiveType::Int), + ) + .into(), + ])), + ) + .into(), + }), + ) + .into(), + NestedField::required( + 3, + "props", + Type::Map(crate::spec::MapType { + key_field: NestedField::required( + 10, + "key", + Type::Primitive(PrimitiveType::String), + ) + .into(), + value_field: NestedField::required( + 11, + "value", + Type::Primitive(PrimitiveType::String), + ) + .into(), + }), + ) + .into(), + NestedField::required(4, "id", Type::Primitive(PrimitiveType::Int)).into(), + ]) + .build() + .unwrap(), + ); + + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + let file_path = format!("{table_location}/1.parquet"); + + let props = WriterProperties::builder() + .set_compression(Compression::SNAPPY) + .build(); + let file = File::create(&file_path).unwrap(); + let mut writer = ArrowWriter::try_new(file, arrow_schema, Some(props)).unwrap(); + writer.write(&batch).expect("Writing batch"); + writer.close().unwrap(); + + let predicate = Reference::new("id").greater_than(Datum::int(1)); + + let reader = ArrowReaderBuilder::new(FileIO::new_with_fs()) + .with_row_group_filtering_enabled(true) + .with_row_selection_enabled(true) + .build(); + + let tasks = Box::pin(futures::stream::iter( + vec![Ok(FileScanTask { + file_size_in_bytes: std::fs::metadata(&file_path).unwrap().len(), + start: 0, + length: 0, + record_count: None, + data_file_path: file_path, + data_file_format: DataFileFormat::Parquet, + schema: iceberg_schema.clone(), + project_field_ids: vec![4], + predicate: Some(predicate.bind(iceberg_schema, true).unwrap()), + deletes: vec![], + partition: None, + partition_spec: None, + name_mapping: None, + case_sensitive: false, + })] + .into_iter(), + )) as FileScanTaskStream; + + let result = reader + .read(tasks) + .unwrap() + .try_collect::>() + .await + .unwrap(); + + let ids: Vec = result + .iter() + .flat_map(|b| { + b.column(0) + .as_primitive::() + .values() + .iter() + .copied() + }) + .collect(); + assert_eq!(ids, vec![2, 3]); + } } From a2f067d0225d66ab88b8a18ec25b8a0953e35082 Mon Sep 17 00:00:00 2001 From: Matt Butrovich Date: Wed, 15 Apr 2026 22:13:56 -0400 Subject: [PATCH 16/45] fix: incorrect Parquet INT96 timestamp values from ArrowReader (#2301) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Which issue does this PR close? - Closes #2299. ## What changes are included in this PR? - Add `coerce_int96_timestamps()` to patch the Arrow schema before reading, using arrow-rs's schema hint mechanism (`ArrowReaderOptions::with_schema`) to read INT96 columns at the resolution specified by the Iceberg table schema - `timestamp`/`timestamptz` → microsecond, `timestamp_ns`/`timestamptz_ns` → nanosecond, per the [Iceberg spec](https://iceberg.apache.org/spec/#primitive-types) - Falls back to microsecond when no field ID is available (matching Iceberg Java's `TimestampInt96Reader` behavior) - Applied after all three schema resolution branches (with field IDs, name mapping, positional fallback) so the fix covers both native and migrated tables - Handles INT96 inside nested types (structs, lists, maps) via `ArrowSchemaVisitor` traversal - Visitor and tests live in a standalone `arrow/int96.rs` module to keep `reader.rs` manageable - Made `visit_schema` in `arrow/schema.rs` `pub(crate)` so the coercion visitor can reuse the existing traversal ## Are these changes tested? - `test_read_int96_timestamps_with_field_ids` — files with embedded field IDs (branch 1) - `test_read_int96_timestamps_without_field_ids` — migrated files without field IDs (branches 2/3) - `test_read_int96_timestamps_in_struct` — INT96 inside a struct field - `test_read_int96_timestamps_in_list` — INT96 inside a list field (3-level Parquet LIST encoding) - `test_read_int96_timestamps_in_map` — INT96 as map values - All tests use dates outside the i64 nanosecond range (~1677-2262) to confirm the overflow is avoided - [Apache DataFusion Comet](https://github.com/apache/datafusion-comet) used the repro test in [apache/datafusion-comet#3856](https://github.com/apache/datafusion-comet/issues/3856) and it passes with this change: https://github.com/apache/datafusion-comet/pull/3857 --- crates/iceberg/src/arrow/int96.rs | 578 +++++++++++++++++++++++++++++ crates/iceberg/src/arrow/mod.rs | 1 + crates/iceberg/src/arrow/reader.rs | 575 ++++++++++++++++++++++++++++ crates/iceberg/src/arrow/schema.rs | 5 +- 4 files changed, 1158 insertions(+), 1 deletion(-) create mode 100644 crates/iceberg/src/arrow/int96.rs diff --git a/crates/iceberg/src/arrow/int96.rs b/crates/iceberg/src/arrow/int96.rs new file mode 100644 index 0000000000..63a7a30f1a --- /dev/null +++ b/crates/iceberg/src/arrow/int96.rs @@ -0,0 +1,578 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! INT96 timestamp coercion for Parquet files. + +use std::sync::Arc; + +use arrow_schema::{ + DataType, Field, Fields, Schema as ArrowSchema, SchemaRef as ArrowSchemaRef, TimeUnit, +}; +use parquet::arrow::PARQUET_FIELD_ID_META_KEY; + +use crate::arrow::schema::{ArrowSchemaVisitor, DEFAULT_MAP_FIELD_NAME, visit_schema}; +use crate::error::Result; +use crate::spec::{PrimitiveType, Schema, Type}; +use crate::{Error, ErrorKind}; + +/// Coerce Arrow schema types for INT96 columns to match the Iceberg table schema. +/// +/// arrow-rs defaults INT96 to `Timestamp(Nanosecond)`, which overflows i64 for dates outside +/// ~1677-2262. We use arrow-rs's schema hint mechanism to read INT96 at the resolution +/// specified by the Iceberg schema (`timestamp` → microsecond, `timestamp_ns` → nanosecond). +/// +/// Iceberg Java handles this differently: it bypasses parquet-mr with a custom column reader +/// (`GenericParquetReaders.TimestampInt96Reader`). We achieve the same result via schema hints. +/// +/// References: +/// - Iceberg spec primitive types: +/// - arrow-rs schema hint support: +pub(crate) fn coerce_int96_timestamps( + arrow_schema: &ArrowSchemaRef, + iceberg_schema: &Schema, +) -> Option> { + let mut visitor = Int96CoercionVisitor::new(iceberg_schema); + let coerced = visit_schema(arrow_schema, &mut visitor).ok()?; + if visitor.changed { + Some(Arc::new(coerced)) + } else { + None + } +} + +/// Visitor that coerces `Timestamp(Nanosecond)` Arrow fields to the resolution +/// indicated by the Iceberg schema. +struct Int96CoercionVisitor<'a> { + iceberg_schema: &'a Schema, + // TODO(#2310): use FieldRef (Arc) once ArrowSchemaVisitor passes FieldRef. + field_stack: Vec, + changed: bool, +} + +impl<'a> Int96CoercionVisitor<'a> { + fn new(iceberg_schema: &'a Schema) -> Self { + Self { + iceberg_schema, + field_stack: Vec::new(), + changed: false, + } + } + + /// Determine the target TimeUnit for a Timestamp(Nanosecond) field based on the + /// Iceberg schema. Falls back to microsecond when field IDs are unavailable, + /// matching Iceberg Java behavior. + fn target_unit(&self, field: &Field) -> Option { + if !matches!( + field.data_type(), + DataType::Timestamp(TimeUnit::Nanosecond, _) + ) { + return None; + } + + let target = field + .metadata() + .get(PARQUET_FIELD_ID_META_KEY) + .and_then(|id_str| id_str.parse::().ok()) + .and_then(|field_id| self.iceberg_schema.field_by_id(field_id)) + .and_then(|f| match &*f.field_type { + Type::Primitive(PrimitiveType::Timestamp | PrimitiveType::Timestamptz) => { + Some(TimeUnit::Microsecond) + } + Type::Primitive(PrimitiveType::TimestampNs | PrimitiveType::TimestamptzNs) => { + Some(TimeUnit::Nanosecond) + } + _ => None, + }) + // Iceberg Java reads INT96 as microseconds by default + .unwrap_or(TimeUnit::Microsecond); + + if target == TimeUnit::Nanosecond { + None + } else { + Some(target) + } + } +} + +impl ArrowSchemaVisitor for Int96CoercionVisitor<'_> { + type T = Field; + type U = ArrowSchema; + + fn before_field(&mut self, field: &Field) -> Result<()> { + self.field_stack.push(field.as_ref().clone()); + Ok(()) + } + + fn after_field(&mut self, _field: &Field) -> Result<()> { + self.field_stack.pop(); + Ok(()) + } + + fn before_list_element(&mut self, field: &Field) -> Result<()> { + self.field_stack.push(field.as_ref().clone()); + Ok(()) + } + + fn after_list_element(&mut self, _field: &Field) -> Result<()> { + self.field_stack.pop(); + Ok(()) + } + + fn before_map_key(&mut self, field: &Field) -> Result<()> { + self.field_stack.push(field.as_ref().clone()); + Ok(()) + } + + fn after_map_key(&mut self, _field: &Field) -> Result<()> { + self.field_stack.pop(); + Ok(()) + } + + fn before_map_value(&mut self, field: &Field) -> Result<()> { + self.field_stack.push(field.as_ref().clone()); + Ok(()) + } + + fn after_map_value(&mut self, _field: &Field) -> Result<()> { + self.field_stack.pop(); + Ok(()) + } + + fn schema(&mut self, schema: &ArrowSchema, values: Vec) -> Result { + Ok(ArrowSchema::new_with_metadata( + values, + schema.metadata().clone(), + )) + } + + fn r#struct(&mut self, _fields: &Fields, results: Vec) -> Result { + let field_info = self + .field_stack + .last() + .ok_or_else(|| Error::new(ErrorKind::Unexpected, "Field stack underflow in struct"))?; + Ok(Field::new( + field_info.name(), + DataType::Struct(Fields::from(results)), + field_info.is_nullable(), + ) + .with_metadata(field_info.metadata().clone())) + } + + fn list(&mut self, list: &DataType, value: Field) -> Result { + let field_info = self + .field_stack + .last() + .ok_or_else(|| Error::new(ErrorKind::Unexpected, "Field stack underflow in list"))?; + let list_type = match list { + DataType::List(_) => DataType::List(Arc::new(value)), + DataType::LargeList(_) => DataType::LargeList(Arc::new(value)), + DataType::FixedSizeList(_, size) => DataType::FixedSizeList(Arc::new(value), *size), + _ => { + return Err(Error::new( + ErrorKind::Unexpected, + format!("Expected list type, got {list}"), + )); + } + }; + Ok( + Field::new(field_info.name(), list_type, field_info.is_nullable()) + .with_metadata(field_info.metadata().clone()), + ) + } + + fn map(&mut self, map: &DataType, key_value: Field, value: Field) -> Result { + let field_info = self + .field_stack + .last() + .ok_or_else(|| Error::new(ErrorKind::Unexpected, "Field stack underflow in map"))?; + let sorted = match map { + DataType::Map(_, sorted) => *sorted, + _ => { + return Err(Error::new( + ErrorKind::Unexpected, + format!("Expected map type, got {map}"), + )); + } + }; + let struct_field = Field::new( + DEFAULT_MAP_FIELD_NAME, + DataType::Struct(Fields::from(vec![key_value, value])), + false, + ); + Ok(Field::new( + field_info.name(), + DataType::Map(Arc::new(struct_field), sorted), + field_info.is_nullable(), + ) + .with_metadata(field_info.metadata().clone())) + } + + fn primitive(&mut self, p: &DataType) -> Result { + let field_info = self.field_stack.last().ok_or_else(|| { + Error::new(ErrorKind::Unexpected, "Field stack underflow in primitive") + })?; + + if let Some(target_unit) = self.target_unit(field_info) { + let tz = match field_info.data_type() { + DataType::Timestamp(_, tz) => tz.clone(), + _ => None, + }; + self.changed = true; + Ok(Field::new( + field_info.name(), + DataType::Timestamp(target_unit, tz), + field_info.is_nullable(), + ) + .with_metadata(field_info.metadata().clone())) + } else { + Ok( + Field::new(field_info.name(), p.clone(), field_info.is_nullable()) + .with_metadata(field_info.metadata().clone()), + ) + } + } +} + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + use std::sync::Arc; + + use arrow_schema::{DataType, Field, Schema as ArrowSchema, TimeUnit}; + use parquet::arrow::PARQUET_FIELD_ID_META_KEY; + + use super::coerce_int96_timestamps; + use crate::spec::{ListType, MapType, NestedField, PrimitiveType, Schema, StructType, Type}; + + fn iceberg_schema_with_timestamp() -> Schema { + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::optional(1, "ts", Type::Primitive(PrimitiveType::Timestamp)).into(), + NestedField::required(2, "id", Type::Primitive(PrimitiveType::Int)).into(), + ]) + .build() + .unwrap() + } + + fn field_id_meta(id: i32) -> HashMap { + HashMap::from([(PARQUET_FIELD_ID_META_KEY.to_string(), id.to_string())]) + } + + #[test] + fn test_coerce_timestamp_ns_to_us() { + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("ts", DataType::Timestamp(TimeUnit::Nanosecond, None), true) + .with_metadata(field_id_meta(1)), + Field::new("id", DataType::Int32, false).with_metadata(field_id_meta(2)), + ])); + let iceberg = iceberg_schema_with_timestamp(); + + let coerced = coerce_int96_timestamps(&arrow_schema, &iceberg).unwrap(); + assert_eq!( + coerced.field(0).data_type(), + &DataType::Timestamp(TimeUnit::Microsecond, None) + ); + // Non-timestamp field unchanged + assert_eq!(coerced.field(1).data_type(), &DataType::Int32); + } + + #[test] + fn test_coerce_timestamptz_ns_to_us() { + let iceberg = Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::optional(1, "ts", Type::Primitive(PrimitiveType::Timestamptz)).into(), + ]) + .build() + .unwrap(); + + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new( + "ts", + DataType::Timestamp(TimeUnit::Nanosecond, Some("UTC".into())), + true, + ) + .with_metadata(field_id_meta(1)), + ])); + + let coerced = coerce_int96_timestamps(&arrow_schema, &iceberg).unwrap(); + assert_eq!( + coerced.field(0).data_type(), + &DataType::Timestamp(TimeUnit::Microsecond, Some("UTC".into())) + ); + } + + #[test] + fn test_no_coercion_when_iceberg_is_timestamp_ns() { + let iceberg = Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::optional(1, "ts", Type::Primitive(PrimitiveType::TimestampNs)).into(), + ]) + .build() + .unwrap(); + + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("ts", DataType::Timestamp(TimeUnit::Nanosecond, None), true) + .with_metadata(field_id_meta(1)), + ])); + + assert!(coerce_int96_timestamps(&arrow_schema, &iceberg).is_none()); + } + + #[test] + fn test_no_coercion_when_iceberg_is_timestamptz_ns() { + let iceberg = Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::optional(1, "ts", Type::Primitive(PrimitiveType::TimestamptzNs)) + .into(), + ]) + .build() + .unwrap(); + + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new( + "ts", + DataType::Timestamp(TimeUnit::Nanosecond, Some("UTC".into())), + true, + ) + .with_metadata(field_id_meta(1)), + ])); + + assert!(coerce_int96_timestamps(&arrow_schema, &iceberg).is_none()); + } + + #[test] + fn test_no_coercion_when_already_microsecond() { + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("ts", DataType::Timestamp(TimeUnit::Microsecond, None), true) + .with_metadata(field_id_meta(1)), + Field::new("id", DataType::Int32, false).with_metadata(field_id_meta(2)), + ])); + let iceberg = iceberg_schema_with_timestamp(); + + assert!(coerce_int96_timestamps(&arrow_schema, &iceberg).is_none()); + } + + // Without field IDs, the visitor can't look up the Iceberg type and falls back + // to microsecond to match Iceberg Java behavior. + #[test] + fn test_defaults_to_us_without_field_ids() { + let arrow_schema = Arc::new(ArrowSchema::new(vec![Field::new( + "ts", + DataType::Timestamp(TimeUnit::Nanosecond, None), + true, + )])); + let iceberg = iceberg_schema_with_timestamp(); + + let coerced = coerce_int96_timestamps(&arrow_schema, &iceberg).unwrap(); + assert_eq!( + coerced.field(0).data_type(), + &DataType::Timestamp(TimeUnit::Microsecond, None) + ); + } + + // Field ID exists but points to a non-timestamp Iceberg type. The field_by_id + // lookup succeeds but the match arm returns None, so unwrap_or falls back to + // microsecond. + #[test] + fn test_defaults_to_us_when_iceberg_type_is_not_timestamp() { + let iceberg = Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::optional(1, "ts", Type::Primitive(PrimitiveType::String)).into(), + ]) + .build() + .unwrap(); + + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("ts", DataType::Timestamp(TimeUnit::Nanosecond, None), true) + .with_metadata(field_id_meta(1)), + ])); + + let coerced = coerce_int96_timestamps(&arrow_schema, &iceberg).unwrap(); + assert_eq!( + coerced.field(0).data_type(), + &DataType::Timestamp(TimeUnit::Microsecond, None) + ); + } + + #[test] + fn test_coerce_preserves_field_metadata() { + let mut meta = field_id_meta(1); + meta.insert("custom_key".to_string(), "custom_value".to_string()); + + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("ts", DataType::Timestamp(TimeUnit::Nanosecond, None), true) + .with_metadata(meta.clone()), + ])); + let iceberg = iceberg_schema_with_timestamp(); + + let coerced = coerce_int96_timestamps(&arrow_schema, &iceberg).unwrap(); + assert_eq!(coerced.field(0).metadata(), &meta); + } + + #[test] + fn test_coerce_timestamp_in_struct() { + let iceberg = Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::required( + 1, + "data", + Type::Struct(StructType::new(vec![ + NestedField::optional(2, "ts", Type::Primitive(PrimitiveType::Timestamp)) + .into(), + ])), + ) + .into(), + ]) + .build() + .unwrap(); + + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new( + "data", + DataType::Struct( + vec![ + Field::new("ts", DataType::Timestamp(TimeUnit::Nanosecond, None), true) + .with_metadata(field_id_meta(2)), + ] + .into(), + ), + false, + ) + .with_metadata(field_id_meta(1)), + ])); + + let coerced = coerce_int96_timestamps(&arrow_schema, &iceberg).unwrap(); + let inner = match coerced.field(0).data_type() { + DataType::Struct(fields) => fields, + other => panic!("Expected Struct, got {other}"), + }; + assert_eq!( + inner[0].data_type(), + &DataType::Timestamp(TimeUnit::Microsecond, None) + ); + } + + #[test] + fn test_coerce_timestamp_in_list() { + let iceberg = Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::optional( + 1, + "timestamps", + Type::List(ListType { + element_field: NestedField::optional( + 2, + "element", + Type::Primitive(PrimitiveType::Timestamp), + ) + .into(), + }), + ) + .into(), + ]) + .build() + .unwrap(); + + let element_field = Field::new( + "element", + DataType::Timestamp(TimeUnit::Nanosecond, None), + true, + ) + .with_metadata(field_id_meta(2)); + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("timestamps", DataType::List(Arc::new(element_field)), true) + .with_metadata(field_id_meta(1)), + ])); + + let coerced = coerce_int96_timestamps(&arrow_schema, &iceberg).unwrap(); + let element_dt = match coerced.field(0).data_type() { + DataType::List(f) => f.data_type(), + other => panic!("Expected List, got {other}"), + }; + assert_eq!( + element_dt, + &DataType::Timestamp(TimeUnit::Microsecond, None) + ); + } + + #[test] + fn test_coerce_timestamp_in_map_value() { + let iceberg = Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::optional( + 1, + "ts_map", + Type::Map(MapType { + key_field: NestedField::required( + 2, + "key", + Type::Primitive(PrimitiveType::String), + ) + .into(), + value_field: NestedField::optional( + 3, + "value", + Type::Primitive(PrimitiveType::Timestamp), + ) + .into(), + }), + ) + .into(), + ]) + .build() + .unwrap(); + + let key_field = Field::new("key", DataType::Utf8, false).with_metadata(field_id_meta(2)); + let value_field = Field::new( + "value", + DataType::Timestamp(TimeUnit::Nanosecond, None), + true, + ) + .with_metadata(field_id_meta(3)); + let entries_field = Field::new( + "key_value", + DataType::Struct(vec![key_field, value_field].into()), + false, + ); + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new( + "ts_map", + DataType::Map(Arc::new(entries_field), false), + true, + ) + .with_metadata(field_id_meta(1)), + ])); + + let coerced = coerce_int96_timestamps(&arrow_schema, &iceberg).unwrap(); + let value_dt = match coerced.field(0).data_type() { + DataType::Map(entries, _) => match entries.data_type() { + DataType::Struct(fields) => fields[1].data_type().clone(), + other => panic!("Expected Struct inside Map, got {other}"), + }, + other => panic!("Expected Map, got {other}"), + }; + assert_eq!(value_dt, DataType::Timestamp(TimeUnit::Microsecond, None)); + } +} diff --git a/crates/iceberg/src/arrow/mod.rs b/crates/iceberg/src/arrow/mod.rs index c091c45177..7823320452 100644 --- a/crates/iceberg/src/arrow/mod.rs +++ b/crates/iceberg/src/arrow/mod.rs @@ -27,6 +27,7 @@ pub(crate) mod caching_delete_file_loader; pub mod delete_file_loader; pub(crate) mod delete_filter; +mod int96; mod reader; /// RecordBatch projection utilities pub mod record_batch_projector; diff --git a/crates/iceberg/src/arrow/reader.rs b/crates/iceberg/src/arrow/reader.rs index 9ccf1ac3d7..488f41cf20 100644 --- a/crates/iceberg/src/arrow/reader.rs +++ b/crates/iceberg/src/arrow/reader.rs @@ -46,6 +46,7 @@ use parquet::schema::types::{SchemaDescriptor, Type as ParquetType}; use typed_builder::TypedBuilder; use crate::arrow::caching_delete_file_loader::CachingDeleteFileLoader; +use crate::arrow::int96::coerce_int96_timestamps; use crate::arrow::record_batch_transformer::RecordBatchTransformerBuilder; use crate::arrow::{arrow_schema_to_schema, get_arrow_datum}; use crate::delete_vector::DeleteVector; @@ -386,6 +387,27 @@ impl ArrowReader { arrow_metadata }; + // Coerce INT96 timestamp columns to the resolution specified by the Iceberg schema. + // This must happen before building the stream reader to avoid i64 overflow in arrow-rs. + let arrow_metadata = if let Some(coerced_schema) = + coerce_int96_timestamps(arrow_metadata.schema(), &task.schema) + { + let options = ArrowReaderOptions::new().with_schema(Arc::clone(&coerced_schema)); + ArrowReaderMetadata::try_new(Arc::clone(arrow_metadata.metadata()), options).map_err( + |e| { + Error::new( + ErrorKind::Unexpected, + format!( + "Failed to create ArrowReaderMetadata with INT96-coerced schema: {coerced_schema}" + ), + ) + .with_source(e) + }, + )? + } else { + arrow_metadata + }; + // Build the stream reader, reusing the already-opened file reader let mut record_batch_stream_builder = ParquetRecordBatchStreamBuilder::new_with_metadata(parquet_file_reader, arrow_metadata); @@ -4889,4 +4911,557 @@ message schema { .collect(); assert_eq!(ids, vec![2, 3]); } + + // INT96 encoding: [nanos_low_u32, nanos_high_u32, julian_day_u32] + // Julian day 2_440_588 = Unix epoch (1970-01-01) + const UNIX_EPOCH_JULIAN: i64 = 2_440_588; + const MICROS_PER_DAY: i64 = 86_400_000_000; + // Noon on 3333-01-01 (Julian day 2_953_529) — outside the i64 nanosecond range (~1677-2262). + const INT96_TEST_NANOS_WITHIN_DAY: u64 = 43_200_000_000_000; + const INT96_TEST_JULIAN_DAY: u32 = 2_953_529; + + fn make_int96_test_value() -> (parquet::data_type::Int96, i64) { + let mut val = parquet::data_type::Int96::new(); + val.set_data( + (INT96_TEST_NANOS_WITHIN_DAY & 0xFFFFFFFF) as u32, + (INT96_TEST_NANOS_WITHIN_DAY >> 32) as u32, + INT96_TEST_JULIAN_DAY, + ); + let expected_micros = (INT96_TEST_JULIAN_DAY as i64 - UNIX_EPOCH_JULIAN) * MICROS_PER_DAY + + (INT96_TEST_NANOS_WITHIN_DAY / 1_000) as i64; + (val, expected_micros) + } + + async fn read_int96_batches( + file_path: &str, + schema: SchemaRef, + project_field_ids: Vec, + ) -> Vec { + let file_io = FileIO::new_with_fs(); + let reader = ArrowReaderBuilder::new(file_io).build(); + + let file_size = std::fs::metadata(file_path).unwrap().len(); + let task = FileScanTask { + file_size_in_bytes: file_size, + start: 0, + length: file_size, + record_count: None, + data_file_path: file_path.to_string(), + data_file_format: DataFileFormat::Parquet, + schema, + project_field_ids, + predicate: None, + deletes: vec![], + partition: None, + partition_spec: None, + name_mapping: None, + case_sensitive: false, + }; + + let tasks = Box::pin(futures::stream::iter(vec![Ok(task)])) as FileScanTaskStream; + reader.read(tasks).unwrap().try_collect().await.unwrap() + } + + // ArrowWriter cannot write INT96, so we use SerializedFileWriter directly. + fn write_int96_parquet_file( + table_location: &str, + filename: &str, + with_field_ids: bool, + ) -> (String, Vec) { + use parquet::basic::{Repetition, Type as PhysicalType}; + use parquet::data_type::{Int32Type, Int96, Int96Type}; + use parquet::file::writer::SerializedFileWriter; + use parquet::schema::types::Type as SchemaType; + + let file_path = format!("{table_location}/{filename}"); + + let mut ts_builder = SchemaType::primitive_type_builder("ts", PhysicalType::INT96) + .with_repetition(Repetition::OPTIONAL); + let mut id_builder = SchemaType::primitive_type_builder("id", PhysicalType::INT32) + .with_repetition(Repetition::REQUIRED); + + if with_field_ids { + ts_builder = ts_builder.with_id(Some(1)); + id_builder = id_builder.with_id(Some(2)); + } + + let schema = SchemaType::group_type_builder("schema") + .with_fields(vec![ + Arc::new(ts_builder.build().unwrap()), + Arc::new(id_builder.build().unwrap()), + ]) + .build() + .unwrap(); + + // Dates outside the i64 nanosecond range (~1677-2262) overflow without coercion. + const NOON_NANOS: u64 = INT96_TEST_NANOS_WITHIN_DAY; + const JULIAN_3333: u32 = INT96_TEST_JULIAN_DAY; + const JULIAN_2100: u32 = 2_488_070; + + let test_data: Vec<(u32, u32, u32, i64)> = vec![ + // 3333-01-01 00:00:00 + ( + 0, + 0, + JULIAN_3333, + (JULIAN_3333 as i64 - UNIX_EPOCH_JULIAN) * MICROS_PER_DAY, + ), + // 3333-01-01 12:00:00 + ( + (NOON_NANOS & 0xFFFFFFFF) as u32, + (NOON_NANOS >> 32) as u32, + JULIAN_3333, + (JULIAN_3333 as i64 - UNIX_EPOCH_JULIAN) * MICROS_PER_DAY + + (NOON_NANOS / 1_000) as i64, + ), + // 2100-01-01 00:00:00 + ( + 0, + 0, + JULIAN_2100, + (JULIAN_2100 as i64 - UNIX_EPOCH_JULIAN) * MICROS_PER_DAY, + ), + ]; + + let int96_values: Vec = test_data + .iter() + .map(|(lo, hi, day, _)| { + let mut v = Int96::new(); + v.set_data(*lo, *hi, *day); + v + }) + .collect(); + + let id_values: Vec = (0..test_data.len() as i32).collect(); + let expected_micros: Vec = test_data.iter().map(|(_, _, _, m)| *m).collect(); + + let file = File::create(&file_path).unwrap(); + let mut writer = + SerializedFileWriter::new(file, Arc::new(schema), Default::default()).unwrap(); + + let mut row_group = writer.next_row_group().unwrap(); + { + // def=1: ts is OPTIONAL and present. No repetition levels (top-level columns). + let mut col = row_group.next_column().unwrap().unwrap(); + col.typed::() + .write_batch(&int96_values, Some(&vec![1; test_data.len()]), None) + .unwrap(); + col.close().unwrap(); + } + { + let mut col = row_group.next_column().unwrap().unwrap(); + col.typed::() + .write_batch(&id_values, None, None) + .unwrap(); + col.close().unwrap(); + } + row_group.close().unwrap(); + writer.close().unwrap(); + + (file_path, expected_micros) + } + + async fn assert_int96_read_matches( + file_path: &str, + schema: SchemaRef, + project_field_ids: Vec, + expected_micros: &[i64], + ) { + use arrow_array::TimestampMicrosecondArray; + + let batches = read_int96_batches(file_path, schema, project_field_ids).await; + + assert_eq!(batches.len(), 1); + let ts_array = batches[0] + .column(0) + .as_any() + .downcast_ref::() + .expect("Expected TimestampMicrosecondArray"); + + for (i, expected) in expected_micros.iter().enumerate() { + assert_eq!( + ts_array.value(i), + *expected, + "Row {i}: got {}, expected {expected}", + ts_array.value(i) + ); + } + } + + #[tokio::test] + async fn test_read_int96_timestamps_with_field_ids() { + let schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::optional(1, "ts", Type::Primitive(PrimitiveType::Timestamp)) + .into(), + NestedField::required(2, "id", Type::Primitive(PrimitiveType::Int)).into(), + ]) + .build() + .unwrap(), + ); + + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + let (file_path, expected_micros) = + write_int96_parquet_file(&table_location, "with_ids.parquet", true); + + assert_int96_read_matches(&file_path, schema, vec![1, 2], &expected_micros).await; + } + + #[tokio::test] + async fn test_read_int96_timestamps_without_field_ids() { + let schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::optional(1, "ts", Type::Primitive(PrimitiveType::Timestamp)) + .into(), + NestedField::required(2, "id", Type::Primitive(PrimitiveType::Int)).into(), + ]) + .build() + .unwrap(), + ); + + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + let (file_path, expected_micros) = + write_int96_parquet_file(&table_location, "no_ids.parquet", false); + + assert_int96_read_matches(&file_path, schema, vec![1, 2], &expected_micros).await; + } + + #[tokio::test] + async fn test_read_int96_timestamps_in_struct() { + use arrow_array::{StructArray, TimestampMicrosecondArray}; + use parquet::basic::{Repetition, Type as PhysicalType}; + use parquet::data_type::Int96Type; + use parquet::file::writer::SerializedFileWriter; + use parquet::schema::types::Type as SchemaType; + + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + let file_path = format!("{table_location}/struct_int96.parquet"); + + let ts_type = SchemaType::primitive_type_builder("ts", PhysicalType::INT96) + .with_repetition(Repetition::OPTIONAL) + .with_id(Some(2)) + .build() + .unwrap(); + + let struct_type = SchemaType::group_type_builder("data") + .with_repetition(Repetition::REQUIRED) + .with_id(Some(1)) + .with_fields(vec![Arc::new(ts_type)]) + .build() + .unwrap(); + + let parquet_schema = SchemaType::group_type_builder("schema") + .with_fields(vec![Arc::new(struct_type)]) + .build() + .unwrap(); + + let (int96_val, expected_micros) = make_int96_test_value(); + + let file = File::create(&file_path).unwrap(); + let mut writer = + SerializedFileWriter::new(file, Arc::new(parquet_schema), Default::default()).unwrap(); + + // def=1: struct is REQUIRED so no level, ts is OPTIONAL and present (1). + // No repetition levels needed (no repeated groups). + let mut row_group = writer.next_row_group().unwrap(); + { + let mut col = row_group.next_column().unwrap().unwrap(); + col.typed::() + .write_batch(&[int96_val], Some(&[1]), None) + .unwrap(); + col.close().unwrap(); + } + row_group.close().unwrap(); + writer.close().unwrap(); + + let iceberg_schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::required( + 1, + "data", + Type::Struct(crate::spec::StructType::new(vec![ + NestedField::optional( + 2, + "ts", + Type::Primitive(PrimitiveType::Timestamp), + ) + .into(), + ])), + ) + .into(), + ]) + .build() + .unwrap(), + ); + + let batches = read_int96_batches(&file_path, iceberg_schema, vec![1]).await; + + assert_eq!(batches.len(), 1); + let struct_array = batches[0] + .column(0) + .as_any() + .downcast_ref::() + .expect("Expected StructArray"); + let ts_array = struct_array + .column(0) + .as_any() + .downcast_ref::() + .expect("Expected TimestampMicrosecondArray inside struct"); + + assert_eq!( + ts_array.value(0), + expected_micros, + "INT96 in struct: got {}, expected {expected_micros}", + ts_array.value(0) + ); + } + + #[tokio::test] + async fn test_read_int96_timestamps_in_list() { + use arrow_array::{ListArray, TimestampMicrosecondArray}; + use parquet::basic::{Repetition, Type as PhysicalType}; + use parquet::data_type::Int96Type; + use parquet::file::writer::SerializedFileWriter; + use parquet::schema::types::Type as SchemaType; + + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + let file_path = format!("{table_location}/list_int96.parquet"); + + // 3-level LIST encoding: + // optional group timestamps (LIST) { + // repeated group list { + // optional int96 element; + // } + // } + let element_type = SchemaType::primitive_type_builder("element", PhysicalType::INT96) + .with_repetition(Repetition::OPTIONAL) + .with_id(Some(2)) + .build() + .unwrap(); + + let list_group = SchemaType::group_type_builder("list") + .with_repetition(Repetition::REPEATED) + .with_fields(vec![Arc::new(element_type)]) + .build() + .unwrap(); + + let list_type = SchemaType::group_type_builder("timestamps") + .with_repetition(Repetition::OPTIONAL) + .with_id(Some(1)) + .with_logical_type(Some(parquet::basic::LogicalType::List)) + .with_fields(vec![Arc::new(list_group)]) + .build() + .unwrap(); + + let parquet_schema = SchemaType::group_type_builder("schema") + .with_fields(vec![Arc::new(list_type)]) + .build() + .unwrap(); + + let (int96_val, expected_micros) = make_int96_test_value(); + + let file = File::create(&file_path).unwrap(); + let mut writer = + SerializedFileWriter::new(file, Arc::new(parquet_schema), Default::default()).unwrap(); + + // Write a single row with a list containing one INT96 element. + // def=3: list present (1) + repeated group (2) + element present (3) + // rep=0: start of a new list + let mut row_group = writer.next_row_group().unwrap(); + { + let mut col = row_group.next_column().unwrap().unwrap(); + col.typed::() + .write_batch(&[int96_val], Some(&[3]), Some(&[0])) + .unwrap(); + col.close().unwrap(); + } + row_group.close().unwrap(); + writer.close().unwrap(); + + let iceberg_schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::optional( + 1, + "timestamps", + Type::List(crate::spec::ListType { + element_field: NestedField::optional( + 2, + "element", + Type::Primitive(PrimitiveType::Timestamp), + ) + .into(), + }), + ) + .into(), + ]) + .build() + .unwrap(), + ); + + let batches = read_int96_batches(&file_path, iceberg_schema, vec![1]).await; + + assert_eq!(batches.len(), 1); + let list_array = batches[0] + .column(0) + .as_any() + .downcast_ref::() + .expect("Expected ListArray"); + let ts_array = list_array + .values() + .as_any() + .downcast_ref::() + .expect("Expected TimestampMicrosecondArray inside list"); + + assert_eq!( + ts_array.value(0), + expected_micros, + "INT96 in list: got {}, expected {expected_micros}", + ts_array.value(0) + ); + } + + #[tokio::test] + async fn test_read_int96_timestamps_in_map() { + use arrow_array::{MapArray, TimestampMicrosecondArray}; + use parquet::basic::{Repetition, Type as PhysicalType}; + use parquet::data_type::{ByteArrayType, Int96Type}; + use parquet::file::writer::SerializedFileWriter; + use parquet::schema::types::Type as SchemaType; + + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + let file_path = format!("{table_location}/map_int96.parquet"); + + // MAP encoding: + // optional group ts_map (MAP) { + // repeated group key_value { + // required binary key (UTF8); + // optional int96 value; + // } + // } + let key_type = SchemaType::primitive_type_builder("key", PhysicalType::BYTE_ARRAY) + .with_repetition(Repetition::REQUIRED) + .with_logical_type(Some(parquet::basic::LogicalType::String)) + .with_id(Some(2)) + .build() + .unwrap(); + + let value_type = SchemaType::primitive_type_builder("value", PhysicalType::INT96) + .with_repetition(Repetition::OPTIONAL) + .with_id(Some(3)) + .build() + .unwrap(); + + let key_value_group = SchemaType::group_type_builder("key_value") + .with_repetition(Repetition::REPEATED) + .with_fields(vec![Arc::new(key_type), Arc::new(value_type)]) + .build() + .unwrap(); + + let map_type = SchemaType::group_type_builder("ts_map") + .with_repetition(Repetition::OPTIONAL) + .with_id(Some(1)) + .with_logical_type(Some(parquet::basic::LogicalType::Map)) + .with_fields(vec![Arc::new(key_value_group)]) + .build() + .unwrap(); + + let parquet_schema = SchemaType::group_type_builder("schema") + .with_fields(vec![Arc::new(map_type)]) + .build() + .unwrap(); + + let (int96_val, expected_micros) = make_int96_test_value(); + + let file = File::create(&file_path).unwrap(); + let mut writer = + SerializedFileWriter::new(file, Arc::new(parquet_schema), Default::default()).unwrap(); + + // Write a single row with a map containing one key-value pair. + // rep=0 for both columns: start of a new map. + // key def=2: map present (1) + key_value entry present (2), key is REQUIRED. + // value def=3: map present (1) + key_value entry present (2) + value present (3). + let mut row_group = writer.next_row_group().unwrap(); + { + let mut col = row_group.next_column().unwrap().unwrap(); + col.typed::() + .write_batch( + &[parquet::data_type::ByteArray::from("event_time")], + Some(&[2]), + Some(&[0]), + ) + .unwrap(); + col.close().unwrap(); + } + { + let mut col = row_group.next_column().unwrap().unwrap(); + col.typed::() + .write_batch(&[int96_val], Some(&[3]), Some(&[0])) + .unwrap(); + col.close().unwrap(); + } + row_group.close().unwrap(); + writer.close().unwrap(); + + let iceberg_schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::optional( + 1, + "ts_map", + Type::Map(crate::spec::MapType { + key_field: NestedField::required( + 2, + "key", + Type::Primitive(PrimitiveType::String), + ) + .into(), + value_field: NestedField::optional( + 3, + "value", + Type::Primitive(PrimitiveType::Timestamp), + ) + .into(), + }), + ) + .into(), + ]) + .build() + .unwrap(), + ); + + let batches = read_int96_batches(&file_path, iceberg_schema, vec![1]).await; + + assert_eq!(batches.len(), 1); + let map_array = batches[0] + .column(0) + .as_any() + .downcast_ref::() + .expect("Expected MapArray"); + let ts_array = map_array + .values() + .as_any() + .downcast_ref::() + .expect("Expected TimestampMicrosecondArray as map values"); + + assert_eq!( + ts_array.value(0), + expected_micros, + "INT96 in map: got {}, expected {expected_micros}", + ts_array.value(0) + ); + } } diff --git a/crates/iceberg/src/arrow/schema.rs b/crates/iceberg/src/arrow/schema.rs index bd9e249f48..f96c29ab4a 100644 --- a/crates/iceberg/src/arrow/schema.rs +++ b/crates/iceberg/src/arrow/schema.rs @@ -199,7 +199,10 @@ fn visit_struct(fields: &Fields, visitor: &mut V) -> Resu } /// Visit schema in post order. -fn visit_schema(schema: &ArrowSchema, visitor: &mut V) -> Result { +pub(crate) fn visit_schema( + schema: &ArrowSchema, + visitor: &mut V, +) -> Result { let mut results = Vec::with_capacity(schema.fields().len()); for field in schema.fields() { visitor.before_field(field)?; From 4e8cf7f2b10206fca59cbc87b710f66d0f23fb21 Mon Sep 17 00:00:00 2001 From: Xander Date: Thu, 16 Apr 2026 20:00:13 +0100 Subject: [PATCH 17/45] feat(encryption) [2/N] Support encryption: Add streaming encryption/decryption (#2286) --- .typos.toml | 4 + .../iceberg/src/encryption/file_decryptor.rs | 156 ++ .../iceberg/src/encryption/file_encryptor.rs | 138 ++ crates/iceberg/src/encryption/mod.rs | 6 + crates/iceberg/src/encryption/stream.rs | 1249 +++++++++++++++++ 5 files changed, 1553 insertions(+) create mode 100644 crates/iceberg/src/encryption/file_decryptor.rs create mode 100644 crates/iceberg/src/encryption/file_encryptor.rs create mode 100644 crates/iceberg/src/encryption/stream.rs diff --git a/.typos.toml b/.typos.toml index 407ce8168c..36996a553a 100644 --- a/.typos.toml +++ b/.typos.toml @@ -18,5 +18,9 @@ [type.rust] extend-ignore-identifiers-re = ["^bimap$"] +[default.extend-words] +ags = "ags" +AGS = "AGS" + [files] extend-exclude = ["**/testdata", "CHANGELOG.md"] diff --git a/crates/iceberg/src/encryption/file_decryptor.rs b/crates/iceberg/src/encryption/file_decryptor.rs new file mode 100644 index 0000000000..e44c0e1d78 --- /dev/null +++ b/crates/iceberg/src/encryption/file_decryptor.rs @@ -0,0 +1,156 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! File-level decryption helper for AGS1 stream-encrypted files. + +use std::fmt; +use std::sync::Arc; + +use super::crypto::{AesGcmCipher, SecureKey}; +use super::stream::AesGcmFileRead; +use crate::Result; +use crate::io::FileRead; + +/// Holds the decryption material for a single encrypted file. +/// +/// Created from a plaintext DEK and AAD prefix, then used to wrap +/// an encrypted file reader for transparent decryption on read. +pub struct AesGcmFileDecryptor { + cipher: Arc, + aad_prefix: Box<[u8]>, +} + +impl AesGcmFileDecryptor { + /// Creates a new `AesGcmFileDecryptor` from a plaintext DEK and AAD prefix. + pub fn new(dek: &[u8], aad_prefix: impl Into>) -> Result { + let key = SecureKey::new(dek)?; + let cipher = Arc::new(AesGcmCipher::new(key)); + Ok(Self { + cipher, + aad_prefix: aad_prefix.into(), + }) + } + + /// Wraps a raw encrypted-file reader in a decrypting [`AesGcmFileRead`]. + pub fn wrap_reader( + &self, + reader: Box, + encrypted_file_length: u64, + ) -> Result> { + let decrypting = AesGcmFileRead::new( + reader, + Arc::clone(&self.cipher), + self.aad_prefix.clone(), + encrypted_file_length, + )?; + Ok(Box::new(decrypting)) + } + + /// Calculates the plaintext length from an encrypted file's total length. + pub fn plaintext_length(&self, encrypted_file_length: u64) -> Result { + AesGcmFileRead::calculate_plaintext_length(encrypted_file_length) + } +} + +impl fmt::Debug for AesGcmFileDecryptor { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("AesGcmFileDecryptor") + .field("aad_prefix_len", &self.aad_prefix.len()) + .finish_non_exhaustive() + } +} + +#[cfg(test)] +mod tests { + use std::ops::Range; + + use bytes::Bytes; + + use super::*; + use crate::encryption::AesGcmFileEncryptor; + use crate::io::FileWrite; + + struct MemoryFileRead(Bytes); + + #[async_trait::async_trait] + impl FileRead for MemoryFileRead { + async fn read(&self, range: Range) -> Result { + Ok(self.0.slice(range.start as usize..range.end as usize)) + } + } + + struct MemoryFileWrite { + buffer: std::sync::Arc>>, + } + + #[async_trait::async_trait] + impl FileWrite for MemoryFileWrite { + async fn write(&mut self, bs: Bytes) -> Result<()> { + self.buffer.lock().unwrap().extend_from_slice(&bs); + Ok(()) + } + + async fn close(&mut self) -> Result<()> { + Ok(()) + } + } + + #[tokio::test] + async fn test_wrap_reader_roundtrip() { + let key = b"0123456789abcdef"; + let aad_prefix = b"test-aad-prefix!"; + let plaintext = b"Hello from file decryptor!"; + + // Encrypt via the encryptor wrapper + let encryptor = AesGcmFileEncryptor::new(key.as_slice(), aad_prefix.as_slice()).unwrap(); + let buffer = std::sync::Arc::new(std::sync::Mutex::new(Vec::new())); + let mut writer = encryptor.wrap_writer(Box::new(MemoryFileWrite { + buffer: buffer.clone(), + })); + writer.write(Bytes::from(plaintext.to_vec())).await.unwrap(); + writer.close().await.unwrap(); + let encrypted = buffer.lock().unwrap().clone(); + let encrypted_len = encrypted.len() as u64; + + // Decrypt via the decryptor wrapper + let decryptor = AesGcmFileDecryptor::new(key.as_slice(), aad_prefix.as_slice()).unwrap(); + let reader = decryptor + .wrap_reader( + Box::new(MemoryFileRead(Bytes::from(encrypted))), + encrypted_len, + ) + .unwrap(); + + let result = reader.read(0..plaintext.len() as u64).await.unwrap(); + assert_eq!(&result[..], plaintext); + } + + #[tokio::test] + async fn test_invalid_key_length() { + let result = AesGcmFileDecryptor::new(b"too-short", b"aad".as_slice()); + assert!(result.is_err()); + } + + #[tokio::test] + async fn test_plaintext_length() { + let decryptor = AesGcmFileDecryptor::new(b"0123456789abcdef", b"aad".as_slice()).unwrap(); + // header(8) + nonce(12) + 10 bytes ciphertext + tag(16) = 46 + let encrypted_len = 8 + 12 + 10 + 16; + let plain_len = decryptor.plaintext_length(encrypted_len).unwrap(); + assert_eq!(plain_len, 10); + } +} diff --git a/crates/iceberg/src/encryption/file_encryptor.rs b/crates/iceberg/src/encryption/file_encryptor.rs new file mode 100644 index 0000000000..773438ad80 --- /dev/null +++ b/crates/iceberg/src/encryption/file_encryptor.rs @@ -0,0 +1,138 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! File-level encryption helper for AGS1 stream-encrypted files. + +use std::fmt; +use std::sync::Arc; + +use super::crypto::{AesGcmCipher, SecureKey}; +use super::stream::AesGcmFileWrite; +use crate::Result; +use crate::io::FileWrite; + +/// Holds the encryption material for a single encrypted file. +/// +/// This is the write-side counterpart to +/// [`AesGcmFileDecryptor`](super::AesGcmFileDecryptor). Created from +/// a plaintext DEK and AAD prefix, then used to wrap an output writer +/// for transparent encryption on write. +pub struct AesGcmFileEncryptor { + cipher: Arc, + aad_prefix: Box<[u8]>, +} + +impl AesGcmFileEncryptor { + /// Creates a new `AesGcmFileEncryptor` from a plaintext DEK and AAD prefix. + pub fn new(dek: &[u8], aad_prefix: impl Into>) -> Result { + let key = SecureKey::new(dek)?; + let cipher = Arc::new(AesGcmCipher::new(key)); + Ok(Self { + cipher, + aad_prefix: aad_prefix.into(), + }) + } + + /// Wraps a raw output writer in an encrypting [`AesGcmFileWrite`]. + pub fn wrap_writer(&self, writer: Box) -> Box { + Box::new(AesGcmFileWrite::new( + writer, + Arc::clone(&self.cipher), + self.aad_prefix.clone(), + )) + } +} + +impl fmt::Debug for AesGcmFileEncryptor { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("AesGcmFileEncryptor") + .field("aad_prefix_len", &self.aad_prefix.len()) + .finish_non_exhaustive() + } +} + +#[cfg(test)] +mod tests { + use std::ops::Range; + + use bytes::Bytes; + + use super::*; + use crate::encryption::AesGcmFileDecryptor; + use crate::io::FileRead; + + struct MemoryFileRead(Bytes); + + #[async_trait::async_trait] + impl FileRead for MemoryFileRead { + async fn read(&self, range: Range) -> Result { + Ok(self.0.slice(range.start as usize..range.end as usize)) + } + } + + struct MemoryFileWrite { + buffer: std::sync::Arc>>, + } + + #[async_trait::async_trait] + impl FileWrite for MemoryFileWrite { + async fn write(&mut self, bs: Bytes) -> Result<()> { + self.buffer.lock().unwrap().extend_from_slice(&bs); + Ok(()) + } + + async fn close(&mut self) -> Result<()> { + Ok(()) + } + } + + #[tokio::test] + async fn test_wrap_writer_roundtrip() { + let key = b"0123456789abcdef"; + let aad_prefix = b"test-aad-prefix!"; + let plaintext = b"Hello from file encryptor!"; + + // Encrypt via the encryptor wrapper + let encryptor = AesGcmFileEncryptor::new(key.as_slice(), aad_prefix.as_slice()).unwrap(); + let buffer = std::sync::Arc::new(std::sync::Mutex::new(Vec::new())); + let mut writer = encryptor.wrap_writer(Box::new(MemoryFileWrite { + buffer: buffer.clone(), + })); + writer.write(Bytes::from(plaintext.to_vec())).await.unwrap(); + writer.close().await.unwrap(); + let encrypted = buffer.lock().unwrap().clone(); + let encrypted_len = encrypted.len() as u64; + + // Decrypt via the decryptor wrapper + let decryptor = AesGcmFileDecryptor::new(key.as_slice(), aad_prefix.as_slice()).unwrap(); + let reader = decryptor + .wrap_reader( + Box::new(MemoryFileRead(Bytes::from(encrypted))), + encrypted_len, + ) + .unwrap(); + + let result = reader.read(0..plaintext.len() as u64).await.unwrap(); + assert_eq!(&result[..], plaintext); + } + + #[tokio::test] + async fn test_invalid_key_length() { + let result = AesGcmFileEncryptor::new(b"bad-key", b"aad".as_slice()); + assert!(result.is_err()); + } +} diff --git a/crates/iceberg/src/encryption/mod.rs b/crates/iceberg/src/encryption/mod.rs index 097f4f24e3..9888a153c7 100644 --- a/crates/iceberg/src/encryption/mod.rs +++ b/crates/iceberg/src/encryption/mod.rs @@ -21,5 +21,11 @@ //! and decrypting data in Iceberg tables. mod crypto; +mod file_decryptor; +mod file_encryptor; +mod stream; pub use crypto::{AesGcmCipher, AesKeySize, SecureKey}; +pub use file_decryptor::AesGcmFileDecryptor; +pub use file_encryptor::AesGcmFileEncryptor; +pub use stream::{AesGcmFileRead, AesGcmFileWrite}; diff --git a/crates/iceberg/src/encryption/stream.rs b/crates/iceberg/src/encryption/stream.rs new file mode 100644 index 0000000000..130578f2b1 --- /dev/null +++ b/crates/iceberg/src/encryption/stream.rs @@ -0,0 +1,1249 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! AGS1 stream encryption/decryption for Iceberg. +//! +//! Implements the block-based AES-GCM stream format used by Iceberg for +//! encrypting manifest lists and manifest files. The format is +//! byte-compatible with Java's `AesGcmInputStream` / `AesGcmOutputStream`. +//! +//! # AGS1 File Format +//! +//! ```text +//! ┌─────────────────────────────────────────────┐ +//! │ Header (8 bytes) │ +//! │ Magic: "AGS1" (4 bytes, ASCII) │ +//! │ Plain block size: u32 LE (4 bytes) │ +//! │ Default: 1,048,576 (1 MiB) │ +//! ├─────────────────────────────────────────────┤ +//! │ Block 0 │ +//! │ Nonce (12 bytes) │ +//! │ Ciphertext (up to plain_block_size bytes) │ +//! │ GCM Tag (16 bytes) │ +//! ├─────────────────────────────────────────────┤ +//! │ Block 1..N (same structure) │ +//! ├─────────────────────────────────────────────┤ +//! │ Final block (may be shorter) │ +//! └─────────────────────────────────────────────┘ +//! ``` +//! +//! Each block's AAD is: `aad_prefix || block_index (4 bytes, LE)`. + +use std::ops::Range; +use std::sync::Arc; + +use bytes::{Bytes, BytesMut}; + +use super::AesGcmCipher; +use crate::io::{FileRead, FileWrite}; +use crate::{Error, ErrorKind, Result}; + +/// Default plaintext block size (1 MiB), matching Java's `Ciphers.PLAIN_BLOCK_SIZE`. +pub const PLAIN_BLOCK_SIZE: u32 = 1024 * 1024; + +/// AES-GCM nonce length in bytes. +pub const NONCE_LENGTH: u32 = 12; + +/// AES-GCM authentication tag length in bytes. +pub const GCM_TAG_LENGTH: u32 = 16; + +/// Cipher block size = plaintext block size + nonce + GCM tag. +pub const CIPHER_BLOCK_SIZE: u32 = PLAIN_BLOCK_SIZE + NONCE_LENGTH + GCM_TAG_LENGTH; + +/// AGS1 stream magic bytes. +pub const GCM_STREAM_MAGIC: [u8; 4] = *b"AGS1"; + +/// AGS1 stream header length (4-byte magic + 4-byte block size). +pub const GCM_STREAM_HEADER_LENGTH: u32 = 8; + +/// Minimum valid AGS1 stream length (header + one empty block). +#[cfg(test)] +pub const MIN_STREAM_LENGTH: u32 = GCM_STREAM_HEADER_LENGTH + NONCE_LENGTH + GCM_TAG_LENGTH; + +/// Constructs the per-block AAD for AGS1 stream encryption. +/// +/// Format: `aad_prefix || block_index (4 bytes, little-endian)` +/// +/// This matches Java's `Ciphers.streamBlockAAD()`. +pub(crate) fn stream_block_aad(aad_prefix: &[u8], block_index: u32) -> Vec { + let index_bytes = block_index.to_le_bytes(); + if aad_prefix.is_empty() { + index_bytes.to_vec() + } else { + let mut aad = Vec::with_capacity(aad_prefix.len() + 4); + aad.extend_from_slice(aad_prefix); + aad.extend_from_slice(&index_bytes); + aad + } +} + +/// Transparent decryption of AGS1 stream-encrypted files. +/// +/// Implements the [`FileRead`] trait, providing random-access reads over +/// encrypted data. Each `read()` call determines which encrypted blocks +/// overlap the requested plaintext range, reads and decrypts them, then +/// returns the requested plaintext bytes. +/// +/// # Usage +/// +/// ```ignore +/// // (ignored: requires async runtime and concrete FileRead/FileWrite impls) +/// let reader = AesGcmFileRead::new( +/// inner_reader, // Box for the encrypted file +/// cipher, // Arc with the DEK +/// aad_prefix.to_vec(), +/// encrypted_file_length, +/// )?; +/// +/// // Read plaintext bytes transparently +/// let plaintext = reader.read(0..1024).await?; +/// ``` +pub struct AesGcmFileRead { + /// The underlying encrypted file reader. + inner: Box, + /// The AES-GCM cipher holding the DEK. + cipher: Arc, + /// AAD prefix from the key metadata. + aad_prefix: Box<[u8]>, + /// Total plaintext stream size in bytes. + plain_stream_size: u64, + /// Total number of encrypted blocks. + num_blocks: u64, + /// Size of the last cipher block (may be smaller than `CIPHER_BLOCK_SIZE`). + last_cipher_block_size: u32, +} + +impl AesGcmFileRead { + /// Creates a new `AesGcmFileRead` for decrypting an AGS1 stream. + /// + /// Computes the plaintext size and block layout from the encrypted file + /// length. No I/O is performed; header validation happens implicitly + /// when blocks are decrypted (GCM authentication will fail on corrupt data). + /// + /// # Arguments + /// + /// * `inner` - Reader for the underlying encrypted file + /// * `cipher` - AES-GCM cipher initialized with the file's DEK + /// * `aad_prefix` - AAD prefix from the file's `StandardKeyMetadata` + /// * `encrypted_file_length` - Total byte length of the encrypted file + pub fn new( + inner: Box, + cipher: Arc, + aad_prefix: Box<[u8]>, + encrypted_file_length: u64, + ) -> Result { + let plain_stream_size = Self::calculate_plaintext_length(encrypted_file_length)?; + let stream_length = encrypted_file_length - GCM_STREAM_HEADER_LENGTH as u64; + + if stream_length == 0 { + return Ok(Self { + inner, + cipher, + aad_prefix, + plain_stream_size: 0, + num_blocks: 0, + last_cipher_block_size: 0, + }); + } + + let num_full_blocks = stream_length / CIPHER_BLOCK_SIZE as u64; + let cipher_bytes_in_last_block = (stream_length % CIPHER_BLOCK_SIZE as u64) as u32; + let full_blocks_only = cipher_bytes_in_last_block == 0; + + let num_blocks = if full_blocks_only { + num_full_blocks + } else { + num_full_blocks + 1 + }; + + if num_blocks > u32::MAX as u64 { + return Err(Error::new( + ErrorKind::DataInvalid, + format!( + "AGS1 format supports at most {} blocks (~4 TiB per file), but file requires {num_blocks} blocks", + u32::MAX + ), + )); + } + + let last_cipher_block_size = if full_blocks_only { + CIPHER_BLOCK_SIZE + } else { + cipher_bytes_in_last_block + }; + + Ok(Self { + inner, + cipher, + aad_prefix, + plain_stream_size, + num_blocks, + last_cipher_block_size, + }) + } + + /// Returns the plaintext stream size in bytes. + pub fn plaintext_length(&self) -> u64 { + self.plain_stream_size + } + + /// Calculates the plaintext length from an encrypted file's total length. + /// + /// This is a static calculation matching Java's + /// `AesGcmInputStream.calculatePlaintextLength()`. + pub fn calculate_plaintext_length(encrypted_file_length: u64) -> Result { + if encrypted_file_length < GCM_STREAM_HEADER_LENGTH as u64 { + return Err(Error::new( + ErrorKind::DataInvalid, + format!( + "Encrypted file too short: {encrypted_file_length} bytes (minimum {GCM_STREAM_HEADER_LENGTH})" + ), + )); + } + + let stream_length = encrypted_file_length - GCM_STREAM_HEADER_LENGTH as u64; + + if stream_length == 0 { + return Ok(0); + } + + let num_full_blocks = stream_length / CIPHER_BLOCK_SIZE as u64; + let cipher_bytes_in_last_block = stream_length % CIPHER_BLOCK_SIZE as u64; + let full_blocks_only = cipher_bytes_in_last_block == 0; + + let plain_bytes_in_last_block = if full_blocks_only { + 0 + } else { + if cipher_bytes_in_last_block < (NONCE_LENGTH + GCM_TAG_LENGTH) as u64 { + return Err(Error::new( + ErrorKind::DataInvalid, + format!( + "Truncated encrypted file: last block is {} bytes (minimum {})", + cipher_bytes_in_last_block, + NONCE_LENGTH + GCM_TAG_LENGTH + ), + )); + } + cipher_bytes_in_last_block - NONCE_LENGTH as u64 - GCM_TAG_LENGTH as u64 + }; + + Ok(num_full_blocks * PLAIN_BLOCK_SIZE as u64 + plain_bytes_in_last_block) + } + + /// Returns the encrypted byte offset for a given block index. + fn encrypted_block_offset(block_index: u64) -> u64 { + block_index * CIPHER_BLOCK_SIZE as u64 + GCM_STREAM_HEADER_LENGTH as u64 + } + + /// Returns the cipher block size for a given block index. + fn cipher_block_size(&self, block_index: u64) -> u32 { + if block_index == self.num_blocks - 1 { + self.last_cipher_block_size + } else { + CIPHER_BLOCK_SIZE + } + } +} + +#[async_trait::async_trait] +impl FileRead for AesGcmFileRead { + /// Reads and decrypts a plaintext byte range from the encrypted AGS1 stream. + /// + /// The caller specifies a range in **plaintext** coordinates (e.g. "bytes 0..1024 + /// of the original file"). This method translates that into the encrypted file + /// layout and performs the following steps: + /// + /// 1. **Map to blocks** — divides the plaintext range by `PLAIN_BLOCK_SIZE` to + /// find which encrypted blocks (`first_block..=last_block`) contain the + /// requested data. + /// + /// 2. **Single I/O read** — calculates the contiguous byte range in the + /// encrypted file that covers all needed blocks (including the 8-byte AGS1 + /// header offset, 12-byte nonces, and 16-byte GCM tags) and fetches them in + /// one call to the inner `FileRead`. + /// + /// 3. **Decrypt per block** — iterates over each cipher block in the response, + /// decrypts it with AES-GCM using the per-block AAD (`aad_prefix || block_index`), + /// and slices out only the plaintext bytes that overlap the requested range. + /// + /// 4. **Assemble result** — concatenates the slices into a single `Bytes` buffer + /// matching exactly `range.end - range.start` bytes. + /// + /// Because each block is independently encrypted with its own nonce and AAD, + /// arbitrary random-access reads are supported without decrypting the entire + /// file. GCM authentication is verified per-block, so any tampering is detected + /// at the granularity of individual blocks. + async fn read(&self, range: Range) -> Result { + if range.start == range.end { + return Ok(Bytes::new()); + } + + if range.start > range.end { + return Err(Error::new( + ErrorKind::DataInvalid, + format!( + "Invalid read range: start ({}) is greater than end ({})", + range.start, range.end + ), + )); + } + + if range.end > self.plain_stream_size { + return Err(Error::new( + ErrorKind::DataInvalid, + format!( + "Read range {}..{} exceeds plaintext size {}", + range.start, range.end, self.plain_stream_size + ), + )); + } + + if self.num_blocks == 0 { + return Ok(Bytes::new()); + } + + let first_block = range.start / PLAIN_BLOCK_SIZE as u64; + let last_block = (range.end - 1) / PLAIN_BLOCK_SIZE as u64; + + // Read all needed encrypted blocks in a single I/O call + let encrypted_start = Self::encrypted_block_offset(first_block); + let encrypted_end = + Self::encrypted_block_offset(last_block) + self.cipher_block_size(last_block) as u64; + + let all_encrypted = self.inner.read(encrypted_start..encrypted_end).await?; + + // Decrypt each block and extract the requested plaintext range + let result_len = (range.end - range.start) as usize; + let mut result = BytesMut::with_capacity(result_len); + let mut encrypted_offset = 0usize; + + for block_idx in first_block..=last_block { + let block_size = self.cipher_block_size(block_idx) as usize; + let cipher_block = &all_encrypted[encrypted_offset..encrypted_offset + block_size]; + encrypted_offset += block_size; + + let aad = stream_block_aad(&self.aad_prefix, block_idx as u32); + let decrypted = self.cipher.decrypt(cipher_block, Some(&aad))?; + + // Calculate which slice of this decrypted block we need + let block_plain_start = block_idx * PLAIN_BLOCK_SIZE as u64; + let slice_start = if block_idx == first_block { + (range.start - block_plain_start) as usize + } else { + 0 + }; + let slice_end = if block_idx == last_block { + (range.end - block_plain_start) as usize + } else { + decrypted.len() + }; + + result.extend_from_slice(&decrypted[slice_start..slice_end]); + } + + Ok(result.freeze()) + } +} + +/// Transparent encryption of AGS1 stream-encrypted files. +/// +/// Implements the [`FileWrite`] trait, buffering plaintext and emitting +/// encrypted AGS1 blocks. This is the streaming write counterpart to +/// [`AesGcmFileRead`]. +/// +/// # Usage +/// +/// ```ignore +/// // (ignored: requires async runtime and concrete FileRead/FileWrite impls) +/// let writer = AesGcmFileWrite::new( +/// inner_writer, // Box for the output file +/// cipher, // Arc with the DEK +/// aad_prefix.to_vec(), +/// ); +/// +/// writer.write(plaintext_chunk).await?; +/// writer.close().await?; +/// ``` +pub struct AesGcmFileWrite { + /// The underlying output writer. + inner: Box, + /// The AES-GCM cipher holding the DEK. + cipher: Arc, + /// AAD prefix from the key metadata. + aad_prefix: Box<[u8]>, + /// Plaintext buffer accumulating data before block encryption. + buffer: Vec, + /// Current block index for AAD construction. + block_index: u32, + /// Whether the AGS1 header has been written. + header_written: bool, + /// Whether close() has been called. + closed: bool, + /// Whether the writer is in a poisoned state due to a failed inner write. + /// Once poisoned, all subsequent operations are rejected because the inner + /// writer may have received partial data. + poisoned: bool, +} + +impl AesGcmFileWrite { + /// Creates a new `AesGcmFileWrite` for encrypting to AGS1 format. + /// + /// No I/O is performed until `write()` or `close()` is called. + pub fn new( + inner: Box, + cipher: Arc, + aad_prefix: impl Into>, + ) -> Self { + Self { + inner, + cipher, + aad_prefix: aad_prefix.into(), + buffer: Vec::new(), + block_index: 0, + header_written: false, + closed: false, + poisoned: false, + } + } + + /// Writes the AGS1 header (magic + plain block size) to the inner writer. + async fn write_header(&mut self) -> Result<()> { + let mut header = Vec::with_capacity(GCM_STREAM_HEADER_LENGTH as usize); + header.extend_from_slice(&GCM_STREAM_MAGIC); + header.extend_from_slice(&PLAIN_BLOCK_SIZE.to_le_bytes()); + if let Err(e) = self.inner.write(Bytes::from(header)).await { + self.poisoned = true; + return Err(e); + } + self.header_written = true; + Ok(()) + } + + /// Encrypts a plaintext block and writes it to the inner writer. + async fn encrypt_and_write_block(&mut self, block_data: &[u8]) -> Result<()> { + let aad = stream_block_aad(&self.aad_prefix, self.block_index); + let encrypted = self.cipher.encrypt(block_data, Some(&aad))?; + if let Err(e) = self.inner.write(Bytes::from(encrypted)).await { + self.poisoned = true; + return Err(e); + } + self.block_index = self.block_index.checked_add(1).ok_or_else(|| { + Error::new( + ErrorKind::DataInvalid, + "AGS1 block index overflow: file exceeds the maximum supported size (~4 TiB)", + ) + })?; + Ok(()) + } + + /// Encrypts the first `PLAIN_BLOCK_SIZE` bytes of the buffer in-place + /// and drains them, avoiding a 1 MiB temporary copy. + async fn encrypt_and_drain_block(&mut self) -> Result<()> { + let aad = stream_block_aad(&self.aad_prefix, self.block_index); + let encrypted = self + .cipher + .encrypt(&self.buffer[..PLAIN_BLOCK_SIZE as usize], Some(&aad))?; + if let Err(e) = self.inner.write(Bytes::from(encrypted)).await { + self.poisoned = true; + return Err(e); + } + self.block_index = self.block_index.checked_add(1).ok_or_else(|| { + Error::new( + ErrorKind::DataInvalid, + "AGS1 block index overflow: file exceeds the maximum supported size (~4 TiB)", + ) + })?; + self.buffer.drain(..PLAIN_BLOCK_SIZE as usize); + Ok(()) + } +} + +#[async_trait::async_trait] +impl FileWrite for AesGcmFileWrite { + async fn write(&mut self, bs: Bytes) -> Result<()> { + if self.closed { + return Err(Error::new( + ErrorKind::Unexpected, + "Cannot write to a closed AesGcmFileWrite", + )); + } + if self.poisoned { + return Err(Error::new( + ErrorKind::Unexpected, + "AesGcmFileWrite is in a poisoned state due to a previous write failure", + )); + } + + if !self.header_written { + self.write_header().await?; + } + + self.buffer.extend_from_slice(&bs); + + // Flush full blocks + while self.buffer.len() >= PLAIN_BLOCK_SIZE as usize { + self.encrypt_and_drain_block().await?; + } + + Ok(()) + } + + async fn close(&mut self) -> Result<()> { + if self.closed { + return Err(Error::new( + ErrorKind::Unexpected, + "AesGcmFileWrite already closed", + )); + } + if self.poisoned { + return Err(Error::new( + ErrorKind::Unexpected, + "AesGcmFileWrite is in a poisoned state due to a previous write failure", + )); + } + + if !self.header_written { + self.write_header().await?; + } + + // Write the final block if there's remaining data, or if this is an empty file + // (block_index == 0). Skip writing a spurious empty block when the plaintext was + // exactly block-aligned (buffer empty, blocks already written). + if !self.buffer.is_empty() || self.block_index == 0 { + let final_block = std::mem::take(&mut self.buffer); + self.encrypt_and_write_block(&final_block).await?; + } + self.closed = true; + + self.inner.close().await + } +} + +#[cfg(test)] +mod tests { + use super::*; + + /// Encrypts plaintext into AGS1 format for testing. + /// + /// Mirrors Java's `AesGcmOutputStream` behavior: + /// - Always writes header + at least one block (even for empty input) + /// - Full blocks are `PLAIN_BLOCK_SIZE` bytes; last block may be shorter + fn encrypt_ags1(plaintext: &[u8], cipher: &AesGcmCipher, aad_prefix: &[u8]) -> Vec { + let mut result = Vec::new(); + + // Write header: "AGS1" + PLAIN_BLOCK_SIZE (LE) + result.extend_from_slice(&GCM_STREAM_MAGIC); + result.extend_from_slice(&PLAIN_BLOCK_SIZE.to_le_bytes()); + + // Write blocks + let mut offset = 0; + let mut block_index = 0u32; + + loop { + let remaining = plaintext.len() - offset; + let block_size = std::cmp::min(remaining, PLAIN_BLOCK_SIZE as usize); + + // Block 0 is always written (even if empty); subsequent empty blocks are skipped + if block_size == 0 && block_index > 0 { + break; + } + + let block_data = &plaintext[offset..offset + block_size]; + let aad = stream_block_aad(aad_prefix, block_index); + let encrypted = cipher.encrypt(block_data, Some(&aad)).unwrap(); + result.extend_from_slice(&encrypted); + + offset += block_size; + block_index += 1; + + // A partial block is always the last + if block_size < PLAIN_BLOCK_SIZE as usize { + break; + } + } + + result + } + + /// Helper to create an AesGcmCipher from raw key bytes. + fn make_cipher(key: &[u8]) -> AesGcmCipher { + use super::super::SecureKey; + let secure_key = SecureKey::new(key).unwrap(); + AesGcmCipher::new(secure_key) + } + + /// Helper to create an in-memory FileRead from bytes. + fn memory_reader(data: Vec) -> Box { + Box::new(MemoryFileRead(Bytes::from(data))) + } + + /// Simple in-memory FileRead for tests. + struct MemoryFileRead(Bytes); + + #[async_trait::async_trait] + impl FileRead for MemoryFileRead { + async fn read(&self, range: Range) -> Result { + let start = range.start as usize; + let end = range.end as usize; + if end > self.0.len() { + return Err(Error::new( + ErrorKind::DataInvalid, + format!( + "Range {}..{} out of bounds for {} bytes", + start, + end, + self.0.len() + ), + )); + } + Ok(self.0.slice(start..end)) + } + } + + #[tokio::test] + async fn test_empty_file_roundtrip() { + let key = b"0123456789abcdef"; + let aad_prefix = b"test-aad-prefix!"; + let cipher = make_cipher(key); + + let encrypted = encrypt_ags1(b"", &cipher, aad_prefix); + + // Verify minimum length: header(8) + nonce(12) + tag(16) = 36 + assert_eq!(encrypted.len(), MIN_STREAM_LENGTH as usize); + + let reader = AesGcmFileRead::new( + memory_reader(encrypted.clone()), + Arc::new(make_cipher(key)), + aad_prefix.as_slice().into(), + encrypted.len() as u64, + ) + .unwrap(); + + assert_eq!(reader.plaintext_length(), 0); + + // Reading empty range should return empty bytes + let result = reader.read(0..0).await.unwrap(); + assert!(result.is_empty()); + } + + #[tokio::test] + async fn test_small_file_roundtrip() { + let key = b"0123456789abcdef"; + let aad_prefix = b"test-aad-prefix!"; + let plaintext = b"Hello, Iceberg encryption!"; + let cipher = make_cipher(key); + + let encrypted = encrypt_ags1(plaintext, &cipher, aad_prefix); + + let reader = AesGcmFileRead::new( + memory_reader(encrypted.clone()), + Arc::new(make_cipher(key)), + aad_prefix.as_slice().into(), + encrypted.len() as u64, + ) + .unwrap(); + + assert_eq!(reader.plaintext_length(), plaintext.len() as u64); + + // Read entire file + let result = reader.read(0..plaintext.len() as u64).await.unwrap(); + assert_eq!(&result[..], plaintext); + } + + #[tokio::test] + async fn test_partial_read() { + let key = b"0123456789abcdef"; + let aad_prefix = b"aad-prefix-here!"; + let plaintext = b"ABCDEFGHIJKLMNOPQRSTUVWXYZ"; + let cipher = make_cipher(key); + + let encrypted = encrypt_ags1(plaintext, &cipher, aad_prefix); + + let reader = AesGcmFileRead::new( + memory_reader(encrypted.clone()), + Arc::new(make_cipher(key)), + aad_prefix.as_slice().into(), + encrypted.len() as u64, + ) + .unwrap(); + + // Read a slice from the middle + let result = reader.read(10..20).await.unwrap(); + assert_eq!(&result[..], &plaintext[10..20]); + + // Read first byte + let result = reader.read(0..1).await.unwrap(); + assert_eq!(&result[..], &plaintext[0..1]); + + // Read last byte + let last = plaintext.len() as u64; + let result = reader.read(last - 1..last).await.unwrap(); + assert_eq!(&result[..], &plaintext[plaintext.len() - 1..]); + } + + #[tokio::test] + async fn test_multi_block_roundtrip() { + let key = b"0123456789abcdef"; + let aad_prefix = b"multi-block-aad!"; + + // 1.5 blocks of data + let size = PLAIN_BLOCK_SIZE as usize + PLAIN_BLOCK_SIZE as usize / 2; + let plaintext: Vec = (0..size).map(|i| (i % 256) as u8).collect(); + let cipher = make_cipher(key); + + let encrypted = encrypt_ags1(&plaintext, &cipher, aad_prefix); + + let reader = AesGcmFileRead::new( + memory_reader(encrypted.clone()), + Arc::new(make_cipher(key)), + aad_prefix.as_slice().into(), + encrypted.len() as u64, + ) + .unwrap(); + + assert_eq!(reader.plaintext_length(), plaintext.len() as u64); + + // Read entire file + let result = reader.read(0..plaintext.len() as u64).await.unwrap(); + assert_eq!(&result[..], &plaintext[..]); + } + + #[tokio::test] + async fn test_cross_block_read() { + let key = b"0123456789abcdef"; + let aad_prefix = b"cross-block-aad!"; + + // 2.5 blocks of data + let size = PLAIN_BLOCK_SIZE as usize * 2 + PLAIN_BLOCK_SIZE as usize / 2; + let plaintext: Vec = (0..size).map(|i| (i % 256) as u8).collect(); + let cipher = make_cipher(key); + + let encrypted = encrypt_ags1(&plaintext, &cipher, aad_prefix); + + let reader = AesGcmFileRead::new( + memory_reader(encrypted.clone()), + Arc::new(make_cipher(key)), + aad_prefix.as_slice().into(), + encrypted.len() as u64, + ) + .unwrap(); + + // Read across block boundary (last 100 bytes of block 0 + first 100 bytes of block 1) + let boundary = PLAIN_BLOCK_SIZE as u64; + let result = reader.read(boundary - 100..boundary + 100).await.unwrap(); + assert_eq!( + &result[..], + &plaintext[(boundary - 100) as usize..(boundary + 100) as usize] + ); + + // Read across two block boundaries (spans blocks 0, 1, and 2) + let result = reader.read(boundary - 50..boundary * 2 + 50).await.unwrap(); + assert_eq!( + &result[..], + &plaintext[(boundary - 50) as usize..(boundary * 2 + 50) as usize] + ); + } + + #[tokio::test] + async fn test_exact_block_size() { + let key = b"0123456789abcdef"; + let aad_prefix = b"exact-block-aad!"; + + // Exactly 1 block + let plaintext: Vec = (0..PLAIN_BLOCK_SIZE as usize) + .map(|i| (i % 256) as u8) + .collect(); + let cipher = make_cipher(key); + + let encrypted = encrypt_ags1(&plaintext, &cipher, aad_prefix); + + let reader = AesGcmFileRead::new( + memory_reader(encrypted.clone()), + Arc::new(make_cipher(key)), + aad_prefix.as_slice().into(), + encrypted.len() as u64, + ) + .unwrap(); + + assert_eq!(reader.plaintext_length(), PLAIN_BLOCK_SIZE as u64); + + let result = reader.read(0..PLAIN_BLOCK_SIZE as u64).await.unwrap(); + assert_eq!(&result[..], &plaintext[..]); + } + + #[tokio::test] + async fn test_block_size_plus_one() { + let key = b"0123456789abcdef"; + let aad_prefix = b"block-plus-one!!"; + + // 1 block + 1 byte + let size = PLAIN_BLOCK_SIZE as usize + 1; + let plaintext: Vec = (0..size).map(|i| (i % 256) as u8).collect(); + let cipher = make_cipher(key); + + let encrypted = encrypt_ags1(&plaintext, &cipher, aad_prefix); + + let reader = AesGcmFileRead::new( + memory_reader(encrypted.clone()), + Arc::new(make_cipher(key)), + aad_prefix.as_slice().into(), + encrypted.len() as u64, + ) + .unwrap(); + + assert_eq!(reader.plaintext_length(), size as u64); + + // Read the last byte (in block 1) + let result = reader.read(size as u64 - 1..size as u64).await.unwrap(); + assert_eq!(result[0], plaintext[size - 1]); + + // Read all + let result = reader.read(0..size as u64).await.unwrap(); + assert_eq!(&result[..], &plaintext[..]); + } + + #[tokio::test] + async fn test_block_size_minus_one() { + let key = b"0123456789abcdef"; + let aad_prefix = b"block-minus-one!"; + + // 1 block - 1 byte + let size = PLAIN_BLOCK_SIZE as usize - 1; + let plaintext: Vec = (0..size).map(|i| (i % 256) as u8).collect(); + let cipher = make_cipher(key); + + let encrypted = encrypt_ags1(&plaintext, &cipher, aad_prefix); + + let reader = AesGcmFileRead::new( + memory_reader(encrypted.clone()), + Arc::new(make_cipher(key)), + aad_prefix.as_slice().into(), + encrypted.len() as u64, + ) + .unwrap(); + + assert_eq!(reader.plaintext_length(), size as u64); + + let result = reader.read(0..size as u64).await.unwrap(); + assert_eq!(&result[..], &plaintext[..]); + } + + #[tokio::test] + async fn test_wrong_aad_fails() { + let key = b"0123456789abcdef"; + let aad_prefix = b"correct-aad-here"; + let plaintext = b"sensitive data here"; + let cipher = make_cipher(key); + + let encrypted = encrypt_ags1(plaintext, &cipher, aad_prefix); + + // Try to decrypt with wrong AAD + let mut bad_aad = aad_prefix.to_vec(); + bad_aad[0] ^= 0xFF; + + let reader = AesGcmFileRead::new( + memory_reader(encrypted.clone()), + Arc::new(make_cipher(key)), + bad_aad.as_slice().into(), + encrypted.len() as u64, + ) + .unwrap(); + + let result = reader.read(0..plaintext.len() as u64).await; + assert!(result.is_err(), "Decryption with wrong AAD should fail"); + } + + #[tokio::test] + async fn test_wrong_key_fails() { + let key = b"0123456789abcdef"; + let wrong_key = b"fedcba9876543210"; + let aad_prefix = b"test-aad-prefix!"; + let plaintext = b"sensitive data"; + let cipher = make_cipher(key); + + let encrypted = encrypt_ags1(plaintext, &cipher, aad_prefix); + + let reader = AesGcmFileRead::new( + memory_reader(encrypted.clone()), + Arc::new(make_cipher(wrong_key)), + aad_prefix.as_slice().into(), + encrypted.len() as u64, + ) + .unwrap(); + + let result = reader.read(0..plaintext.len() as u64).await; + assert!(result.is_err(), "Decryption with wrong key should fail"); + } + + #[tokio::test] + async fn test_out_of_bounds_read() { + let key = b"0123456789abcdef"; + let aad_prefix = b"test-aad-prefix!"; + let plaintext = b"short data"; + let cipher = make_cipher(key); + + let encrypted = encrypt_ags1(plaintext, &cipher, aad_prefix); + + let reader = AesGcmFileRead::new( + memory_reader(encrypted.clone()), + Arc::new(make_cipher(key)), + aad_prefix.as_slice().into(), + encrypted.len() as u64, + ) + .unwrap(); + + let result = reader.read(0..plaintext.len() as u64 + 1).await; + assert!(result.is_err(), "Reading past end should fail"); + } + + #[tokio::test] + async fn test_calculate_plaintext_length() { + // Empty file: header only (not valid per Java, but handled) + assert_eq!( + AesGcmFileRead::calculate_plaintext_length(GCM_STREAM_HEADER_LENGTH as u64).unwrap(), + 0 + ); + + // Empty file with one empty block: header(8) + nonce(12) + tag(16) = 36 + assert_eq!( + AesGcmFileRead::calculate_plaintext_length(MIN_STREAM_LENGTH as u64).unwrap(), + 0 + ); + + // One full block: header(8) + cipher_block(1048604) = 1048612 + let one_full = GCM_STREAM_HEADER_LENGTH as u64 + CIPHER_BLOCK_SIZE as u64; + assert_eq!( + AesGcmFileRead::calculate_plaintext_length(one_full).unwrap(), + PLAIN_BLOCK_SIZE as u64 + ); + + // One full block + 1 byte: need partial second block + // Second block = nonce(12) + 1 byte ciphertext + tag(16) = 29 + let one_full_plus_one = one_full + NONCE_LENGTH as u64 + 1 + GCM_TAG_LENGTH as u64; + assert_eq!( + AesGcmFileRead::calculate_plaintext_length(one_full_plus_one).unwrap(), + PLAIN_BLOCK_SIZE as u64 + 1 + ); + } + + #[tokio::test] + async fn test_stream_block_aad() { + // With prefix + let aad = stream_block_aad(b"prefix", 0); + assert_eq!(&aad[..6], b"prefix"); + assert_eq!(&aad[6..], &0u32.to_le_bytes()); + + let aad = stream_block_aad(b"prefix", 1); + assert_eq!(&aad[..6], b"prefix"); + assert_eq!(&aad[6..], &1u32.to_le_bytes()); + + // Without prefix + let aad = stream_block_aad(b"", 42); + assert_eq!(&aad[..], &42u32.to_le_bytes()); + } + + #[tokio::test] + async fn test_encrypted_file_too_short() { + let result = AesGcmFileRead::new( + memory_reader(vec![0; 4]), + Arc::new(make_cipher(b"0123456789abcdef")), + [].into(), + 4, + ); + assert!(result.is_err()); + } + + // --- AesGcmFileWrite tests --- + + /// Shared-buffer FileWrite for testing AesGcmFileWrite output. + struct SharedMemoryWrite { + buffer: std::sync::Arc>>, + } + + /// FileWrite that fails after a configured number of successful writes. + struct FailingFileWrite { + writes_before_failure: usize, + write_count: usize, + } + + #[async_trait::async_trait] + impl FileWrite for FailingFileWrite { + async fn write(&mut self, _bs: Bytes) -> Result<()> { + if self.write_count >= self.writes_before_failure { + return Err(Error::new(ErrorKind::Unexpected, "simulated write failure")); + } + self.write_count += 1; + Ok(()) + } + + async fn close(&mut self) -> Result<()> { + Ok(()) + } + } + + #[async_trait::async_trait] + impl FileWrite for SharedMemoryWrite { + async fn write(&mut self, bs: Bytes) -> Result<()> { + self.buffer.lock().unwrap().extend_from_slice(&bs); + Ok(()) + } + + async fn close(&mut self) -> Result<()> { + Ok(()) + } + } + + /// Helper: one-shot encrypt through AesGcmFileWrite, return encrypted bytes. + async fn write_through_ags1(plaintext: &[u8], key: &[u8], aad_prefix: &[u8]) -> Vec { + let buffer = std::sync::Arc::new(std::sync::Mutex::new(Vec::new())); + let inner: Box = Box::new(SharedMemoryWrite { + buffer: buffer.clone(), + }); + let cipher = Arc::new(make_cipher(key)); + let mut writer = AesGcmFileWrite::new(inner, cipher, aad_prefix.to_vec()); + + writer.write(Bytes::from(plaintext.to_vec())).await.unwrap(); + writer.close().await.unwrap(); + + buffer.lock().unwrap().clone() + } + + #[tokio::test] + async fn test_write_empty_roundtrip() { + let key = b"0123456789abcdef"; + let aad_prefix = b"test-aad-prefix!"; + + let encrypted = write_through_ags1(b"", key, aad_prefix).await; + + // Should produce header + one empty encrypted block + assert_eq!(encrypted.len(), MIN_STREAM_LENGTH as usize); + + let reader = AesGcmFileRead::new( + memory_reader(encrypted.clone()), + Arc::new(make_cipher(key)), + aad_prefix.as_slice().into(), + encrypted.len() as u64, + ) + .unwrap(); + + assert_eq!(reader.plaintext_length(), 0); + } + + #[tokio::test] + async fn test_write_small_roundtrip() { + let key = b"0123456789abcdef"; + let aad_prefix = b"test-aad-prefix!"; + let plaintext = b"Hello, Iceberg encryption!"; + + let encrypted = write_through_ags1(plaintext, key, aad_prefix).await; + + let reader = AesGcmFileRead::new( + memory_reader(encrypted.clone()), + Arc::new(make_cipher(key)), + aad_prefix.as_slice().into(), + encrypted.len() as u64, + ) + .unwrap(); + + assert_eq!(reader.plaintext_length(), plaintext.len() as u64); + let result = reader.read(0..plaintext.len() as u64).await.unwrap(); + assert_eq!(&result[..], plaintext); + } + + #[tokio::test] + async fn test_write_multi_block_roundtrip() { + let key = b"0123456789abcdef"; + let aad_prefix = b"multi-block-aad!"; + + // 1.5 blocks of data + let size = PLAIN_BLOCK_SIZE as usize + PLAIN_BLOCK_SIZE as usize / 2; + let plaintext: Vec = (0..size).map(|i| (i % 256) as u8).collect(); + + let encrypted = write_through_ags1(&plaintext, key, aad_prefix).await; + + let reader = AesGcmFileRead::new( + memory_reader(encrypted.clone()), + Arc::new(make_cipher(key)), + aad_prefix.as_slice().into(), + encrypted.len() as u64, + ) + .unwrap(); + + assert_eq!(reader.plaintext_length(), plaintext.len() as u64); + let result = reader.read(0..plaintext.len() as u64).await.unwrap(); + assert_eq!(&result[..], &plaintext[..]); + } + + #[tokio::test] + async fn test_write_cross_block_accumulation() { + let key = b"0123456789abcdef"; + let aad_prefix = b"cross-block-aad!"; + + let buffer = std::sync::Arc::new(std::sync::Mutex::new(Vec::new())); + let inner: Box = Box::new(SharedMemoryWrite { + buffer: buffer.clone(), + }); + let cipher = Arc::new(make_cipher(key)); + let mut writer = AesGcmFileWrite::new(inner, cipher, aad_prefix.to_vec()); + + // Write 1.5 blocks in 1000-byte chunks + let total_size = PLAIN_BLOCK_SIZE as usize + PLAIN_BLOCK_SIZE as usize / 2; + let plaintext: Vec = (0..total_size).map(|i| (i % 256) as u8).collect(); + let chunk_size = 1000; + for chunk in plaintext.chunks(chunk_size) { + writer.write(Bytes::from(chunk.to_vec())).await.unwrap(); + } + writer.close().await.unwrap(); + + let encrypted = buffer.lock().unwrap().clone(); + + let reader = AesGcmFileRead::new( + memory_reader(encrypted.clone()), + Arc::new(make_cipher(key)), + aad_prefix.as_slice().into(), + encrypted.len() as u64, + ) + .unwrap(); + + assert_eq!(reader.plaintext_length(), plaintext.len() as u64); + let result = reader.read(0..plaintext.len() as u64).await.unwrap(); + assert_eq!(&result[..], &plaintext[..]); + } + + #[tokio::test] + async fn test_write_exact_block_size() { + let key = b"0123456789abcdef"; + let aad_prefix = b"exact-block-aad!"; + + // Exactly 1 block + let plaintext: Vec = (0..PLAIN_BLOCK_SIZE as usize) + .map(|i| (i % 256) as u8) + .collect(); + + let encrypted = write_through_ags1(&plaintext, key, aad_prefix).await; + + let reader = AesGcmFileRead::new( + memory_reader(encrypted.clone()), + Arc::new(make_cipher(key)), + aad_prefix.as_slice().into(), + encrypted.len() as u64, + ) + .unwrap(); + + assert_eq!(reader.plaintext_length(), PLAIN_BLOCK_SIZE as u64); + let result = reader.read(0..PLAIN_BLOCK_SIZE as u64).await.unwrap(); + assert_eq!(&result[..], &plaintext[..]); + } + + #[tokio::test] + async fn test_write_block_aligned_no_spurious_empty_block() { + let key = b"0123456789abcdef"; + let aad_prefix = b"block-align-aad!"; + + // Write exactly one block of plaintext — close() should NOT add + // a trailing empty encrypted block (28 bytes: 12-byte nonce + 16-byte tag). + let plaintext: Vec = (0..PLAIN_BLOCK_SIZE as usize) + .map(|i| (i % 256) as u8) + .collect(); + + let encrypted_via_writer = write_through_ags1(&plaintext, key, aad_prefix).await; + let encrypted_via_reference = encrypt_ags1(&plaintext, &make_cipher(key), aad_prefix); + + // Both should be the same length — no extra 28-byte empty block + assert_eq!( + encrypted_via_writer.len(), + encrypted_via_reference.len(), + "Writer output should match reference encryption length (no spurious trailing block)" + ); + + // Verify roundtrip + let reader = AesGcmFileRead::new( + memory_reader(encrypted_via_writer.clone()), + Arc::new(make_cipher(key)), + aad_prefix.as_slice().into(), + encrypted_via_writer.len() as u64, + ) + .unwrap(); + + assert_eq!(reader.plaintext_length(), PLAIN_BLOCK_SIZE as u64); + let result = reader.read(0..PLAIN_BLOCK_SIZE as u64).await.unwrap(); + assert_eq!(&result[..], &plaintext[..]); + } + + #[tokio::test] + async fn test_write_two_blocks_aligned_no_spurious_empty_block() { + let key = b"0123456789abcdef"; + let aad_prefix = b"2blk-align-aad!!"; + + // Exactly 2 blocks + let size = PLAIN_BLOCK_SIZE as usize * 2; + let plaintext: Vec = (0..size).map(|i| (i % 256) as u8).collect(); + + let encrypted_via_writer = write_through_ags1(&plaintext, key, aad_prefix).await; + let encrypted_via_reference = encrypt_ags1(&plaintext, &make_cipher(key), aad_prefix); + + assert_eq!( + encrypted_via_writer.len(), + encrypted_via_reference.len(), + "Writer output should match reference encryption length (no spurious trailing block)" + ); + + let reader = AesGcmFileRead::new( + memory_reader(encrypted_via_writer.clone()), + Arc::new(make_cipher(key)), + aad_prefix.as_slice().into(), + encrypted_via_writer.len() as u64, + ) + .unwrap(); + + assert_eq!(reader.plaintext_length(), size as u64); + let result = reader.read(0..size as u64).await.unwrap(); + assert_eq!(&result[..], &plaintext[..]); + } + + #[tokio::test] + async fn test_write_poisoned_after_inner_write_failure() { + let cipher = Arc::new(make_cipher(b"0123456789abcdef")); + // Fail on the second write (first write is the header, second is block data) + let inner: Box = Box::new(FailingFileWrite { + writes_before_failure: 1, + write_count: 0, + }); + let mut writer = AesGcmFileWrite::new(inner, cipher, b"aad-prefix-here!".to_vec()); + + // First write triggers header (succeeds) + block encrypt+write (fails) + let data = vec![0u8; PLAIN_BLOCK_SIZE as usize]; + let result = writer.write(Bytes::from(data)).await; + assert!(result.is_err()); + + // Subsequent write should be rejected as poisoned + let result = writer.write(Bytes::from(b"more data".to_vec())).await; + assert!(result.is_err()); + assert!( + result.unwrap_err().to_string().contains("poisoned"), + "expected poisoned error" + ); + + // Close should also be rejected + let result = writer.close().await; + assert!(result.is_err()); + assert!( + result.unwrap_err().to_string().contains("poisoned"), + "expected poisoned error on close" + ); + } +} From fda82a249461f8ecf09b6328f6db7272e238844a Mon Sep 17 00:00:00 2001 From: "R. Conner Howell" <5731503+rchowell@users.noreply.github.com> Date: Fri, 17 Apr 2026 16:59:33 -0700 Subject: [PATCH 18/45] refactor(storage): remove the configured_scheme parameter from storage impls (#2338) ## Which issue does this PR close? - Closes #2245 - Related #2231 ## What changes are included in this PR? - Remove configured_scheme field from OpenDalStorage::{S3,Azdls} - Make S3 storage use the scheme in the file paths, allowing for custom S3-compatible schemes like minio:// - Use `HashMap>` so aliases share a storage instance - Added new unit tests and removed some now-obsolete scheme-mismatch tests ## Break Change We are removing a struct field from public types, so this would need to be release in 0.10.0 ```rust // Before OpenDalStorageFactory::S3 { configured_scheme: "s3a".to_string(), customized_credential_load: None, } OpenDalStorageFactory::Azdls { configured_scheme: AzureStorageScheme::Abfss, } // After OpenDalStorageFactory::S3 { customized_credential_load: None } OpenDalStorageFactory::Azdls ``` ## Are these changes tested? Beyond the unit tests, I ran these integration tests. ```sh docker compose -f dev/docker-compose.yaml up -d --wait # requires unset on any AWS_ env vars cargo test -p iceberg-integration-tests cargo test -p iceberg-catalog-hms --test hms_catalog_test cargo test -p iceberg-catalog-loader cargo test -p iceberg-storage-opendal --features opendal-s3 --test file_io_s3_test ``` --- crates/catalog/glue/src/catalog.rs | 1 - crates/catalog/hms/tests/hms_catalog_test.rs | 2 - crates/catalog/loader/tests/common/mod.rs | 3 - crates/catalog/s3tables/src/catalog.rs | 1 - crates/iceberg/src/catalog/mod.rs | 1 - crates/integration_tests/tests/common/mod.rs | 1 - .../tests/conflict_commit_test.rs | 1 - .../tests/read_evolved_schema.rs | 1 - .../tests/read_positional_deletes.rs | 1 - crates/storage/opendal/README.md | 1 - crates/storage/opendal/src/azdls.rs | 53 +++---- crates/storage/opendal/src/lib.rs | 134 +++++------------- crates/storage/opendal/src/resolving.rs | 75 ++++++++-- .../storage/opendal/tests/file_io_s3_test.rs | 4 - 14 files changed, 116 insertions(+), 163 deletions(-) diff --git a/crates/catalog/glue/src/catalog.rs b/crates/catalog/glue/src/catalog.rs index a7e0171337..5b3ccf3b39 100644 --- a/crates/catalog/glue/src/catalog.rs +++ b/crates/catalog/glue/src/catalog.rs @@ -203,7 +203,6 @@ impl GlueCatalog { // Use provided factory or default to OpenDalStorageFactory::S3 let factory = storage_factory.unwrap_or_else(|| { Arc::new(OpenDalStorageFactory::S3 { - configured_scheme: "s3a".to_string(), customized_credential_load: None, }) }); diff --git a/crates/catalog/hms/tests/hms_catalog_test.rs b/crates/catalog/hms/tests/hms_catalog_test.rs index c1ae4db7ee..d0e6486ad8 100644 --- a/crates/catalog/hms/tests/hms_catalog_test.rs +++ b/crates/catalog/hms/tests/hms_catalog_test.rs @@ -64,7 +64,6 @@ async fn get_catalog() -> HmsCatalog { // Wait for bucket to actually exist let file_io = FileIOBuilder::new(Arc::new(OpenDalStorageFactory::S3 { - configured_scheme: "s3a".to_string(), customized_credential_load: None, })) .with_props(props.clone()) @@ -83,7 +82,6 @@ async fn get_catalog() -> HmsCatalog { HmsCatalogBuilder::default() .with_storage_factory(Arc::new(OpenDalStorageFactory::S3 { - configured_scheme: "s3a".to_string(), customized_credential_load: None, })) .load("hms", props) diff --git a/crates/catalog/loader/tests/common/mod.rs b/crates/catalog/loader/tests/common/mod.rs index 1a3fb8d8f1..1d40fef357 100644 --- a/crates/catalog/loader/tests/common/mod.rs +++ b/crates/catalog/loader/tests/common/mod.rs @@ -233,7 +233,6 @@ async fn glue_catalog() -> GlueCatalog { ]); let file_io = FileIOBuilder::new(Arc::new(OpenDalStorageFactory::S3 { - configured_scheme: "s3a".to_string(), customized_credential_load: None, })) .with_props(props.clone()) @@ -285,7 +284,6 @@ async fn hms_catalog() -> HmsCatalog { ]); let file_io = FileIOBuilder::new(Arc::new(OpenDalStorageFactory::S3 { - configured_scheme: "s3a".to_string(), customized_credential_load: None, })) .with_props(props.clone()) @@ -302,7 +300,6 @@ async fn hms_catalog() -> HmsCatalog { HmsCatalogBuilder::default() .with_storage_factory(Arc::new(OpenDalStorageFactory::S3 { - configured_scheme: "s3a".to_string(), customized_credential_load: None, })) .load("hms", props) diff --git a/crates/catalog/s3tables/src/catalog.rs b/crates/catalog/s3tables/src/catalog.rs index e19f5ae092..cc43446943 100644 --- a/crates/catalog/s3tables/src/catalog.rs +++ b/crates/catalog/s3tables/src/catalog.rs @@ -202,7 +202,6 @@ impl S3TablesCatalog { // Use provided factory or default to OpenDalStorageFactory::S3 let factory = storage_factory.unwrap_or_else(|| { Arc::new(OpenDalStorageFactory::S3 { - configured_scheme: "s3".to_string(), customized_credential_load: None, }) }); diff --git a/crates/iceberg/src/catalog/mod.rs b/crates/iceberg/src/catalog/mod.rs index f296cf2260..43102adec9 100644 --- a/crates/iceberg/src/catalog/mod.rs +++ b/crates/iceberg/src/catalog/mod.rs @@ -144,7 +144,6 @@ pub trait CatalogBuilder: Default + Debug + Send + Sync { /// /// let catalog = MyCatalogBuilder::default() /// .with_storage_factory(Arc::new(OpenDalStorageFactory::S3 { - /// configured_scheme: "s3a".to_string(), /// customized_credential_load: None, /// })) /// .load("my_catalog", props) diff --git a/crates/integration_tests/tests/common/mod.rs b/crates/integration_tests/tests/common/mod.rs index e49a57465c..b7197a3a46 100644 --- a/crates/integration_tests/tests/common/mod.rs +++ b/crates/integration_tests/tests/common/mod.rs @@ -28,7 +28,6 @@ pub async fn random_ns() -> Namespace { let fixture = get_test_fixture(); let rest_catalog = RestCatalogBuilder::default() .with_storage_factory(Arc::new(OpenDalStorageFactory::S3 { - configured_scheme: "s3".to_string(), customized_credential_load: None, })) .load("rest", fixture.catalog_config.clone()) diff --git a/crates/integration_tests/tests/conflict_commit_test.rs b/crates/integration_tests/tests/conflict_commit_test.rs index 3b1362b95d..af2c7a7779 100644 --- a/crates/integration_tests/tests/conflict_commit_test.rs +++ b/crates/integration_tests/tests/conflict_commit_test.rs @@ -43,7 +43,6 @@ async fn test_append_data_file_conflict() { let fixture = get_test_fixture(); let rest_catalog = RestCatalogBuilder::default() .with_storage_factory(Arc::new(OpenDalStorageFactory::S3 { - configured_scheme: "s3".to_string(), customized_credential_load: None, })) .load("rest", fixture.catalog_config.clone()) diff --git a/crates/integration_tests/tests/read_evolved_schema.rs b/crates/integration_tests/tests/read_evolved_schema.rs index ae25a08987..f7416be2d4 100644 --- a/crates/integration_tests/tests/read_evolved_schema.rs +++ b/crates/integration_tests/tests/read_evolved_schema.rs @@ -34,7 +34,6 @@ async fn test_evolved_schema() { let fixture = get_test_fixture(); let rest_catalog = RestCatalogBuilder::default() .with_storage_factory(Arc::new(OpenDalStorageFactory::S3 { - configured_scheme: "s3".to_string(), customized_credential_load: None, })) .load("rest", fixture.catalog_config.clone()) diff --git a/crates/integration_tests/tests/read_positional_deletes.rs b/crates/integration_tests/tests/read_positional_deletes.rs index d4c4afeaf3..0f79596a12 100644 --- a/crates/integration_tests/tests/read_positional_deletes.rs +++ b/crates/integration_tests/tests/read_positional_deletes.rs @@ -30,7 +30,6 @@ async fn test_read_table_with_positional_deletes() { let fixture = get_test_fixture(); let rest_catalog = RestCatalogBuilder::default() .with_storage_factory(Arc::new(OpenDalStorageFactory::S3 { - configured_scheme: "s3".to_string(), customized_credential_load: None, })) .load("rest", fixture.catalog_config.clone()) diff --git a/crates/storage/opendal/README.md b/crates/storage/opendal/README.md index c5092eb97a..a4ad512e17 100644 --- a/crates/storage/opendal/README.md +++ b/crates/storage/opendal/README.md @@ -61,7 +61,6 @@ use iceberg_storage_opendal::OpenDalStorageFactory; async fn main() -> iceberg::Result<()> { let catalog = RestCatalogBuilder::default() .with_storage_factory(Arc::new(OpenDalStorageFactory::S3 { - configured_scheme: "s3".to_string(), customized_credential_load: None, })) .load( diff --git a/crates/storage/opendal/src/azdls.rs b/crates/storage/opendal/src/azdls.rs index 6251f8cdaa..b47c55d9e7 100644 --- a/crates/storage/opendal/src/azdls.rs +++ b/crates/storage/opendal/src/azdls.rs @@ -91,10 +91,9 @@ pub(crate) fn azdls_config_parse(mut properties: HashMap) -> Res pub(crate) fn azdls_create_operator<'a>( absolute_path: &'a str, config: &AzdlsConfig, - configured_scheme: &AzureStorageScheme, ) -> Result<(opendal::Operator, &'a str)> { let path = absolute_path.parse::()?; - match_path_with_config(&path, config, configured_scheme)?; + match_path_with_config(&path, config)?; let op = azdls_config_build(config, &path)?; @@ -160,18 +159,7 @@ impl FromStr for AzureStorageScheme { } /// Validates whether the given path matches what's configured for the backend. -pub(crate) fn match_path_with_config( - path: &AzureStoragePath, - config: &AzdlsConfig, - configured_scheme: &AzureStorageScheme, -) -> Result<()> { - ensure_data_valid!( - &path.scheme == configured_scheme, - "Storage::Azdls: Scheme mismatch: configured {}, passed {}", - configured_scheme, - path.scheme - ); - +pub(crate) fn match_path_with_config(path: &AzureStoragePath, config: &AzdlsConfig) -> Result<()> { if let Some(ref configured_account_name) = config.account_name { ensure_data_valid!( &path.account_name == configured_account_name, @@ -408,7 +396,6 @@ mod tests { endpoint: Some("https://myaccount.dfs.core.windows.net".to_string()), ..Default::default() }, - AzureStorageScheme::Abfss, ), Some(("myfs", "/path/to/file.parquet")), ), @@ -421,33 +408,19 @@ mod tests { endpoint: Some("https://myaccount.dfs.core.windows.net".to_string()), ..Default::default() }, - AzureStorageScheme::Abfss, - ), - None, - ), - ( - "different scheme", - ( - "wasbs://myfs@myaccount.dfs.core.windows.net/path/to/file.parquet", - AzdlsConfig { - account_name: Some("myaccount".to_string()), - endpoint: Some("https://myaccount.dfs.core.windows.net".to_string()), - ..Default::default() - }, - AzureStorageScheme::Abfss, ), None, ), ( "incompatible scheme for endpoint", ( - "abfs://myfs@myaccount.dfs.core.windows.net/path/to/file.parquet", + // `abfss` implies https; configured endpoint is plain http. + "abfss://myfs@myaccount.dfs.core.windows.net/path/to/file.parquet", AzdlsConfig { account_name: Some("myaccount".to_string()), endpoint: Some("http://myaccount.dfs.core.windows.net".to_string()), ..Default::default() }, - AzureStorageScheme::Abfss, ), None, ), @@ -460,7 +433,6 @@ mod tests { endpoint: Some("https://myaccount.dfs.core.chinacloudapi.cn".to_string()), ..Default::default() }, - AzureStorageScheme::Abfss, ), None, ), @@ -474,14 +446,27 @@ mod tests { endpoint: None, ..Default::default() }, - AzureStorageScheme::Abfs, + ), + Some(("myfs", "/path/to/file.parquet")), + ), + ( + "scheme differs from a previously-configured one is accepted", + ( + // No configured scheme exists anymore; both abfss and wasbs + // should be accepted by the same storage. + "wasbs://myfs@myaccount.blob.core.windows.net/path/to/file.parquet", + AzdlsConfig { + account_name: Some("myaccount".to_string()), + endpoint: Some("https://myaccount.blob.core.windows.net".to_string()), + ..Default::default() + }, ), Some(("myfs", "/path/to/file.parquet")), ), ]; for (name, input, expected) in test_cases { - let result = azdls_create_operator(input.0, &input.1, &input.2); + let result = azdls_create_operator(input.0, &input.1); match expected { Some((expected_filesystem, expected_path)) => { assert!(result.is_ok(), "Test case {name} failed: {result:?}"); diff --git a/crates/storage/opendal/src/lib.rs b/crates/storage/opendal/src/lib.rs index 8160680523..a0336868e3 100644 --- a/crates/storage/opendal/src/lib.rs +++ b/crates/storage/opendal/src/lib.rs @@ -46,7 +46,6 @@ use utils::from_opendal_error; cfg_if! { if #[cfg(feature = "opendal-azdls")] { mod azdls; - use azdls::AzureStorageScheme; use azdls::*; use opendal::services::AzdlsConfig; } @@ -108,9 +107,6 @@ pub enum OpenDalStorageFactory { /// S3 storage factory. #[cfg(feature = "opendal-s3")] S3 { - /// s3 storage could have `s3://` and `s3a://`. - /// Storing the scheme string here to return the correct path. - configured_scheme: String, /// Custom AWS credential loader. #[serde(skip)] customized_credential_load: Option, @@ -123,10 +119,7 @@ pub enum OpenDalStorageFactory { Oss, /// Azure Data Lake Storage factory. #[cfg(feature = "opendal-azdls")] - Azdls { - /// The configured Azure storage scheme. - configured_scheme: AzureStorageScheme, - }, + Azdls, } #[typetag::serde(name = "OpenDalStorageFactory")] @@ -142,10 +135,8 @@ impl StorageFactory for OpenDalStorageFactory { OpenDalStorageFactory::Fs => Ok(Arc::new(OpenDalStorage::LocalFs)), #[cfg(feature = "opendal-s3")] OpenDalStorageFactory::S3 { - configured_scheme, customized_credential_load, } => Ok(Arc::new(OpenDalStorage::S3 { - configured_scheme: configured_scheme.clone(), config: s3_config_parse(config.props().clone())?.into(), customized_credential_load: customized_credential_load.clone(), })), @@ -158,12 +149,9 @@ impl StorageFactory for OpenDalStorageFactory { config: oss_config_parse(config.props().clone())?.into(), })), #[cfg(feature = "opendal-azdls")] - OpenDalStorageFactory::Azdls { configured_scheme } => { - Ok(Arc::new(OpenDalStorage::Azdls { - configured_scheme: configured_scheme.clone(), - config: azdls_config_parse(config.props().clone())?.into(), - })) - } + OpenDalStorageFactory::Azdls => Ok(Arc::new(OpenDalStorage::Azdls { + config: azdls_config_parse(config.props().clone())?.into(), + })), #[cfg(all( not(feature = "opendal-memory"), not(feature = "opendal-fs"), @@ -196,11 +184,11 @@ pub enum OpenDalStorage { #[cfg(feature = "opendal-fs")] LocalFs, /// S3 storage variant. + /// + /// Accepts any S3-family URL (`s3://`, `s3a://`, `s3n://`); the scheme is + /// derived from the path at call time. #[cfg(feature = "opendal-s3")] S3 { - /// s3 storage could have `s3://` and `s3a://`. - /// Storing the scheme string here to return the correct path. - configured_scheme: String, /// S3 configuration. config: Arc, /// Custom AWS credential loader. @@ -220,16 +208,13 @@ pub enum OpenDalStorage { config: Arc, }, /// Azure Data Lake Storage variant. - /// Expects paths of the form + /// + /// Accepts paths of the form /// `abfs[s]://@.dfs./` or /// `wasb[s]://@.blob./`. + /// The scheme is derived from the path at call time. #[cfg(feature = "opendal-azdls")] - #[allow(private_interfaces)] Azdls { - /// The configured Azure storage scheme. - /// Because Azdls accepts multiple possible schemes, we store the full - /// passed scheme here to later validate schemes passed via paths. - configured_scheme: AzureStorageScheme, /// Azure DLS configuration. config: Arc, }, @@ -274,15 +259,21 @@ impl OpenDalStorage { } #[cfg(feature = "opendal-s3")] OpenDalStorage::S3 { - configured_scheme, config, customized_credential_load, } => { let op = s3_config_build(config, customized_credential_load, path)?; let op_info = op.info(); - // Check prefix of s3 path. - let prefix = format!("{}://{}/", configured_scheme, op_info.name()); + // Use the URL scheme in the path for prefix matching. This enables + // use of S3-compatible storage backends using custom schemes (e.g., `minio://`, `r2://`). + let url = url::Url::parse(path).map_err(|e| { + Error::new( + ErrorKind::DataInvalid, + format!("Invalid s3 url: {path}: {e}"), + ) + })?; + let prefix = format!("{}://{}/", url.scheme(), op_info.name()); if path.starts_with(&prefix) { (op, &path[prefix.len()..]) } else { @@ -319,10 +310,7 @@ impl OpenDalStorage { } } #[cfg(feature = "opendal-azdls")] - OpenDalStorage::Azdls { - configured_scheme, - config, - } => azdls_create_operator(path, config, configured_scheme)?, + OpenDalStorage::Azdls { config } => azdls_create_operator(path, config)?, #[cfg(all( not(feature = "opendal-s3"), not(feature = "opendal-fs"), @@ -357,9 +345,7 @@ impl OpenDalStorage { #[cfg(feature = "opendal-fs")] OpenDalStorage::LocalFs => Ok(path.strip_prefix("file:/").unwrap_or(&path[1..])), #[cfg(feature = "opendal-s3")] - OpenDalStorage::S3 { - configured_scheme, .. - } => { + OpenDalStorage::S3 { .. } => { let url = url::Url::parse(path)?; let bucket = url.host_str().ok_or_else(|| { Error::new( @@ -367,7 +353,7 @@ impl OpenDalStorage { format!("Invalid s3 url: {path}, missing bucket"), ) })?; - let prefix = format!("{}://{}/", configured_scheme, bucket); + let prefix = format!("{}://{}/", url.scheme(), bucket); if path.starts_with(&prefix) { Ok(&path[prefix.len()..]) } else { @@ -416,12 +402,9 @@ impl OpenDalStorage { } } #[cfg(feature = "opendal-azdls")] - OpenDalStorage::Azdls { - configured_scheme, - config, - } => { + OpenDalStorage::Azdls { config } => { let azure_path = path.parse::()?; - match_path_with_config(&azure_path, config, configured_scheme)?; + match_path_with_config(&azure_path, config)?; let relative_path_len = azure_path.path.len(); Ok(&path[path.len() - relative_path_len..]) } @@ -631,47 +614,21 @@ mod tests { #[test] fn test_relativize_path_s3() { let storage = OpenDalStorage::S3 { - configured_scheme: "s3".to_string(), config: Arc::new(S3Config::default()), customized_credential_load: None, }; - assert_eq!( - storage - .relativize_path("s3://my-bucket/path/to/file.parquet") - .unwrap(), - "path/to/file.parquet" - ); - - // s3a scheme - let storage_s3a = OpenDalStorage::S3 { - configured_scheme: "s3a".to_string(), - config: Arc::new(S3Config::default()), - customized_credential_load: None, - }; - assert_eq!( - storage_s3a - .relativize_path("s3a://my-bucket/path/to/file.parquet") - .unwrap(), - "path/to/file.parquet" - ); - } - - #[cfg(feature = "opendal-s3")] - #[test] - fn test_relativize_path_s3_scheme_mismatch() { - let storage = OpenDalStorage::S3 { - configured_scheme: "s3".to_string(), - config: Arc::new(S3Config::default()), - customized_credential_load: None, - }; - - // Scheme mismatch should error - assert!( - storage - .relativize_path("s3a://my-bucket/path/to/file.parquet") - .is_err() - ); + // All S3-family schemes are accepted by the same storage instance. + // Custom schemes for S3-compatible stores (e.g., `minio://`) are also + // accepted because the path's scheme is used as-is for prefix matching. + for scheme in ["s3", "s3a", "s3n", "minio"] { + assert_eq!( + storage + .relativize_path(&format!("{scheme}://my-bucket/path/to/file.parquet")) + .unwrap(), + "path/to/file.parquet" + ); + } } #[cfg(feature = "opendal-gcs")] @@ -736,7 +693,6 @@ mod tests { #[test] fn test_relativize_path_azdls() { let storage = OpenDalStorage::Azdls { - configured_scheme: AzureStorageScheme::Abfss, config: Arc::new(AzdlsConfig { account_name: Some("myaccount".to_string()), endpoint: Some("https://myaccount.dfs.core.windows.net".to_string()), @@ -751,24 +707,4 @@ mod tests { "/path/to/file.parquet" ); } - - #[cfg(feature = "opendal-azdls")] - #[test] - fn test_relativize_path_azdls_scheme_mismatch() { - let storage = OpenDalStorage::Azdls { - configured_scheme: AzureStorageScheme::Abfss, - config: Arc::new(AzdlsConfig { - account_name: Some("myaccount".to_string()), - endpoint: Some("https://myaccount.dfs.core.windows.net".to_string()), - ..Default::default() - }), - }; - - // wasbs scheme doesn't match configured abfss - assert!( - storage - .relativize_path("wasbs://myfs@myaccount.dfs.core.windows.net/path/to/file.parquet") - .is_err() - ); - } } diff --git a/crates/storage/opendal/src/resolving.rs b/crates/storage/opendal/src/resolving.rs index 7c06cf96a5..64a16b18d2 100644 --- a/crates/storage/opendal/src/resolving.rs +++ b/crates/storage/opendal/src/resolving.rs @@ -70,29 +70,28 @@ fn parse_scheme(scheme: &str) -> Result { } } -/// Extract the scheme string from a path URL. -fn extract_scheme(path: &str) -> Result { +/// Extract the [`Scheme`] family from a path URL. +fn extract_scheme(path: &str) -> Result { let url = Url::parse(path).map_err(|e| { Error::new( ErrorKind::DataInvalid, format!("Invalid path: {path}, failed to parse URL: {e}"), ) })?; - Ok(url.scheme().to_string()) + parse_scheme(url.scheme()) } /// Build an [`OpenDalStorage`] variant for the given scheme and config properties. fn build_storage_for_scheme( - scheme: &str, + scheme: Scheme, props: &HashMap, #[cfg(feature = "opendal-s3")] customized_credential_load: &Option, ) -> Result { - match parse_scheme(scheme)? { + match scheme { #[cfg(feature = "opendal-s3")] Scheme::S3 => { let config = crate::s3::s3_config_parse(props.clone())?; Ok(OpenDalStorage::S3 { - configured_scheme: scheme.to_string(), config: Arc::new(config), customized_credential_load: customized_credential_load.clone(), }) @@ -113,10 +112,8 @@ fn build_storage_for_scheme( } #[cfg(feature = "opendal-azdls")] Scheme::Azdls => { - let configured_scheme: crate::azdls::AzureStorageScheme = scheme.parse()?; let config = crate::azdls::azdls_config_parse(props.clone())?; Ok(OpenDalStorage::Azdls { - configured_scheme, config: Arc::new(config), }) } @@ -196,14 +193,15 @@ impl StorageFactory for OpenDalResolvingStorageFactory { /// to the appropriate [`OpenDalStorage`] variant. /// /// Sub-storages are lazily created on first use for each scheme and cached -/// for subsequent operations. +/// for subsequent operations. Scheme aliases like `s3`/`s3a`/`s3n` map to +/// the same [`Scheme`] variant, so they share a storage instance. #[derive(Debug, Serialize, Deserialize)] pub struct OpenDalResolvingStorage { /// Configuration properties shared across all backends. props: HashMap, - /// Cache of scheme → storage mappings. + /// Cache of scheme to storage mappings. #[serde(skip, default)] - storages: RwLock>>, + storages: RwLock>>, /// Custom AWS credential loader for S3 storage. #[cfg(feature = "opendal-s3")] #[serde(skip)] @@ -239,7 +237,7 @@ impl OpenDalResolvingStorage { } let storage = build_storage_for_scheme( - &scheme, + scheme, &self.props, #[cfg(feature = "opendal-s3")] &self.customized_credential_load, @@ -288,7 +286,7 @@ impl Storage for OpenDalResolvingStorage { async fn delete_stream(&self, mut paths: BoxStream<'static, String>) -> Result<()> { // Group paths by scheme so each resolved storage receives a batch, // avoiding repeated operator creation per path. - let mut grouped: HashMap> = HashMap::new(); + let mut grouped: HashMap> = HashMap::new(); while let Some(path) = paths.next().await { let scheme = extract_scheme(&path)?; grouped.entry(scheme).or_default().push(path); @@ -317,3 +315,54 @@ impl Storage for OpenDalResolvingStorage { )) } } + +#[cfg(test)] +mod tests { + use super::*; + + /// Builds a resolving storage with empty props, suitable for `resolve()` + /// calls that don't actually hit any backend. + fn empty_resolving_storage() -> OpenDalResolvingStorage { + OpenDalResolvingStorage { + props: HashMap::new(), + storages: RwLock::new(HashMap::new()), + #[cfg(feature = "opendal-s3")] + customized_credential_load: None, + } + } + + #[cfg(feature = "opendal-s3")] + #[test] + fn test_resolve_s3_aliases_share_instance() { + let storage = empty_resolving_storage(); + + // All three S3-family schemes must collapse to a single cached + // `Arc` so that catalogs handing the resolver a mix + // of `s3://`, `s3a://`, `s3n://` paths don't rebuild operators. + let a = storage.resolve("s3://bucket/key").unwrap(); + let b = storage.resolve("s3a://bucket/key").unwrap(); + let c = storage.resolve("s3n://bucket/key").unwrap(); + + assert!(Arc::ptr_eq(&a, &b), "s3 and s3a should share one instance"); + assert!(Arc::ptr_eq(&a, &c), "s3 and s3n should share one instance"); + } + + #[cfg(feature = "opendal-azdls")] + #[test] + fn test_resolve_azdls_aliases_share_instance() { + let storage = empty_resolving_storage(); + + let path_for = |scheme: &str| { + format!("{scheme}://myfs@myaccount.dfs.core.windows.net/path/to/file.parquet") + }; + + // All Azure schemes collapse onto one cached instance. + let abfss = storage.resolve(&path_for("abfss")).unwrap(); + let abfs = storage.resolve(&path_for("abfs")).unwrap(); + + assert!( + Arc::ptr_eq(&abfss, &abfs), + "abfss and abfs should share one instance" + ); + } +} diff --git a/crates/storage/opendal/tests/file_io_s3_test.rs b/crates/storage/opendal/tests/file_io_s3_test.rs index a27afb6996..d6dd8a3b45 100644 --- a/crates/storage/opendal/tests/file_io_s3_test.rs +++ b/crates/storage/opendal/tests/file_io_s3_test.rs @@ -40,7 +40,6 @@ mod tests { let minio_endpoint = get_minio_endpoint(); FileIOBuilder::new(Arc::new(OpenDalStorageFactory::S3 { - configured_scheme: "s3".to_string(), customized_credential_load: None, })) .with_props(vec![ @@ -134,7 +133,6 @@ mod tests { // Test that the loader can be used in FileIOBuilder with OpenDalStorageFactory let _builder = FileIOBuilder::new(Arc::new(OpenDalStorageFactory::S3 { - configured_scheme: "s3".to_string(), customized_credential_load: Some(custom_loader), })) .with_props(vec![ @@ -157,7 +155,6 @@ mod tests { // Build FileIO with custom credential loader via OpenDalStorageFactory let file_io_with_custom_creds = FileIOBuilder::new(Arc::new(OpenDalStorageFactory::S3 { - configured_scheme: "s3".to_string(), customized_credential_load: Some(custom_loader), })) .with_props(vec![ @@ -186,7 +183,6 @@ mod tests { // Build FileIO with custom credential loader via OpenDalStorageFactory let file_io_with_custom_creds = FileIOBuilder::new(Arc::new(OpenDalStorageFactory::S3 { - configured_scheme: "s3".to_string(), customized_credential_load: Some(custom_loader), })) .with_props(vec![ From 1c7eb65068b46863bc83edc827844e519f746401 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 20 Apr 2026 10:52:02 +0800 Subject: [PATCH 19/45] chore(deps): Bump PyO3/maturin-action from 1.50.1 to 1.51.0 (#2346) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [PyO3/maturin-action](https://github.com/pyo3/maturin-action) from 1.50.1 to 1.51.0.
Release notes

Sourced from PyO3/maturin-action's releases.

v1.51.0

What's Changed

New Contributors

Full Changelog: https://github.com/PyO3/maturin-action/compare/v1.50.1...v1.51.0

Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=PyO3/maturin-action&package-manager=github_actions&previous-version=1.50.1&new-version=1.51.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/bindings_python_ci.yml | 2 +- .github/workflows/release_python.yml | 4 ++-- .github/workflows/release_python_nightly.yml | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/bindings_python_ci.yml b/.github/workflows/bindings_python_ci.yml index 916113e06f..4483a53310 100644 --- a/.github/workflows/bindings_python_ci.yml +++ b/.github/workflows/bindings_python_ci.yml @@ -95,7 +95,7 @@ jobs: - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6 with: python-version: 3.12 - - uses: PyO3/maturin-action@04ac600d27cdf7a9a280dadf7147097c42b757ad # v1.50.1 + - uses: PyO3/maturin-action@e83996d129638aa358a18fbd1dfb82f0b0fb5d3b # v1.51.0 with: working-directory: "bindings/python" command: build diff --git a/.github/workflows/release_python.yml b/.github/workflows/release_python.yml index c9817e064c..1514cbacc8 100644 --- a/.github/workflows/release_python.yml +++ b/.github/workflows/release_python.yml @@ -124,7 +124,7 @@ jobs: env: NEEDS_VALIDATE_RELEASE_TAG_OUTPUTS_CARGO_VERSION: ${{ needs.validate-release-tag.outputs.cargo-version }} - - uses: PyO3/maturin-action@04ac600d27cdf7a9a280dadf7147097c42b757ad # v1.50.1 + - uses: PyO3/maturin-action@e83996d129638aa358a18fbd1dfb82f0b0fb5d3b # v1.51.0 with: working-directory: "bindings/python" command: sdist @@ -184,7 +184,7 @@ jobs: uses: ./.github/actions/setup-builder with: rust-version: ${{ steps.get-msrv.outputs.msrv }} - - uses: PyO3/maturin-action@04ac600d27cdf7a9a280dadf7147097c42b757ad # v1.50.1 + - uses: PyO3/maturin-action@e83996d129638aa358a18fbd1dfb82f0b0fb5d3b # v1.51.0 with: target: ${{ matrix.target }} manylinux: ${{ matrix.manylinux || 'auto' }} diff --git a/.github/workflows/release_python_nightly.yml b/.github/workflows/release_python_nightly.yml index 55695784e9..115d5d7955 100644 --- a/.github/workflows/release_python_nightly.yml +++ b/.github/workflows/release_python_nightly.yml @@ -48,7 +48,7 @@ jobs: with: timestamp: ${{ needs.set-version.outputs.TIMESTAMP }} - - uses: PyO3/maturin-action@04ac600d27cdf7a9a280dadf7147097c42b757ad # v1.50.1 + - uses: PyO3/maturin-action@e83996d129638aa358a18fbd1dfb82f0b0fb5d3b # v1.51.0 with: working-directory: "bindings/python" command: sdist @@ -98,7 +98,7 @@ jobs: with: rust-version: ${{ steps.get-msrv.outputs.msrv }} - - uses: PyO3/maturin-action@04ac600d27cdf7a9a280dadf7147097c42b757ad # v1.50.1 + - uses: PyO3/maturin-action@e83996d129638aa358a18fbd1dfb82f0b0fb5d3b # v1.51.0 with: target: ${{ matrix.target }} manylinux: ${{ matrix.manylinux || 'auto' }} From 02496d3f20d2796437272234ee42fbb406c6efdd Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 20 Apr 2026 10:52:34 +0800 Subject: [PATCH 20/45] chore(deps): Bump actions/upload-artifact from 7.0.0 to 7.0.1 (#2345) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [actions/upload-artifact](https://github.com/actions/upload-artifact) from 7.0.0 to 7.0.1.
Release notes

Sourced from actions/upload-artifact's releases.

v7.0.1

What's Changed

Full Changelog: https://github.com/actions/upload-artifact/compare/v7...v7.0.1

Commits
  • 043fb46 Merge pull request #797 from actions/yacaovsnc/update-dependency
  • 634250c Include changes in typespec/ts-http-runtime 0.3.5
  • e454baa Readme: bump all the example versions to v7 (#796)
  • 74fad66 Update the readme with direct upload details (#795)
  • See full diff in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=actions/upload-artifact&package-manager=github_actions&previous-version=7.0.0&new-version=7.0.1)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/release_python.yml | 4 ++-- .github/workflows/release_python_nightly.yml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/release_python.yml b/.github/workflows/release_python.yml index 1514cbacc8..0638cff6df 100644 --- a/.github/workflows/release_python.yml +++ b/.github/workflows/release_python.yml @@ -130,7 +130,7 @@ jobs: command: sdist args: -o dist - name: Upload sdist - uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: name: wheels-sdist path: bindings/python/dist @@ -192,7 +192,7 @@ jobs: command: build args: --release -o dist -i python3.12 # Explicitly set interpreter; manylinux containers have multiple Pythons and maturin may pick an older one - name: Upload wheels - uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: name: wheels-${{ matrix.os }}-${{ matrix.target }} path: bindings/python/dist diff --git a/.github/workflows/release_python_nightly.yml b/.github/workflows/release_python_nightly.yml index 115d5d7955..26b034554c 100644 --- a/.github/workflows/release_python_nightly.yml +++ b/.github/workflows/release_python_nightly.yml @@ -55,7 +55,7 @@ jobs: args: -o dist - name: Upload sdist - uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: name: wheels-sdist path: bindings/python/dist @@ -107,7 +107,7 @@ jobs: args: --release -o dist -i python3.12 # Explicitly set interpreter; manylinux containers have multiple Pythons and maturin may pick an older one - name: Upload wheels - uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: name: wheels-${{ matrix.os }}-${{ matrix.target }} path: bindings/python/dist From c1538de36dd53e491299b62ad89286f2db496bc7 Mon Sep 17 00:00:00 2001 From: jeff-sqds Date: Sun, 19 Apr 2026 23:48:37 -0400 Subject: [PATCH 21/45] support fixedbinary(n) (#2348) ## Which issue does this PR close? - Closes #2347 . ## What changes are included in this PR? This change performs datum conversion for FixedSizedBinaryArray types from fixed binary primitive types. It closely follows the previously added support for Uuid types. ## Are these changes tested? Yes --- crates/iceberg/src/arrow/schema.rs | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/crates/iceberg/src/arrow/schema.rs b/crates/iceberg/src/arrow/schema.rs index f96c29ab4a..9b504421ae 100644 --- a/crates/iceberg/src/arrow/schema.rs +++ b/crates/iceberg/src/arrow/schema.rs @@ -762,6 +762,11 @@ pub(crate) fn get_arrow_datum(datum: &Datum) -> Result { + let array = FixedSizeBinaryArray::try_from_iter(std::iter::once(value.as_slice())) + .map_err(|e| Error::new(ErrorKind::DataInvalid, e.to_string()))?; + Ok(Arc::new(Scalar::new(array))) + } (primitive_type, _) => Err(Error::new( ErrorKind::FeatureUnsupported, @@ -2154,6 +2159,18 @@ mod tests { assert!(is_scalar); assert_eq!(array.value(0), [66u8; 16]); } + { + let datum = Datum::fixed(vec![1u8, 2, 3, 4, 5, 6, 7, 8]); + let arrow_datum = get_arrow_datum(&datum).unwrap(); + let (array, is_scalar) = arrow_datum.get(); + let array = array + .as_any() + .downcast_ref::() + .unwrap(); + assert!(is_scalar); + assert_eq!(array.value_length(), 8); + assert_eq!(array.value(0), &[1u8, 2, 3, 4, 5, 6, 7, 8]); + } } #[test] From 1b6400956da02d6b564db527edf5d7ea8feca0d3 Mon Sep 17 00:00:00 2001 From: Xander Date: Tue, 21 Apr 2026 02:08:37 +0100 Subject: [PATCH 22/45] chore: bump datafusion to 53.1.0 (#2350) ## Which issue does this PR close? Keep up with latest release from datafusion: https://github.com/apache/datafusion/issues/21079 - Closes #. ## What changes are included in this PR? chore: bump datafusion to 53.1.0 ## Are these changes tested? --- Cargo.lock | 128 ++++++++++++++++++++++++++--------------------------- Cargo.toml | 2 +- 2 files changed, 65 insertions(+), 65 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3f80d0013e..a33bf4c5d8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1643,9 +1643,9 @@ dependencies = [ [[package]] name = "datafusion" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de9f8117889ba9503440f1dd79ebab32ba52ccf1720bb83cd718a29d4edc0d16" +checksum = "93db0e623840612f7f2cd757f7e8a8922064192363732c88692e0870016e141b" dependencies = [ "arrow", "arrow-schema", @@ -1699,9 +1699,9 @@ dependencies = [ [[package]] name = "datafusion-catalog" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be893b73a13671f310ffcc8da2c546b81efcc54c22e0382c0a28aa3537017137" +checksum = "37cefde60b26a7f4ff61e9d2ff2833322f91df2b568d7238afe67bde5bdffb66" dependencies = [ "arrow", "async-trait", @@ -1724,9 +1724,9 @@ dependencies = [ [[package]] name = "datafusion-catalog-listing" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "830487b51ed83807d6b32d6325f349c3144ae0c9bf772cf2a712db180c31d5e6" +checksum = "17e112307715d6a7a331111a4c2330ff54bc237183511c319e3708a4cff431fb" dependencies = [ "arrow", "async-trait", @@ -1775,9 +1775,9 @@ dependencies = [ [[package]] name = "datafusion-common" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d7663f3af955292f8004e74bcaf8f7ea3d66cc38438749615bb84815b61a293" +checksum = "d72a11ca44a95e1081870d3abb80c717496e8a7acb467a1d3e932bb636af5cc2" dependencies = [ "ahash", "apache-avro", @@ -1802,9 +1802,9 @@ dependencies = [ [[package]] name = "datafusion-common-runtime" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f590205c7e32fe1fea48dd53ffb406e56ae0e7a062213a3ac848db8771641bd" +checksum = "89f4afaed29670ec4fd6053643adc749fe3f4bc9d1ce1b8c5679b22c67d12def" dependencies = [ "futures", "log", @@ -1813,9 +1813,9 @@ dependencies = [ [[package]] name = "datafusion-datasource" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fde1e030a9dc87b743c806fbd631f5ecfa2ccaa4ffb61fa19144a07fea406b79" +checksum = "e9fb386e1691355355a96419978a0022b7947b44d4a24a6ea99f00b6b485cbb6" dependencies = [ "arrow", "async-compression", @@ -1848,9 +1848,9 @@ dependencies = [ [[package]] name = "datafusion-datasource-arrow" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "331ebae7055dc108f9b54994b93dff91f3a17445539efe5b74e89264f7b36e15" +checksum = "ffa6c52cfed0734c5f93754d1c0175f558175248bf686c944fb05c373e5fc096" dependencies = [ "arrow", "arrow-ipc", @@ -1872,9 +1872,9 @@ dependencies = [ [[package]] name = "datafusion-datasource-avro" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49dda81c79b6ba57b1853a9158abc66eb85a3aa1cede0c517dabec6d8a4ed3aa" +checksum = "a579c3bd290c66ea4b269493e75e8a3ed42c9c895a651f10210a29538aee50c4" dependencies = [ "apache-avro", "arrow", @@ -1892,9 +1892,9 @@ dependencies = [ [[package]] name = "datafusion-datasource-csv" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e0d475088325e2986876aa27bb30d0574f72a22955a527d202f454681d55c5c" +checksum = "503f29e0582c1fc189578d665ff57d9300da1f80c282777d7eb67bb79fb8cdca" dependencies = [ "arrow", "async-trait", @@ -1915,9 +1915,9 @@ dependencies = [ [[package]] name = "datafusion-datasource-json" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea1520d81f31770f3ad6ee98b391e75e87a68a5bb90de70064ace5e0a7182fe8" +checksum = "e33804749abc8d0c8cb7473228483cb8070e524c6f6086ee1b85a64debe2b3d2" dependencies = [ "arrow", "async-trait", @@ -1939,9 +1939,9 @@ dependencies = [ [[package]] name = "datafusion-datasource-parquet" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95be805d0742ab129720f4c51ad9242cd872599cdb076098b03f061fcdc7f946" +checksum = "32a8e0365e0e08e8ff94d912f0ababcf9065a1a304018ba90b1fc83c855b4997" dependencies = [ "arrow", "async-trait", @@ -1969,15 +1969,15 @@ dependencies = [ [[package]] name = "datafusion-doc" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c93ad9e37730d2c7196e68616f3f2dd3b04c892e03acd3a8eeca6e177f3c06a" +checksum = "8de6ac0df1662b9148ad3c987978b32cbec7c772f199b1d53520c8fa764a87ee" [[package]] name = "datafusion-execution" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9437d3cd5d363f9319f8122182d4d233427de79c7eb748f23054c9aaa0fdd8df" +checksum = "c03c7fbdaefcca4ef6ffe425a5fc2325763bfb426599bb0bf4536466efabe709" dependencies = [ "arrow", "arrow-buffer", @@ -1999,9 +1999,9 @@ dependencies = [ [[package]] name = "datafusion-expr" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67164333342b86521d6d93fa54081ee39839894fb10f7a700c099af96d7552cf" +checksum = "574b9b6977fedbd2a611cbff12e5caf90f31640ad9dc5870f152836d94bad0dd" dependencies = [ "arrow", "async-trait", @@ -2022,9 +2022,9 @@ dependencies = [ [[package]] name = "datafusion-expr-common" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ab05fdd00e05d5a6ee362882546d29d6d3df43a6c55355164a7fbee12d163bc9" +checksum = "7d7c3adf3db8bf61e92eb90cb659c8e8b734593a8f7c8e12a843c7ddba24b87e" dependencies = [ "arrow", "datafusion-common", @@ -2035,9 +2035,9 @@ dependencies = [ [[package]] name = "datafusion-functions" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04fb863482d987cf938db2079e07ab0d3bb64595f28907a6c2f8671ad71cca7e" +checksum = "f28aa4e10384e782774b10e72aca4d93ef7b31aa653095d9d4536b0a3dbc51b6" dependencies = [ "arrow", "arrow-buffer", @@ -2067,9 +2067,9 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "829856f4e14275fb376c104f27cbf3c3b57a9cfe24885d98677525f5e43ce8d6" +checksum = "00aa6217e56098ba84e0a338176fe52f0a84cca398021512c6c8c5eff806d0ad" dependencies = [ "ahash", "arrow", @@ -2089,9 +2089,9 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate-common" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08af79cc3d2aa874a362fb97decfcbd73d687190cb096f16a6c85a7780cce311" +checksum = "b511250349407db7c43832ab2de63f5557b19a20dfd236b39ca2c04468b50d47" dependencies = [ "ahash", "arrow", @@ -2102,9 +2102,9 @@ dependencies = [ [[package]] name = "datafusion-functions-nested" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "465ae3368146d49c2eda3e2c0ef114424c87e8a6b509ab34c1026ace6497e790" +checksum = "ef13a858e20d50f0a9bb5e96e7ac82b4e7597f247515bccca4fdd2992df0212a" dependencies = [ "arrow", "arrow-ord", @@ -2127,9 +2127,9 @@ dependencies = [ [[package]] name = "datafusion-functions-table" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6156e6b22fcf1784112fc0173f3ae6e78c8fdb4d3ed0eace9543873b437e2af6" +checksum = "72b40d3f5bbb3905f9ccb1ce9485a9595c77b69758a7c24d3ba79e334ff51e7e" dependencies = [ "arrow", "async-trait", @@ -2143,9 +2143,9 @@ dependencies = [ [[package]] name = "datafusion-functions-window" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca7baec14f866729012efb89011a6973f3a346dc8090c567bfcd328deff551c1" +checksum = "d4e88ec9d57c9b685d02f58bfee7be62d72610430ddcedb82a08e5d9925dbfb6" dependencies = [ "arrow", "datafusion-common", @@ -2161,9 +2161,9 @@ dependencies = [ [[package]] name = "datafusion-functions-window-common" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "159228c3280d342658466bb556dc24de30047fe1d7e559dc5d16ccc5324166f9" +checksum = "8307bb93519b1a91913723a1130cfafeee3f72200d870d88e91a6fc5470ede5c" dependencies = [ "datafusion-common", "datafusion-physical-expr-common", @@ -2171,9 +2171,9 @@ dependencies = [ [[package]] name = "datafusion-macros" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5427e5da5edca4d21ea1c7f50e1c9421775fe33d7d5726e5641a833566e7578" +checksum = "2e367e6a71051d0ebdd29b2f85d12059b38b1d1f172c6906e80016da662226bd" dependencies = [ "datafusion-doc", "quote", @@ -2182,9 +2182,9 @@ dependencies = [ [[package]] name = "datafusion-optimizer" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89099eefcd5b223ec685c36a41d35c69239236310d71d339f2af0fa4383f3f46" +checksum = "e929015451a67f77d9d8b727b2bf3a40c4445fdef6cdc53281d7d97c76888ace" dependencies = [ "arrow", "chrono", @@ -2202,9 +2202,9 @@ dependencies = [ [[package]] name = "datafusion-physical-expr" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0f222df5195d605d79098ef37bdd5323bff0131c9d877a24da6ec98dfca9fe36" +checksum = "4b1e68aba7a4b350401cfdf25a3d6f989ad898a7410164afe9ca52080244cb59" dependencies = [ "ahash", "arrow", @@ -2226,9 +2226,9 @@ dependencies = [ [[package]] name = "datafusion-physical-expr-adapter" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "40838625d63d9c12549d81979db3dd675d159055eb9135009ba272ab0e8d0f64" +checksum = "ea22315f33cf2e0adc104e8ec42e285f6ed93998d565c65e82fec6a9ee9f9db4" dependencies = [ "arrow", "datafusion-common", @@ -2241,9 +2241,9 @@ dependencies = [ [[package]] name = "datafusion-physical-expr-common" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eacbcc4cfd502558184ed58fa3c72e775ec65bf077eef5fd2b3453db676f893c" +checksum = "b04b45ea8ad3ac2d78f2ea2a76053e06591c9629c7a603eda16c10649ecf4362" dependencies = [ "ahash", "arrow", @@ -2258,9 +2258,9 @@ dependencies = [ [[package]] name = "datafusion-physical-optimizer" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d501d0e1d0910f015677121601ac177ec59272ef5c9324d1147b394988f40941" +checksum = "7cb13397809a425918f608dfe8653f332015a3e330004ab191b4404187238b95" dependencies = [ "arrow", "datafusion-common", @@ -2277,9 +2277,9 @@ dependencies = [ [[package]] name = "datafusion-physical-plan" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "463c88ad6f1ecab1810f4c9f046898bee035b370137eb79b2b2db925e270631d" +checksum = "5edc023675791af9d5fb4cc4c24abf5f7bd3bd4dcf9e5bd90ea1eff6976dcc79" dependencies = [ "ahash", "arrow", @@ -2309,9 +2309,9 @@ dependencies = [ [[package]] name = "datafusion-pruning" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2857618a0ecbd8cd0cf29826889edd3a25774ec26b2995fc3862095c95d88fc6" +checksum = "ac8c76860e355616555081cab5968cec1af7a80701ff374510860bcd567e365a" dependencies = [ "arrow", "datafusion-common", @@ -2326,9 +2326,9 @@ dependencies = [ [[package]] name = "datafusion-session" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ef8637e35022c5c775003b3ab1debc6b4a8f0eb41b069bdd5475dd3aa93f6eba" +checksum = "5412111aa48e2424ba926112e192f7a6b7e4ccb450145d25ce5ede9f19dc491e" dependencies = [ "async-trait", "datafusion-common", @@ -2367,9 +2367,9 @@ dependencies = [ [[package]] name = "datafusion-sql" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "12d9e9f16a1692a11c94bcc418191fa15fd2b4d72a0c1a0c607db93c0b84dd81" +checksum = "fa0d133ddf8b9b3b872acac900157f783e7b879fe9a6bccf389abebbfac45ec1" dependencies = [ "arrow", "bigdecimal", diff --git a/Cargo.toml b/Cargo.toml index 2f5a515ef0..7f612c44bf 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -66,7 +66,7 @@ cfg-if = "1" chrono = "0.4.41" clap = { version = "4.5.48", features = ["derive", "cargo"] } dashmap = "6" -datafusion = "53.0.0" +datafusion = "53.1.0" datafusion-cli = "53.0.0" datafusion-sqllogictest = "53.0.0" derive_builder = "0.20" From ad44fc3b99f941ecb6d6d7e5a40647077068361e Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 21 Apr 2026 15:33:41 +0800 Subject: [PATCH 23/45] chore(deps): Bump rustls-webpki from 0.103.10 to 0.103.12 in /bindings/python (#2352) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [rustls-webpki](https://github.com/rustls/webpki) from 0.103.10 to 0.103.12.
Release notes

Sourced from rustls-webpki's releases.

0.103.12

This release fixes two bugs in name constraint enforcement:

  • GHSA-965h-392x-2mh5: name constraints for URI names were ignored and therefore accepted. URI name constraints are now rejected unconditionally. Note this library does not provide an API for asserting URI names, and URI name constraints are otherwise not implemented.
  • GHSA-xgp8-3hg3-c2mh: permitted subtree name constraints for DNS names were accepted for certificates asserting a wildcard name. This was incorrect because, given a name constraint of accept.example.com, *.example.com could feasibly allow a name of reject.example.com which is outside the constraint. This is very similar to CVE-2025-61727.

Since name constraints are restrictions on otherwise properly-issued certificates, these bugs are reachable only after signature verification and require misissuance to exploit.

What's Changed

Full Changelog: https://github.com/rustls/webpki/compare/v/0.103.11...v/0.103.12

0.103.11

In response to #464, we've slightly relaxed requirements for anchor_from_trust_cert() to ignore unknown extensions even if they're marked as critical. This only affects parsing a TrustAnchor from DER, for which most extensions are ignored anyway.

What's Changed

Commits
  • 27131d4 Bump version to 0.103.12
  • 6ecb876 Clean up stuttery enum variant names
  • 318b3e6 Ignore wildcard labels when matching name constraints
  • 1219622 Rewrite constraint matching to avoid permissive catch-all branch
  • 57bc62c Bump version to 0.103.11
  • d0fa01e Allow parsing trust anchors with unknown criticial extensions
  • See full diff in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=rustls-webpki&package-manager=cargo&previous-version=0.103.10&new-version=0.103.12)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself) You can disable automated security fix PRs for this repo from the [Security Alerts page](https://github.com/apache/iceberg-rust/network/alerts).
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- bindings/python/Cargo.lock | 148 ++++++++++++++++++------------------- 1 file changed, 74 insertions(+), 74 deletions(-) diff --git a/bindings/python/Cargo.lock b/bindings/python/Cargo.lock index 1b5c06f492..21ddcefc58 100644 --- a/bindings/python/Cargo.lock +++ b/bindings/python/Cargo.lock @@ -163,7 +163,7 @@ dependencies = [ "miniz_oxide", "num-bigint", "quad-rand", - "rand 0.9.2", + "rand 0.9.4", "regex-lite", "serde", "serde_bytes", @@ -1052,9 +1052,9 @@ dependencies = [ [[package]] name = "datafusion" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de9f8117889ba9503440f1dd79ebab32ba52ccf1720bb83cd718a29d4edc0d16" +checksum = "93db0e623840612f7f2cd757f7e8a8922064192363732c88692e0870016e141b" dependencies = [ "arrow", "arrow-schema", @@ -1095,7 +1095,7 @@ dependencies = [ "object_store", "parking_lot", "parquet", - "rand 0.9.2", + "rand 0.9.4", "regex", "sqlparser", "tempfile", @@ -1107,9 +1107,9 @@ dependencies = [ [[package]] name = "datafusion-catalog" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be893b73a13671f310ffcc8da2c546b81efcc54c22e0382c0a28aa3537017137" +checksum = "37cefde60b26a7f4ff61e9d2ff2833322f91df2b568d7238afe67bde5bdffb66" dependencies = [ "arrow", "async-trait", @@ -1132,9 +1132,9 @@ dependencies = [ [[package]] name = "datafusion-catalog-listing" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "830487b51ed83807d6b32d6325f349c3144ae0c9bf772cf2a712db180c31d5e6" +checksum = "17e112307715d6a7a331111a4c2330ff54bc237183511c319e3708a4cff431fb" dependencies = [ "arrow", "async-trait", @@ -1155,9 +1155,9 @@ dependencies = [ [[package]] name = "datafusion-common" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d7663f3af955292f8004e74bcaf8f7ea3d66cc38438749615bb84815b61a293" +checksum = "d72a11ca44a95e1081870d3abb80c717496e8a7acb467a1d3e932bb636af5cc2" dependencies = [ "ahash", "arrow", @@ -1180,9 +1180,9 @@ dependencies = [ [[package]] name = "datafusion-common-runtime" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f590205c7e32fe1fea48dd53ffb406e56ae0e7a062213a3ac848db8771641bd" +checksum = "89f4afaed29670ec4fd6053643adc749fe3f4bc9d1ce1b8c5679b22c67d12def" dependencies = [ "futures", "log", @@ -1191,9 +1191,9 @@ dependencies = [ [[package]] name = "datafusion-datasource" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fde1e030a9dc87b743c806fbd631f5ecfa2ccaa4ffb61fa19144a07fea406b79" +checksum = "e9fb386e1691355355a96419978a0022b7947b44d4a24a6ea99f00b6b485cbb6" dependencies = [ "arrow", "async-compression", @@ -1217,7 +1217,7 @@ dependencies = [ "liblzma", "log", "object_store", - "rand 0.9.2", + "rand 0.9.4", "tokio", "tokio-util", "url", @@ -1226,9 +1226,9 @@ dependencies = [ [[package]] name = "datafusion-datasource-arrow" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "331ebae7055dc108f9b54994b93dff91f3a17445539efe5b74e89264f7b36e15" +checksum = "ffa6c52cfed0734c5f93754d1c0175f558175248bf686c944fb05c373e5fc096" dependencies = [ "arrow", "arrow-ipc", @@ -1250,9 +1250,9 @@ dependencies = [ [[package]] name = "datafusion-datasource-csv" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e0d475088325e2986876aa27bb30d0574f72a22955a527d202f454681d55c5c" +checksum = "503f29e0582c1fc189578d665ff57d9300da1f80c282777d7eb67bb79fb8cdca" dependencies = [ "arrow", "async-trait", @@ -1273,9 +1273,9 @@ dependencies = [ [[package]] name = "datafusion-datasource-json" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea1520d81f31770f3ad6ee98b391e75e87a68a5bb90de70064ace5e0a7182fe8" +checksum = "e33804749abc8d0c8cb7473228483cb8070e524c6f6086ee1b85a64debe2b3d2" dependencies = [ "arrow", "async-trait", @@ -1297,9 +1297,9 @@ dependencies = [ [[package]] name = "datafusion-datasource-parquet" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95be805d0742ab129720f4c51ad9242cd872599cdb076098b03f061fcdc7f946" +checksum = "32a8e0365e0e08e8ff94d912f0ababcf9065a1a304018ba90b1fc83c855b4997" dependencies = [ "arrow", "async-trait", @@ -1327,15 +1327,15 @@ dependencies = [ [[package]] name = "datafusion-doc" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c93ad9e37730d2c7196e68616f3f2dd3b04c892e03acd3a8eeca6e177f3c06a" +checksum = "8de6ac0df1662b9148ad3c987978b32cbec7c772f199b1d53520c8fa764a87ee" [[package]] name = "datafusion-execution" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9437d3cd5d363f9319f8122182d4d233427de79c7eb748f23054c9aaa0fdd8df" +checksum = "c03c7fbdaefcca4ef6ffe425a5fc2325763bfb426599bb0bf4536466efabe709" dependencies = [ "arrow", "arrow-buffer", @@ -1349,16 +1349,16 @@ dependencies = [ "log", "object_store", "parking_lot", - "rand 0.9.2", + "rand 0.9.4", "tempfile", "url", ] [[package]] name = "datafusion-expr" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67164333342b86521d6d93fa54081ee39839894fb10f7a700c099af96d7552cf" +checksum = "574b9b6977fedbd2a611cbff12e5caf90f31640ad9dc5870f152836d94bad0dd" dependencies = [ "arrow", "async-trait", @@ -1379,9 +1379,9 @@ dependencies = [ [[package]] name = "datafusion-expr-common" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ab05fdd00e05d5a6ee362882546d29d6d3df43a6c55355164a7fbee12d163bc9" +checksum = "7d7c3adf3db8bf61e92eb90cb659c8e8b734593a8f7c8e12a843c7ddba24b87e" dependencies = [ "arrow", "datafusion-common", @@ -1422,9 +1422,9 @@ dependencies = [ [[package]] name = "datafusion-functions" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04fb863482d987cf938db2079e07ab0d3bb64595f28907a6c2f8671ad71cca7e" +checksum = "f28aa4e10384e782774b10e72aca4d93ef7b31aa653095d9d4536b0a3dbc51b6" dependencies = [ "arrow", "arrow-buffer", @@ -1445,7 +1445,7 @@ dependencies = [ "md-5", "memchr", "num-traits", - "rand 0.9.2", + "rand 0.9.4", "regex", "sha2", "unicode-segmentation", @@ -1454,9 +1454,9 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "829856f4e14275fb376c104f27cbf3c3b57a9cfe24885d98677525f5e43ce8d6" +checksum = "00aa6217e56098ba84e0a338176fe52f0a84cca398021512c6c8c5eff806d0ad" dependencies = [ "ahash", "arrow", @@ -1476,9 +1476,9 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate-common" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08af79cc3d2aa874a362fb97decfcbd73d687190cb096f16a6c85a7780cce311" +checksum = "b511250349407db7c43832ab2de63f5557b19a20dfd236b39ca2c04468b50d47" dependencies = [ "ahash", "arrow", @@ -1489,9 +1489,9 @@ dependencies = [ [[package]] name = "datafusion-functions-nested" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "465ae3368146d49c2eda3e2c0ef114424c87e8a6b509ab34c1026ace6497e790" +checksum = "ef13a858e20d50f0a9bb5e96e7ac82b4e7597f247515bccca4fdd2992df0212a" dependencies = [ "arrow", "arrow-ord", @@ -1514,9 +1514,9 @@ dependencies = [ [[package]] name = "datafusion-functions-table" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6156e6b22fcf1784112fc0173f3ae6e78c8fdb4d3ed0eace9543873b437e2af6" +checksum = "72b40d3f5bbb3905f9ccb1ce9485a9595c77b69758a7c24d3ba79e334ff51e7e" dependencies = [ "arrow", "async-trait", @@ -1530,9 +1530,9 @@ dependencies = [ [[package]] name = "datafusion-functions-window" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca7baec14f866729012efb89011a6973f3a346dc8090c567bfcd328deff551c1" +checksum = "d4e88ec9d57c9b685d02f58bfee7be62d72610430ddcedb82a08e5d9925dbfb6" dependencies = [ "arrow", "datafusion-common", @@ -1548,9 +1548,9 @@ dependencies = [ [[package]] name = "datafusion-functions-window-common" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "159228c3280d342658466bb556dc24de30047fe1d7e559dc5d16ccc5324166f9" +checksum = "8307bb93519b1a91913723a1130cfafeee3f72200d870d88e91a6fc5470ede5c" dependencies = [ "datafusion-common", "datafusion-physical-expr-common", @@ -1558,9 +1558,9 @@ dependencies = [ [[package]] name = "datafusion-macros" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5427e5da5edca4d21ea1c7f50e1c9421775fe33d7d5726e5641a833566e7578" +checksum = "2e367e6a71051d0ebdd29b2f85d12059b38b1d1f172c6906e80016da662226bd" dependencies = [ "datafusion-doc", "quote", @@ -1569,9 +1569,9 @@ dependencies = [ [[package]] name = "datafusion-optimizer" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89099eefcd5b223ec685c36a41d35c69239236310d71d339f2af0fa4383f3f46" +checksum = "e929015451a67f77d9d8b727b2bf3a40c4445fdef6cdc53281d7d97c76888ace" dependencies = [ "arrow", "chrono", @@ -1589,9 +1589,9 @@ dependencies = [ [[package]] name = "datafusion-physical-expr" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0f222df5195d605d79098ef37bdd5323bff0131c9d877a24da6ec98dfca9fe36" +checksum = "4b1e68aba7a4b350401cfdf25a3d6f989ad898a7410164afe9ca52080244cb59" dependencies = [ "ahash", "arrow", @@ -1613,9 +1613,9 @@ dependencies = [ [[package]] name = "datafusion-physical-expr-adapter" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "40838625d63d9c12549d81979db3dd675d159055eb9135009ba272ab0e8d0f64" +checksum = "ea22315f33cf2e0adc104e8ec42e285f6ed93998d565c65e82fec6a9ee9f9db4" dependencies = [ "arrow", "datafusion-common", @@ -1628,9 +1628,9 @@ dependencies = [ [[package]] name = "datafusion-physical-expr-common" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eacbcc4cfd502558184ed58fa3c72e775ec65bf077eef5fd2b3453db676f893c" +checksum = "b04b45ea8ad3ac2d78f2ea2a76053e06591c9629c7a603eda16c10649ecf4362" dependencies = [ "ahash", "arrow", @@ -1645,9 +1645,9 @@ dependencies = [ [[package]] name = "datafusion-physical-optimizer" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d501d0e1d0910f015677121601ac177ec59272ef5c9324d1147b394988f40941" +checksum = "7cb13397809a425918f608dfe8653f332015a3e330004ab191b4404187238b95" dependencies = [ "arrow", "datafusion-common", @@ -1664,9 +1664,9 @@ dependencies = [ [[package]] name = "datafusion-physical-plan" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "463c88ad6f1ecab1810f4c9f046898bee035b370137eb79b2b2db925e270631d" +checksum = "5edc023675791af9d5fb4cc4c24abf5f7bd3bd4dcf9e5bd90ea1eff6976dcc79" dependencies = [ "ahash", "arrow", @@ -1719,7 +1719,7 @@ dependencies = [ "datafusion-proto-common", "object_store", "prost", - "rand 0.9.2", + "rand 0.9.4", ] [[package]] @@ -1735,9 +1735,9 @@ dependencies = [ [[package]] name = "datafusion-pruning" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2857618a0ecbd8cd0cf29826889edd3a25774ec26b2995fc3862095c95d88fc6" +checksum = "ac8c76860e355616555081cab5968cec1af7a80701ff374510860bcd567e365a" dependencies = [ "arrow", "datafusion-common", @@ -1752,9 +1752,9 @@ dependencies = [ [[package]] name = "datafusion-session" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ef8637e35022c5c775003b3ab1debc6b4a8f0eb41b069bdd5475dd3aa93f6eba" +checksum = "5412111aa48e2424ba926112e192f7a6b7e4ccb450145d25ce5ede9f19dc491e" dependencies = [ "async-trait", "datafusion-common", @@ -1766,9 +1766,9 @@ dependencies = [ [[package]] name = "datafusion-sql" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "12d9e9f16a1692a11c94bcc418191fa15fd2b4d72a0c1a0c607db93c0b84dd81" +checksum = "fa0d133ddf8b9b3b872acac900157f783e7b879fe9a6bccf389abebbfac45ec1" dependencies = [ "arrow", "bigdecimal", @@ -2435,7 +2435,7 @@ dependencies = [ "once_cell", "ordered-float 4.6.0", "parquet", - "rand 0.8.5", + "rand 0.9.4", "reqwest", "roaring", "serde", @@ -3587,7 +3587,7 @@ dependencies = [ "bytes", "getrandom 0.3.4", "lru-slab", - "rand 0.9.2", + "rand 0.9.4", "ring", "rustc-hash", "rustls", @@ -3647,9 +3647,9 @@ dependencies = [ [[package]] name = "rand" -version = "0.9.2" +version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1" +checksum = "44c5af06bb1b7d3216d91932aed5265164bf384dc89cd6ba05cf59a35f5f76ea" dependencies = [ "rand_chacha 0.9.0", "rand_core 0.9.5", @@ -3968,9 +3968,9 @@ dependencies = [ [[package]] name = "rustls-webpki" -version = "0.103.10" +version = "0.103.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df33b2b81ac578cabaf06b89b0631153a3f416b0a886e8a7a1707fb51abbd1ef" +checksum = "8279bb85272c9f10811ae6a6c547ff594d6a7f3c6c6b02ee9726d1d0dcfcdd06" dependencies = [ "ring", "rustls-pki-types", From 1bed7b6bf18dbf6f56be21559a3c0c2066ed5f90 Mon Sep 17 00:00:00 2001 From: Xander Date: Wed, 22 Apr 2026 09:41:45 +0100 Subject: [PATCH 24/45] fix: NaN pushdown correctly pushes down NaNs correctness issue (#2351) ## Which issue does this PR close? Not tied to a specific issue - found during an audit of pushdown filter gaps. ## What changes are included in this PR? `PredicateConverter::is_nan` and `not_nan` were never actually implemented - `is_nan` returned `always_true` (matches every row) and `not_nan` returned `always_false` (matches no rows). Every other predicate in `PredicateConverter` projects the column from the batch and runs an arrow compute kernel, but these two just returned constants. This adds a `compute_is_nan` helper that downcasts to `Float32Array`/`Float64Array` and checks each value with `f.is_nan()`, preserving nulls. Non-float types return all false. `is_nan` and `not_nan` now use it the same way `is_null`/`not_null` use `arrow::is_null`/`is_not_null`. ## Are these changes tested? Yes, test added --- crates/iceberg/src/arrow/reader.rs | 126 +++++++++++++++++++++++++++-- 1 file changed, 121 insertions(+), 5 deletions(-) diff --git a/crates/iceberg/src/arrow/reader.rs b/crates/iceberg/src/arrow/reader.rs index 488f41cf20..5cd97410ea 100644 --- a/crates/iceberg/src/arrow/reader.rs +++ b/crates/iceberg/src/arrow/reader.rs @@ -23,7 +23,10 @@ use std::str::FromStr; use std::sync::Arc; use arrow_arith::boolean::{and, and_kleene, is_not_null, is_null, not, or, or_kleene}; +use arrow_array::cast::AsArray; +use arrow_array::types::{Float32Type, Float64Type}; use arrow_array::{Array, ArrayRef, BooleanArray, Datum as ArrowDatum, RecordBatch, Scalar}; +use arrow_buffer::BooleanBuffer; use arrow_cast::cast::cast; use arrow_ord::cmp::{eq, gt, gt_eq, lt, lt_eq, neq}; use arrow_schema::{ @@ -1509,6 +1512,35 @@ fn project_column( } } +fn compute_is_nan(array: &ArrayRef) -> std::result::Result { + // Compute NaN over the contiguous values slice, then fold the null bitmap + // in with a single bitwise AND so that null slots become false. + let (is_nan, nulls) = match array.data_type() { + DataType::Float32 => { + let arr = array.as_primitive::(); + ( + BooleanBuffer::from_iter(arr.values().iter().map(|v| v.is_nan())), + arr.nulls(), + ) + } + DataType::Float64 => { + let arr = array.as_primitive::(); + ( + BooleanBuffer::from_iter(arr.values().iter().map(|v| v.is_nan())), + arr.nulls(), + ) + } + _ => unreachable!("is_nan is only valid for float types"), + }; + + let values = match nulls { + Some(nulls) => &is_nan & nulls.inner(), + None => is_nan, + }; + + Ok(BooleanArray::new(values, None)) +} + type PredicateResult = dyn FnMut(RecordBatch) -> std::result::Result + Send + 'static; @@ -1591,8 +1623,11 @@ impl BoundPredicateVisitor for PredicateConverter<'_> { reference: &BoundReference, _predicate: &BoundPredicate, ) -> Result> { - if self.bound_reference(reference)?.is_some() { - self.build_always_true() + if let Some(idx) = self.bound_reference(reference)? { + Ok(Box::new(move |batch| { + let column = project_column(&batch, idx)?; + compute_is_nan(&column) + })) } else { // A missing column, treating it as null. self.build_always_false() @@ -1604,8 +1639,12 @@ impl BoundPredicateVisitor for PredicateConverter<'_> { reference: &BoundReference, _predicate: &BoundPredicate, ) -> Result> { - if self.bound_reference(reference)?.is_some() { - self.build_always_false() + if let Some(idx) = self.bound_reference(reference)? { + Ok(Box::new(move |batch| { + let column = project_column(&batch, idx)?; + let is_nan = compute_is_nan(&column)?; + not(&is_nan) + })) } else { // A missing column, treating it as null. self.build_always_true() @@ -2002,7 +2041,7 @@ mod tests { use std::sync::Arc; use arrow_array::cast::AsArray; - use arrow_array::{ArrayRef, LargeStringArray, RecordBatch, StringArray}; + use arrow_array::{Array, ArrayRef, BooleanArray, LargeStringArray, RecordBatch, StringArray}; use arrow_schema::{DataType, Field, Schema as ArrowSchema, TimeUnit}; use futures::TryStreamExt; use parquet::arrow::arrow_reader::{RowSelection, RowSelector}; @@ -5464,4 +5503,81 @@ message schema { ts_array.value(0) ); } + + fn apply_predicate_to_batch( + predicate: Predicate, + schema: SchemaRef, + batch: RecordBatch, + ) -> BooleanArray { + use super::PredicateConverter; + + let bound = predicate.bind(schema, true).unwrap(); + + // Build a trivial Parquet schema with one float column at field id 4 + let message_type = " + message schema { + optional float qux = 4; + } + "; + let parquet_type = parse_message_type(message_type).expect("parse schema"); + let parquet_schema = SchemaDescriptor::new(Arc::new(parquet_type)); + + let column_map = HashMap::from([(4i32, 0usize)]); + let column_indices = vec![0usize]; + + let mut converter = PredicateConverter { + parquet_schema: &parquet_schema, + column_map: &column_map, + column_indices: &column_indices, + }; + + let mut predicate_fn = visit(&mut converter, &bound).unwrap(); + predicate_fn(batch).unwrap() + } + + #[test] + fn test_predicate_converter_nan() { + use arrow_array::Float32Array; + + let schema = table_schema_simple(); + let arrow_schema = Arc::new(ArrowSchema::new(vec![Field::new( + "qux", + DataType::Float32, + true, + )])); + let values = vec![Some(1.0f32), Some(f32::NAN), None, Some(0.0f32)]; + + // is_nan: non-null-propagating per Java's implementation - NULL → false + let batch = RecordBatch::try_new(arrow_schema.clone(), vec![Arc::new(Float32Array::from( + values.clone(), + ))]) + .unwrap(); + let result = + apply_predicate_to_batch(Reference::new("qux").is_nan(), schema.clone(), batch); + assert_eq!( + [ + result.value(0), + result.value(1), + result.value(2), + result.value(3) + ], + [false, true, false, false] + ); + assert!(!result.is_null(2)); + + // not_nan: non-null-propagating per Java's implementation - NULL → true + let batch = + RecordBatch::try_new(arrow_schema, vec![Arc::new(Float32Array::from(values))]).unwrap(); + let result = apply_predicate_to_batch(Reference::new("qux").is_not_nan(), schema, batch); + assert_eq!( + [ + result.value(0), + result.value(1), + result.value(2), + result.value(3) + ], + [true, false, true, true] + ); + assert!(!result.is_null(2)); + } } From 4b4ffd0a6a7e3d52f7ea219647407290b0d46e93 Mon Sep 17 00:00:00 2001 From: Shawn Chang Date: Wed, 22 Apr 2026 23:22:33 -0700 Subject: [PATCH 25/45] fix: Bump rustls-webpki from 0.103.12 to 0.103.13 to fix RUSTSEC-2026-0104 (#2356) ## Which issue does this PR close? - Closes #. ## What changes are included in this PR? ## Are these changes tested? --- Cargo.lock | 4 ++-- bindings/python/Cargo.lock | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a33bf4c5d8..528aaf023f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5735,9 +5735,9 @@ dependencies = [ [[package]] name = "rustls-webpki" -version = "0.103.12" +version = "0.103.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8279bb85272c9f10811ae6a6c547ff594d6a7f3c6c6b02ee9726d1d0dcfcdd06" +checksum = "61c429a8649f110dddef65e2a5ad240f747e85f7758a6bccc7e5777bd33f756e" dependencies = [ "aws-lc-rs", "ring", diff --git a/bindings/python/Cargo.lock b/bindings/python/Cargo.lock index 21ddcefc58..72ea322d7b 100644 --- a/bindings/python/Cargo.lock +++ b/bindings/python/Cargo.lock @@ -3968,9 +3968,9 @@ dependencies = [ [[package]] name = "rustls-webpki" -version = "0.103.12" +version = "0.103.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8279bb85272c9f10811ae6a6c547ff594d6a7f3c6c6b02ee9726d1d0dcfcdd06" +checksum = "61c429a8649f110dddef65e2a5ad240f747e85f7758a6bccc7e5777bd33f756e" dependencies = [ "ring", "rustls-pki-types", From c82c42eb20f4bb2b33bec7c53d52742404ac0b55 Mon Sep 17 00:00:00 2001 From: Charles-Antoine Leger Date: Thu, 23 Apr 2026 11:40:46 +0200 Subject: [PATCH 26/45] feat(datafusion): show pushed-down limit in IcebergTableScan EXPLAIN output (#2360) ## Which issue does this PR close? - Closes #2359. ## What changes are included in this PR? Emit ` limit:[N]` in `IcebergTableScan`'s `DisplayAs` output when a `LIMIT` is pushed down to the scan. When no limit is pushed down, the output is unchanged. - Before (unchanged): `IcebergTableScan projection:[id,name] predicate:[...]` - After (new, only when a limit reaches the scan): `IcebergTableScan projection:[...] predicate:[] limit:[3]` ## Are these changes tested? Yes: new `EXPLAIN ... LIMIT 3` assertion in `crates/sqllogictest/testdata/slts/df_test/basic_queries.slt`. Existing snapshots are unchanged, which confirms the additive-only behavior. --- .../datafusion/src/physical_plan/scan.rs | 6 +++++- .../testdata/slts/df_test/basic_queries.slt | 12 ++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/crates/integrations/datafusion/src/physical_plan/scan.rs b/crates/integrations/datafusion/src/physical_plan/scan.rs index 234ab26470..36539ae503 100644 --- a/crates/integrations/datafusion/src/physical_plan/scan.rs +++ b/crates/integrations/datafusion/src/physical_plan/scan.rs @@ -196,7 +196,11 @@ impl DisplayAs for IcebergTableScan { self.predicates .clone() .map_or(String::from(""), |p| format!("{p}")) - ) + )?; + if let Some(limit) = self.limit { + write!(f, " limit:[{limit}]")?; + } + Ok(()) } } diff --git a/crates/sqllogictest/testdata/slts/df_test/basic_queries.slt b/crates/sqllogictest/testdata/slts/df_test/basic_queries.slt index 5d8889f158..a5ca4de46a 100644 --- a/crates/sqllogictest/testdata/slts/df_test/basic_queries.slt +++ b/crates/sqllogictest/testdata/slts/df_test/basic_queries.slt @@ -43,6 +43,18 @@ INSERT INTO default.default.query_test_table VALUES ---- 10 +# Verify EXPLAIN shows limit is pushed down to IcebergTableScan +query TT +EXPLAIN SELECT * FROM default.default.query_test_table LIMIT 3 +---- +logical_plan +01)Limit: skip=0, fetch=3 +02)--TableScan: default.default.query_test_table projection=[id, name, score, category], fetch=3 +physical_plan +01)GlobalLimitExec: skip=0, fetch=3 +02)--CooperativeExec +03)----IcebergTableScan projection:[id,name,score,category] predicate:[] limit:[3] + # Test SELECT * with ORDER BY and LIMIT query ITRT SELECT * FROM default.default.query_test_table ORDER BY id LIMIT 3 From 4b0b35255469039d9877218049f93b07556124b5 Mon Sep 17 00:00:00 2001 From: blackmwk Date: Fri, 24 Apr 2026 05:30:28 +0800 Subject: [PATCH 27/45] Split arrow reader into smaller modules (#2358) ## Which issue does this PR close? - Closes #2309 ## What changes are included in this PR? Split arrow reader module into smaller onces so that it would be easier to maintain. I didn't do any extra changes on purpose to make the pr easier to read. ## Are these changes tested? ut. --------- Co-authored-by: Claude Opus 4.7 (1M context) --- crates/iceberg/src/arrow/reader.rs | 5583 ----------------- .../iceberg/src/arrow/reader/file_reader.rs | 368 ++ crates/iceberg/src/arrow/reader/mod.rs | 154 + crates/iceberg/src/arrow/reader/options.rs | 84 + crates/iceberg/src/arrow/reader/pipeline.rs | 1174 ++++ .../src/arrow/reader/positional_deletes.rs | 931 +++ .../src/arrow/reader/predicate_visitor.rs | 820 +++ crates/iceberg/src/arrow/reader/projection.rs | 1718 +++++ crates/iceberg/src/arrow/reader/row_filter.rs | 616 ++ 9 files changed, 5865 insertions(+), 5583 deletions(-) delete mode 100644 crates/iceberg/src/arrow/reader.rs create mode 100644 crates/iceberg/src/arrow/reader/file_reader.rs create mode 100644 crates/iceberg/src/arrow/reader/mod.rs create mode 100644 crates/iceberg/src/arrow/reader/options.rs create mode 100644 crates/iceberg/src/arrow/reader/pipeline.rs create mode 100644 crates/iceberg/src/arrow/reader/positional_deletes.rs create mode 100644 crates/iceberg/src/arrow/reader/predicate_visitor.rs create mode 100644 crates/iceberg/src/arrow/reader/projection.rs create mode 100644 crates/iceberg/src/arrow/reader/row_filter.rs diff --git a/crates/iceberg/src/arrow/reader.rs b/crates/iceberg/src/arrow/reader.rs deleted file mode 100644 index 5cd97410ea..0000000000 --- a/crates/iceberg/src/arrow/reader.rs +++ /dev/null @@ -1,5583 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Parquet file data reader - -use std::collections::{HashMap, HashSet}; -use std::ops::Range; -use std::str::FromStr; -use std::sync::Arc; - -use arrow_arith::boolean::{and, and_kleene, is_not_null, is_null, not, or, or_kleene}; -use arrow_array::cast::AsArray; -use arrow_array::types::{Float32Type, Float64Type}; -use arrow_array::{Array, ArrayRef, BooleanArray, Datum as ArrowDatum, RecordBatch, Scalar}; -use arrow_buffer::BooleanBuffer; -use arrow_cast::cast::cast; -use arrow_ord::cmp::{eq, gt, gt_eq, lt, lt_eq, neq}; -use arrow_schema::{ - ArrowError, DataType, FieldRef, Schema as ArrowSchema, SchemaRef as ArrowSchemaRef, -}; -use arrow_string::like::starts_with; -use bytes::Bytes; -use fnv::FnvHashSet; -use futures::future::BoxFuture; -use futures::{FutureExt, StreamExt, TryFutureExt, TryStreamExt}; -use parquet::arrow::arrow_reader::{ - ArrowPredicateFn, ArrowReaderMetadata, ArrowReaderOptions, RowFilter, RowSelection, RowSelector, -}; -use parquet::arrow::async_reader::AsyncFileReader; -use parquet::arrow::{PARQUET_FIELD_ID_META_KEY, ParquetRecordBatchStreamBuilder, ProjectionMask}; -use parquet::file::metadata::{ - PageIndexPolicy, ParquetMetaData, ParquetMetaDataReader, RowGroupMetaData, -}; -use parquet::schema::types::{SchemaDescriptor, Type as ParquetType}; -use typed_builder::TypedBuilder; - -use crate::arrow::caching_delete_file_loader::CachingDeleteFileLoader; -use crate::arrow::int96::coerce_int96_timestamps; -use crate::arrow::record_batch_transformer::RecordBatchTransformerBuilder; -use crate::arrow::{arrow_schema_to_schema, get_arrow_datum}; -use crate::delete_vector::DeleteVector; -use crate::error::Result; -use crate::expr::visitors::bound_predicate_visitor::{BoundPredicateVisitor, visit}; -use crate::expr::visitors::page_index_evaluator::PageIndexEvaluator; -use crate::expr::visitors::row_group_metrics_evaluator::RowGroupMetricsEvaluator; -use crate::expr::{BoundPredicate, BoundReference}; -use crate::io::{FileIO, FileMetadata, FileRead}; -use crate::metadata_columns::{RESERVED_FIELD_ID_FILE, is_metadata_field}; -use crate::scan::{ArrowRecordBatchStream, FileScanTask, FileScanTaskStream}; -use crate::spec::{Datum, NameMapping, NestedField, PrimitiveType, Schema, Type}; -use crate::util::available_parallelism; -use crate::{Error, ErrorKind}; - -/// Default gap between byte ranges below which they are coalesced into a -/// single request. Matches object_store's `OBJECT_STORE_COALESCE_DEFAULT`. -const DEFAULT_RANGE_COALESCE_BYTES: u64 = 1024 * 1024; - -/// Default maximum number of coalesced byte ranges fetched concurrently. -/// Matches object_store's `OBJECT_STORE_COALESCE_PARALLEL`. -const DEFAULT_RANGE_FETCH_CONCURRENCY: usize = 10; - -/// Default number of bytes to prefetch when parsing Parquet footer metadata. -/// Matches DataFusion's default `ParquetOptions::metadata_size_hint`. -const DEFAULT_METADATA_SIZE_HINT: usize = 512 * 1024; - -/// Options for tuning Parquet file I/O. -#[derive(Clone, Copy, Debug, TypedBuilder)] -#[builder(field_defaults(setter(prefix = "with_")))] -pub(crate) struct ParquetReadOptions { - /// Number of bytes to prefetch for parsing the Parquet metadata. - /// - /// This hint can help reduce the number of fetch requests. For more details see the - /// [ParquetMetaDataReader documentation](https://docs.rs/parquet/latest/parquet/file/metadata/struct.ParquetMetaDataReader.html#method.with_prefetch_hint). - /// - /// Defaults to 512 KiB, matching DataFusion's default `ParquetOptions::metadata_size_hint`. - #[builder(default = Some(DEFAULT_METADATA_SIZE_HINT))] - pub(crate) metadata_size_hint: Option, - /// Gap threshold for merging nearby byte ranges into a single request. - /// Ranges with gaps smaller than this value will be coalesced. - /// - /// Defaults to 1 MiB, matching object_store's `OBJECT_STORE_COALESCE_DEFAULT`. - #[builder(default = DEFAULT_RANGE_COALESCE_BYTES)] - pub(crate) range_coalesce_bytes: u64, - /// Maximum number of merged byte ranges to fetch concurrently. - /// - /// Defaults to 10, matching object_store's `OBJECT_STORE_COALESCE_PARALLEL`. - #[builder(default = DEFAULT_RANGE_FETCH_CONCURRENCY)] - pub(crate) range_fetch_concurrency: usize, - /// Whether to preload the column index when reading Parquet metadata. - #[builder(default = true)] - pub(crate) preload_column_index: bool, - /// Whether to preload the offset index when reading Parquet metadata. - #[builder(default = true)] - pub(crate) preload_offset_index: bool, - /// Whether to preload the page index when reading Parquet metadata. - #[builder(default = false)] - pub(crate) preload_page_index: bool, -} - -impl ParquetReadOptions { - pub(crate) fn metadata_size_hint(&self) -> Option { - self.metadata_size_hint - } - - pub(crate) fn range_coalesce_bytes(&self) -> u64 { - self.range_coalesce_bytes - } - - pub(crate) fn range_fetch_concurrency(&self) -> usize { - self.range_fetch_concurrency - } - - pub(crate) fn preload_column_index(&self) -> bool { - self.preload_column_index - } - - pub(crate) fn preload_offset_index(&self) -> bool { - self.preload_offset_index - } - - pub(crate) fn preload_page_index(&self) -> bool { - self.preload_page_index - } -} - -/// Builder to create ArrowReader -pub struct ArrowReaderBuilder { - batch_size: Option, - file_io: FileIO, - concurrency_limit_data_files: usize, - row_group_filtering_enabled: bool, - row_selection_enabled: bool, - parquet_read_options: ParquetReadOptions, -} - -impl ArrowReaderBuilder { - /// Create a new ArrowReaderBuilder - pub fn new(file_io: FileIO) -> Self { - let num_cpus = available_parallelism().get(); - - ArrowReaderBuilder { - batch_size: None, - file_io, - concurrency_limit_data_files: num_cpus, - row_group_filtering_enabled: true, - row_selection_enabled: false, - parquet_read_options: ParquetReadOptions::builder().build(), - } - } - - /// Sets the max number of in flight data files that are being fetched - pub fn with_data_file_concurrency_limit(mut self, val: usize) -> Self { - self.concurrency_limit_data_files = val; - self - } - - /// Sets the desired size of batches in the response - /// to something other than the default - pub fn with_batch_size(mut self, batch_size: usize) -> Self { - self.batch_size = Some(batch_size); - self - } - - /// Determines whether to enable row group filtering. - pub fn with_row_group_filtering_enabled(mut self, row_group_filtering_enabled: bool) -> Self { - self.row_group_filtering_enabled = row_group_filtering_enabled; - self - } - - /// Determines whether to enable row selection. - pub fn with_row_selection_enabled(mut self, row_selection_enabled: bool) -> Self { - self.row_selection_enabled = row_selection_enabled; - self - } - - /// Provide a hint as to the number of bytes to prefetch for parsing the Parquet metadata - /// - /// This hint can help reduce the number of fetch requests. For more details see the - /// [ParquetMetaDataReader documentation](https://docs.rs/parquet/latest/parquet/file/metadata/struct.ParquetMetaDataReader.html#method.with_prefetch_hint). - pub fn with_metadata_size_hint(mut self, metadata_size_hint: usize) -> Self { - self.parquet_read_options.metadata_size_hint = Some(metadata_size_hint); - self - } - - /// Sets the gap threshold for merging nearby byte ranges into a single request. - /// Ranges with gaps smaller than this value will be coalesced. - /// - /// Defaults to 1 MiB, matching object_store's OBJECT_STORE_COALESCE_DEFAULT. - pub fn with_range_coalesce_bytes(mut self, range_coalesce_bytes: u64) -> Self { - self.parquet_read_options.range_coalesce_bytes = range_coalesce_bytes; - self - } - - /// Sets the maximum number of merged byte ranges to fetch concurrently. - /// - /// Defaults to 10, matching object_store's OBJECT_STORE_COALESCE_PARALLEL. - pub fn with_range_fetch_concurrency(mut self, range_fetch_concurrency: usize) -> Self { - self.parquet_read_options.range_fetch_concurrency = range_fetch_concurrency; - self - } - - /// Build the ArrowReader. - pub fn build(self) -> ArrowReader { - ArrowReader { - batch_size: self.batch_size, - file_io: self.file_io.clone(), - delete_file_loader: CachingDeleteFileLoader::new( - self.file_io.clone(), - self.concurrency_limit_data_files, - ), - concurrency_limit_data_files: self.concurrency_limit_data_files, - row_group_filtering_enabled: self.row_group_filtering_enabled, - row_selection_enabled: self.row_selection_enabled, - parquet_read_options: self.parquet_read_options, - } - } -} - -/// Reads data from Parquet files -#[derive(Clone)] -pub struct ArrowReader { - batch_size: Option, - file_io: FileIO, - delete_file_loader: CachingDeleteFileLoader, - - /// the maximum number of data files that can be fetched at the same time - concurrency_limit_data_files: usize, - - row_group_filtering_enabled: bool, - row_selection_enabled: bool, - parquet_read_options: ParquetReadOptions, -} - -impl ArrowReader { - /// Take a stream of FileScanTasks and reads all the files. - /// Returns a stream of Arrow RecordBatches containing the data from the files - pub fn read(self, tasks: FileScanTaskStream) -> Result { - let file_io = self.file_io.clone(); - let batch_size = self.batch_size; - let concurrency_limit_data_files = self.concurrency_limit_data_files; - let row_group_filtering_enabled = self.row_group_filtering_enabled; - let row_selection_enabled = self.row_selection_enabled; - let parquet_read_options = self.parquet_read_options; - - // Fast-path for single concurrency to avoid overhead of try_flatten_unordered - let stream: ArrowRecordBatchStream = if concurrency_limit_data_files == 1 { - Box::pin( - tasks - .and_then(move |task| { - let file_io = file_io.clone(); - - Self::process_file_scan_task( - task, - batch_size, - file_io, - self.delete_file_loader.clone(), - row_group_filtering_enabled, - row_selection_enabled, - parquet_read_options, - ) - }) - .map_err(|err| { - Error::new(ErrorKind::Unexpected, "file scan task generate failed") - .with_source(err) - }) - .try_flatten(), - ) - } else { - Box::pin( - tasks - .map_ok(move |task| { - let file_io = file_io.clone(); - - Self::process_file_scan_task( - task, - batch_size, - file_io, - self.delete_file_loader.clone(), - row_group_filtering_enabled, - row_selection_enabled, - parquet_read_options, - ) - }) - .map_err(|err| { - Error::new(ErrorKind::Unexpected, "file scan task generate failed") - .with_source(err) - }) - .try_buffer_unordered(concurrency_limit_data_files) - .try_flatten_unordered(concurrency_limit_data_files), - ) - }; - - Ok(stream) - } - - async fn process_file_scan_task( - task: FileScanTask, - batch_size: Option, - file_io: FileIO, - delete_file_loader: CachingDeleteFileLoader, - row_group_filtering_enabled: bool, - row_selection_enabled: bool, - parquet_read_options: ParquetReadOptions, - ) -> Result { - let should_load_page_index = - (row_selection_enabled && task.predicate.is_some()) || !task.deletes.is_empty(); - let mut parquet_read_options = parquet_read_options; - parquet_read_options.preload_page_index = should_load_page_index; - - let delete_filter_rx = - delete_file_loader.load_deletes(&task.deletes, Arc::clone(&task.schema)); - - // Open the Parquet file once, loading its metadata - let (parquet_file_reader, arrow_metadata) = Self::open_parquet_file( - &task.data_file_path, - &file_io, - task.file_size_in_bytes, - parquet_read_options, - ) - .await?; - - // Check if Parquet file has embedded field IDs - // Corresponds to Java's ParquetSchemaUtil.hasIds() - // Reference: parquet/src/main/java/org/apache/iceberg/parquet/ParquetSchemaUtil.java:118 - let missing_field_ids = arrow_metadata - .schema() - .fields() - .iter() - .next() - .is_some_and(|f| f.metadata().get(PARQUET_FIELD_ID_META_KEY).is_none()); - - // Three-branch schema resolution strategy matching Java's ReadConf constructor - // - // Per Iceberg spec Column Projection rules: - // "Columns in Iceberg data files are selected by field id. The table schema's column - // names and order may change after a data file is written, and projection must be done - // using field ids." - // https://iceberg.apache.org/spec/#column-projection - // - // When Parquet files lack field IDs (e.g., Hive/Spark migrations via add_files), - // we must assign field IDs BEFORE reading data to enable correct projection. - // - // Java's ReadConf determines field ID strategy: - // - Branch 1: hasIds(fileSchema) → trust embedded field IDs, use pruneColumns() - // - Branch 2: nameMapping present → applyNameMapping(), then pruneColumns() - // - Branch 3: fallback → addFallbackIds(), then pruneColumnsFallback() - let arrow_metadata = if missing_field_ids { - // Parquet file lacks field IDs - must assign them before reading - let arrow_schema = if let Some(name_mapping) = &task.name_mapping { - // Branch 2: Apply name mapping to assign correct Iceberg field IDs - // Per spec rule #2: "Use schema.name-mapping.default metadata to map field id - // to columns without field id" - // Corresponds to Java's ParquetSchemaUtil.applyNameMapping() - apply_name_mapping_to_arrow_schema( - Arc::clone(arrow_metadata.schema()), - name_mapping, - )? - } else { - // Branch 3: No name mapping - use position-based fallback IDs - // Corresponds to Java's ParquetSchemaUtil.addFallbackIds() - add_fallback_field_ids_to_arrow_schema(arrow_metadata.schema()) - }; - - let options = ArrowReaderOptions::new().with_schema(arrow_schema); - ArrowReaderMetadata::try_new(Arc::clone(arrow_metadata.metadata()), options).map_err( - |e| { - Error::new( - ErrorKind::Unexpected, - "Failed to create ArrowReaderMetadata with field ID schema", - ) - .with_source(e) - }, - )? - } else { - // Branch 1: File has embedded field IDs - trust them - arrow_metadata - }; - - // Coerce INT96 timestamp columns to the resolution specified by the Iceberg schema. - // This must happen before building the stream reader to avoid i64 overflow in arrow-rs. - let arrow_metadata = if let Some(coerced_schema) = - coerce_int96_timestamps(arrow_metadata.schema(), &task.schema) - { - let options = ArrowReaderOptions::new().with_schema(Arc::clone(&coerced_schema)); - ArrowReaderMetadata::try_new(Arc::clone(arrow_metadata.metadata()), options).map_err( - |e| { - Error::new( - ErrorKind::Unexpected, - format!( - "Failed to create ArrowReaderMetadata with INT96-coerced schema: {coerced_schema}" - ), - ) - .with_source(e) - }, - )? - } else { - arrow_metadata - }; - - // Build the stream reader, reusing the already-opened file reader - let mut record_batch_stream_builder = - ParquetRecordBatchStreamBuilder::new_with_metadata(parquet_file_reader, arrow_metadata); - - // Filter out metadata fields for Parquet projection (they don't exist in files) - let project_field_ids_without_metadata: Vec = task - .project_field_ids - .iter() - .filter(|&&id| !is_metadata_field(id)) - .copied() - .collect(); - - // Create projection mask based on field IDs - // - If file has embedded IDs: field-ID-based projection (missing_field_ids=false) - // - If name mapping applied: field-ID-based projection (missing_field_ids=true but IDs now match) - // - If fallback IDs: position-based projection (missing_field_ids=true) - let projection_mask = Self::get_arrow_projection_mask( - &project_field_ids_without_metadata, - &task.schema, - record_batch_stream_builder.parquet_schema(), - record_batch_stream_builder.schema(), - missing_field_ids, // Whether to use position-based (true) or field-ID-based (false) projection - )?; - - record_batch_stream_builder = - record_batch_stream_builder.with_projection(projection_mask.clone()); - - // RecordBatchTransformer performs any transformations required on the RecordBatches - // that come back from the file, such as type promotion, default column insertion, - // column re-ordering, partition constants, and virtual field addition (like _file) - let mut record_batch_transformer_builder = - RecordBatchTransformerBuilder::new(task.schema_ref(), task.project_field_ids()); - - // Add the _file metadata column if it's in the projected fields - if task.project_field_ids().contains(&RESERVED_FIELD_ID_FILE) { - let file_datum = Datum::string(task.data_file_path.clone()); - record_batch_transformer_builder = - record_batch_transformer_builder.with_constant(RESERVED_FIELD_ID_FILE, file_datum); - } - - if let (Some(partition_spec), Some(partition_data)) = - (task.partition_spec.clone(), task.partition.clone()) - { - record_batch_transformer_builder = - record_batch_transformer_builder.with_partition(partition_spec, partition_data)?; - } - - let mut record_batch_transformer = record_batch_transformer_builder.build(); - - if let Some(batch_size) = batch_size { - record_batch_stream_builder = record_batch_stream_builder.with_batch_size(batch_size); - } - - let delete_filter = delete_filter_rx.await.unwrap()?; - let delete_predicate = delete_filter.build_equality_delete_predicate(&task).await?; - - // In addition to the optional predicate supplied in the `FileScanTask`, - // we also have an optional predicate resulting from equality delete files. - // If both are present, we logical-AND them together to form a single filter - // predicate that we can pass to the `RecordBatchStreamBuilder`. - let final_predicate = match (&task.predicate, delete_predicate) { - (None, None) => None, - (Some(predicate), None) => Some(predicate.clone()), - (None, Some(ref predicate)) => Some(predicate.clone()), - (Some(filter_predicate), Some(delete_predicate)) => { - Some(filter_predicate.clone().and(delete_predicate)) - } - }; - - // There are three possible sources for potential lists of selected RowGroup indices, - // and two for `RowSelection`s. - // Selected RowGroup index lists can come from three sources: - // * When task.start and task.length specify a byte range (file splitting); - // * When there are equality delete files that are applicable; - // * When there is a scan predicate and row_group_filtering_enabled = true. - // `RowSelection`s can be created in either or both of the following cases: - // * When there are positional delete files that are applicable; - // * When there is a scan predicate and row_selection_enabled = true - // Note that row group filtering from predicates only happens when - // there is a scan predicate AND row_group_filtering_enabled = true, - // but we perform row selection filtering if there are applicable - // equality delete files OR (there is a scan predicate AND row_selection_enabled), - // since the only implemented method of applying positional deletes is - // by using a `RowSelection`. - let mut selected_row_group_indices = None; - let mut row_selection = None; - - // Filter row groups based on byte range from task.start and task.length. - // If both start and length are 0, read the entire file (backwards compatibility). - if task.start != 0 || task.length != 0 { - let byte_range_filtered_row_groups = Self::filter_row_groups_by_byte_range( - record_batch_stream_builder.metadata(), - task.start, - task.length, - )?; - selected_row_group_indices = Some(byte_range_filtered_row_groups); - } - - if let Some(predicate) = final_predicate { - let (iceberg_field_ids, field_id_map) = Self::build_field_id_set_and_map( - record_batch_stream_builder.parquet_schema(), - &predicate, - )?; - - let row_filter = Self::get_row_filter( - &predicate, - record_batch_stream_builder.parquet_schema(), - &iceberg_field_ids, - &field_id_map, - )?; - record_batch_stream_builder = record_batch_stream_builder.with_row_filter(row_filter); - - if row_group_filtering_enabled { - let predicate_filtered_row_groups = Self::get_selected_row_group_indices( - &predicate, - record_batch_stream_builder.metadata(), - &field_id_map, - &task.schema, - )?; - - // Merge predicate-based filtering with byte range filtering (if present) - // by taking the intersection of both filters - selected_row_group_indices = match selected_row_group_indices { - Some(byte_range_filtered) => { - // Keep only row groups that are in both filters - let intersection: Vec = byte_range_filtered - .into_iter() - .filter(|idx| predicate_filtered_row_groups.contains(idx)) - .collect(); - Some(intersection) - } - None => Some(predicate_filtered_row_groups), - }; - } - - if row_selection_enabled { - row_selection = Some(Self::get_row_selection_for_filter_predicate( - &predicate, - record_batch_stream_builder.metadata(), - &selected_row_group_indices, - &field_id_map, - &task.schema, - )?); - } - } - - let positional_delete_indexes = delete_filter.get_delete_vector(&task); - - if let Some(positional_delete_indexes) = positional_delete_indexes { - let delete_row_selection = { - let positional_delete_indexes = positional_delete_indexes.lock().unwrap(); - - Self::build_deletes_row_selection( - record_batch_stream_builder.metadata().row_groups(), - &selected_row_group_indices, - &positional_delete_indexes, - ) - }?; - - // merge the row selection from the delete files with the row selection - // from the filter predicate, if there is one from the filter predicate - row_selection = match row_selection { - None => Some(delete_row_selection), - Some(filter_row_selection) => { - Some(filter_row_selection.intersection(&delete_row_selection)) - } - }; - } - - if let Some(row_selection) = row_selection { - record_batch_stream_builder = - record_batch_stream_builder.with_row_selection(row_selection); - } - - if let Some(selected_row_group_indices) = selected_row_group_indices { - record_batch_stream_builder = - record_batch_stream_builder.with_row_groups(selected_row_group_indices); - } - - // Build the batch stream and send all the RecordBatches that it generates - // to the requester. - let record_batch_stream = - record_batch_stream_builder - .build()? - .map(move |batch| match batch { - Ok(batch) => { - // Process the record batch (type promotion, column reordering, virtual fields, etc.) - record_batch_transformer.process_record_batch(batch) - } - Err(err) => Err(err.into()), - }); - - Ok(Box::pin(record_batch_stream) as ArrowRecordBatchStream) - } - - /// Opens a Parquet file and loads its metadata, returning both the reader and metadata. - /// The reader can be reused to build a `ParquetRecordBatchStreamBuilder` without - /// reopening the file. - pub(crate) async fn open_parquet_file( - data_file_path: &str, - file_io: &FileIO, - file_size_in_bytes: u64, - parquet_read_options: ParquetReadOptions, - ) -> Result<(ArrowFileReader, ArrowReaderMetadata)> { - let parquet_file = file_io.new_input(data_file_path)?; - let parquet_reader = parquet_file.reader().await?; - let mut reader = ArrowFileReader::new( - FileMetadata { - size: file_size_in_bytes, - }, - parquet_reader, - ) - .with_parquet_read_options(parquet_read_options); - - let arrow_metadata = ArrowReaderMetadata::load_async(&mut reader, Default::default()) - .await - .map_err(|e| { - Error::new(ErrorKind::Unexpected, "Failed to load Parquet metadata").with_source(e) - })?; - - Ok((reader, arrow_metadata)) - } - - /// computes a `RowSelection` from positional delete indices. - /// - /// Using the Parquet page index, we build a `RowSelection` that rejects rows that are indicated - /// as having been deleted by a positional delete, taking into account any row groups that have - /// been skipped entirely by the filter predicate - fn build_deletes_row_selection( - row_group_metadata_list: &[RowGroupMetaData], - selected_row_groups: &Option>, - positional_deletes: &DeleteVector, - ) -> Result { - let mut results: Vec = Vec::new(); - let mut selected_row_groups_idx = 0; - let mut current_row_group_base_idx: u64 = 0; - let mut delete_vector_iter = positional_deletes.iter(); - let mut next_deleted_row_idx_opt = delete_vector_iter.next(); - - for (idx, row_group_metadata) in row_group_metadata_list.iter().enumerate() { - let row_group_num_rows = row_group_metadata.num_rows() as u64; - let next_row_group_base_idx = current_row_group_base_idx + row_group_num_rows; - - // if row group selection is enabled, - if let Some(selected_row_groups) = selected_row_groups { - // if we've consumed all the selected row groups, we're done - if selected_row_groups_idx == selected_row_groups.len() { - break; - } - - if idx == selected_row_groups[selected_row_groups_idx] { - // we're in a selected row group. Increment selected_row_groups_idx - // so that next time around the for loop we're looking for the next - // selected row group - selected_row_groups_idx += 1; - } else { - // Advance iterator past all deletes in the skipped row group. - // advance_to() positions the iterator to the first delete >= next_row_group_base_idx. - // However, if our cached next_deleted_row_idx_opt is in the skipped range, - // we need to call next() to update the cache with the newly positioned value. - delete_vector_iter.advance_to(next_row_group_base_idx); - // Only update the cache if the cached value is stale (in the skipped range) - if let Some(cached_idx) = next_deleted_row_idx_opt - && cached_idx < next_row_group_base_idx - { - next_deleted_row_idx_opt = delete_vector_iter.next(); - } - - // still increment the current page base index but then skip to the next row group - // in the file - current_row_group_base_idx += row_group_num_rows; - continue; - } - } - - let mut next_deleted_row_idx = match next_deleted_row_idx_opt { - Some(next_deleted_row_idx) => { - // if the index of the next deleted row is beyond this row group, add a selection for - // the remainder of this row group and skip to the next row group - if next_deleted_row_idx >= next_row_group_base_idx { - results.push(RowSelector::select(row_group_num_rows as usize)); - current_row_group_base_idx += row_group_num_rows; - continue; - } - - next_deleted_row_idx - } - - // If there are no more pos deletes, add a selector for the entirety of this row group. - _ => { - results.push(RowSelector::select(row_group_num_rows as usize)); - current_row_group_base_idx += row_group_num_rows; - continue; - } - }; - - let mut current_idx = current_row_group_base_idx; - 'chunks: while next_deleted_row_idx < next_row_group_base_idx { - // `select` all rows that precede the next delete index - if current_idx < next_deleted_row_idx { - let run_length = next_deleted_row_idx - current_idx; - results.push(RowSelector::select(run_length as usize)); - current_idx += run_length; - } - - // `skip` all consecutive deleted rows in the current row group - let mut run_length = 0; - while next_deleted_row_idx == current_idx - && next_deleted_row_idx < next_row_group_base_idx - { - run_length += 1; - current_idx += 1; - - next_deleted_row_idx_opt = delete_vector_iter.next(); - next_deleted_row_idx = match next_deleted_row_idx_opt { - Some(next_deleted_row_idx) => next_deleted_row_idx, - _ => { - // We've processed the final positional delete. - // Conclude the skip and then break so that we select the remaining - // rows in the row group and move on to the next row group - results.push(RowSelector::skip(run_length)); - break 'chunks; - } - }; - } - if run_length > 0 { - results.push(RowSelector::skip(run_length)); - } - } - - if current_idx < next_row_group_base_idx { - results.push(RowSelector::select( - (next_row_group_base_idx - current_idx) as usize, - )); - } - - current_row_group_base_idx += row_group_num_rows; - } - - Ok(results.into()) - } - - fn build_field_id_set_and_map( - parquet_schema: &SchemaDescriptor, - predicate: &BoundPredicate, - ) -> Result<(HashSet, HashMap)> { - // Collects all Iceberg field IDs referenced in the filter predicate - let mut collector = CollectFieldIdVisitor { - field_ids: HashSet::default(), - }; - visit(&mut collector, predicate)?; - - let iceberg_field_ids = collector.field_ids(); - - // Without embedded field IDs, we fall back to position-based mapping for compatibility - let field_id_map = match build_field_id_map(parquet_schema)? { - Some(map) => map, - None => build_fallback_field_id_map(parquet_schema), - }; - - Ok((iceberg_field_ids, field_id_map)) - } - - /// Recursively extract leaf field IDs because Parquet projection works at the leaf column level. - /// Nested types (struct/list/map) are flattened in Parquet's columnar format. - fn include_leaf_field_id(field: &NestedField, field_ids: &mut Vec) { - match field.field_type.as_ref() { - Type::Primitive(_) => { - field_ids.push(field.id); - } - Type::Struct(struct_type) => { - for nested_field in struct_type.fields() { - Self::include_leaf_field_id(nested_field, field_ids); - } - } - Type::List(list_type) => { - Self::include_leaf_field_id(&list_type.element_field, field_ids); - } - Type::Map(map_type) => { - Self::include_leaf_field_id(&map_type.key_field, field_ids); - Self::include_leaf_field_id(&map_type.value_field, field_ids); - } - } - } - - fn get_arrow_projection_mask( - field_ids: &[i32], - iceberg_schema_of_task: &Schema, - parquet_schema: &SchemaDescriptor, - arrow_schema: &ArrowSchemaRef, - use_fallback: bool, // Whether file lacks embedded field IDs (e.g., migrated from Hive/Spark) - ) -> Result { - fn type_promotion_is_valid( - file_type: Option<&PrimitiveType>, - projected_type: Option<&PrimitiveType>, - ) -> bool { - match (file_type, projected_type) { - (Some(lhs), Some(rhs)) if lhs == rhs => true, - (Some(PrimitiveType::Int), Some(PrimitiveType::Long)) => true, - (Some(PrimitiveType::Float), Some(PrimitiveType::Double)) => true, - ( - Some(PrimitiveType::Decimal { - precision: file_precision, - scale: file_scale, - }), - Some(PrimitiveType::Decimal { - precision: requested_precision, - scale: requested_scale, - }), - ) if requested_precision >= file_precision && file_scale == requested_scale => true, - // Uuid will be store as Fixed(16) in parquet file, so the read back type will be Fixed(16). - (Some(PrimitiveType::Fixed(16)), Some(PrimitiveType::Uuid)) => true, - _ => false, - } - } - - if field_ids.is_empty() { - return Ok(ProjectionMask::all()); - } - - if use_fallback { - // Position-based projection necessary because file lacks embedded field IDs - Self::get_arrow_projection_mask_fallback(field_ids, parquet_schema) - } else { - // Field-ID-based projection using embedded field IDs from Parquet metadata - - // Parquet's columnar format requires leaf-level (not top-level struct/list/map) projection - let mut leaf_field_ids = vec![]; - for field_id in field_ids { - let field = iceberg_schema_of_task.field_by_id(*field_id); - if let Some(field) = field { - Self::include_leaf_field_id(field, &mut leaf_field_ids); - } - } - - Self::get_arrow_projection_mask_with_field_ids( - &leaf_field_ids, - iceberg_schema_of_task, - parquet_schema, - arrow_schema, - type_promotion_is_valid, - ) - } - } - - /// Standard projection using embedded field IDs from Parquet metadata. - /// For iceberg-java compatibility with ParquetSchemaUtil.pruneColumns(). - fn get_arrow_projection_mask_with_field_ids( - leaf_field_ids: &[i32], - iceberg_schema_of_task: &Schema, - parquet_schema: &SchemaDescriptor, - arrow_schema: &ArrowSchemaRef, - type_promotion_is_valid: fn(Option<&PrimitiveType>, Option<&PrimitiveType>) -> bool, - ) -> Result { - let mut column_map = HashMap::new(); - let fields = arrow_schema.fields(); - - // Pre-project only the fields that have been selected, possibly avoiding converting - // some Arrow types that are not yet supported. - let mut projected_fields: HashMap = HashMap::new(); - let projected_arrow_schema = ArrowSchema::new_with_metadata( - fields.filter_leaves(|_, f| { - f.metadata() - .get(PARQUET_FIELD_ID_META_KEY) - .and_then(|field_id| i32::from_str(field_id).ok()) - .is_some_and(|field_id| { - projected_fields.insert((*f).clone(), field_id); - leaf_field_ids.contains(&field_id) - }) - }), - arrow_schema.metadata().clone(), - ); - let iceberg_schema = arrow_schema_to_schema(&projected_arrow_schema)?; - - fields.filter_leaves(|idx, field| { - let Some(field_id) = projected_fields.get(field).cloned() else { - return false; - }; - - let iceberg_field = iceberg_schema_of_task.field_by_id(field_id); - let parquet_iceberg_field = iceberg_schema.field_by_id(field_id); - - if iceberg_field.is_none() || parquet_iceberg_field.is_none() { - return false; - } - - if !type_promotion_is_valid( - parquet_iceberg_field - .unwrap() - .field_type - .as_primitive_type(), - iceberg_field.unwrap().field_type.as_primitive_type(), - ) { - return false; - } - - column_map.insert(field_id, idx); - true - }); - - // Schema evolution: New columns may not exist in old Parquet files. - // We only project existing columns; RecordBatchTransformer adds default/NULL values. - let mut indices = vec![]; - for field_id in leaf_field_ids { - if let Some(col_idx) = column_map.get(field_id) { - indices.push(*col_idx); - } - } - - if indices.is_empty() { - // Edge case: All requested columns are new (don't exist in file). - // Project all columns so RecordBatchTransformer has a batch to transform. - Ok(ProjectionMask::all()) - } else { - Ok(ProjectionMask::leaves(parquet_schema, indices)) - } - } - - /// Fallback projection for Parquet files without field IDs. - /// Uses position-based matching: field ID N → column position N-1. - /// Projects entire top-level columns (including nested content) for iceberg-java compatibility. - fn get_arrow_projection_mask_fallback( - field_ids: &[i32], - parquet_schema: &SchemaDescriptor, - ) -> Result { - // Position-based: field_id N → column N-1 (field IDs are 1-indexed) - let parquet_root_fields = parquet_schema.root_schema().get_fields(); - let mut root_indices = vec![]; - - for field_id in field_ids.iter() { - let parquet_pos = (*field_id - 1) as usize; - - if parquet_pos < parquet_root_fields.len() { - root_indices.push(parquet_pos); - } - // RecordBatchTransformer adds missing columns with NULL values - } - - if root_indices.is_empty() { - Ok(ProjectionMask::all()) - } else { - Ok(ProjectionMask::roots(parquet_schema, root_indices)) - } - } - - fn get_row_filter( - predicates: &BoundPredicate, - parquet_schema: &SchemaDescriptor, - iceberg_field_ids: &HashSet, - field_id_map: &HashMap, - ) -> Result { - // Collect Parquet column indices from field ids. - // If the field id is not found in Parquet schema, it will be ignored due to schema evolution. - let mut column_indices = iceberg_field_ids - .iter() - .filter_map(|field_id| field_id_map.get(field_id).cloned()) - .collect::>(); - column_indices.sort(); - - // The converter that converts `BoundPredicates` to `ArrowPredicates` - let mut converter = PredicateConverter { - parquet_schema, - column_map: field_id_map, - column_indices: &column_indices, - }; - - // After collecting required leaf column indices used in the predicate, - // creates the projection mask for the Arrow predicates. - let projection_mask = ProjectionMask::leaves(parquet_schema, column_indices.clone()); - let predicate_func = visit(&mut converter, predicates)?; - let arrow_predicate = ArrowPredicateFn::new(projection_mask, predicate_func); - Ok(RowFilter::new(vec![Box::new(arrow_predicate)])) - } - - fn get_selected_row_group_indices( - predicate: &BoundPredicate, - parquet_metadata: &Arc, - field_id_map: &HashMap, - snapshot_schema: &Schema, - ) -> Result> { - let row_groups_metadata = parquet_metadata.row_groups(); - let mut results = Vec::with_capacity(row_groups_metadata.len()); - - for (idx, row_group_metadata) in row_groups_metadata.iter().enumerate() { - if RowGroupMetricsEvaluator::eval( - predicate, - row_group_metadata, - field_id_map, - snapshot_schema, - )? { - results.push(idx); - } - } - - Ok(results) - } - - fn get_row_selection_for_filter_predicate( - predicate: &BoundPredicate, - parquet_metadata: &Arc, - selected_row_groups: &Option>, - field_id_map: &HashMap, - snapshot_schema: &Schema, - ) -> Result { - let Some(column_index) = parquet_metadata.column_index() else { - return Err(Error::new( - ErrorKind::Unexpected, - "Parquet file metadata does not contain a column index", - )); - }; - - let Some(offset_index) = parquet_metadata.offset_index() else { - return Err(Error::new( - ErrorKind::Unexpected, - "Parquet file metadata does not contain an offset index", - )); - }; - - // If all row groups were filtered out, return an empty RowSelection (select no rows) - if let Some(selected_row_groups) = selected_row_groups - && selected_row_groups.is_empty() - { - return Ok(RowSelection::from(Vec::new())); - } - - let mut selected_row_groups_idx = 0; - - let page_index = column_index - .iter() - .enumerate() - .zip(offset_index) - .zip(parquet_metadata.row_groups()); - - let mut results = Vec::new(); - for (((idx, column_index), offset_index), row_group_metadata) in page_index { - if let Some(selected_row_groups) = selected_row_groups { - // skip row groups that aren't present in selected_row_groups - if idx == selected_row_groups[selected_row_groups_idx] { - selected_row_groups_idx += 1; - } else { - continue; - } - } - - let selections_for_page = PageIndexEvaluator::eval( - predicate, - column_index, - offset_index, - row_group_metadata, - field_id_map, - snapshot_schema, - )?; - - results.push(selections_for_page); - - if let Some(selected_row_groups) = selected_row_groups - && selected_row_groups_idx == selected_row_groups.len() - { - break; - } - } - - Ok(results.into_iter().flatten().collect::>().into()) - } - - /// Filters row groups by byte range to support Iceberg's file splitting. - /// - /// Iceberg splits large files at row group boundaries, so we only read row groups - /// whose byte ranges overlap with [start, start+length). - fn filter_row_groups_by_byte_range( - parquet_metadata: &Arc, - start: u64, - length: u64, - ) -> Result> { - let row_groups = parquet_metadata.row_groups(); - let mut selected = Vec::new(); - let end = start + length; - - // Row groups are stored sequentially after the 4-byte magic header. - let mut current_byte_offset = 4u64; - - for (idx, row_group) in row_groups.iter().enumerate() { - let row_group_size = row_group.compressed_size() as u64; - let row_group_end = current_byte_offset + row_group_size; - - if current_byte_offset < end && start < row_group_end { - selected.push(idx); - } - - current_byte_offset = row_group_end; - } - - Ok(selected) - } -} - -/// Build the map of parquet field id to Parquet column index in the schema. -/// Returns None if the Parquet file doesn't have field IDs embedded (e.g., migrated tables). -fn build_field_id_map(parquet_schema: &SchemaDescriptor) -> Result>> { - let mut column_map = HashMap::new(); - - for (idx, field) in parquet_schema.columns().iter().enumerate() { - let field_type = field.self_type(); - match field_type { - ParquetType::PrimitiveType { basic_info, .. } => { - if !basic_info.has_id() { - return Ok(None); - } - column_map.insert(basic_info.id(), idx); - } - ParquetType::GroupType { .. } => { - return Err(Error::new( - ErrorKind::DataInvalid, - format!( - "Leaf column in schema should be primitive type but got {field_type:?}" - ), - )); - } - }; - } - - Ok(Some(column_map)) -} - -/// Build a fallback field ID map for Parquet files without embedded field IDs. -/// -/// Returns the number of primitive (leaf) columns in a Parquet type, recursing into groups. -fn leaf_count(ty: &parquet::schema::types::Type) -> usize { - if ty.is_primitive() { - 1 - } else { - ty.get_fields().iter().map(|f| leaf_count(f)).sum() - } -} - -/// Builds a mapping from fallback field IDs to leaf column indices for Parquet files -/// without embedded field IDs. Returns entries only for primitive top-level fields. -/// -/// Must use top-level field positions (not leaf column positions) to stay consistent -/// with `add_fallback_field_ids_to_arrow_schema`, which assigns ordinal IDs to -/// top-level Arrow fields. Using leaf positions instead would produce wrong indices -/// when nested types (struct/list/map) expand into multiple leaf columns. -/// -/// Mirrors iceberg-java's ParquetSchemaUtil.addFallbackIds() which iterates -/// fileSchema.getFields() assigning ordinal IDs to top-level fields. -fn build_fallback_field_id_map(parquet_schema: &SchemaDescriptor) -> HashMap { - let mut column_map = HashMap::new(); - let mut leaf_idx = 0; - - for (top_pos, field) in parquet_schema.root_schema().get_fields().iter().enumerate() { - let field_id = (top_pos + 1) as i32; - if field.is_primitive() { - column_map.insert(field_id, leaf_idx); - } - leaf_idx += leaf_count(field); - } - - column_map -} - -/// Apply name mapping to Arrow schema for Parquet files lacking field IDs. -/// -/// Assigns Iceberg field IDs based on column names using the name mapping, -/// enabling correct projection on migrated files (e.g., from Hive/Spark via add_files). -/// -/// Per Iceberg spec Column Projection rule #2: -/// "Use schema.name-mapping.default metadata to map field id to columns without field id" -/// https://iceberg.apache.org/spec/#column-projection -/// -/// Corresponds to Java's ParquetSchemaUtil.applyNameMapping() and ApplyNameMapping visitor. -/// The key difference is Java operates on Parquet MessageType, while we operate on Arrow Schema. -/// -/// # Arguments -/// * `arrow_schema` - Arrow schema from Parquet file (without field IDs) -/// * `name_mapping` - Name mapping from table metadata (TableProperties.DEFAULT_NAME_MAPPING) -/// -/// # Returns -/// Arrow schema with field IDs assigned based on name mapping -fn apply_name_mapping_to_arrow_schema( - arrow_schema: ArrowSchemaRef, - name_mapping: &NameMapping, -) -> Result> { - debug_assert!( - arrow_schema - .fields() - .iter() - .next() - .is_none_or(|f| f.metadata().get(PARQUET_FIELD_ID_META_KEY).is_none()), - "Schema already has field IDs - name mapping should not be applied" - ); - - use arrow_schema::Field; - - let fields_with_mapped_ids: Vec<_> = arrow_schema - .fields() - .iter() - .map(|field| { - // Look up this column name in name mapping to get the Iceberg field ID. - // Corresponds to Java's ApplyNameMapping visitor which calls - // nameMapping.find(currentPath()) and returns field.withId() if found. - // - // If the field isn't in the mapping, leave it WITHOUT assigning an ID - // (matching Java's behavior of returning the field unchanged). - // Later, during projection, fields without IDs are filtered out. - let mapped_field_opt = name_mapping - .fields() - .iter() - .find(|f| f.names().contains(&field.name().to_string())); - - let mut metadata = field.metadata().clone(); - - if let Some(mapped_field) = mapped_field_opt - && let Some(field_id) = mapped_field.field_id() - { - // Field found in mapping with a field_id → assign it - metadata.insert(PARQUET_FIELD_ID_META_KEY.to_string(), field_id.to_string()); - } - // If field_id is None, leave the field without an ID (will be filtered by projection) - - Field::new(field.name(), field.data_type().clone(), field.is_nullable()) - .with_metadata(metadata) - }) - .collect(); - - Ok(Arc::new(ArrowSchema::new_with_metadata( - fields_with_mapped_ids, - arrow_schema.metadata().clone(), - ))) -} - -/// Add position-based fallback field IDs to Arrow schema for Parquet files lacking them. -/// Enables projection on migrated files (e.g., from Hive/Spark). -/// -/// Why at schema level (not per-batch): Efficiency - avoids repeated schema modification. -/// Why only top-level: Nested projection uses leaf column indices, not parent struct IDs. -/// Why 1-indexed: Compatibility with iceberg-java's ParquetSchemaUtil.addFallbackIds(). -fn add_fallback_field_ids_to_arrow_schema(arrow_schema: &ArrowSchemaRef) -> Arc { - debug_assert!( - arrow_schema - .fields() - .iter() - .next() - .is_none_or(|f| f.metadata().get(PARQUET_FIELD_ID_META_KEY).is_none()), - "Schema already has field IDs" - ); - - use arrow_schema::Field; - - let fields_with_fallback_ids: Vec<_> = arrow_schema - .fields() - .iter() - .enumerate() - .map(|(pos, field)| { - let mut metadata = field.metadata().clone(); - let field_id = (pos + 1) as i32; // 1-indexed for Java compatibility - metadata.insert(PARQUET_FIELD_ID_META_KEY.to_string(), field_id.to_string()); - - Field::new(field.name(), field.data_type().clone(), field.is_nullable()) - .with_metadata(metadata) - }) - .collect(); - - Arc::new(ArrowSchema::new_with_metadata( - fields_with_fallback_ids, - arrow_schema.metadata().clone(), - )) -} - -/// A visitor to collect field ids from bound predicates. -struct CollectFieldIdVisitor { - field_ids: HashSet, -} - -impl CollectFieldIdVisitor { - fn field_ids(self) -> HashSet { - self.field_ids - } -} - -impl BoundPredicateVisitor for CollectFieldIdVisitor { - type T = (); - - fn always_true(&mut self) -> Result<()> { - Ok(()) - } - - fn always_false(&mut self) -> Result<()> { - Ok(()) - } - - fn and(&mut self, _lhs: (), _rhs: ()) -> Result<()> { - Ok(()) - } - - fn or(&mut self, _lhs: (), _rhs: ()) -> Result<()> { - Ok(()) - } - - fn not(&mut self, _inner: ()) -> Result<()> { - Ok(()) - } - - fn is_null(&mut self, reference: &BoundReference, _predicate: &BoundPredicate) -> Result<()> { - self.field_ids.insert(reference.field().id); - Ok(()) - } - - fn not_null(&mut self, reference: &BoundReference, _predicate: &BoundPredicate) -> Result<()> { - self.field_ids.insert(reference.field().id); - Ok(()) - } - - fn is_nan(&mut self, reference: &BoundReference, _predicate: &BoundPredicate) -> Result<()> { - self.field_ids.insert(reference.field().id); - Ok(()) - } - - fn not_nan(&mut self, reference: &BoundReference, _predicate: &BoundPredicate) -> Result<()> { - self.field_ids.insert(reference.field().id); - Ok(()) - } - - fn less_than( - &mut self, - reference: &BoundReference, - _literal: &Datum, - _predicate: &BoundPredicate, - ) -> Result<()> { - self.field_ids.insert(reference.field().id); - Ok(()) - } - - fn less_than_or_eq( - &mut self, - reference: &BoundReference, - _literal: &Datum, - _predicate: &BoundPredicate, - ) -> Result<()> { - self.field_ids.insert(reference.field().id); - Ok(()) - } - - fn greater_than( - &mut self, - reference: &BoundReference, - _literal: &Datum, - _predicate: &BoundPredicate, - ) -> Result<()> { - self.field_ids.insert(reference.field().id); - Ok(()) - } - - fn greater_than_or_eq( - &mut self, - reference: &BoundReference, - _literal: &Datum, - _predicate: &BoundPredicate, - ) -> Result<()> { - self.field_ids.insert(reference.field().id); - Ok(()) - } - - fn eq( - &mut self, - reference: &BoundReference, - _literal: &Datum, - _predicate: &BoundPredicate, - ) -> Result<()> { - self.field_ids.insert(reference.field().id); - Ok(()) - } - - fn not_eq( - &mut self, - reference: &BoundReference, - _literal: &Datum, - _predicate: &BoundPredicate, - ) -> Result<()> { - self.field_ids.insert(reference.field().id); - Ok(()) - } - - fn starts_with( - &mut self, - reference: &BoundReference, - _literal: &Datum, - _predicate: &BoundPredicate, - ) -> Result<()> { - self.field_ids.insert(reference.field().id); - Ok(()) - } - - fn not_starts_with( - &mut self, - reference: &BoundReference, - _literal: &Datum, - _predicate: &BoundPredicate, - ) -> Result<()> { - self.field_ids.insert(reference.field().id); - Ok(()) - } - - fn r#in( - &mut self, - reference: &BoundReference, - _literals: &FnvHashSet, - _predicate: &BoundPredicate, - ) -> Result<()> { - self.field_ids.insert(reference.field().id); - Ok(()) - } - - fn not_in( - &mut self, - reference: &BoundReference, - _literals: &FnvHashSet, - _predicate: &BoundPredicate, - ) -> Result<()> { - self.field_ids.insert(reference.field().id); - Ok(()) - } -} - -/// A visitor to convert Iceberg bound predicates to Arrow predicates. -struct PredicateConverter<'a> { - /// The Parquet schema descriptor. - pub parquet_schema: &'a SchemaDescriptor, - /// The map between field id and leaf column index in Parquet schema. - pub column_map: &'a HashMap, - /// The required column indices in Parquet schema for the predicates. - pub column_indices: &'a Vec, -} - -impl PredicateConverter<'_> { - /// When visiting a bound reference, we return index of the leaf column in the - /// required column indices which is used to project the column in the record batch. - /// Return None if the field id is not found in the column map, which is possible - /// due to schema evolution. - fn bound_reference(&mut self, reference: &BoundReference) -> Result> { - // The leaf column's index in Parquet schema. - if let Some(column_idx) = self.column_map.get(&reference.field().id) { - if self.parquet_schema.get_column_root(*column_idx).is_group() { - return Err(Error::new( - ErrorKind::DataInvalid, - format!( - "Leaf column `{}` in predicates isn't a root column in Parquet schema.", - reference.field().name - ), - )); - } - - // The leaf column's index in the required column indices. - let index = self - .column_indices - .iter() - .position(|&idx| idx == *column_idx) - .ok_or(Error::new( - ErrorKind::DataInvalid, - format!( - "Leaf column `{}` in predicates cannot be found in the required column indices.", - reference.field().name - ), - ))?; - - Ok(Some(index)) - } else { - Ok(None) - } - } - - /// Build an Arrow predicate that always returns true. - fn build_always_true(&self) -> Result> { - Ok(Box::new(|batch| { - Ok(BooleanArray::from(vec![true; batch.num_rows()])) - })) - } - - /// Build an Arrow predicate that always returns false. - fn build_always_false(&self) -> Result> { - Ok(Box::new(|batch| { - Ok(BooleanArray::from(vec![false; batch.num_rows()])) - })) - } -} - -/// Gets the leaf column from the record batch for the required column index. Only -/// supports top-level columns for now. -fn project_column( - batch: &RecordBatch, - column_idx: usize, -) -> std::result::Result { - let column = batch.column(column_idx); - - match column.data_type() { - DataType::Struct(_) => Err(ArrowError::SchemaError( - "Does not support struct column yet.".to_string(), - )), - _ => Ok(column.clone()), - } -} - -fn compute_is_nan(array: &ArrayRef) -> std::result::Result { - // Compute NaN over the contiguous values slice, then fold the null bitmap - // in with a single bitwise AND so that null slots become false. - let (is_nan, nulls) = match array.data_type() { - DataType::Float32 => { - let arr = array.as_primitive::(); - ( - BooleanBuffer::from_iter(arr.values().iter().map(|v| v.is_nan())), - arr.nulls(), - ) - } - DataType::Float64 => { - let arr = array.as_primitive::(); - ( - BooleanBuffer::from_iter(arr.values().iter().map(|v| v.is_nan())), - arr.nulls(), - ) - } - _ => unreachable!("is_nan is only valid for float types"), - }; - - let values = match nulls { - Some(nulls) => &is_nan & nulls.inner(), - None => is_nan, - }; - - Ok(BooleanArray::new(values, None)) -} - -type PredicateResult = - dyn FnMut(RecordBatch) -> std::result::Result + Send + 'static; - -impl BoundPredicateVisitor for PredicateConverter<'_> { - type T = Box; - - fn always_true(&mut self) -> Result> { - self.build_always_true() - } - - fn always_false(&mut self) -> Result> { - self.build_always_false() - } - - fn and( - &mut self, - mut lhs: Box, - mut rhs: Box, - ) -> Result> { - Ok(Box::new(move |batch| { - let left = lhs(batch.clone())?; - let right = rhs(batch)?; - and_kleene(&left, &right) - })) - } - - fn or( - &mut self, - mut lhs: Box, - mut rhs: Box, - ) -> Result> { - Ok(Box::new(move |batch| { - let left = lhs(batch.clone())?; - let right = rhs(batch)?; - or_kleene(&left, &right) - })) - } - - fn not(&mut self, mut inner: Box) -> Result> { - Ok(Box::new(move |batch| { - let pred_ret = inner(batch)?; - not(&pred_ret) - })) - } - - fn is_null( - &mut self, - reference: &BoundReference, - _predicate: &BoundPredicate, - ) -> Result> { - if let Some(idx) = self.bound_reference(reference)? { - Ok(Box::new(move |batch| { - let column = project_column(&batch, idx)?; - is_null(&column) - })) - } else { - // A missing column, treating it as null. - self.build_always_true() - } - } - - fn not_null( - &mut self, - reference: &BoundReference, - _predicate: &BoundPredicate, - ) -> Result> { - if let Some(idx) = self.bound_reference(reference)? { - Ok(Box::new(move |batch| { - let column = project_column(&batch, idx)?; - is_not_null(&column) - })) - } else { - // A missing column, treating it as null. - self.build_always_false() - } - } - - fn is_nan( - &mut self, - reference: &BoundReference, - _predicate: &BoundPredicate, - ) -> Result> { - if let Some(idx) = self.bound_reference(reference)? { - Ok(Box::new(move |batch| { - let column = project_column(&batch, idx)?; - compute_is_nan(&column) - })) - } else { - // A missing column, treating it as null. - self.build_always_false() - } - } - - fn not_nan( - &mut self, - reference: &BoundReference, - _predicate: &BoundPredicate, - ) -> Result> { - if let Some(idx) = self.bound_reference(reference)? { - Ok(Box::new(move |batch| { - let column = project_column(&batch, idx)?; - let is_nan = compute_is_nan(&column)?; - not(&is_nan) - })) - } else { - // A missing column, treating it as null. - self.build_always_true() - } - } - - fn less_than( - &mut self, - reference: &BoundReference, - literal: &Datum, - _predicate: &BoundPredicate, - ) -> Result> { - if let Some(idx) = self.bound_reference(reference)? { - let literal = get_arrow_datum(literal)?; - - Ok(Box::new(move |batch| { - let left = project_column(&batch, idx)?; - let literal = try_cast_literal(&literal, left.data_type())?; - lt(&left, literal.as_ref()) - })) - } else { - // A missing column, treating it as null. - self.build_always_true() - } - } - - fn less_than_or_eq( - &mut self, - reference: &BoundReference, - literal: &Datum, - _predicate: &BoundPredicate, - ) -> Result> { - if let Some(idx) = self.bound_reference(reference)? { - let literal = get_arrow_datum(literal)?; - - Ok(Box::new(move |batch| { - let left = project_column(&batch, idx)?; - let literal = try_cast_literal(&literal, left.data_type())?; - lt_eq(&left, literal.as_ref()) - })) - } else { - // A missing column, treating it as null. - self.build_always_true() - } - } - - fn greater_than( - &mut self, - reference: &BoundReference, - literal: &Datum, - _predicate: &BoundPredicate, - ) -> Result> { - if let Some(idx) = self.bound_reference(reference)? { - let literal = get_arrow_datum(literal)?; - - Ok(Box::new(move |batch| { - let left = project_column(&batch, idx)?; - let literal = try_cast_literal(&literal, left.data_type())?; - gt(&left, literal.as_ref()) - })) - } else { - // A missing column, treating it as null. - self.build_always_false() - } - } - - fn greater_than_or_eq( - &mut self, - reference: &BoundReference, - literal: &Datum, - _predicate: &BoundPredicate, - ) -> Result> { - if let Some(idx) = self.bound_reference(reference)? { - let literal = get_arrow_datum(literal)?; - - Ok(Box::new(move |batch| { - let left = project_column(&batch, idx)?; - let literal = try_cast_literal(&literal, left.data_type())?; - gt_eq(&left, literal.as_ref()) - })) - } else { - // A missing column, treating it as null. - self.build_always_false() - } - } - - fn eq( - &mut self, - reference: &BoundReference, - literal: &Datum, - _predicate: &BoundPredicate, - ) -> Result> { - if let Some(idx) = self.bound_reference(reference)? { - let literal = get_arrow_datum(literal)?; - - Ok(Box::new(move |batch| { - let left = project_column(&batch, idx)?; - let literal = try_cast_literal(&literal, left.data_type())?; - eq(&left, literal.as_ref()) - })) - } else { - // A missing column, treating it as null. - self.build_always_false() - } - } - - fn not_eq( - &mut self, - reference: &BoundReference, - literal: &Datum, - _predicate: &BoundPredicate, - ) -> Result> { - if let Some(idx) = self.bound_reference(reference)? { - let literal = get_arrow_datum(literal)?; - - Ok(Box::new(move |batch| { - let left = project_column(&batch, idx)?; - let literal = try_cast_literal(&literal, left.data_type())?; - neq(&left, literal.as_ref()) - })) - } else { - // A missing column, treating it as null. - self.build_always_false() - } - } - - fn starts_with( - &mut self, - reference: &BoundReference, - literal: &Datum, - _predicate: &BoundPredicate, - ) -> Result> { - if let Some(idx) = self.bound_reference(reference)? { - let literal = get_arrow_datum(literal)?; - - Ok(Box::new(move |batch| { - let left = project_column(&batch, idx)?; - let literal = try_cast_literal(&literal, left.data_type())?; - starts_with(&left, literal.as_ref()) - })) - } else { - // A missing column, treating it as null. - self.build_always_false() - } - } - - fn not_starts_with( - &mut self, - reference: &BoundReference, - literal: &Datum, - _predicate: &BoundPredicate, - ) -> Result> { - if let Some(idx) = self.bound_reference(reference)? { - let literal = get_arrow_datum(literal)?; - - Ok(Box::new(move |batch| { - let left = project_column(&batch, idx)?; - let literal = try_cast_literal(&literal, left.data_type())?; - // update here if arrow ever adds a native not_starts_with - not(&starts_with(&left, literal.as_ref())?) - })) - } else { - // A missing column, treating it as null. - self.build_always_true() - } - } - - fn r#in( - &mut self, - reference: &BoundReference, - literals: &FnvHashSet, - _predicate: &BoundPredicate, - ) -> Result> { - if let Some(idx) = self.bound_reference(reference)? { - let literals: Vec<_> = literals - .iter() - .map(|lit| get_arrow_datum(lit).unwrap()) - .collect(); - - Ok(Box::new(move |batch| { - // update this if arrow ever adds a native is_in kernel - let left = project_column(&batch, idx)?; - - let mut acc = BooleanArray::from(vec![false; batch.num_rows()]); - for literal in &literals { - let literal = try_cast_literal(literal, left.data_type())?; - acc = or(&acc, &eq(&left, literal.as_ref())?)? - } - - Ok(acc) - })) - } else { - // A missing column, treating it as null. - self.build_always_false() - } - } - - fn not_in( - &mut self, - reference: &BoundReference, - literals: &FnvHashSet, - _predicate: &BoundPredicate, - ) -> Result> { - if let Some(idx) = self.bound_reference(reference)? { - let literals: Vec<_> = literals - .iter() - .map(|lit| get_arrow_datum(lit).unwrap()) - .collect(); - - Ok(Box::new(move |batch| { - // update this if arrow ever adds a native not_in kernel - let left = project_column(&batch, idx)?; - let mut acc = BooleanArray::from(vec![true; batch.num_rows()]); - for literal in &literals { - let literal = try_cast_literal(literal, left.data_type())?; - acc = and(&acc, &neq(&left, literal.as_ref())?)? - } - - Ok(acc) - })) - } else { - // A missing column, treating it as null. - self.build_always_true() - } - } -} - -/// ArrowFileReader is a wrapper around a FileRead that impls parquets AsyncFileReader. -pub struct ArrowFileReader { - meta: FileMetadata, - parquet_read_options: ParquetReadOptions, - r: Box, -} - -impl ArrowFileReader { - /// Create a new ArrowFileReader - pub fn new(meta: FileMetadata, r: Box) -> Self { - Self { - meta, - parquet_read_options: ParquetReadOptions::builder().build(), - r, - } - } - - /// Configure all Parquet read options. - pub(crate) fn with_parquet_read_options(mut self, options: ParquetReadOptions) -> Self { - self.parquet_read_options = options; - self - } -} - -impl AsyncFileReader for ArrowFileReader { - fn get_bytes(&mut self, range: Range) -> BoxFuture<'_, parquet::errors::Result> { - Box::pin( - self.r - .read(range.start..range.end) - .map_err(|err| parquet::errors::ParquetError::External(Box::new(err))), - ) - } - - /// Override the default `get_byte_ranges` which calls `get_bytes` sequentially. - /// The parquet reader calls this to fetch column chunks for a row group, so - /// without this override each column chunk is a serial round-trip to object storage. - /// Adapted from object_store's `coalesce_ranges` in `util.rs`. - fn get_byte_ranges( - &mut self, - ranges: Vec>, - ) -> BoxFuture<'_, parquet::errors::Result>> { - let coalesce_bytes = self.parquet_read_options.range_coalesce_bytes(); - let concurrency = self.parquet_read_options.range_fetch_concurrency().max(1); - - async move { - // Merge nearby ranges to reduce the number of object store requests. - let fetch_ranges = merge_ranges(&ranges, coalesce_bytes); - let r = &self.r; - - // Fetch merged ranges concurrently. - let fetched: Vec = futures::stream::iter(fetch_ranges.iter().cloned()) - .map(|range| async move { - r.read(range) - .await - .map_err(|e| parquet::errors::ParquetError::External(Box::new(e))) - }) - .buffered(concurrency) - .try_collect() - .await?; - - // Slice the fetched data back into the originally requested ranges. - Ok(ranges - .iter() - .map(|range| { - let idx = fetch_ranges.partition_point(|v| v.start <= range.start) - 1; - let fetch_range = &fetch_ranges[idx]; - let fetch_bytes = &fetched[idx]; - let start = (range.start - fetch_range.start) as usize; - let end = (range.end - fetch_range.start) as usize; - fetch_bytes.slice(start..end.min(fetch_bytes.len())) - }) - .collect()) - } - .boxed() - } - - // TODO: currently we don't respect `ArrowReaderOptions` cause it don't expose any method to access the option field - // we will fix it after `v55.1.0` is released in https://github.com/apache/arrow-rs/issues/7393 - fn get_metadata( - &mut self, - _options: Option<&'_ ArrowReaderOptions>, - ) -> BoxFuture<'_, parquet::errors::Result>> { - async move { - let reader = ParquetMetaDataReader::new() - .with_prefetch_hint(self.parquet_read_options.metadata_size_hint()) - // Set the page policy first because it updates both column and offset policies. - .with_page_index_policy(PageIndexPolicy::from( - self.parquet_read_options.preload_page_index(), - )) - .with_column_index_policy(PageIndexPolicy::from( - self.parquet_read_options.preload_column_index(), - )) - .with_offset_index_policy(PageIndexPolicy::from( - self.parquet_read_options.preload_offset_index(), - )); - let size = self.meta.size; - let meta = reader.load_and_finish(self, size).await?; - - Ok(Arc::new(meta)) - } - .boxed() - } -} - -/// Merge overlapping or nearby byte ranges, combining ranges with gaps <= `coalesce` bytes. -/// Adapted from object_store's `merge_ranges` in `util.rs`. -fn merge_ranges(ranges: &[Range], coalesce: u64) -> Vec> { - if ranges.is_empty() { - return vec![]; - } - - let mut ranges = ranges.to_vec(); - ranges.sort_unstable_by_key(|r| r.start); - - let mut merged = Vec::with_capacity(ranges.len()); - let mut start_idx = 0; - let mut end_idx = 1; - - while start_idx != ranges.len() { - let mut range_end = ranges[start_idx].end; - - while end_idx != ranges.len() - && ranges[end_idx] - .start - .checked_sub(range_end) - .map(|delta| delta <= coalesce) - .unwrap_or(true) - { - range_end = range_end.max(ranges[end_idx].end); - end_idx += 1; - } - - merged.push(ranges[start_idx].start..range_end); - start_idx = end_idx; - end_idx += 1; - } - - merged -} - -/// The Arrow type of an array that the Parquet reader reads may not match the exact Arrow type -/// that Iceberg uses for literals - but they are effectively the same logical type, -/// i.e. LargeUtf8 and Utf8 or Utf8View and Utf8 or Utf8View and LargeUtf8. -/// -/// The Arrow compute kernels that we use must match the type exactly, so first cast the literal -/// into the type of the batch we read from Parquet before sending it to the compute kernel. -fn try_cast_literal( - literal: &Arc, - column_type: &DataType, -) -> std::result::Result, ArrowError> { - let literal_array = literal.get().0; - - // No cast required - if literal_array.data_type() == column_type { - return Ok(Arc::clone(literal)); - } - - let literal_array = cast(literal_array, column_type)?; - Ok(Arc::new(Scalar::new(literal_array))) -} - -#[cfg(test)] -mod tests { - use std::collections::{HashMap, HashSet}; - use std::fs::File; - use std::ops::Range; - use std::sync::Arc; - - use arrow_array::cast::AsArray; - use arrow_array::{Array, ArrayRef, BooleanArray, LargeStringArray, RecordBatch, StringArray}; - use arrow_schema::{DataType, Field, Schema as ArrowSchema, TimeUnit}; - use futures::TryStreamExt; - use parquet::arrow::arrow_reader::{RowSelection, RowSelector}; - use parquet::arrow::{ArrowWriter, ProjectionMask}; - use parquet::basic::Compression; - use parquet::file::metadata::{ColumnChunkMetaData, RowGroupMetaData}; - use parquet::file::properties::WriterProperties; - use parquet::schema::parser::parse_message_type; - use parquet::schema::types::{SchemaDescPtr, SchemaDescriptor}; - use roaring::RoaringTreemap; - use tempfile::TempDir; - - use crate::ErrorKind; - use crate::arrow::reader::{CollectFieldIdVisitor, PARQUET_FIELD_ID_META_KEY}; - use crate::arrow::{ArrowReader, ArrowReaderBuilder}; - use crate::delete_vector::DeleteVector; - use crate::expr::visitors::bound_predicate_visitor::visit; - use crate::expr::{Bind, Predicate, Reference}; - use crate::io::FileIO; - use crate::scan::{FileScanTask, FileScanTaskDeleteFile, FileScanTaskStream}; - use crate::spec::{ - DataContentType, DataFileFormat, Datum, NestedField, PrimitiveType, Schema, SchemaRef, Type, - }; - - fn table_schema_simple() -> SchemaRef { - Arc::new( - Schema::builder() - .with_schema_id(1) - .with_identifier_field_ids(vec![2]) - .with_fields(vec![ - NestedField::optional(1, "foo", Type::Primitive(PrimitiveType::String)).into(), - NestedField::required(2, "bar", Type::Primitive(PrimitiveType::Int)).into(), - NestedField::optional(3, "baz", Type::Primitive(PrimitiveType::Boolean)).into(), - NestedField::optional(4, "qux", Type::Primitive(PrimitiveType::Float)).into(), - ]) - .build() - .unwrap(), - ) - } - - #[test] - fn test_collect_field_id() { - let schema = table_schema_simple(); - let expr = Reference::new("qux").is_null(); - let bound_expr = expr.bind(schema, true).unwrap(); - - let mut visitor = CollectFieldIdVisitor { - field_ids: HashSet::default(), - }; - visit(&mut visitor, &bound_expr).unwrap(); - - let mut expected = HashSet::default(); - expected.insert(4_i32); - - assert_eq!(visitor.field_ids, expected); - } - - #[test] - fn test_collect_field_id_with_and() { - let schema = table_schema_simple(); - let expr = Reference::new("qux") - .is_null() - .and(Reference::new("baz").is_null()); - let bound_expr = expr.bind(schema, true).unwrap(); - - let mut visitor = CollectFieldIdVisitor { - field_ids: HashSet::default(), - }; - visit(&mut visitor, &bound_expr).unwrap(); - - let mut expected = HashSet::default(); - expected.insert(4_i32); - expected.insert(3); - - assert_eq!(visitor.field_ids, expected); - } - - #[test] - fn test_collect_field_id_with_or() { - let schema = table_schema_simple(); - let expr = Reference::new("qux") - .is_null() - .or(Reference::new("baz").is_null()); - let bound_expr = expr.bind(schema, true).unwrap(); - - let mut visitor = CollectFieldIdVisitor { - field_ids: HashSet::default(), - }; - visit(&mut visitor, &bound_expr).unwrap(); - - let mut expected = HashSet::default(); - expected.insert(4_i32); - expected.insert(3); - - assert_eq!(visitor.field_ids, expected); - } - - #[test] - fn test_arrow_projection_mask() { - let schema = Arc::new( - Schema::builder() - .with_schema_id(1) - .with_identifier_field_ids(vec![1]) - .with_fields(vec![ - NestedField::required(1, "c1", Type::Primitive(PrimitiveType::String)).into(), - NestedField::optional(2, "c2", Type::Primitive(PrimitiveType::Int)).into(), - NestedField::optional( - 3, - "c3", - Type::Primitive(PrimitiveType::Decimal { - precision: 38, - scale: 3, - }), - ) - .into(), - ]) - .build() - .unwrap(), - ); - let arrow_schema = Arc::new(ArrowSchema::new(vec![ - Field::new("c1", DataType::Utf8, false).with_metadata(HashMap::from([( - PARQUET_FIELD_ID_META_KEY.to_string(), - "1".to_string(), - )])), - // Type not supported - Field::new("c2", DataType::Duration(TimeUnit::Microsecond), true).with_metadata( - HashMap::from([(PARQUET_FIELD_ID_META_KEY.to_string(), "2".to_string())]), - ), - // Precision is beyond the supported range - Field::new("c3", DataType::Decimal128(39, 3), true).with_metadata(HashMap::from([( - PARQUET_FIELD_ID_META_KEY.to_string(), - "3".to_string(), - )])), - ])); - - let message_type = " -message schema { - required binary c1 (STRING) = 1; - optional int32 c2 (INTEGER(8,true)) = 2; - optional fixed_len_byte_array(17) c3 (DECIMAL(39,3)) = 3; -} - "; - let parquet_type = parse_message_type(message_type).expect("should parse schema"); - let parquet_schema = SchemaDescriptor::new(Arc::new(parquet_type)); - - // Try projecting the fields c2 and c3 with the unsupported data types - let err = ArrowReader::get_arrow_projection_mask( - &[1, 2, 3], - &schema, - &parquet_schema, - &arrow_schema, - false, - ) - .unwrap_err(); - - assert_eq!(err.kind(), ErrorKind::DataInvalid); - assert_eq!( - err.to_string(), - "DataInvalid => Unsupported Arrow data type: Duration(µs)".to_string() - ); - - // Omitting field c2, we still get an error due to c3 being selected - let err = ArrowReader::get_arrow_projection_mask( - &[1, 3], - &schema, - &parquet_schema, - &arrow_schema, - false, - ) - .unwrap_err(); - - assert_eq!(err.kind(), ErrorKind::DataInvalid); - assert_eq!( - err.to_string(), - "DataInvalid => Failed to create decimal type, source: DataInvalid => Decimals with precision larger than 38 are not supported: 39".to_string() - ); - - // Finally avoid selecting fields with unsupported data types - let mask = ArrowReader::get_arrow_projection_mask( - &[1], - &schema, - &parquet_schema, - &arrow_schema, - false, - ) - .expect("Some ProjectionMask"); - assert_eq!(mask, ProjectionMask::leaves(&parquet_schema, vec![0])); - } - - #[tokio::test] - async fn test_kleene_logic_or_behaviour() { - // a IS NULL OR a = 'foo' - let predicate = Reference::new("a") - .is_null() - .or(Reference::new("a").equal_to(Datum::string("foo"))); - - // Table data: [NULL, "foo", "bar"] - let data_for_col_a = vec![None, Some("foo".to_string()), Some("bar".to_string())]; - - // Expected: [NULL, "foo"]. - let expected = vec![None, Some("foo".to_string())]; - - let (file_io, schema, table_location, _temp_dir) = - setup_kleene_logic(data_for_col_a, DataType::Utf8); - let reader = ArrowReaderBuilder::new(file_io).build(); - - let result_data = test_perform_read(predicate, schema, table_location, reader).await; - - assert_eq!(result_data, expected); - } - - #[tokio::test] - async fn test_kleene_logic_and_behaviour() { - // a IS NOT NULL AND a != 'foo' - let predicate = Reference::new("a") - .is_not_null() - .and(Reference::new("a").not_equal_to(Datum::string("foo"))); - - // Table data: [NULL, "foo", "bar"] - let data_for_col_a = vec![None, Some("foo".to_string()), Some("bar".to_string())]; - - // Expected: ["bar"]. - let expected = vec![Some("bar".to_string())]; - - let (file_io, schema, table_location, _temp_dir) = - setup_kleene_logic(data_for_col_a, DataType::Utf8); - let reader = ArrowReaderBuilder::new(file_io).build(); - - let result_data = test_perform_read(predicate, schema, table_location, reader).await; - - assert_eq!(result_data, expected); - } - - #[tokio::test] - async fn test_predicate_cast_literal() { - let predicates = vec![ - // a == 'foo' - (Reference::new("a").equal_to(Datum::string("foo")), vec![ - Some("foo".to_string()), - ]), - // a != 'foo' - ( - Reference::new("a").not_equal_to(Datum::string("foo")), - vec![Some("bar".to_string())], - ), - // STARTS_WITH(a, 'foo') - (Reference::new("a").starts_with(Datum::string("f")), vec![ - Some("foo".to_string()), - ]), - // NOT STARTS_WITH(a, 'foo') - ( - Reference::new("a").not_starts_with(Datum::string("f")), - vec![Some("bar".to_string())], - ), - // a < 'foo' - (Reference::new("a").less_than(Datum::string("foo")), vec![ - Some("bar".to_string()), - ]), - // a <= 'foo' - ( - Reference::new("a").less_than_or_equal_to(Datum::string("foo")), - vec![Some("foo".to_string()), Some("bar".to_string())], - ), - // a > 'foo' - ( - Reference::new("a").greater_than(Datum::string("bar")), - vec![Some("foo".to_string())], - ), - // a >= 'foo' - ( - Reference::new("a").greater_than_or_equal_to(Datum::string("foo")), - vec![Some("foo".to_string())], - ), - // a IN ('foo', 'bar') - ( - Reference::new("a").is_in([Datum::string("foo"), Datum::string("baz")]), - vec![Some("foo".to_string())], - ), - // a NOT IN ('foo', 'bar') - ( - Reference::new("a").is_not_in([Datum::string("foo"), Datum::string("baz")]), - vec![Some("bar".to_string())], - ), - ]; - - // Table data: ["foo", "bar"] - let data_for_col_a = vec![Some("foo".to_string()), Some("bar".to_string())]; - - let (file_io, schema, table_location, _temp_dir) = - setup_kleene_logic(data_for_col_a, DataType::LargeUtf8); - let reader = ArrowReaderBuilder::new(file_io).build(); - - for (predicate, expected) in predicates { - println!("testing predicate {predicate}"); - let result_data = test_perform_read( - predicate.clone(), - schema.clone(), - table_location.clone(), - reader.clone(), - ) - .await; - - assert_eq!(result_data, expected, "predicate={predicate}"); - } - } - - async fn test_perform_read( - predicate: Predicate, - schema: SchemaRef, - table_location: String, - reader: ArrowReader, - ) -> Vec> { - let tasks = Box::pin(futures::stream::iter( - vec![Ok(FileScanTask { - file_size_in_bytes: std::fs::metadata(format!("{table_location}/1.parquet")) - .unwrap() - .len(), - start: 0, - length: 0, - record_count: None, - data_file_path: format!("{table_location}/1.parquet"), - data_file_format: DataFileFormat::Parquet, - schema: schema.clone(), - project_field_ids: vec![1], - predicate: Some(predicate.bind(schema, true).unwrap()), - deletes: vec![], - partition: None, - partition_spec: None, - name_mapping: None, - case_sensitive: false, - })] - .into_iter(), - )) as FileScanTaskStream; - - let result = reader - .read(tasks) - .unwrap() - .try_collect::>() - .await - .unwrap(); - - result[0].columns()[0] - .as_string_opt::() - .unwrap() - .iter() - .map(|v| v.map(ToOwned::to_owned)) - .collect::>() - } - - fn setup_kleene_logic( - data_for_col_a: Vec>, - col_a_type: DataType, - ) -> (FileIO, SchemaRef, String, TempDir) { - let schema = Arc::new( - Schema::builder() - .with_schema_id(1) - .with_fields(vec![ - NestedField::optional(1, "a", Type::Primitive(PrimitiveType::String)).into(), - ]) - .build() - .unwrap(), - ); - - let arrow_schema = Arc::new(ArrowSchema::new(vec![ - Field::new("a", col_a_type.clone(), true).with_metadata(HashMap::from([( - PARQUET_FIELD_ID_META_KEY.to_string(), - "1".to_string(), - )])), - ])); - - let tmp_dir = TempDir::new().unwrap(); - let table_location = tmp_dir.path().to_str().unwrap().to_string(); - - let file_io = FileIO::new_with_fs(); - - let col = match col_a_type { - DataType::Utf8 => Arc::new(StringArray::from(data_for_col_a)) as ArrayRef, - DataType::LargeUtf8 => Arc::new(LargeStringArray::from(data_for_col_a)) as ArrayRef, - _ => panic!("unexpected col_a_type"), - }; - - let to_write = RecordBatch::try_new(arrow_schema.clone(), vec![col]).unwrap(); - - // Write the Parquet files - let props = WriterProperties::builder() - .set_compression(Compression::SNAPPY) - .build(); - - let file = File::create(format!("{table_location}/1.parquet")).unwrap(); - let mut writer = - ArrowWriter::try_new(file, to_write.schema(), Some(props.clone())).unwrap(); - - writer.write(&to_write).expect("Writing batch"); - - // writer must be closed to write footer - writer.close().unwrap(); - - (file_io, schema, table_location, tmp_dir) - } - - #[test] - fn test_build_deletes_row_selection() { - let schema_descr = get_test_schema_descr(); - - let mut columns = vec![]; - for ptr in schema_descr.columns() { - let column = ColumnChunkMetaData::builder(ptr.clone()).build().unwrap(); - columns.push(column); - } - - let row_groups_metadata = vec![ - build_test_row_group_meta(schema_descr.clone(), columns.clone(), 1000, 0), - build_test_row_group_meta(schema_descr.clone(), columns.clone(), 500, 1), - build_test_row_group_meta(schema_descr.clone(), columns.clone(), 500, 2), - build_test_row_group_meta(schema_descr.clone(), columns.clone(), 1000, 3), - build_test_row_group_meta(schema_descr.clone(), columns.clone(), 500, 4), - ]; - - let selected_row_groups = Some(vec![1, 3]); - - /* cases to cover: - * {skip|select} {first|intermediate|last} {one row|multiple rows} in - {first|intermediate|last} {skipped|selected} row group - * row group selection disabled - */ - - let positional_deletes = RoaringTreemap::from_iter(&[ - 1, // in skipped rg 0, should be ignored - 3, // run of three consecutive items in skipped rg0 - 4, 5, 998, // two consecutive items at end of skipped rg0 - 999, 1000, // solitary row at start of selected rg1 (1, 9) - 1010, // run of 3 rows in selected rg1 - 1011, 1012, // (3, 485) - 1498, // run of two items at end of selected rg1 - 1499, 1500, // run of two items at start of skipped rg2 - 1501, 1600, // should ignore, in skipped rg2 - 1999, // single row at end of skipped rg2 - 2000, // run of two items at start of selected rg3 - 2001, // (4, 98) - 2100, // single row in selected row group 3 (1, 99) - 2200, // run of 3 consecutive rows in selected row group 3 - 2201, 2202, // (3, 796) - 2999, // single item at end of selected rg3 (1) - 3000, // single item at start of skipped rg4 - ]); - - let positional_deletes = DeleteVector::new(positional_deletes); - - // using selected row groups 1 and 3 - let result = ArrowReader::build_deletes_row_selection( - &row_groups_metadata, - &selected_row_groups, - &positional_deletes, - ) - .unwrap(); - - let expected = RowSelection::from(vec![ - RowSelector::skip(1), - RowSelector::select(9), - RowSelector::skip(3), - RowSelector::select(485), - RowSelector::skip(4), - RowSelector::select(98), - RowSelector::skip(1), - RowSelector::select(99), - RowSelector::skip(3), - RowSelector::select(796), - RowSelector::skip(1), - ]); - - assert_eq!(result, expected); - - // selecting all row groups - let result = ArrowReader::build_deletes_row_selection( - &row_groups_metadata, - &None, - &positional_deletes, - ) - .unwrap(); - - let expected = RowSelection::from(vec![ - RowSelector::select(1), - RowSelector::skip(1), - RowSelector::select(1), - RowSelector::skip(3), - RowSelector::select(992), - RowSelector::skip(3), - RowSelector::select(9), - RowSelector::skip(3), - RowSelector::select(485), - RowSelector::skip(4), - RowSelector::select(98), - RowSelector::skip(1), - RowSelector::select(398), - RowSelector::skip(3), - RowSelector::select(98), - RowSelector::skip(1), - RowSelector::select(99), - RowSelector::skip(3), - RowSelector::select(796), - RowSelector::skip(2), - RowSelector::select(499), - ]); - - assert_eq!(result, expected); - } - - fn build_test_row_group_meta( - schema_descr: SchemaDescPtr, - columns: Vec, - num_rows: i64, - ordinal: i16, - ) -> RowGroupMetaData { - RowGroupMetaData::builder(schema_descr.clone()) - .set_num_rows(num_rows) - .set_total_byte_size(2000) - .set_column_metadata(columns) - .set_ordinal(ordinal) - .build() - .unwrap() - } - - fn get_test_schema_descr() -> SchemaDescPtr { - use parquet::schema::types::Type as SchemaType; - - let schema = SchemaType::group_type_builder("schema") - .with_fields(vec![ - Arc::new( - SchemaType::primitive_type_builder("a", parquet::basic::Type::INT32) - .build() - .unwrap(), - ), - Arc::new( - SchemaType::primitive_type_builder("b", parquet::basic::Type::INT32) - .build() - .unwrap(), - ), - ]) - .build() - .unwrap(); - - Arc::new(SchemaDescriptor::new(Arc::new(schema))) - } - - /// Verifies that file splits respect byte ranges and only read specific row groups. - #[tokio::test] - async fn test_file_splits_respect_byte_ranges() { - use arrow_array::Int32Array; - use parquet::file::reader::{FileReader, SerializedFileReader}; - - let schema = Arc::new( - Schema::builder() - .with_schema_id(1) - .with_fields(vec![ - NestedField::required(1, "id", Type::Primitive(PrimitiveType::Int)).into(), - ]) - .build() - .unwrap(), - ); - - let arrow_schema = Arc::new(ArrowSchema::new(vec![ - Field::new("id", DataType::Int32, false).with_metadata(HashMap::from([( - PARQUET_FIELD_ID_META_KEY.to_string(), - "1".to_string(), - )])), - ])); - - let tmp_dir = TempDir::new().unwrap(); - let table_location = tmp_dir.path().to_str().unwrap().to_string(); - let file_path = format!("{table_location}/multi_row_group.parquet"); - - // Force each batch into its own row group for testing byte range filtering. - let batch1 = RecordBatch::try_new(arrow_schema.clone(), vec![Arc::new(Int32Array::from( - (0..100).collect::>(), - ))]) - .unwrap(); - let batch2 = RecordBatch::try_new(arrow_schema.clone(), vec![Arc::new(Int32Array::from( - (100..200).collect::>(), - ))]) - .unwrap(); - let batch3 = RecordBatch::try_new(arrow_schema.clone(), vec![Arc::new(Int32Array::from( - (200..300).collect::>(), - ))]) - .unwrap(); - - let props = WriterProperties::builder() - .set_compression(Compression::SNAPPY) - .set_max_row_group_row_count(Some(100)) - .build(); - - let file = File::create(&file_path).unwrap(); - let mut writer = ArrowWriter::try_new(file, arrow_schema.clone(), Some(props)).unwrap(); - writer.write(&batch1).expect("Writing batch 1"); - writer.write(&batch2).expect("Writing batch 2"); - writer.write(&batch3).expect("Writing batch 3"); - writer.close().unwrap(); - - // Read the file metadata to get row group byte positions - let file = File::open(&file_path).unwrap(); - let reader = SerializedFileReader::new(file).unwrap(); - let metadata = reader.metadata(); - - println!("File has {} row groups", metadata.num_row_groups()); - assert_eq!(metadata.num_row_groups(), 3, "Expected 3 row groups"); - - // Get byte positions for each row group - let row_group_0 = metadata.row_group(0); - let row_group_1 = metadata.row_group(1); - let row_group_2 = metadata.row_group(2); - - let rg0_start = 4u64; // Parquet files start with 4-byte magic "PAR1" - let rg1_start = rg0_start + row_group_0.compressed_size() as u64; - let rg2_start = rg1_start + row_group_1.compressed_size() as u64; - let file_end = rg2_start + row_group_2.compressed_size() as u64; - - println!( - "Row group 0: {} rows, starts at byte {}, {} bytes compressed", - row_group_0.num_rows(), - rg0_start, - row_group_0.compressed_size() - ); - println!( - "Row group 1: {} rows, starts at byte {}, {} bytes compressed", - row_group_1.num_rows(), - rg1_start, - row_group_1.compressed_size() - ); - println!( - "Row group 2: {} rows, starts at byte {}, {} bytes compressed", - row_group_2.num_rows(), - rg2_start, - row_group_2.compressed_size() - ); - - let file_io = FileIO::new_with_fs(); - let reader = ArrowReaderBuilder::new(file_io).build(); - - // Task 1: read only the first row group - let task1 = FileScanTask { - file_size_in_bytes: std::fs::metadata(&file_path).unwrap().len(), - start: rg0_start, - length: row_group_0.compressed_size() as u64, - record_count: Some(100), - data_file_path: file_path.clone(), - data_file_format: DataFileFormat::Parquet, - schema: schema.clone(), - project_field_ids: vec![1], - predicate: None, - deletes: vec![], - partition: None, - partition_spec: None, - name_mapping: None, - case_sensitive: false, - }; - - // Task 2: read the second and third row groups - let task2 = FileScanTask { - file_size_in_bytes: std::fs::metadata(&file_path).unwrap().len(), - start: rg1_start, - length: file_end - rg1_start, - record_count: Some(200), - data_file_path: file_path.clone(), - data_file_format: DataFileFormat::Parquet, - schema: schema.clone(), - project_field_ids: vec![1], - predicate: None, - deletes: vec![], - partition: None, - partition_spec: None, - name_mapping: None, - case_sensitive: false, - }; - - let tasks1 = Box::pin(futures::stream::iter(vec![Ok(task1)])) as FileScanTaskStream; - let result1 = reader - .clone() - .read(tasks1) - .unwrap() - .try_collect::>() - .await - .unwrap(); - - let total_rows_task1: usize = result1.iter().map(|b| b.num_rows()).sum(); - println!( - "Task 1 (bytes {}-{}) returned {} rows", - rg0_start, - rg0_start + row_group_0.compressed_size() as u64, - total_rows_task1 - ); - - let tasks2 = Box::pin(futures::stream::iter(vec![Ok(task2)])) as FileScanTaskStream; - let result2 = reader - .read(tasks2) - .unwrap() - .try_collect::>() - .await - .unwrap(); - - let total_rows_task2: usize = result2.iter().map(|b| b.num_rows()).sum(); - println!("Task 2 (bytes {rg1_start}-{file_end}) returned {total_rows_task2} rows"); - - assert_eq!( - total_rows_task1, 100, - "Task 1 should read only the first row group (100 rows), but got {total_rows_task1} rows" - ); - - assert_eq!( - total_rows_task2, 200, - "Task 2 should read only the second+third row groups (200 rows), but got {total_rows_task2} rows" - ); - - // Verify the actual data values are correct (not just the row count) - if total_rows_task1 > 0 { - let first_batch = &result1[0]; - let id_col = first_batch - .column(0) - .as_primitive::(); - let first_val = id_col.value(0); - let last_val = id_col.value(id_col.len() - 1); - println!("Task 1 data range: {first_val} to {last_val}"); - - assert_eq!(first_val, 0, "Task 1 should start with id=0"); - assert_eq!(last_val, 99, "Task 1 should end with id=99"); - } - - if total_rows_task2 > 0 { - let first_batch = &result2[0]; - let id_col = first_batch - .column(0) - .as_primitive::(); - let first_val = id_col.value(0); - println!("Task 2 first value: {first_val}"); - - assert_eq!(first_val, 100, "Task 2 should start with id=100, not id=0"); - } - } - - /// Test schema evolution: reading old Parquet file (with only column 'a') - /// using a newer table schema (with columns 'a' and 'b'). - /// This tests that: - /// 1. get_arrow_projection_mask allows missing columns - /// 2. RecordBatchTransformer adds missing column 'b' with NULL values - #[tokio::test] - async fn test_schema_evolution_add_column() { - use arrow_array::{Array, Int32Array}; - - // New table schema: columns 'a' and 'b' (b was added later, file only has 'a') - let new_schema = Arc::new( - Schema::builder() - .with_schema_id(2) - .with_fields(vec![ - NestedField::required(1, "a", Type::Primitive(PrimitiveType::Int)).into(), - NestedField::optional(2, "b", Type::Primitive(PrimitiveType::Int)).into(), - ]) - .build() - .unwrap(), - ); - - // Create Arrow schema for old Parquet file (only has column 'a') - let arrow_schema_old = Arc::new(ArrowSchema::new(vec![ - Field::new("a", DataType::Int32, false).with_metadata(HashMap::from([( - PARQUET_FIELD_ID_META_KEY.to_string(), - "1".to_string(), - )])), - ])); - - // Write old Parquet file with only column 'a' - let tmp_dir = TempDir::new().unwrap(); - let table_location = tmp_dir.path().to_str().unwrap().to_string(); - let file_io = FileIO::new_with_fs(); - - let data_a = Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef; - let to_write = RecordBatch::try_new(arrow_schema_old.clone(), vec![data_a]).unwrap(); - - let props = WriterProperties::builder() - .set_compression(Compression::SNAPPY) - .build(); - let file = File::create(format!("{table_location}/old_file.parquet")).unwrap(); - let mut writer = ArrowWriter::try_new(file, to_write.schema(), Some(props)).unwrap(); - writer.write(&to_write).expect("Writing batch"); - writer.close().unwrap(); - - // Read the old Parquet file using the NEW schema (with column 'b') - let reader = ArrowReaderBuilder::new(file_io).build(); - let tasks = Box::pin(futures::stream::iter( - vec![Ok(FileScanTask { - file_size_in_bytes: std::fs::metadata(format!("{table_location}/old_file.parquet")) - .unwrap() - .len(), - start: 0, - length: 0, - record_count: None, - data_file_path: format!("{table_location}/old_file.parquet"), - data_file_format: DataFileFormat::Parquet, - schema: new_schema.clone(), - project_field_ids: vec![1, 2], // Request both columns 'a' and 'b' - predicate: None, - deletes: vec![], - partition: None, - partition_spec: None, - name_mapping: None, - case_sensitive: false, - })] - .into_iter(), - )) as FileScanTaskStream; - - let result = reader - .read(tasks) - .unwrap() - .try_collect::>() - .await - .unwrap(); - - // Verify we got the correct data - assert_eq!(result.len(), 1); - let batch = &result[0]; - - // Should have 2 columns now - assert_eq!(batch.num_columns(), 2); - assert_eq!(batch.num_rows(), 3); - - // Column 'a' should have the original data - let col_a = batch - .column(0) - .as_primitive::(); - assert_eq!(col_a.values(), &[1, 2, 3]); - - // Column 'b' should be all NULLs (it didn't exist in the old file) - let col_b = batch - .column(1) - .as_primitive::(); - assert_eq!(col_b.null_count(), 3); - assert!(col_b.is_null(0)); - assert!(col_b.is_null(1)); - assert!(col_b.is_null(2)); - } - - /// Test for bug where position deletes in later row groups are not applied correctly. - /// - /// When a file has multiple row groups and a position delete targets a row in a later - /// row group, the `build_deletes_row_selection` function had a bug where it would - /// fail to increment `current_row_group_base_idx` when skipping row groups. - /// - /// This test creates: - /// - A data file with 200 rows split into 2 row groups (0-99, 100-199) - /// - A position delete file that deletes row 199 (last row in second row group) - /// - /// Expected behavior: Should return 199 rows (with id=200 deleted) - /// Bug behavior: Returns 200 rows (delete is not applied) - /// - /// This bug was discovered while running Apache Spark + Apache Iceberg integration tests - /// through DataFusion Comet. The following Iceberg Java tests failed due to this bug: - /// - `org.apache.iceberg.spark.extensions.TestMergeOnReadDelete::testDeleteWithMultipleRowGroupsParquet` - /// - `org.apache.iceberg.spark.extensions.TestMergeOnReadUpdate::testUpdateWithMultipleRowGroupsParquet` - #[tokio::test] - async fn test_position_delete_across_multiple_row_groups() { - use arrow_array::{Int32Array, Int64Array}; - use parquet::file::reader::{FileReader, SerializedFileReader}; - - // Field IDs for positional delete schema - const FIELD_ID_POSITIONAL_DELETE_FILE_PATH: u64 = 2147483546; - const FIELD_ID_POSITIONAL_DELETE_POS: u64 = 2147483545; - - let tmp_dir = TempDir::new().unwrap(); - let table_location = tmp_dir.path().to_str().unwrap().to_string(); - - // Create table schema with a single 'id' column - let table_schema = Arc::new( - Schema::builder() - .with_schema_id(1) - .with_fields(vec![ - NestedField::required(1, "id", Type::Primitive(PrimitiveType::Int)).into(), - ]) - .build() - .unwrap(), - ); - - let arrow_schema = Arc::new(ArrowSchema::new(vec![ - Field::new("id", DataType::Int32, false).with_metadata(HashMap::from([( - PARQUET_FIELD_ID_META_KEY.to_string(), - "1".to_string(), - )])), - ])); - - // Step 1: Create data file with 200 rows in 2 row groups - // Row group 0: rows 0-99 (ids 1-100) - // Row group 1: rows 100-199 (ids 101-200) - let data_file_path = format!("{table_location}/data.parquet"); - - let batch1 = RecordBatch::try_new(arrow_schema.clone(), vec![Arc::new( - Int32Array::from_iter_values(1..=100), - )]) - .unwrap(); - - let batch2 = RecordBatch::try_new(arrow_schema.clone(), vec![Arc::new( - Int32Array::from_iter_values(101..=200), - )]) - .unwrap(); - - // Force each batch into its own row group - let props = WriterProperties::builder() - .set_compression(Compression::SNAPPY) - .set_max_row_group_row_count(Some(100)) - .build(); - - let file = File::create(&data_file_path).unwrap(); - let mut writer = ArrowWriter::try_new(file, arrow_schema.clone(), Some(props)).unwrap(); - writer.write(&batch1).expect("Writing batch 1"); - writer.write(&batch2).expect("Writing batch 2"); - writer.close().unwrap(); - - // Verify we created 2 row groups - let verify_file = File::open(&data_file_path).unwrap(); - let verify_reader = SerializedFileReader::new(verify_file).unwrap(); - assert_eq!( - verify_reader.metadata().num_row_groups(), - 2, - "Should have 2 row groups" - ); - - // Step 2: Create position delete file that deletes row 199 (id=200, last row in row group 1) - let delete_file_path = format!("{table_location}/deletes.parquet"); - - let delete_schema = Arc::new(ArrowSchema::new(vec![ - Field::new("file_path", DataType::Utf8, false).with_metadata(HashMap::from([( - PARQUET_FIELD_ID_META_KEY.to_string(), - FIELD_ID_POSITIONAL_DELETE_FILE_PATH.to_string(), - )])), - Field::new("pos", DataType::Int64, false).with_metadata(HashMap::from([( - PARQUET_FIELD_ID_META_KEY.to_string(), - FIELD_ID_POSITIONAL_DELETE_POS.to_string(), - )])), - ])); - - // Delete row at position 199 (0-indexed, so it's the last row: id=200) - let delete_batch = RecordBatch::try_new(delete_schema.clone(), vec![ - Arc::new(StringArray::from_iter_values(vec![data_file_path.clone()])), - Arc::new(Int64Array::from_iter_values(vec![199i64])), - ]) - .unwrap(); - - let delete_props = WriterProperties::builder() - .set_compression(Compression::SNAPPY) - .build(); - - let delete_file = File::create(&delete_file_path).unwrap(); - let mut delete_writer = - ArrowWriter::try_new(delete_file, delete_schema, Some(delete_props)).unwrap(); - delete_writer.write(&delete_batch).unwrap(); - delete_writer.close().unwrap(); - - // Step 3: Read the data file with the delete applied - let file_io = FileIO::new_with_fs(); - let reader = ArrowReaderBuilder::new(file_io).build(); - - let task = FileScanTask { - file_size_in_bytes: std::fs::metadata(&data_file_path).unwrap().len(), - start: 0, - length: 0, - record_count: Some(200), - data_file_path: data_file_path.clone(), - data_file_format: DataFileFormat::Parquet, - schema: table_schema.clone(), - project_field_ids: vec![1], - predicate: None, - deletes: vec![FileScanTaskDeleteFile { - file_size_in_bytes: std::fs::metadata(&delete_file_path).unwrap().len(), - file_path: delete_file_path, - file_type: DataContentType::PositionDeletes, - partition_spec_id: 0, - equality_ids: None, - }], - partition: None, - partition_spec: None, - name_mapping: None, - case_sensitive: false, - }; - - let tasks = Box::pin(futures::stream::iter(vec![Ok(task)])) as FileScanTaskStream; - let result = reader - .read(tasks) - .unwrap() - .try_collect::>() - .await - .unwrap(); - - // Step 4: Verify we got 199 rows (not 200) - let total_rows: usize = result.iter().map(|b| b.num_rows()).sum(); - - println!("Total rows read: {total_rows}"); - println!("Expected: 199 rows (deleted row 199 which had id=200)"); - - // This assertion will FAIL before the fix and PASS after the fix - assert_eq!( - total_rows, 199, - "Expected 199 rows after deleting row 199, but got {total_rows} rows. \ - The bug causes position deletes in later row groups to be ignored." - ); - - // Verify the deleted row (id=200) is not present - let all_ids: Vec = result - .iter() - .flat_map(|batch| { - batch - .column(0) - .as_primitive::() - .values() - .iter() - .copied() - }) - .collect(); - - assert!( - !all_ids.contains(&200), - "Row with id=200 should be deleted but was found in results" - ); - - // Verify we have all other ids (1-199) - let expected_ids: Vec = (1..=199).collect(); - assert_eq!( - all_ids, expected_ids, - "Should have ids 1-199 but got different values" - ); - } - - /// Test for bug where position deletes are lost when skipping unselected row groups. - /// - /// This is a variant of `test_position_delete_across_multiple_row_groups` that exercises - /// the row group selection code path (`selected_row_groups: Some([...])`). - /// - /// When a file has multiple row groups and only some are selected for reading, - /// the `build_deletes_row_selection` function must correctly skip over deletes in - /// unselected row groups WITHOUT consuming deletes that belong to selected row groups. - /// - /// This test creates: - /// - A data file with 200 rows split into 2 row groups (0-99, 100-199) - /// - A position delete file that deletes row 199 (last row in second row group) - /// - Row group selection that reads ONLY row group 1 (rows 100-199) - /// - /// Expected behavior: Should return 99 rows (with row 199 deleted) - /// Bug behavior: Returns 100 rows (delete is lost when skipping row group 0) - /// - /// The bug occurs when processing row group 0 (unselected): - /// ```rust - /// delete_vector_iter.advance_to(next_row_group_base_idx); // Position at first delete >= 100 - /// next_deleted_row_idx_opt = delete_vector_iter.next(); // BUG: Consumes delete at 199! - /// ``` - /// - /// The fix is to NOT call `next()` after `advance_to()` when skipping unselected row groups, - /// because `advance_to()` already positions the iterator correctly without consuming elements. - #[tokio::test] - async fn test_position_delete_with_row_group_selection() { - use arrow_array::{Int32Array, Int64Array}; - use parquet::file::reader::{FileReader, SerializedFileReader}; - - // Field IDs for positional delete schema - const FIELD_ID_POSITIONAL_DELETE_FILE_PATH: u64 = 2147483546; - const FIELD_ID_POSITIONAL_DELETE_POS: u64 = 2147483545; - - let tmp_dir = TempDir::new().unwrap(); - let table_location = tmp_dir.path().to_str().unwrap().to_string(); - - // Create table schema with a single 'id' column - let table_schema = Arc::new( - Schema::builder() - .with_schema_id(1) - .with_fields(vec![ - NestedField::required(1, "id", Type::Primitive(PrimitiveType::Int)).into(), - ]) - .build() - .unwrap(), - ); - - let arrow_schema = Arc::new(ArrowSchema::new(vec![ - Field::new("id", DataType::Int32, false).with_metadata(HashMap::from([( - PARQUET_FIELD_ID_META_KEY.to_string(), - "1".to_string(), - )])), - ])); - - // Step 1: Create data file with 200 rows in 2 row groups - // Row group 0: rows 0-99 (ids 1-100) - // Row group 1: rows 100-199 (ids 101-200) - let data_file_path = format!("{table_location}/data.parquet"); - - let batch1 = RecordBatch::try_new(arrow_schema.clone(), vec![Arc::new( - Int32Array::from_iter_values(1..=100), - )]) - .unwrap(); - - let batch2 = RecordBatch::try_new(arrow_schema.clone(), vec![Arc::new( - Int32Array::from_iter_values(101..=200), - )]) - .unwrap(); - - // Force each batch into its own row group - let props = WriterProperties::builder() - .set_compression(Compression::SNAPPY) - .set_max_row_group_row_count(Some(100)) - .build(); - - let file = File::create(&data_file_path).unwrap(); - let mut writer = ArrowWriter::try_new(file, arrow_schema.clone(), Some(props)).unwrap(); - writer.write(&batch1).expect("Writing batch 1"); - writer.write(&batch2).expect("Writing batch 2"); - writer.close().unwrap(); - - // Verify we created 2 row groups - let verify_file = File::open(&data_file_path).unwrap(); - let verify_reader = SerializedFileReader::new(verify_file).unwrap(); - assert_eq!( - verify_reader.metadata().num_row_groups(), - 2, - "Should have 2 row groups" - ); - - // Step 2: Create position delete file that deletes row 199 (id=200, last row in row group 1) - let delete_file_path = format!("{table_location}/deletes.parquet"); - - let delete_schema = Arc::new(ArrowSchema::new(vec![ - Field::new("file_path", DataType::Utf8, false).with_metadata(HashMap::from([( - PARQUET_FIELD_ID_META_KEY.to_string(), - FIELD_ID_POSITIONAL_DELETE_FILE_PATH.to_string(), - )])), - Field::new("pos", DataType::Int64, false).with_metadata(HashMap::from([( - PARQUET_FIELD_ID_META_KEY.to_string(), - FIELD_ID_POSITIONAL_DELETE_POS.to_string(), - )])), - ])); - - // Delete row at position 199 (0-indexed, so it's the last row: id=200) - let delete_batch = RecordBatch::try_new(delete_schema.clone(), vec![ - Arc::new(StringArray::from_iter_values(vec![data_file_path.clone()])), - Arc::new(Int64Array::from_iter_values(vec![199i64])), - ]) - .unwrap(); - - let delete_props = WriterProperties::builder() - .set_compression(Compression::SNAPPY) - .build(); - - let delete_file = File::create(&delete_file_path).unwrap(); - let mut delete_writer = - ArrowWriter::try_new(delete_file, delete_schema, Some(delete_props)).unwrap(); - delete_writer.write(&delete_batch).unwrap(); - delete_writer.close().unwrap(); - - // Step 3: Get byte ranges to read ONLY row group 1 (rows 100-199) - // This exercises the row group selection code path where row group 0 is skipped - let metadata_file = File::open(&data_file_path).unwrap(); - let metadata_reader = SerializedFileReader::new(metadata_file).unwrap(); - let metadata = metadata_reader.metadata(); - - let row_group_0 = metadata.row_group(0); - let row_group_1 = metadata.row_group(1); - - let rg0_start = 4u64; // Parquet files start with 4-byte magic "PAR1" - let rg1_start = rg0_start + row_group_0.compressed_size() as u64; - let rg1_length = row_group_1.compressed_size() as u64; - - println!( - "Row group 0: starts at byte {}, {} bytes compressed", - rg0_start, - row_group_0.compressed_size() - ); - println!( - "Row group 1: starts at byte {}, {} bytes compressed", - rg1_start, - row_group_1.compressed_size() - ); - - let file_io = FileIO::new_with_fs(); - let reader = ArrowReaderBuilder::new(file_io).build(); - - // Create FileScanTask that reads ONLY row group 1 via byte range filtering - let task = FileScanTask { - file_size_in_bytes: std::fs::metadata(&data_file_path).unwrap().len(), - start: rg1_start, - length: rg1_length, - record_count: Some(100), // Row group 1 has 100 rows - data_file_path: data_file_path.clone(), - data_file_format: DataFileFormat::Parquet, - schema: table_schema.clone(), - project_field_ids: vec![1], - predicate: None, - deletes: vec![FileScanTaskDeleteFile { - file_size_in_bytes: std::fs::metadata(&delete_file_path).unwrap().len(), - file_path: delete_file_path, - file_type: DataContentType::PositionDeletes, - partition_spec_id: 0, - equality_ids: None, - }], - partition: None, - partition_spec: None, - name_mapping: None, - case_sensitive: false, - }; - - let tasks = Box::pin(futures::stream::iter(vec![Ok(task)])) as FileScanTaskStream; - let result = reader - .read(tasks) - .unwrap() - .try_collect::>() - .await - .unwrap(); - - // Step 4: Verify we got 99 rows (not 100) - // Row group 1 has 100 rows (ids 101-200), minus 1 delete (id=200) = 99 rows - let total_rows: usize = result.iter().map(|b| b.num_rows()).sum(); - - println!("Total rows read from row group 1: {total_rows}"); - println!("Expected: 99 rows (row group 1 has 100 rows, 1 delete at position 199)"); - - // This assertion will FAIL before the fix and PASS after the fix - assert_eq!( - total_rows, 99, - "Expected 99 rows from row group 1 after deleting position 199, but got {total_rows} rows. \ - The bug causes position deletes to be lost when advance_to() is followed by next() \ - when skipping unselected row groups." - ); - - // Verify the deleted row (id=200) is not present - let all_ids: Vec = result - .iter() - .flat_map(|batch| { - batch - .column(0) - .as_primitive::() - .values() - .iter() - .copied() - }) - .collect(); - - assert!( - !all_ids.contains(&200), - "Row with id=200 should be deleted but was found in results" - ); - - // Verify we have ids 101-199 (not 101-200) - let expected_ids: Vec = (101..=199).collect(); - assert_eq!( - all_ids, expected_ids, - "Should have ids 101-199 but got different values" - ); - } - /// Test for bug where stale cached delete causes infinite loop when skipping row groups. - /// - /// This test exposes the inverse scenario of `test_position_delete_with_row_group_selection`: - /// - Position delete targets a row in the SKIPPED row group (not the selected one) - /// - After calling advance_to(), the cached delete index is stale - /// - Without updating the cache, the code enters an infinite loop - /// - /// This test creates: - /// - A data file with 200 rows split into 2 row groups (0-99, 100-199) - /// - A position delete file that deletes row 0 (first row in SKIPPED row group 0) - /// - Row group selection that reads ONLY row group 1 (rows 100-199) - /// - /// The bug occurs when skipping row group 0: - /// ```rust - /// let mut next_deleted_row_idx_opt = delete_vector_iter.next(); // Some(0) - /// // ... skip to row group 1 ... - /// delete_vector_iter.advance_to(100); // Iterator advances past delete at 0 - /// // BUG: next_deleted_row_idx_opt is still Some(0) - STALE! - /// // When processing row group 1: - /// // current_idx = 100, next_deleted_row_idx = 0, next_row_group_base_idx = 200 - /// // Loop condition: 0 < 200 (true) - /// // But: current_idx (100) > next_deleted_row_idx (0) - /// // And: current_idx (100) != next_deleted_row_idx (0) - /// // Neither branch executes -> INFINITE LOOP! - /// ``` - /// - /// Expected behavior: Should return 100 rows (delete at 0 doesn't affect row group 1) - /// Bug behavior: Infinite loop in build_deletes_row_selection - #[tokio::test] - async fn test_position_delete_in_skipped_row_group() { - use arrow_array::{Int32Array, Int64Array}; - use parquet::file::reader::{FileReader, SerializedFileReader}; - - // Field IDs for positional delete schema - const FIELD_ID_POSITIONAL_DELETE_FILE_PATH: u64 = 2147483546; - const FIELD_ID_POSITIONAL_DELETE_POS: u64 = 2147483545; - - let tmp_dir = TempDir::new().unwrap(); - let table_location = tmp_dir.path().to_str().unwrap().to_string(); - - // Create table schema with a single 'id' column - let table_schema = Arc::new( - Schema::builder() - .with_schema_id(1) - .with_fields(vec![ - NestedField::required(1, "id", Type::Primitive(PrimitiveType::Int)).into(), - ]) - .build() - .unwrap(), - ); - - let arrow_schema = Arc::new(ArrowSchema::new(vec![ - Field::new("id", DataType::Int32, false).with_metadata(HashMap::from([( - PARQUET_FIELD_ID_META_KEY.to_string(), - "1".to_string(), - )])), - ])); - - // Step 1: Create data file with 200 rows in 2 row groups - // Row group 0: rows 0-99 (ids 1-100) - // Row group 1: rows 100-199 (ids 101-200) - let data_file_path = format!("{table_location}/data.parquet"); - - let batch1 = RecordBatch::try_new(arrow_schema.clone(), vec![Arc::new( - Int32Array::from_iter_values(1..=100), - )]) - .unwrap(); - - let batch2 = RecordBatch::try_new(arrow_schema.clone(), vec![Arc::new( - Int32Array::from_iter_values(101..=200), - )]) - .unwrap(); - - // Force each batch into its own row group - let props = WriterProperties::builder() - .set_compression(Compression::SNAPPY) - .set_max_row_group_row_count(Some(100)) - .build(); - - let file = File::create(&data_file_path).unwrap(); - let mut writer = ArrowWriter::try_new(file, arrow_schema.clone(), Some(props)).unwrap(); - writer.write(&batch1).expect("Writing batch 1"); - writer.write(&batch2).expect("Writing batch 2"); - writer.close().unwrap(); - - // Verify we created 2 row groups - let verify_file = File::open(&data_file_path).unwrap(); - let verify_reader = SerializedFileReader::new(verify_file).unwrap(); - assert_eq!( - verify_reader.metadata().num_row_groups(), - 2, - "Should have 2 row groups" - ); - - // Step 2: Create position delete file that deletes row 0 (id=1, first row in row group 0) - let delete_file_path = format!("{table_location}/deletes.parquet"); - - let delete_schema = Arc::new(ArrowSchema::new(vec![ - Field::new("file_path", DataType::Utf8, false).with_metadata(HashMap::from([( - PARQUET_FIELD_ID_META_KEY.to_string(), - FIELD_ID_POSITIONAL_DELETE_FILE_PATH.to_string(), - )])), - Field::new("pos", DataType::Int64, false).with_metadata(HashMap::from([( - PARQUET_FIELD_ID_META_KEY.to_string(), - FIELD_ID_POSITIONAL_DELETE_POS.to_string(), - )])), - ])); - - // Delete row at position 0 (0-indexed, so it's the first row: id=1) - let delete_batch = RecordBatch::try_new(delete_schema.clone(), vec![ - Arc::new(StringArray::from_iter_values(vec![data_file_path.clone()])), - Arc::new(Int64Array::from_iter_values(vec![0i64])), - ]) - .unwrap(); - - let delete_props = WriterProperties::builder() - .set_compression(Compression::SNAPPY) - .build(); - - let delete_file = File::create(&delete_file_path).unwrap(); - let mut delete_writer = - ArrowWriter::try_new(delete_file, delete_schema, Some(delete_props)).unwrap(); - delete_writer.write(&delete_batch).unwrap(); - delete_writer.close().unwrap(); - - // Step 3: Get byte ranges to read ONLY row group 1 (rows 100-199) - // This exercises the row group selection code path where row group 0 is skipped - let metadata_file = File::open(&data_file_path).unwrap(); - let metadata_reader = SerializedFileReader::new(metadata_file).unwrap(); - let metadata = metadata_reader.metadata(); - - let row_group_0 = metadata.row_group(0); - let row_group_1 = metadata.row_group(1); - - let rg0_start = 4u64; // Parquet files start with 4-byte magic "PAR1" - let rg1_start = rg0_start + row_group_0.compressed_size() as u64; - let rg1_length = row_group_1.compressed_size() as u64; - - let file_io = FileIO::new_with_fs(); - let reader = ArrowReaderBuilder::new(file_io).build(); - - // Create FileScanTask that reads ONLY row group 1 via byte range filtering - let task = FileScanTask { - file_size_in_bytes: std::fs::metadata(&data_file_path).unwrap().len(), - start: rg1_start, - length: rg1_length, - record_count: Some(100), // Row group 1 has 100 rows - data_file_path: data_file_path.clone(), - data_file_format: DataFileFormat::Parquet, - schema: table_schema.clone(), - project_field_ids: vec![1], - predicate: None, - deletes: vec![FileScanTaskDeleteFile { - file_size_in_bytes: std::fs::metadata(&delete_file_path).unwrap().len(), - file_path: delete_file_path, - file_type: DataContentType::PositionDeletes, - partition_spec_id: 0, - equality_ids: None, - }], - partition: None, - partition_spec: None, - name_mapping: None, - case_sensitive: false, - }; - - let tasks = Box::pin(futures::stream::iter(vec![Ok(task)])) as FileScanTaskStream; - let result = reader - .read(tasks) - .unwrap() - .try_collect::>() - .await - .unwrap(); - - // Step 4: Verify we got 100 rows (all of row group 1) - // The delete at position 0 is in row group 0, which is skipped, so it doesn't affect us - let total_rows: usize = result.iter().map(|b| b.num_rows()).sum(); - - assert_eq!( - total_rows, 100, - "Expected 100 rows from row group 1 (delete at position 0 is in skipped row group 0). \ - If this hangs or fails, it indicates the cached delete index was not updated after advance_to()." - ); - - // Verify we have all ids from row group 1 (101-200) - let all_ids: Vec = result - .iter() - .flat_map(|batch| { - batch - .column(0) - .as_primitive::() - .values() - .iter() - .copied() - }) - .collect(); - - let expected_ids: Vec = (101..=200).collect(); - assert_eq!( - all_ids, expected_ids, - "Should have ids 101-200 (all of row group 1)" - ); - } - - /// Test reading Parquet files without field ID metadata (e.g., migrated tables). - /// This exercises the position-based fallback path. - /// - /// Corresponds to Java's ParquetSchemaUtil.addFallbackIds() + pruneColumnsFallback() - /// in /parquet/src/main/java/org/apache/iceberg/parquet/ParquetSchemaUtil.java - #[tokio::test] - async fn test_read_parquet_file_without_field_ids() { - let schema = Arc::new( - Schema::builder() - .with_schema_id(1) - .with_fields(vec![ - NestedField::required(1, "name", Type::Primitive(PrimitiveType::String)).into(), - NestedField::required(2, "age", Type::Primitive(PrimitiveType::Int)).into(), - ]) - .build() - .unwrap(), - ); - - // Parquet file from a migrated table - no field ID metadata - let arrow_schema = Arc::new(ArrowSchema::new(vec![ - Field::new("name", DataType::Utf8, false), - Field::new("age", DataType::Int32, false), - ])); - - let tmp_dir = TempDir::new().unwrap(); - let table_location = tmp_dir.path().to_str().unwrap().to_string(); - let file_io = FileIO::new_with_fs(); - - let name_data = vec!["Alice", "Bob", "Charlie"]; - let age_data = vec![30, 25, 35]; - - use arrow_array::Int32Array; - let name_col = Arc::new(StringArray::from(name_data.clone())) as ArrayRef; - let age_col = Arc::new(Int32Array::from(age_data.clone())) as ArrayRef; - - let to_write = RecordBatch::try_new(arrow_schema.clone(), vec![name_col, age_col]).unwrap(); - - let props = WriterProperties::builder() - .set_compression(Compression::SNAPPY) - .build(); - - let file = File::create(format!("{table_location}/1.parquet")).unwrap(); - let mut writer = ArrowWriter::try_new(file, to_write.schema(), Some(props)).unwrap(); - - writer.write(&to_write).expect("Writing batch"); - writer.close().unwrap(); - - let reader = ArrowReaderBuilder::new(file_io).build(); - - let tasks = Box::pin(futures::stream::iter( - vec![Ok(FileScanTask { - file_size_in_bytes: std::fs::metadata(format!("{table_location}/1.parquet")) - .unwrap() - .len(), - start: 0, - length: 0, - record_count: None, - data_file_path: format!("{table_location}/1.parquet"), - data_file_format: DataFileFormat::Parquet, - schema: schema.clone(), - project_field_ids: vec![1, 2], - predicate: None, - deletes: vec![], - partition: None, - partition_spec: None, - name_mapping: None, - case_sensitive: false, - })] - .into_iter(), - )) as FileScanTaskStream; - - let result = reader - .read(tasks) - .unwrap() - .try_collect::>() - .await - .unwrap(); - - assert_eq!(result.len(), 1); - let batch = &result[0]; - assert_eq!(batch.num_rows(), 3); - assert_eq!(batch.num_columns(), 2); - - // Verify position-based mapping: field_id 1 → position 0, field_id 2 → position 1 - let name_array = batch.column(0).as_string::(); - assert_eq!(name_array.value(0), "Alice"); - assert_eq!(name_array.value(1), "Bob"); - assert_eq!(name_array.value(2), "Charlie"); - - let age_array = batch - .column(1) - .as_primitive::(); - assert_eq!(age_array.value(0), 30); - assert_eq!(age_array.value(1), 25); - assert_eq!(age_array.value(2), 35); - } - - /// Test reading Parquet files without field IDs with partial projection. - /// Only a subset of columns are requested, verifying position-based fallback - /// handles column selection correctly. - #[tokio::test] - async fn test_read_parquet_without_field_ids_partial_projection() { - use arrow_array::Int32Array; - - let schema = Arc::new( - Schema::builder() - .with_schema_id(1) - .with_fields(vec![ - NestedField::required(1, "col1", Type::Primitive(PrimitiveType::String)).into(), - NestedField::required(2, "col2", Type::Primitive(PrimitiveType::Int)).into(), - NestedField::required(3, "col3", Type::Primitive(PrimitiveType::String)).into(), - NestedField::required(4, "col4", Type::Primitive(PrimitiveType::Int)).into(), - ]) - .build() - .unwrap(), - ); - - let arrow_schema = Arc::new(ArrowSchema::new(vec![ - Field::new("col1", DataType::Utf8, false), - Field::new("col2", DataType::Int32, false), - Field::new("col3", DataType::Utf8, false), - Field::new("col4", DataType::Int32, false), - ])); - - let tmp_dir = TempDir::new().unwrap(); - let table_location = tmp_dir.path().to_str().unwrap().to_string(); - let file_io = FileIO::new_with_fs(); - - let col1_data = Arc::new(StringArray::from(vec!["a", "b"])) as ArrayRef; - let col2_data = Arc::new(Int32Array::from(vec![10, 20])) as ArrayRef; - let col3_data = Arc::new(StringArray::from(vec!["c", "d"])) as ArrayRef; - let col4_data = Arc::new(Int32Array::from(vec![30, 40])) as ArrayRef; - - let to_write = RecordBatch::try_new(arrow_schema.clone(), vec![ - col1_data, col2_data, col3_data, col4_data, - ]) - .unwrap(); - - let props = WriterProperties::builder() - .set_compression(Compression::SNAPPY) - .build(); - - let file = File::create(format!("{table_location}/1.parquet")).unwrap(); - let mut writer = ArrowWriter::try_new(file, to_write.schema(), Some(props)).unwrap(); - - writer.write(&to_write).expect("Writing batch"); - writer.close().unwrap(); - - let reader = ArrowReaderBuilder::new(file_io).build(); - - let tasks = Box::pin(futures::stream::iter( - vec![Ok(FileScanTask { - file_size_in_bytes: std::fs::metadata(format!("{table_location}/1.parquet")) - .unwrap() - .len(), - start: 0, - length: 0, - record_count: None, - data_file_path: format!("{table_location}/1.parquet"), - data_file_format: DataFileFormat::Parquet, - schema: schema.clone(), - project_field_ids: vec![1, 3], - predicate: None, - deletes: vec![], - partition: None, - partition_spec: None, - name_mapping: None, - case_sensitive: false, - })] - .into_iter(), - )) as FileScanTaskStream; - - let result = reader - .read(tasks) - .unwrap() - .try_collect::>() - .await - .unwrap(); - - assert_eq!(result.len(), 1); - let batch = &result[0]; - assert_eq!(batch.num_rows(), 2); - assert_eq!(batch.num_columns(), 2); - - let col1_array = batch.column(0).as_string::(); - assert_eq!(col1_array.value(0), "a"); - assert_eq!(col1_array.value(1), "b"); - - let col3_array = batch.column(1).as_string::(); - assert_eq!(col3_array.value(0), "c"); - assert_eq!(col3_array.value(1), "d"); - } - - /// Test reading Parquet files without field IDs with schema evolution. - /// The Iceberg schema has more fields than the Parquet file, testing that - /// missing columns are filled with NULLs. - #[tokio::test] - async fn test_read_parquet_without_field_ids_schema_evolution() { - use arrow_array::{Array, Int32Array}; - - // Schema with field 3 added after the file was written - let schema = Arc::new( - Schema::builder() - .with_schema_id(1) - .with_fields(vec![ - NestedField::required(1, "name", Type::Primitive(PrimitiveType::String)).into(), - NestedField::required(2, "age", Type::Primitive(PrimitiveType::Int)).into(), - NestedField::optional(3, "city", Type::Primitive(PrimitiveType::String)).into(), - ]) - .build() - .unwrap(), - ); - - let arrow_schema = Arc::new(ArrowSchema::new(vec![ - Field::new("name", DataType::Utf8, false), - Field::new("age", DataType::Int32, false), - ])); - - let tmp_dir = TempDir::new().unwrap(); - let table_location = tmp_dir.path().to_str().unwrap().to_string(); - let file_io = FileIO::new_with_fs(); - - let name_data = Arc::new(StringArray::from(vec!["Alice", "Bob"])) as ArrayRef; - let age_data = Arc::new(Int32Array::from(vec![30, 25])) as ArrayRef; - - let to_write = - RecordBatch::try_new(arrow_schema.clone(), vec![name_data, age_data]).unwrap(); - - let props = WriterProperties::builder() - .set_compression(Compression::SNAPPY) - .build(); - - let file = File::create(format!("{table_location}/1.parquet")).unwrap(); - let mut writer = ArrowWriter::try_new(file, to_write.schema(), Some(props)).unwrap(); - - writer.write(&to_write).expect("Writing batch"); - writer.close().unwrap(); - - let reader = ArrowReaderBuilder::new(file_io).build(); - - let tasks = Box::pin(futures::stream::iter( - vec![Ok(FileScanTask { - file_size_in_bytes: std::fs::metadata(format!("{table_location}/1.parquet")) - .unwrap() - .len(), - start: 0, - length: 0, - record_count: None, - data_file_path: format!("{table_location}/1.parquet"), - data_file_format: DataFileFormat::Parquet, - schema: schema.clone(), - project_field_ids: vec![1, 2, 3], - predicate: None, - deletes: vec![], - partition: None, - partition_spec: None, - name_mapping: None, - case_sensitive: false, - })] - .into_iter(), - )) as FileScanTaskStream; - - let result = reader - .read(tasks) - .unwrap() - .try_collect::>() - .await - .unwrap(); - - assert_eq!(result.len(), 1); - let batch = &result[0]; - assert_eq!(batch.num_rows(), 2); - assert_eq!(batch.num_columns(), 3); - - let name_array = batch.column(0).as_string::(); - assert_eq!(name_array.value(0), "Alice"); - assert_eq!(name_array.value(1), "Bob"); - - let age_array = batch - .column(1) - .as_primitive::(); - assert_eq!(age_array.value(0), 30); - assert_eq!(age_array.value(1), 25); - - // Verify missing column filled with NULLs - let city_array = batch.column(2).as_string::(); - assert_eq!(city_array.null_count(), 2); - assert!(city_array.is_null(0)); - assert!(city_array.is_null(1)); - } - - /// Test reading Parquet files without field IDs that have multiple row groups. - /// This ensures the position-based fallback works correctly across row group boundaries. - #[tokio::test] - async fn test_read_parquet_without_field_ids_multiple_row_groups() { - use arrow_array::Int32Array; - - let schema = Arc::new( - Schema::builder() - .with_schema_id(1) - .with_fields(vec![ - NestedField::required(1, "name", Type::Primitive(PrimitiveType::String)).into(), - NestedField::required(2, "value", Type::Primitive(PrimitiveType::Int)).into(), - ]) - .build() - .unwrap(), - ); - - let arrow_schema = Arc::new(ArrowSchema::new(vec![ - Field::new("name", DataType::Utf8, false), - Field::new("value", DataType::Int32, false), - ])); - - let tmp_dir = TempDir::new().unwrap(); - let table_location = tmp_dir.path().to_str().unwrap().to_string(); - let file_io = FileIO::new_with_fs(); - - // Small row group size to create multiple row groups - let props = WriterProperties::builder() - .set_compression(Compression::SNAPPY) - .set_write_batch_size(2) - .set_max_row_group_row_count(Some(2)) - .build(); - - let file = File::create(format!("{table_location}/1.parquet")).unwrap(); - let mut writer = ArrowWriter::try_new(file, arrow_schema.clone(), Some(props)).unwrap(); - - // Write 6 rows in 3 batches (will create 3 row groups) - for batch_num in 0..3 { - let name_data = Arc::new(StringArray::from(vec![ - format!("name_{}", batch_num * 2), - format!("name_{}", batch_num * 2 + 1), - ])) as ArrayRef; - let value_data = - Arc::new(Int32Array::from(vec![batch_num * 2, batch_num * 2 + 1])) as ArrayRef; - - let batch = - RecordBatch::try_new(arrow_schema.clone(), vec![name_data, value_data]).unwrap(); - writer.write(&batch).expect("Writing batch"); - } - writer.close().unwrap(); - - let reader = ArrowReaderBuilder::new(file_io).build(); - - let tasks = Box::pin(futures::stream::iter( - vec![Ok(FileScanTask { - file_size_in_bytes: std::fs::metadata(format!("{table_location}/1.parquet")) - .unwrap() - .len(), - start: 0, - length: 0, - record_count: None, - data_file_path: format!("{table_location}/1.parquet"), - data_file_format: DataFileFormat::Parquet, - schema: schema.clone(), - project_field_ids: vec![1, 2], - predicate: None, - deletes: vec![], - partition: None, - partition_spec: None, - name_mapping: None, - case_sensitive: false, - })] - .into_iter(), - )) as FileScanTaskStream; - - let result = reader - .read(tasks) - .unwrap() - .try_collect::>() - .await - .unwrap(); - - assert!(!result.is_empty()); - - let mut all_names = Vec::new(); - let mut all_values = Vec::new(); - - for batch in &result { - let name_array = batch.column(0).as_string::(); - let value_array = batch - .column(1) - .as_primitive::(); - - for i in 0..batch.num_rows() { - all_names.push(name_array.value(i).to_string()); - all_values.push(value_array.value(i)); - } - } - - assert_eq!(all_names.len(), 6); - assert_eq!(all_values.len(), 6); - - for i in 0..6 { - assert_eq!(all_names[i], format!("name_{i}")); - assert_eq!(all_values[i], i as i32); - } - } - - /// Test reading Parquet files without field IDs with nested types (struct). - /// Java's pruneColumnsFallback() projects entire top-level columns including nested content. - /// This test verifies that a top-level struct field is projected correctly with all its nested fields. - #[tokio::test] - async fn test_read_parquet_without_field_ids_with_struct() { - use arrow_array::{Int32Array, StructArray}; - use arrow_schema::Fields; - - let schema = Arc::new( - Schema::builder() - .with_schema_id(1) - .with_fields(vec![ - NestedField::required(1, "id", Type::Primitive(PrimitiveType::Int)).into(), - NestedField::required( - 2, - "person", - Type::Struct(crate::spec::StructType::new(vec![ - NestedField::required( - 3, - "name", - Type::Primitive(PrimitiveType::String), - ) - .into(), - NestedField::required(4, "age", Type::Primitive(PrimitiveType::Int)) - .into(), - ])), - ) - .into(), - ]) - .build() - .unwrap(), - ); - - let arrow_schema = Arc::new(ArrowSchema::new(vec![ - Field::new("id", DataType::Int32, false), - Field::new( - "person", - DataType::Struct(Fields::from(vec![ - Field::new("name", DataType::Utf8, false), - Field::new("age", DataType::Int32, false), - ])), - false, - ), - ])); - - let tmp_dir = TempDir::new().unwrap(); - let table_location = tmp_dir.path().to_str().unwrap().to_string(); - let file_io = FileIO::new_with_fs(); - - let id_data = Arc::new(Int32Array::from(vec![1, 2])) as ArrayRef; - let name_data = Arc::new(StringArray::from(vec!["Alice", "Bob"])) as ArrayRef; - let age_data = Arc::new(Int32Array::from(vec![30, 25])) as ArrayRef; - let person_data = Arc::new(StructArray::from(vec![ - ( - Arc::new(Field::new("name", DataType::Utf8, false)), - name_data, - ), - ( - Arc::new(Field::new("age", DataType::Int32, false)), - age_data, - ), - ])) as ArrayRef; - - let to_write = - RecordBatch::try_new(arrow_schema.clone(), vec![id_data, person_data]).unwrap(); - - let props = WriterProperties::builder() - .set_compression(Compression::SNAPPY) - .build(); - - let file = File::create(format!("{table_location}/1.parquet")).unwrap(); - let mut writer = ArrowWriter::try_new(file, to_write.schema(), Some(props)).unwrap(); - - writer.write(&to_write).expect("Writing batch"); - writer.close().unwrap(); - - let reader = ArrowReaderBuilder::new(file_io).build(); - - let tasks = Box::pin(futures::stream::iter( - vec![Ok(FileScanTask { - file_size_in_bytes: std::fs::metadata(format!("{table_location}/1.parquet")) - .unwrap() - .len(), - start: 0, - length: 0, - record_count: None, - data_file_path: format!("{table_location}/1.parquet"), - data_file_format: DataFileFormat::Parquet, - schema: schema.clone(), - project_field_ids: vec![1, 2], - predicate: None, - deletes: vec![], - partition: None, - partition_spec: None, - name_mapping: None, - case_sensitive: false, - })] - .into_iter(), - )) as FileScanTaskStream; - - let result = reader - .read(tasks) - .unwrap() - .try_collect::>() - .await - .unwrap(); - - assert_eq!(result.len(), 1); - let batch = &result[0]; - assert_eq!(batch.num_rows(), 2); - assert_eq!(batch.num_columns(), 2); - - let id_array = batch - .column(0) - .as_primitive::(); - assert_eq!(id_array.value(0), 1); - assert_eq!(id_array.value(1), 2); - - let person_array = batch.column(1).as_struct(); - assert_eq!(person_array.num_columns(), 2); - - let name_array = person_array.column(0).as_string::(); - assert_eq!(name_array.value(0), "Alice"); - assert_eq!(name_array.value(1), "Bob"); - - let age_array = person_array - .column(1) - .as_primitive::(); - assert_eq!(age_array.value(0), 30); - assert_eq!(age_array.value(1), 25); - } - - /// Test reading Parquet files without field IDs with schema evolution - column added in the middle. - /// When a new column is inserted between existing columns in the schema order, - /// the fallback projection must correctly map field IDs to output positions. - #[tokio::test] - async fn test_read_parquet_without_field_ids_schema_evolution_add_column_in_middle() { - use arrow_array::{Array, Int32Array}; - - let arrow_schema_old = Arc::new(ArrowSchema::new(vec![ - Field::new("col0", DataType::Int32, true), - Field::new("col1", DataType::Int32, true), - ])); - - // New column added between existing columns: col0 (id=1), newCol (id=5), col1 (id=2) - let schema = Arc::new( - Schema::builder() - .with_schema_id(1) - .with_fields(vec![ - NestedField::optional(1, "col0", Type::Primitive(PrimitiveType::Int)).into(), - NestedField::optional(5, "newCol", Type::Primitive(PrimitiveType::Int)).into(), - NestedField::optional(2, "col1", Type::Primitive(PrimitiveType::Int)).into(), - ]) - .build() - .unwrap(), - ); - - let tmp_dir = TempDir::new().unwrap(); - let table_location = tmp_dir.path().to_str().unwrap().to_string(); - let file_io = FileIO::new_with_fs(); - - let col0_data = Arc::new(Int32Array::from(vec![1, 2])) as ArrayRef; - let col1_data = Arc::new(Int32Array::from(vec![10, 20])) as ArrayRef; - - let to_write = - RecordBatch::try_new(arrow_schema_old.clone(), vec![col0_data, col1_data]).unwrap(); - - let props = WriterProperties::builder() - .set_compression(Compression::SNAPPY) - .build(); - - let file = File::create(format!("{table_location}/1.parquet")).unwrap(); - let mut writer = ArrowWriter::try_new(file, to_write.schema(), Some(props)).unwrap(); - writer.write(&to_write).expect("Writing batch"); - writer.close().unwrap(); - - let reader = ArrowReaderBuilder::new(file_io).build(); - - let tasks = Box::pin(futures::stream::iter( - vec![Ok(FileScanTask { - file_size_in_bytes: std::fs::metadata(format!("{table_location}/1.parquet")) - .unwrap() - .len(), - start: 0, - length: 0, - record_count: None, - data_file_path: format!("{table_location}/1.parquet"), - data_file_format: DataFileFormat::Parquet, - schema: schema.clone(), - project_field_ids: vec![1, 5, 2], - predicate: None, - deletes: vec![], - partition: None, - partition_spec: None, - name_mapping: None, - case_sensitive: false, - })] - .into_iter(), - )) as FileScanTaskStream; - - let result = reader - .read(tasks) - .unwrap() - .try_collect::>() - .await - .unwrap(); - - assert_eq!(result.len(), 1); - let batch = &result[0]; - assert_eq!(batch.num_rows(), 2); - assert_eq!(batch.num_columns(), 3); - - let result_col0 = batch - .column(0) - .as_primitive::(); - assert_eq!(result_col0.value(0), 1); - assert_eq!(result_col0.value(1), 2); - - // New column should be NULL (doesn't exist in old file) - let result_newcol = batch - .column(1) - .as_primitive::(); - assert_eq!(result_newcol.null_count(), 2); - assert!(result_newcol.is_null(0)); - assert!(result_newcol.is_null(1)); - - let result_col1 = batch - .column(2) - .as_primitive::(); - assert_eq!(result_col1.value(0), 10); - assert_eq!(result_col1.value(1), 20); - } - - /// Test reading Parquet files without field IDs with a filter that eliminates all row groups. - /// During development of field ID mapping, we saw a panic when row_selection_enabled=true and - /// all row groups are filtered out. - #[tokio::test] - async fn test_read_parquet_without_field_ids_filter_eliminates_all_rows() { - use arrow_array::{Float64Array, Int32Array}; - - // Schema with fields that will use fallback IDs 1, 2, 3 - let schema = Arc::new( - Schema::builder() - .with_schema_id(1) - .with_fields(vec![ - NestedField::required(1, "id", Type::Primitive(PrimitiveType::Int)).into(), - NestedField::required(2, "name", Type::Primitive(PrimitiveType::String)).into(), - NestedField::required(3, "value", Type::Primitive(PrimitiveType::Double)) - .into(), - ]) - .build() - .unwrap(), - ); - - let arrow_schema = Arc::new(ArrowSchema::new(vec![ - Field::new("id", DataType::Int32, false), - Field::new("name", DataType::Utf8, false), - Field::new("value", DataType::Float64, false), - ])); - - let tmp_dir = TempDir::new().unwrap(); - let table_location = tmp_dir.path().to_str().unwrap().to_string(); - let file_io = FileIO::new_with_fs(); - - // Write data where all ids are >= 10 - let id_data = Arc::new(Int32Array::from(vec![10, 11, 12])) as ArrayRef; - let name_data = Arc::new(StringArray::from(vec!["a", "b", "c"])) as ArrayRef; - let value_data = Arc::new(Float64Array::from(vec![100.0, 200.0, 300.0])) as ArrayRef; - - let to_write = - RecordBatch::try_new(arrow_schema.clone(), vec![id_data, name_data, value_data]) - .unwrap(); - - let props = WriterProperties::builder() - .set_compression(Compression::SNAPPY) - .build(); - - let file = File::create(format!("{table_location}/1.parquet")).unwrap(); - let mut writer = ArrowWriter::try_new(file, to_write.schema(), Some(props)).unwrap(); - writer.write(&to_write).expect("Writing batch"); - writer.close().unwrap(); - - // Filter that eliminates all row groups: id < 5 - let predicate = Reference::new("id").less_than(Datum::int(5)); - - // Enable both row_group_filtering and row_selection - triggered the panic - let reader = ArrowReaderBuilder::new(file_io) - .with_row_group_filtering_enabled(true) - .with_row_selection_enabled(true) - .build(); - - let tasks = Box::pin(futures::stream::iter( - vec![Ok(FileScanTask { - file_size_in_bytes: std::fs::metadata(format!("{table_location}/1.parquet")) - .unwrap() - .len(), - start: 0, - length: 0, - record_count: None, - data_file_path: format!("{table_location}/1.parquet"), - data_file_format: DataFileFormat::Parquet, - schema: schema.clone(), - project_field_ids: vec![1, 2, 3], - predicate: Some(predicate.bind(schema, true).unwrap()), - deletes: vec![], - partition: None, - partition_spec: None, - name_mapping: None, - case_sensitive: false, - })] - .into_iter(), - )) as FileScanTaskStream; - - // Should no longer panic - let result = reader - .read(tasks) - .unwrap() - .try_collect::>() - .await - .unwrap(); - - // Should return empty results - assert!(result.is_empty() || result.iter().all(|batch| batch.num_rows() == 0)); - } - - /// Test that concurrency=1 reads all files correctly and in deterministic order. - /// This verifies the fast-path optimization for single concurrency. - #[tokio::test] - async fn test_read_with_concurrency_one() { - use arrow_array::Int32Array; - - let schema = Arc::new( - Schema::builder() - .with_schema_id(1) - .with_fields(vec![ - NestedField::required(1, "id", Type::Primitive(PrimitiveType::Int)).into(), - NestedField::required(2, "file_num", Type::Primitive(PrimitiveType::Int)) - .into(), - ]) - .build() - .unwrap(), - ); - - let arrow_schema = Arc::new(ArrowSchema::new(vec![ - Field::new("id", DataType::Int32, false).with_metadata(HashMap::from([( - PARQUET_FIELD_ID_META_KEY.to_string(), - "1".to_string(), - )])), - Field::new("file_num", DataType::Int32, false).with_metadata(HashMap::from([( - PARQUET_FIELD_ID_META_KEY.to_string(), - "2".to_string(), - )])), - ])); - - let tmp_dir = TempDir::new().unwrap(); - let table_location = tmp_dir.path().to_str().unwrap().to_string(); - let file_io = FileIO::new_with_fs(); - - // Create 3 parquet files with different data - let props = WriterProperties::builder() - .set_compression(Compression::SNAPPY) - .build(); - - for file_num in 0..3 { - let id_data = Arc::new(Int32Array::from_iter_values( - file_num * 10..(file_num + 1) * 10, - )) as ArrayRef; - let file_num_data = Arc::new(Int32Array::from(vec![file_num; 10])) as ArrayRef; - - let to_write = - RecordBatch::try_new(arrow_schema.clone(), vec![id_data, file_num_data]).unwrap(); - - let file = File::create(format!("{table_location}/file_{file_num}.parquet")).unwrap(); - let mut writer = - ArrowWriter::try_new(file, to_write.schema(), Some(props.clone())).unwrap(); - writer.write(&to_write).expect("Writing batch"); - writer.close().unwrap(); - } - - // Read with concurrency=1 (fast-path) - let reader = ArrowReaderBuilder::new(file_io) - .with_data_file_concurrency_limit(1) - .build(); - - // Create tasks in a specific order: file_0, file_1, file_2 - let tasks = vec![ - Ok(FileScanTask { - file_size_in_bytes: std::fs::metadata(format!("{table_location}/file_0.parquet")) - .unwrap() - .len(), - start: 0, - length: 0, - record_count: None, - data_file_path: format!("{table_location}/file_0.parquet"), - data_file_format: DataFileFormat::Parquet, - schema: schema.clone(), - project_field_ids: vec![1, 2], - predicate: None, - deletes: vec![], - partition: None, - partition_spec: None, - name_mapping: None, - case_sensitive: false, - }), - Ok(FileScanTask { - file_size_in_bytes: std::fs::metadata(format!("{table_location}/file_1.parquet")) - .unwrap() - .len(), - start: 0, - length: 0, - record_count: None, - data_file_path: format!("{table_location}/file_1.parquet"), - data_file_format: DataFileFormat::Parquet, - schema: schema.clone(), - project_field_ids: vec![1, 2], - predicate: None, - deletes: vec![], - partition: None, - partition_spec: None, - name_mapping: None, - case_sensitive: false, - }), - Ok(FileScanTask { - file_size_in_bytes: std::fs::metadata(format!("{table_location}/file_2.parquet")) - .unwrap() - .len(), - start: 0, - length: 0, - record_count: None, - data_file_path: format!("{table_location}/file_2.parquet"), - data_file_format: DataFileFormat::Parquet, - schema: schema.clone(), - project_field_ids: vec![1, 2], - predicate: None, - deletes: vec![], - partition: None, - partition_spec: None, - name_mapping: None, - case_sensitive: false, - }), - ]; - - let tasks_stream = Box::pin(futures::stream::iter(tasks)) as FileScanTaskStream; - - let result = reader - .read(tasks_stream) - .unwrap() - .try_collect::>() - .await - .unwrap(); - - // Verify we got all 30 rows (10 from each file) - let total_rows: usize = result.iter().map(|b| b.num_rows()).sum(); - assert_eq!(total_rows, 30, "Should have 30 total rows"); - - // Collect all ids and file_nums to verify data - let mut all_ids = Vec::new(); - let mut all_file_nums = Vec::new(); - - for batch in &result { - let id_col = batch - .column(0) - .as_primitive::(); - let file_num_col = batch - .column(1) - .as_primitive::(); - - for i in 0..batch.num_rows() { - all_ids.push(id_col.value(i)); - all_file_nums.push(file_num_col.value(i)); - } - } - - assert_eq!(all_ids.len(), 30); - assert_eq!(all_file_nums.len(), 30); - - // With concurrency=1 and sequential processing, files should be processed in order - // file_0: ids 0-9, file_num=0 - // file_1: ids 10-19, file_num=1 - // file_2: ids 20-29, file_num=2 - for i in 0..10 { - assert_eq!(all_file_nums[i], 0, "First 10 rows should be from file_0"); - assert_eq!(all_ids[i], i as i32, "IDs should be 0-9"); - } - for i in 10..20 { - assert_eq!(all_file_nums[i], 1, "Next 10 rows should be from file_1"); - assert_eq!(all_ids[i], i as i32, "IDs should be 10-19"); - } - for i in 20..30 { - assert_eq!(all_file_nums[i], 2, "Last 10 rows should be from file_2"); - assert_eq!(all_ids[i], i as i32, "IDs should be 20-29"); - } - } - - /// Test bucket partitioning reads source column from data file (not partition metadata). - /// - /// This is an integration test verifying the complete ArrowReader pipeline with bucket partitioning. - /// It corresponds to TestRuntimeFiltering tests in Iceberg Java (e.g., testRenamedSourceColumnTable). - /// - /// # Iceberg Spec Requirements - /// - /// Per the Iceberg spec "Column Projection" section: - /// > "Return the value from partition metadata if an **Identity Transform** exists for the field" - /// - /// This means: - /// - Identity transforms (e.g., `identity(dept)`) use constants from partition metadata - /// - Non-identity transforms (e.g., `bucket(4, id)`) must read source columns from data files - /// - Partition metadata for bucket transforms stores bucket numbers (0-3), NOT source values - /// - /// Java's PartitionUtil.constantsMap() implements this via: - /// ```java - /// if (field.transform().isIdentity()) { - /// idToConstant.put(field.sourceId(), converted); - /// } - /// ``` - /// - /// # What This Test Verifies - /// - /// This test ensures the full ArrowReader → RecordBatchTransformer pipeline correctly handles - /// bucket partitioning when FileScanTask provides partition_spec and partition_data: - /// - /// - Parquet file has field_id=1 named "id" with actual data [1, 5, 9, 13] - /// - FileScanTask specifies partition_spec with bucket(4, id) and partition_data with bucket=1 - /// - RecordBatchTransformer.constants_map() excludes bucket-partitioned field from constants - /// - ArrowReader correctly reads [1, 5, 9, 13] from the data file - /// - Values are NOT replaced with constant 1 from partition metadata - /// - /// # Why This Matters - /// - /// Without correct handling: - /// - Runtime filtering would break (e.g., `WHERE id = 5` would fail) - /// - Query results would be incorrect (all rows would have id=1) - /// - Bucket partitioning would be unusable for query optimization - /// - /// # References - /// - Iceberg spec: format/spec.md "Column Projection" + "Partition Transforms" - /// - Java test: spark/src/test/java/.../TestRuntimeFiltering.java - /// - Java impl: core/src/main/java/org/apache/iceberg/util/PartitionUtil.java - #[tokio::test] - async fn test_bucket_partitioning_reads_source_column_from_file() { - use arrow_array::Int32Array; - - use crate::spec::{Literal, PartitionSpec, Struct, Transform}; - - // Iceberg schema with id and name columns - let schema = Arc::new( - Schema::builder() - .with_schema_id(0) - .with_fields(vec![ - NestedField::required(1, "id", Type::Primitive(PrimitiveType::Int)).into(), - NestedField::optional(2, "name", Type::Primitive(PrimitiveType::String)).into(), - ]) - .build() - .unwrap(), - ); - - // Partition spec: bucket(4, id) - let partition_spec = Arc::new( - PartitionSpec::builder(schema.clone()) - .with_spec_id(0) - .add_partition_field("id", "id_bucket", Transform::Bucket(4)) - .unwrap() - .build() - .unwrap(), - ); - - // Partition data: bucket value is 1 - let partition_data = Struct::from_iter(vec![Some(Literal::int(1))]); - - // Create Arrow schema with field IDs for Parquet file - let arrow_schema = Arc::new(ArrowSchema::new(vec![ - Field::new("id", DataType::Int32, false).with_metadata(HashMap::from([( - PARQUET_FIELD_ID_META_KEY.to_string(), - "1".to_string(), - )])), - Field::new("name", DataType::Utf8, true).with_metadata(HashMap::from([( - PARQUET_FIELD_ID_META_KEY.to_string(), - "2".to_string(), - )])), - ])); - - // Write Parquet file with data - let tmp_dir = TempDir::new().unwrap(); - let table_location = tmp_dir.path().to_str().unwrap().to_string(); - let file_io = FileIO::new_with_fs(); - - let id_data = Arc::new(Int32Array::from(vec![1, 5, 9, 13])) as ArrayRef; - let name_data = - Arc::new(StringArray::from(vec!["Alice", "Bob", "Charlie", "Dave"])) as ArrayRef; - - let to_write = - RecordBatch::try_new(arrow_schema.clone(), vec![id_data, name_data]).unwrap(); - - let props = WriterProperties::builder() - .set_compression(Compression::SNAPPY) - .build(); - let file = File::create(format!("{}/data.parquet", &table_location)).unwrap(); - let mut writer = ArrowWriter::try_new(file, to_write.schema(), Some(props)).unwrap(); - writer.write(&to_write).expect("Writing batch"); - writer.close().unwrap(); - - // Read the Parquet file with partition spec and data - let reader = ArrowReaderBuilder::new(file_io).build(); - let tasks = Box::pin(futures::stream::iter( - vec![Ok(FileScanTask { - file_size_in_bytes: std::fs::metadata(format!("{table_location}/data.parquet")) - .unwrap() - .len(), - start: 0, - length: 0, - record_count: None, - data_file_path: format!("{table_location}/data.parquet"), - data_file_format: DataFileFormat::Parquet, - schema: schema.clone(), - project_field_ids: vec![1, 2], - predicate: None, - deletes: vec![], - partition: Some(partition_data), - partition_spec: Some(partition_spec), - name_mapping: None, - case_sensitive: false, - })] - .into_iter(), - )) as FileScanTaskStream; - - let result = reader - .read(tasks) - .unwrap() - .try_collect::>() - .await - .unwrap(); - - // Verify we got the correct data - assert_eq!(result.len(), 1); - let batch = &result[0]; - - assert_eq!(batch.num_columns(), 2); - assert_eq!(batch.num_rows(), 4); - - // The id column MUST contain actual values from the Parquet file [1, 5, 9, 13], - // NOT the constant partition value 1 - let id_col = batch - .column(0) - .as_primitive::(); - assert_eq!(id_col.value(0), 1); - assert_eq!(id_col.value(1), 5); - assert_eq!(id_col.value(2), 9); - assert_eq!(id_col.value(3), 13); - - let name_col = batch.column(1).as_string::(); - assert_eq!(name_col.value(0), "Alice"); - assert_eq!(name_col.value(1), "Bob"); - assert_eq!(name_col.value(2), "Charlie"); - assert_eq!(name_col.value(3), "Dave"); - } - - #[test] - fn test_merge_ranges_empty() { - assert_eq!(super::merge_ranges(&[], 1024), Vec::>::new()); - } - - #[test] - fn test_merge_ranges_no_coalesce() { - // Ranges far apart should not be merged - let ranges = vec![0..100, 1_000_000..1_000_100]; - let merged = super::merge_ranges(&ranges, 1024); - assert_eq!(merged, vec![0..100, 1_000_000..1_000_100]); - } - - #[test] - fn test_merge_ranges_coalesce() { - // Ranges within the gap threshold should be merged - let ranges = vec![0..100, 200..300, 500..600]; - let merged = super::merge_ranges(&ranges, 1024); - assert_eq!(merged, vec![0..600]); - } - - #[test] - fn test_merge_ranges_overlapping() { - let ranges = vec![0..200, 100..300]; - let merged = super::merge_ranges(&ranges, 0); - assert_eq!(merged, vec![0..300]); - } - - #[test] - fn test_merge_ranges_unsorted() { - let ranges = vec![500..600, 0..100, 200..300]; - let merged = super::merge_ranges(&ranges, 1024); - assert_eq!(merged, vec![0..600]); - } - - /// Mock FileRead backed by a flat byte buffer. - struct MockFileRead { - data: bytes::Bytes, - } - - impl MockFileRead { - fn new(size: usize) -> Self { - // Fill with sequential byte values so slices are verifiable. - let data: Vec = (0..size).map(|i| (i % 256) as u8).collect(); - Self { - data: bytes::Bytes::from(data), - } - } - } - - #[async_trait::async_trait] - impl crate::io::FileRead for MockFileRead { - async fn read(&self, range: Range) -> crate::Result { - Ok(self.data.slice(range.start as usize..range.end as usize)) - } - } - - #[tokio::test] - async fn test_get_byte_ranges_no_coalesce() { - use parquet::arrow::async_reader::AsyncFileReader; - - let mock = MockFileRead::new(2048); - let expected_0 = mock.data.slice(0..100); - let expected_1 = mock.data.slice(1500..1600); - - let mut reader = - super::ArrowFileReader::new(crate::io::FileMetadata { size: 2048 }, Box::new(mock)) - .with_parquet_read_options( - super::ParquetReadOptions::builder() - .with_range_coalesce_bytes(0) - .build(), - ); - - let result = reader - .get_byte_ranges(vec![0..100, 1500..1600]) - .await - .unwrap(); - - assert_eq!(result.len(), 2); - assert_eq!(result[0], expected_0); - assert_eq!(result[1], expected_1); - } - - #[tokio::test] - async fn test_get_byte_ranges_with_coalesce() { - use parquet::arrow::async_reader::AsyncFileReader; - - let mock = MockFileRead::new(1024); - let expected_0 = mock.data.slice(0..100); - let expected_1 = mock.data.slice(200..300); - let expected_2 = mock.data.slice(500..600); - - let mut reader = - super::ArrowFileReader::new(crate::io::FileMetadata { size: 1024 }, Box::new(mock)) - .with_parquet_read_options( - super::ParquetReadOptions::builder() - .with_range_coalesce_bytes(1024) - .build(), - ); - - // All ranges within coalesce threshold — should merge into one fetch. - let result = reader - .get_byte_ranges(vec![0..100, 200..300, 500..600]) - .await - .unwrap(); - - assert_eq!(result.len(), 3); - assert_eq!(result[0], expected_0); - assert_eq!(result[1], expected_1); - assert_eq!(result[2], expected_2); - } - - #[tokio::test] - async fn test_get_byte_ranges_empty() { - use parquet::arrow::async_reader::AsyncFileReader; - - let mock = MockFileRead::new(1024); - let mut reader = - super::ArrowFileReader::new(crate::io::FileMetadata { size: 1024 }, Box::new(mock)); - - let result = reader.get_byte_ranges(vec![]).await.unwrap(); - assert!(result.is_empty()); - } - - #[tokio::test] - async fn test_get_byte_ranges_coalesce_max() { - use parquet::arrow::async_reader::AsyncFileReader; - - let mock = MockFileRead::new(2048); - let expected_0 = mock.data.slice(0..100); - let expected_1 = mock.data.slice(1500..1600); - - let mut reader = - super::ArrowFileReader::new(crate::io::FileMetadata { size: 2048 }, Box::new(mock)) - .with_parquet_read_options( - super::ParquetReadOptions::builder() - .with_range_coalesce_bytes(u64::MAX) - .build(), - ); - - // u64::MAX coalesce — all ranges merge into a single fetch. - let result = reader - .get_byte_ranges(vec![0..100, 1500..1600]) - .await - .unwrap(); - - assert_eq!(result.len(), 2); - assert_eq!(result[0], expected_0); - assert_eq!(result[1], expected_1); - } - - #[tokio::test] - async fn test_get_byte_ranges_concurrency_zero() { - use parquet::arrow::async_reader::AsyncFileReader; - - // concurrency=0 is clamped to 1, so this should not hang. - let mock = MockFileRead::new(1024); - let expected = mock.data.slice(0..100); - - let mut reader = - super::ArrowFileReader::new(crate::io::FileMetadata { size: 1024 }, Box::new(mock)) - .with_parquet_read_options( - super::ParquetReadOptions::builder() - .with_range_fetch_concurrency(0) - .build(), - ); - - let result = reader - .get_byte_ranges(vec![0..100, 200..300]) - .await - .unwrap(); - assert_eq!(result.len(), 2); - assert_eq!(result[0], expected); - } - - #[tokio::test] - async fn test_get_byte_ranges_concurrency_one() { - use parquet::arrow::async_reader::AsyncFileReader; - - let mock = MockFileRead::new(2048); - let expected_0 = mock.data.slice(0..100); - let expected_1 = mock.data.slice(500..600); - let expected_2 = mock.data.slice(1500..1600); - - let mut reader = - super::ArrowFileReader::new(crate::io::FileMetadata { size: 2048 }, Box::new(mock)) - .with_parquet_read_options( - super::ParquetReadOptions::builder() - .with_range_coalesce_bytes(0) - .with_range_fetch_concurrency(1) - .build(), - ); - - // concurrency=1 with no coalescing — sequential fetches. - let result = reader - .get_byte_ranges(vec![0..100, 500..600, 1500..1600]) - .await - .unwrap(); - - assert_eq!(result.len(), 3); - assert_eq!(result[0], expected_0); - assert_eq!(result[1], expected_1); - assert_eq!(result[2], expected_2); - } - - /// Regression for : - /// predicate on a column after nested types in a migrated file (no field IDs). - /// Schema has struct, list, and map columns before the predicate target (`id`), - /// exercising the fallback field ID mapping across all nested type variants. - #[tokio::test] - async fn test_predicate_on_migrated_file_with_nested_types() { - use serde::{Deserialize, Serialize}; - use serde_arrow::schema::{SchemaLike, TracingOptions}; - - #[derive(Serialize, Deserialize)] - struct Person { - name: String, - age: i32, - } - - #[derive(Serialize, Deserialize)] - struct Row { - person: Person, - people: Vec, - props: std::collections::BTreeMap, - id: i32, - } - - let rows = vec![ - Row { - person: Person { - name: "Alice".into(), - age: 30, - }, - people: vec![Person { - name: "Alice".into(), - age: 30, - }], - props: [("k1".into(), "v1".into())].into(), - id: 1, - }, - Row { - person: Person { - name: "Bob".into(), - age: 25, - }, - people: vec![Person { - name: "Bob".into(), - age: 25, - }], - props: [("k2".into(), "v2".into())].into(), - id: 2, - }, - Row { - person: Person { - name: "Carol".into(), - age: 40, - }, - people: vec![Person { - name: "Carol".into(), - age: 40, - }], - props: [("k3".into(), "v3".into())].into(), - id: 3, - }, - ]; - - let tracing_options = TracingOptions::default() - .map_as_struct(false) - .strings_as_large_utf8(false) - .sequence_as_large_list(false); - let fields = Vec::::from_type::(tracing_options).unwrap(); - let arrow_schema = Arc::new(ArrowSchema::new(fields.clone())); - let batch = serde_arrow::to_record_batch(&fields, &rows).unwrap(); - - // Fallback field IDs: person=1, people=2, props=3, id=4 - let iceberg_schema = Arc::new( - Schema::builder() - .with_schema_id(1) - .with_fields(vec![ - NestedField::required( - 1, - "person", - Type::Struct(crate::spec::StructType::new(vec![ - NestedField::required( - 5, - "name", - Type::Primitive(PrimitiveType::String), - ) - .into(), - NestedField::required(6, "age", Type::Primitive(PrimitiveType::Int)) - .into(), - ])), - ) - .into(), - NestedField::required( - 2, - "people", - Type::List(crate::spec::ListType { - element_field: NestedField::required( - 7, - "element", - Type::Struct(crate::spec::StructType::new(vec![ - NestedField::required( - 8, - "name", - Type::Primitive(PrimitiveType::String), - ) - .into(), - NestedField::required( - 9, - "age", - Type::Primitive(PrimitiveType::Int), - ) - .into(), - ])), - ) - .into(), - }), - ) - .into(), - NestedField::required( - 3, - "props", - Type::Map(crate::spec::MapType { - key_field: NestedField::required( - 10, - "key", - Type::Primitive(PrimitiveType::String), - ) - .into(), - value_field: NestedField::required( - 11, - "value", - Type::Primitive(PrimitiveType::String), - ) - .into(), - }), - ) - .into(), - NestedField::required(4, "id", Type::Primitive(PrimitiveType::Int)).into(), - ]) - .build() - .unwrap(), - ); - - let tmp_dir = TempDir::new().unwrap(); - let table_location = tmp_dir.path().to_str().unwrap().to_string(); - let file_path = format!("{table_location}/1.parquet"); - - let props = WriterProperties::builder() - .set_compression(Compression::SNAPPY) - .build(); - let file = File::create(&file_path).unwrap(); - let mut writer = ArrowWriter::try_new(file, arrow_schema, Some(props)).unwrap(); - writer.write(&batch).expect("Writing batch"); - writer.close().unwrap(); - - let predicate = Reference::new("id").greater_than(Datum::int(1)); - - let reader = ArrowReaderBuilder::new(FileIO::new_with_fs()) - .with_row_group_filtering_enabled(true) - .with_row_selection_enabled(true) - .build(); - - let tasks = Box::pin(futures::stream::iter( - vec![Ok(FileScanTask { - file_size_in_bytes: std::fs::metadata(&file_path).unwrap().len(), - start: 0, - length: 0, - record_count: None, - data_file_path: file_path, - data_file_format: DataFileFormat::Parquet, - schema: iceberg_schema.clone(), - project_field_ids: vec![4], - predicate: Some(predicate.bind(iceberg_schema, true).unwrap()), - deletes: vec![], - partition: None, - partition_spec: None, - name_mapping: None, - case_sensitive: false, - })] - .into_iter(), - )) as FileScanTaskStream; - - let result = reader - .read(tasks) - .unwrap() - .try_collect::>() - .await - .unwrap(); - - let ids: Vec = result - .iter() - .flat_map(|b| { - b.column(0) - .as_primitive::() - .values() - .iter() - .copied() - }) - .collect(); - assert_eq!(ids, vec![2, 3]); - } - - // INT96 encoding: [nanos_low_u32, nanos_high_u32, julian_day_u32] - // Julian day 2_440_588 = Unix epoch (1970-01-01) - const UNIX_EPOCH_JULIAN: i64 = 2_440_588; - const MICROS_PER_DAY: i64 = 86_400_000_000; - // Noon on 3333-01-01 (Julian day 2_953_529) — outside the i64 nanosecond range (~1677-2262). - const INT96_TEST_NANOS_WITHIN_DAY: u64 = 43_200_000_000_000; - const INT96_TEST_JULIAN_DAY: u32 = 2_953_529; - - fn make_int96_test_value() -> (parquet::data_type::Int96, i64) { - let mut val = parquet::data_type::Int96::new(); - val.set_data( - (INT96_TEST_NANOS_WITHIN_DAY & 0xFFFFFFFF) as u32, - (INT96_TEST_NANOS_WITHIN_DAY >> 32) as u32, - INT96_TEST_JULIAN_DAY, - ); - let expected_micros = (INT96_TEST_JULIAN_DAY as i64 - UNIX_EPOCH_JULIAN) * MICROS_PER_DAY - + (INT96_TEST_NANOS_WITHIN_DAY / 1_000) as i64; - (val, expected_micros) - } - - async fn read_int96_batches( - file_path: &str, - schema: SchemaRef, - project_field_ids: Vec, - ) -> Vec { - let file_io = FileIO::new_with_fs(); - let reader = ArrowReaderBuilder::new(file_io).build(); - - let file_size = std::fs::metadata(file_path).unwrap().len(); - let task = FileScanTask { - file_size_in_bytes: file_size, - start: 0, - length: file_size, - record_count: None, - data_file_path: file_path.to_string(), - data_file_format: DataFileFormat::Parquet, - schema, - project_field_ids, - predicate: None, - deletes: vec![], - partition: None, - partition_spec: None, - name_mapping: None, - case_sensitive: false, - }; - - let tasks = Box::pin(futures::stream::iter(vec![Ok(task)])) as FileScanTaskStream; - reader.read(tasks).unwrap().try_collect().await.unwrap() - } - - // ArrowWriter cannot write INT96, so we use SerializedFileWriter directly. - fn write_int96_parquet_file( - table_location: &str, - filename: &str, - with_field_ids: bool, - ) -> (String, Vec) { - use parquet::basic::{Repetition, Type as PhysicalType}; - use parquet::data_type::{Int32Type, Int96, Int96Type}; - use parquet::file::writer::SerializedFileWriter; - use parquet::schema::types::Type as SchemaType; - - let file_path = format!("{table_location}/{filename}"); - - let mut ts_builder = SchemaType::primitive_type_builder("ts", PhysicalType::INT96) - .with_repetition(Repetition::OPTIONAL); - let mut id_builder = SchemaType::primitive_type_builder("id", PhysicalType::INT32) - .with_repetition(Repetition::REQUIRED); - - if with_field_ids { - ts_builder = ts_builder.with_id(Some(1)); - id_builder = id_builder.with_id(Some(2)); - } - - let schema = SchemaType::group_type_builder("schema") - .with_fields(vec![ - Arc::new(ts_builder.build().unwrap()), - Arc::new(id_builder.build().unwrap()), - ]) - .build() - .unwrap(); - - // Dates outside the i64 nanosecond range (~1677-2262) overflow without coercion. - const NOON_NANOS: u64 = INT96_TEST_NANOS_WITHIN_DAY; - const JULIAN_3333: u32 = INT96_TEST_JULIAN_DAY; - const JULIAN_2100: u32 = 2_488_070; - - let test_data: Vec<(u32, u32, u32, i64)> = vec![ - // 3333-01-01 00:00:00 - ( - 0, - 0, - JULIAN_3333, - (JULIAN_3333 as i64 - UNIX_EPOCH_JULIAN) * MICROS_PER_DAY, - ), - // 3333-01-01 12:00:00 - ( - (NOON_NANOS & 0xFFFFFFFF) as u32, - (NOON_NANOS >> 32) as u32, - JULIAN_3333, - (JULIAN_3333 as i64 - UNIX_EPOCH_JULIAN) * MICROS_PER_DAY - + (NOON_NANOS / 1_000) as i64, - ), - // 2100-01-01 00:00:00 - ( - 0, - 0, - JULIAN_2100, - (JULIAN_2100 as i64 - UNIX_EPOCH_JULIAN) * MICROS_PER_DAY, - ), - ]; - - let int96_values: Vec = test_data - .iter() - .map(|(lo, hi, day, _)| { - let mut v = Int96::new(); - v.set_data(*lo, *hi, *day); - v - }) - .collect(); - - let id_values: Vec = (0..test_data.len() as i32).collect(); - let expected_micros: Vec = test_data.iter().map(|(_, _, _, m)| *m).collect(); - - let file = File::create(&file_path).unwrap(); - let mut writer = - SerializedFileWriter::new(file, Arc::new(schema), Default::default()).unwrap(); - - let mut row_group = writer.next_row_group().unwrap(); - { - // def=1: ts is OPTIONAL and present. No repetition levels (top-level columns). - let mut col = row_group.next_column().unwrap().unwrap(); - col.typed::() - .write_batch(&int96_values, Some(&vec![1; test_data.len()]), None) - .unwrap(); - col.close().unwrap(); - } - { - let mut col = row_group.next_column().unwrap().unwrap(); - col.typed::() - .write_batch(&id_values, None, None) - .unwrap(); - col.close().unwrap(); - } - row_group.close().unwrap(); - writer.close().unwrap(); - - (file_path, expected_micros) - } - - async fn assert_int96_read_matches( - file_path: &str, - schema: SchemaRef, - project_field_ids: Vec, - expected_micros: &[i64], - ) { - use arrow_array::TimestampMicrosecondArray; - - let batches = read_int96_batches(file_path, schema, project_field_ids).await; - - assert_eq!(batches.len(), 1); - let ts_array = batches[0] - .column(0) - .as_any() - .downcast_ref::() - .expect("Expected TimestampMicrosecondArray"); - - for (i, expected) in expected_micros.iter().enumerate() { - assert_eq!( - ts_array.value(i), - *expected, - "Row {i}: got {}, expected {expected}", - ts_array.value(i) - ); - } - } - - #[tokio::test] - async fn test_read_int96_timestamps_with_field_ids() { - let schema = Arc::new( - Schema::builder() - .with_schema_id(1) - .with_fields(vec![ - NestedField::optional(1, "ts", Type::Primitive(PrimitiveType::Timestamp)) - .into(), - NestedField::required(2, "id", Type::Primitive(PrimitiveType::Int)).into(), - ]) - .build() - .unwrap(), - ); - - let tmp_dir = TempDir::new().unwrap(); - let table_location = tmp_dir.path().to_str().unwrap().to_string(); - let (file_path, expected_micros) = - write_int96_parquet_file(&table_location, "with_ids.parquet", true); - - assert_int96_read_matches(&file_path, schema, vec![1, 2], &expected_micros).await; - } - - #[tokio::test] - async fn test_read_int96_timestamps_without_field_ids() { - let schema = Arc::new( - Schema::builder() - .with_schema_id(1) - .with_fields(vec![ - NestedField::optional(1, "ts", Type::Primitive(PrimitiveType::Timestamp)) - .into(), - NestedField::required(2, "id", Type::Primitive(PrimitiveType::Int)).into(), - ]) - .build() - .unwrap(), - ); - - let tmp_dir = TempDir::new().unwrap(); - let table_location = tmp_dir.path().to_str().unwrap().to_string(); - let (file_path, expected_micros) = - write_int96_parquet_file(&table_location, "no_ids.parquet", false); - - assert_int96_read_matches(&file_path, schema, vec![1, 2], &expected_micros).await; - } - - #[tokio::test] - async fn test_read_int96_timestamps_in_struct() { - use arrow_array::{StructArray, TimestampMicrosecondArray}; - use parquet::basic::{Repetition, Type as PhysicalType}; - use parquet::data_type::Int96Type; - use parquet::file::writer::SerializedFileWriter; - use parquet::schema::types::Type as SchemaType; - - let tmp_dir = TempDir::new().unwrap(); - let table_location = tmp_dir.path().to_str().unwrap().to_string(); - let file_path = format!("{table_location}/struct_int96.parquet"); - - let ts_type = SchemaType::primitive_type_builder("ts", PhysicalType::INT96) - .with_repetition(Repetition::OPTIONAL) - .with_id(Some(2)) - .build() - .unwrap(); - - let struct_type = SchemaType::group_type_builder("data") - .with_repetition(Repetition::REQUIRED) - .with_id(Some(1)) - .with_fields(vec![Arc::new(ts_type)]) - .build() - .unwrap(); - - let parquet_schema = SchemaType::group_type_builder("schema") - .with_fields(vec![Arc::new(struct_type)]) - .build() - .unwrap(); - - let (int96_val, expected_micros) = make_int96_test_value(); - - let file = File::create(&file_path).unwrap(); - let mut writer = - SerializedFileWriter::new(file, Arc::new(parquet_schema), Default::default()).unwrap(); - - // def=1: struct is REQUIRED so no level, ts is OPTIONAL and present (1). - // No repetition levels needed (no repeated groups). - let mut row_group = writer.next_row_group().unwrap(); - { - let mut col = row_group.next_column().unwrap().unwrap(); - col.typed::() - .write_batch(&[int96_val], Some(&[1]), None) - .unwrap(); - col.close().unwrap(); - } - row_group.close().unwrap(); - writer.close().unwrap(); - - let iceberg_schema = Arc::new( - Schema::builder() - .with_schema_id(1) - .with_fields(vec![ - NestedField::required( - 1, - "data", - Type::Struct(crate::spec::StructType::new(vec![ - NestedField::optional( - 2, - "ts", - Type::Primitive(PrimitiveType::Timestamp), - ) - .into(), - ])), - ) - .into(), - ]) - .build() - .unwrap(), - ); - - let batches = read_int96_batches(&file_path, iceberg_schema, vec![1]).await; - - assert_eq!(batches.len(), 1); - let struct_array = batches[0] - .column(0) - .as_any() - .downcast_ref::() - .expect("Expected StructArray"); - let ts_array = struct_array - .column(0) - .as_any() - .downcast_ref::() - .expect("Expected TimestampMicrosecondArray inside struct"); - - assert_eq!( - ts_array.value(0), - expected_micros, - "INT96 in struct: got {}, expected {expected_micros}", - ts_array.value(0) - ); - } - - #[tokio::test] - async fn test_read_int96_timestamps_in_list() { - use arrow_array::{ListArray, TimestampMicrosecondArray}; - use parquet::basic::{Repetition, Type as PhysicalType}; - use parquet::data_type::Int96Type; - use parquet::file::writer::SerializedFileWriter; - use parquet::schema::types::Type as SchemaType; - - let tmp_dir = TempDir::new().unwrap(); - let table_location = tmp_dir.path().to_str().unwrap().to_string(); - let file_path = format!("{table_location}/list_int96.parquet"); - - // 3-level LIST encoding: - // optional group timestamps (LIST) { - // repeated group list { - // optional int96 element; - // } - // } - let element_type = SchemaType::primitive_type_builder("element", PhysicalType::INT96) - .with_repetition(Repetition::OPTIONAL) - .with_id(Some(2)) - .build() - .unwrap(); - - let list_group = SchemaType::group_type_builder("list") - .with_repetition(Repetition::REPEATED) - .with_fields(vec![Arc::new(element_type)]) - .build() - .unwrap(); - - let list_type = SchemaType::group_type_builder("timestamps") - .with_repetition(Repetition::OPTIONAL) - .with_id(Some(1)) - .with_logical_type(Some(parquet::basic::LogicalType::List)) - .with_fields(vec![Arc::new(list_group)]) - .build() - .unwrap(); - - let parquet_schema = SchemaType::group_type_builder("schema") - .with_fields(vec![Arc::new(list_type)]) - .build() - .unwrap(); - - let (int96_val, expected_micros) = make_int96_test_value(); - - let file = File::create(&file_path).unwrap(); - let mut writer = - SerializedFileWriter::new(file, Arc::new(parquet_schema), Default::default()).unwrap(); - - // Write a single row with a list containing one INT96 element. - // def=3: list present (1) + repeated group (2) + element present (3) - // rep=0: start of a new list - let mut row_group = writer.next_row_group().unwrap(); - { - let mut col = row_group.next_column().unwrap().unwrap(); - col.typed::() - .write_batch(&[int96_val], Some(&[3]), Some(&[0])) - .unwrap(); - col.close().unwrap(); - } - row_group.close().unwrap(); - writer.close().unwrap(); - - let iceberg_schema = Arc::new( - Schema::builder() - .with_schema_id(1) - .with_fields(vec![ - NestedField::optional( - 1, - "timestamps", - Type::List(crate::spec::ListType { - element_field: NestedField::optional( - 2, - "element", - Type::Primitive(PrimitiveType::Timestamp), - ) - .into(), - }), - ) - .into(), - ]) - .build() - .unwrap(), - ); - - let batches = read_int96_batches(&file_path, iceberg_schema, vec![1]).await; - - assert_eq!(batches.len(), 1); - let list_array = batches[0] - .column(0) - .as_any() - .downcast_ref::() - .expect("Expected ListArray"); - let ts_array = list_array - .values() - .as_any() - .downcast_ref::() - .expect("Expected TimestampMicrosecondArray inside list"); - - assert_eq!( - ts_array.value(0), - expected_micros, - "INT96 in list: got {}, expected {expected_micros}", - ts_array.value(0) - ); - } - - #[tokio::test] - async fn test_read_int96_timestamps_in_map() { - use arrow_array::{MapArray, TimestampMicrosecondArray}; - use parquet::basic::{Repetition, Type as PhysicalType}; - use parquet::data_type::{ByteArrayType, Int96Type}; - use parquet::file::writer::SerializedFileWriter; - use parquet::schema::types::Type as SchemaType; - - let tmp_dir = TempDir::new().unwrap(); - let table_location = tmp_dir.path().to_str().unwrap().to_string(); - let file_path = format!("{table_location}/map_int96.parquet"); - - // MAP encoding: - // optional group ts_map (MAP) { - // repeated group key_value { - // required binary key (UTF8); - // optional int96 value; - // } - // } - let key_type = SchemaType::primitive_type_builder("key", PhysicalType::BYTE_ARRAY) - .with_repetition(Repetition::REQUIRED) - .with_logical_type(Some(parquet::basic::LogicalType::String)) - .with_id(Some(2)) - .build() - .unwrap(); - - let value_type = SchemaType::primitive_type_builder("value", PhysicalType::INT96) - .with_repetition(Repetition::OPTIONAL) - .with_id(Some(3)) - .build() - .unwrap(); - - let key_value_group = SchemaType::group_type_builder("key_value") - .with_repetition(Repetition::REPEATED) - .with_fields(vec![Arc::new(key_type), Arc::new(value_type)]) - .build() - .unwrap(); - - let map_type = SchemaType::group_type_builder("ts_map") - .with_repetition(Repetition::OPTIONAL) - .with_id(Some(1)) - .with_logical_type(Some(parquet::basic::LogicalType::Map)) - .with_fields(vec![Arc::new(key_value_group)]) - .build() - .unwrap(); - - let parquet_schema = SchemaType::group_type_builder("schema") - .with_fields(vec![Arc::new(map_type)]) - .build() - .unwrap(); - - let (int96_val, expected_micros) = make_int96_test_value(); - - let file = File::create(&file_path).unwrap(); - let mut writer = - SerializedFileWriter::new(file, Arc::new(parquet_schema), Default::default()).unwrap(); - - // Write a single row with a map containing one key-value pair. - // rep=0 for both columns: start of a new map. - // key def=2: map present (1) + key_value entry present (2), key is REQUIRED. - // value def=3: map present (1) + key_value entry present (2) + value present (3). - let mut row_group = writer.next_row_group().unwrap(); - { - let mut col = row_group.next_column().unwrap().unwrap(); - col.typed::() - .write_batch( - &[parquet::data_type::ByteArray::from("event_time")], - Some(&[2]), - Some(&[0]), - ) - .unwrap(); - col.close().unwrap(); - } - { - let mut col = row_group.next_column().unwrap().unwrap(); - col.typed::() - .write_batch(&[int96_val], Some(&[3]), Some(&[0])) - .unwrap(); - col.close().unwrap(); - } - row_group.close().unwrap(); - writer.close().unwrap(); - - let iceberg_schema = Arc::new( - Schema::builder() - .with_schema_id(1) - .with_fields(vec![ - NestedField::optional( - 1, - "ts_map", - Type::Map(crate::spec::MapType { - key_field: NestedField::required( - 2, - "key", - Type::Primitive(PrimitiveType::String), - ) - .into(), - value_field: NestedField::optional( - 3, - "value", - Type::Primitive(PrimitiveType::Timestamp), - ) - .into(), - }), - ) - .into(), - ]) - .build() - .unwrap(), - ); - - let batches = read_int96_batches(&file_path, iceberg_schema, vec![1]).await; - - assert_eq!(batches.len(), 1); - let map_array = batches[0] - .column(0) - .as_any() - .downcast_ref::() - .expect("Expected MapArray"); - let ts_array = map_array - .values() - .as_any() - .downcast_ref::() - .expect("Expected TimestampMicrosecondArray as map values"); - - assert_eq!( - ts_array.value(0), - expected_micros, - "INT96 in map: got {}, expected {expected_micros}", - ts_array.value(0) - ); - } - - fn apply_predicate_to_batch( - predicate: Predicate, - schema: SchemaRef, - batch: RecordBatch, - ) -> BooleanArray { - use super::PredicateConverter; - - let bound = predicate.bind(schema, true).unwrap(); - - // Build a trivial Parquet schema with one float column at field id 4 - let message_type = " - message schema { - optional float qux = 4; - } - "; - let parquet_type = parse_message_type(message_type).expect("parse schema"); - let parquet_schema = SchemaDescriptor::new(Arc::new(parquet_type)); - - let column_map = HashMap::from([(4i32, 0usize)]); - let column_indices = vec![0usize]; - - let mut converter = PredicateConverter { - parquet_schema: &parquet_schema, - column_map: &column_map, - column_indices: &column_indices, - }; - - let mut predicate_fn = visit(&mut converter, &bound).unwrap(); - predicate_fn(batch).unwrap() - } - - #[test] - fn test_predicate_converter_nan() { - use arrow_array::Float32Array; - - let schema = table_schema_simple(); - let arrow_schema = Arc::new(ArrowSchema::new(vec![Field::new( - "qux", - DataType::Float32, - true, - )])); - let values = vec![Some(1.0f32), Some(f32::NAN), None, Some(0.0f32)]; - - // is_nan: non-null-propagating per Java's implementation - NULL → false - let batch = RecordBatch::try_new(arrow_schema.clone(), vec![Arc::new(Float32Array::from( - values.clone(), - ))]) - .unwrap(); - let result = - apply_predicate_to_batch(Reference::new("qux").is_nan(), schema.clone(), batch); - assert_eq!( - [ - result.value(0), - result.value(1), - result.value(2), - result.value(3) - ], - [false, true, false, false] - ); - assert!(!result.is_null(2)); - - // not_nan: non-null-propagating per Java's implementation - NULL → true - let batch = - RecordBatch::try_new(arrow_schema, vec![Arc::new(Float32Array::from(values))]).unwrap(); - let result = apply_predicate_to_batch(Reference::new("qux").is_not_nan(), schema, batch); - assert_eq!( - [ - result.value(0), - result.value(1), - result.value(2), - result.value(3) - ], - [true, false, true, true] - ); - assert!(!result.is_null(2)); - } -} diff --git a/crates/iceberg/src/arrow/reader/file_reader.rs b/crates/iceberg/src/arrow/reader/file_reader.rs new file mode 100644 index 0000000000..79fbcc7960 --- /dev/null +++ b/crates/iceberg/src/arrow/reader/file_reader.rs @@ -0,0 +1,368 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Async Parquet file reader that adapts an Iceberg `FileRead` to parquet's `AsyncFileReader`. + +use std::ops::Range; +use std::sync::Arc; + +use bytes::Bytes; +use futures::future::BoxFuture; +use futures::{FutureExt, StreamExt, TryFutureExt, TryStreamExt}; +use parquet::arrow::arrow_reader::ArrowReaderOptions; +use parquet::arrow::async_reader::AsyncFileReader; +use parquet::file::metadata::{PageIndexPolicy, ParquetMetaData, ParquetMetaDataReader}; + +use super::ParquetReadOptions; +use crate::io::{FileMetadata, FileRead}; + +/// ArrowFileReader is a wrapper around a FileRead that impls parquets AsyncFileReader. +pub struct ArrowFileReader { + meta: FileMetadata, + parquet_read_options: ParquetReadOptions, + r: Box, +} + +impl ArrowFileReader { + /// Create a new ArrowFileReader + pub fn new(meta: FileMetadata, r: Box) -> Self { + Self { + meta, + parquet_read_options: ParquetReadOptions::builder().build(), + r, + } + } + + /// Configure all Parquet read options. + pub(crate) fn with_parquet_read_options(mut self, options: ParquetReadOptions) -> Self { + self.parquet_read_options = options; + self + } +} + +impl AsyncFileReader for ArrowFileReader { + fn get_bytes(&mut self, range: Range) -> BoxFuture<'_, parquet::errors::Result> { + Box::pin( + self.r + .read(range.start..range.end) + .map_err(|err| parquet::errors::ParquetError::External(Box::new(err))), + ) + } + + /// Override the default `get_byte_ranges` which calls `get_bytes` sequentially. + /// The parquet reader calls this to fetch column chunks for a row group, so + /// without this override each column chunk is a serial round-trip to object storage. + /// Adapted from object_store's `coalesce_ranges` in `util.rs`. + fn get_byte_ranges( + &mut self, + ranges: Vec>, + ) -> BoxFuture<'_, parquet::errors::Result>> { + let coalesce_bytes = self.parquet_read_options.range_coalesce_bytes(); + let concurrency = self.parquet_read_options.range_fetch_concurrency().max(1); + + async move { + // Merge nearby ranges to reduce the number of object store requests. + let fetch_ranges = merge_ranges(&ranges, coalesce_bytes); + let r = &self.r; + + // Fetch merged ranges concurrently. + let fetched: Vec = futures::stream::iter(fetch_ranges.iter().cloned()) + .map(|range| async move { + r.read(range) + .await + .map_err(|e| parquet::errors::ParquetError::External(Box::new(e))) + }) + .buffered(concurrency) + .try_collect() + .await?; + + // Slice the fetched data back into the originally requested ranges. + Ok(ranges + .iter() + .map(|range| { + let idx = fetch_ranges.partition_point(|v| v.start <= range.start) - 1; + let fetch_range = &fetch_ranges[idx]; + let fetch_bytes = &fetched[idx]; + let start = (range.start - fetch_range.start) as usize; + let end = (range.end - fetch_range.start) as usize; + fetch_bytes.slice(start..end.min(fetch_bytes.len())) + }) + .collect()) + } + .boxed() + } + + // TODO: currently we don't respect `ArrowReaderOptions` cause it don't expose any method to access the option field + // we will fix it after `v55.1.0` is released in https://github.com/apache/arrow-rs/issues/7393 + fn get_metadata( + &mut self, + _options: Option<&'_ ArrowReaderOptions>, + ) -> BoxFuture<'_, parquet::errors::Result>> { + async move { + let reader = ParquetMetaDataReader::new() + .with_prefetch_hint(self.parquet_read_options.metadata_size_hint()) + // Set the page policy first because it updates both column and offset policies. + .with_page_index_policy(PageIndexPolicy::from( + self.parquet_read_options.preload_page_index(), + )) + .with_column_index_policy(PageIndexPolicy::from( + self.parquet_read_options.preload_column_index(), + )) + .with_offset_index_policy(PageIndexPolicy::from( + self.parquet_read_options.preload_offset_index(), + )); + let size = self.meta.size; + let meta = reader.load_and_finish(self, size).await?; + + Ok(Arc::new(meta)) + } + .boxed() + } +} + +/// Merge overlapping or nearby byte ranges, combining ranges with gaps <= `coalesce` bytes. +/// Adapted from object_store's `merge_ranges` in `util.rs`. +fn merge_ranges(ranges: &[Range], coalesce: u64) -> Vec> { + if ranges.is_empty() { + return vec![]; + } + + let mut ranges = ranges.to_vec(); + ranges.sort_unstable_by_key(|r| r.start); + + let mut merged = Vec::with_capacity(ranges.len()); + let mut start_idx = 0; + let mut end_idx = 1; + + while start_idx != ranges.len() { + let mut range_end = ranges[start_idx].end; + + while end_idx != ranges.len() + && ranges[end_idx] + .start + .checked_sub(range_end) + .map(|delta| delta <= coalesce) + .unwrap_or(true) + { + range_end = range_end.max(ranges[end_idx].end); + end_idx += 1; + } + + merged.push(ranges[start_idx].start..range_end); + start_idx = end_idx; + end_idx += 1; + } + + merged +} + +#[cfg(test)] +mod tests { + use std::ops::Range; + + use parquet::arrow::async_reader::AsyncFileReader; + + use super::{ArrowFileReader, ParquetReadOptions, merge_ranges}; + use crate::io::{FileMetadata, FileRead}; + + #[test] + fn test_merge_ranges_empty() { + assert_eq!(merge_ranges(&[], 1024), Vec::>::new()); + } + + #[test] + fn test_merge_ranges_no_coalesce() { + // Ranges far apart should not be merged + let ranges = vec![0..100, 1_000_000..1_000_100]; + let merged = merge_ranges(&ranges, 1024); + assert_eq!(merged, vec![0..100, 1_000_000..1_000_100]); + } + + #[test] + fn test_merge_ranges_coalesce() { + // Ranges within the gap threshold should be merged + let ranges = vec![0..100, 200..300, 500..600]; + let merged = merge_ranges(&ranges, 1024); + assert_eq!(merged, vec![0..600]); + } + + #[test] + fn test_merge_ranges_overlapping() { + let ranges = vec![0..200, 100..300]; + let merged = merge_ranges(&ranges, 0); + assert_eq!(merged, vec![0..300]); + } + + #[test] + fn test_merge_ranges_unsorted() { + let ranges = vec![500..600, 0..100, 200..300]; + let merged = merge_ranges(&ranges, 1024); + assert_eq!(merged, vec![0..600]); + } + + /// Mock FileRead backed by a flat byte buffer. + struct MockFileRead { + data: bytes::Bytes, + } + + impl MockFileRead { + fn new(size: usize) -> Self { + // Fill with sequential byte values so slices are verifiable. + let data: Vec = (0..size).map(|i| (i % 256) as u8).collect(); + Self { + data: bytes::Bytes::from(data), + } + } + } + + #[async_trait::async_trait] + impl FileRead for MockFileRead { + async fn read(&self, range: Range) -> crate::Result { + Ok(self.data.slice(range.start as usize..range.end as usize)) + } + } + + #[tokio::test] + async fn test_get_byte_ranges_no_coalesce() { + let mock = MockFileRead::new(2048); + let expected_0 = mock.data.slice(0..100); + let expected_1 = mock.data.slice(1500..1600); + + let mut reader = ArrowFileReader::new(FileMetadata { size: 2048 }, Box::new(mock)) + .with_parquet_read_options( + ParquetReadOptions::builder() + .with_range_coalesce_bytes(0) + .build(), + ); + + let result = reader + .get_byte_ranges(vec![0..100, 1500..1600]) + .await + .unwrap(); + + assert_eq!(result.len(), 2); + assert_eq!(result[0], expected_0); + assert_eq!(result[1], expected_1); + } + + #[tokio::test] + async fn test_get_byte_ranges_with_coalesce() { + let mock = MockFileRead::new(1024); + let expected_0 = mock.data.slice(0..100); + let expected_1 = mock.data.slice(200..300); + let expected_2 = mock.data.slice(500..600); + + let mut reader = ArrowFileReader::new(FileMetadata { size: 1024 }, Box::new(mock)) + .with_parquet_read_options( + ParquetReadOptions::builder() + .with_range_coalesce_bytes(1024) + .build(), + ); + + // All ranges within coalesce threshold — should merge into one fetch. + let result = reader + .get_byte_ranges(vec![0..100, 200..300, 500..600]) + .await + .unwrap(); + + assert_eq!(result.len(), 3); + assert_eq!(result[0], expected_0); + assert_eq!(result[1], expected_1); + assert_eq!(result[2], expected_2); + } + + #[tokio::test] + async fn test_get_byte_ranges_empty() { + let mock = MockFileRead::new(1024); + let mut reader = ArrowFileReader::new(FileMetadata { size: 1024 }, Box::new(mock)); + + let result = reader.get_byte_ranges(vec![]).await.unwrap(); + assert!(result.is_empty()); + } + + #[tokio::test] + async fn test_get_byte_ranges_coalesce_max() { + let mock = MockFileRead::new(2048); + let expected_0 = mock.data.slice(0..100); + let expected_1 = mock.data.slice(1500..1600); + + let mut reader = ArrowFileReader::new(FileMetadata { size: 2048 }, Box::new(mock)) + .with_parquet_read_options( + ParquetReadOptions::builder() + .with_range_coalesce_bytes(u64::MAX) + .build(), + ); + + // u64::MAX coalesce — all ranges merge into a single fetch. + let result = reader + .get_byte_ranges(vec![0..100, 1500..1600]) + .await + .unwrap(); + + assert_eq!(result.len(), 2); + assert_eq!(result[0], expected_0); + assert_eq!(result[1], expected_1); + } + + #[tokio::test] + async fn test_get_byte_ranges_concurrency_zero() { + // concurrency=0 is clamped to 1, so this should not hang. + let mock = MockFileRead::new(1024); + let expected = mock.data.slice(0..100); + + let mut reader = ArrowFileReader::new(FileMetadata { size: 1024 }, Box::new(mock)) + .with_parquet_read_options( + ParquetReadOptions::builder() + .with_range_fetch_concurrency(0) + .build(), + ); + + let result = reader + .get_byte_ranges(vec![0..100, 200..300]) + .await + .unwrap(); + assert_eq!(result.len(), 2); + assert_eq!(result[0], expected); + } + + #[tokio::test] + async fn test_get_byte_ranges_concurrency_one() { + let mock = MockFileRead::new(2048); + let expected_0 = mock.data.slice(0..100); + let expected_1 = mock.data.slice(500..600); + let expected_2 = mock.data.slice(1500..1600); + + let mut reader = ArrowFileReader::new(FileMetadata { size: 2048 }, Box::new(mock)) + .with_parquet_read_options( + ParquetReadOptions::builder() + .with_range_coalesce_bytes(0) + .with_range_fetch_concurrency(1) + .build(), + ); + + // concurrency=1 with no coalescing — sequential fetches. + let result = reader + .get_byte_ranges(vec![0..100, 500..600, 1500..1600]) + .await + .unwrap(); + + assert_eq!(result.len(), 3); + assert_eq!(result[0], expected_0); + assert_eq!(result[1], expected_1); + assert_eq!(result[2], expected_2); + } +} diff --git a/crates/iceberg/src/arrow/reader/mod.rs b/crates/iceberg/src/arrow/reader/mod.rs new file mode 100644 index 0000000000..c6c41accb7 --- /dev/null +++ b/crates/iceberg/src/arrow/reader/mod.rs @@ -0,0 +1,154 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Parquet file data reader + +use crate::arrow::caching_delete_file_loader::CachingDeleteFileLoader; +use crate::io::FileIO; +use crate::util::available_parallelism; + +/// Default gap between byte ranges below which they are coalesced into a +/// single request. Matches object_store's `OBJECT_STORE_COALESCE_DEFAULT`. +const DEFAULT_RANGE_COALESCE_BYTES: u64 = 1024 * 1024; + +/// Default maximum number of coalesced byte ranges fetched concurrently. +/// Matches object_store's `OBJECT_STORE_COALESCE_PARALLEL`. +const DEFAULT_RANGE_FETCH_CONCURRENCY: usize = 10; + +/// Default number of bytes to prefetch when parsing Parquet footer metadata. +/// Matches DataFusion's default `ParquetOptions::metadata_size_hint`. +const DEFAULT_METADATA_SIZE_HINT: usize = 512 * 1024; + +mod file_reader; +mod options; +mod pipeline; +mod positional_deletes; +mod predicate_visitor; +mod projection; +mod row_filter; +pub use file_reader::ArrowFileReader; +pub(crate) use options::ParquetReadOptions; +use predicate_visitor::{CollectFieldIdVisitor, PredicateConverter}; +use projection::{add_fallback_field_ids_to_arrow_schema, apply_name_mapping_to_arrow_schema}; + +/// Builder to create ArrowReader +pub struct ArrowReaderBuilder { + batch_size: Option, + file_io: FileIO, + concurrency_limit_data_files: usize, + row_group_filtering_enabled: bool, + row_selection_enabled: bool, + parquet_read_options: ParquetReadOptions, +} + +impl ArrowReaderBuilder { + /// Create a new ArrowReaderBuilder + pub fn new(file_io: FileIO) -> Self { + let num_cpus = available_parallelism().get(); + + ArrowReaderBuilder { + batch_size: None, + file_io, + concurrency_limit_data_files: num_cpus, + row_group_filtering_enabled: true, + row_selection_enabled: false, + parquet_read_options: ParquetReadOptions::builder().build(), + } + } + + /// Sets the max number of in flight data files that are being fetched + pub fn with_data_file_concurrency_limit(mut self, val: usize) -> Self { + self.concurrency_limit_data_files = val; + self + } + + /// Sets the desired size of batches in the response + /// to something other than the default + pub fn with_batch_size(mut self, batch_size: usize) -> Self { + self.batch_size = Some(batch_size); + self + } + + /// Determines whether to enable row group filtering. + pub fn with_row_group_filtering_enabled(mut self, row_group_filtering_enabled: bool) -> Self { + self.row_group_filtering_enabled = row_group_filtering_enabled; + self + } + + /// Determines whether to enable row selection. + pub fn with_row_selection_enabled(mut self, row_selection_enabled: bool) -> Self { + self.row_selection_enabled = row_selection_enabled; + self + } + + /// Provide a hint as to the number of bytes to prefetch for parsing the Parquet metadata + /// + /// This hint can help reduce the number of fetch requests. For more details see the + /// [ParquetMetaDataReader documentation](https://docs.rs/parquet/latest/parquet/file/metadata/struct.ParquetMetaDataReader.html#method.with_prefetch_hint). + pub fn with_metadata_size_hint(mut self, metadata_size_hint: usize) -> Self { + self.parquet_read_options.metadata_size_hint = Some(metadata_size_hint); + self + } + + /// Sets the gap threshold for merging nearby byte ranges into a single request. + /// Ranges with gaps smaller than this value will be coalesced. + /// + /// Defaults to 1 MiB, matching object_store's OBJECT_STORE_COALESCE_DEFAULT. + pub fn with_range_coalesce_bytes(mut self, range_coalesce_bytes: u64) -> Self { + self.parquet_read_options.range_coalesce_bytes = range_coalesce_bytes; + self + } + + /// Sets the maximum number of merged byte ranges to fetch concurrently. + /// + /// Defaults to 10, matching object_store's OBJECT_STORE_COALESCE_PARALLEL. + pub fn with_range_fetch_concurrency(mut self, range_fetch_concurrency: usize) -> Self { + self.parquet_read_options.range_fetch_concurrency = range_fetch_concurrency; + self + } + + /// Build the ArrowReader. + pub fn build(self) -> ArrowReader { + ArrowReader { + batch_size: self.batch_size, + file_io: self.file_io.clone(), + delete_file_loader: CachingDeleteFileLoader::new( + self.file_io.clone(), + self.concurrency_limit_data_files, + ), + concurrency_limit_data_files: self.concurrency_limit_data_files, + row_group_filtering_enabled: self.row_group_filtering_enabled, + row_selection_enabled: self.row_selection_enabled, + parquet_read_options: self.parquet_read_options, + } + } +} + +/// Reads data from Parquet files +#[derive(Clone)] +pub struct ArrowReader { + batch_size: Option, + file_io: FileIO, + delete_file_loader: CachingDeleteFileLoader, + + /// the maximum number of data files that can be fetched at the same time + concurrency_limit_data_files: usize, + + row_group_filtering_enabled: bool, + row_selection_enabled: bool, + parquet_read_options: ParquetReadOptions, +} diff --git a/crates/iceberg/src/arrow/reader/options.rs b/crates/iceberg/src/arrow/reader/options.rs new file mode 100644 index 0000000000..ae6a3ed18e --- /dev/null +++ b/crates/iceberg/src/arrow/reader/options.rs @@ -0,0 +1,84 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Tunables for Parquet file I/O used by `ArrowReader`. + +use typed_builder::TypedBuilder; + +use super::{ + DEFAULT_METADATA_SIZE_HINT, DEFAULT_RANGE_COALESCE_BYTES, DEFAULT_RANGE_FETCH_CONCURRENCY, +}; + +/// Options for tuning Parquet file I/O. +#[derive(Clone, Copy, Debug, TypedBuilder)] +#[builder(field_defaults(setter(prefix = "with_")))] +pub(crate) struct ParquetReadOptions { + /// Number of bytes to prefetch for parsing the Parquet metadata. + /// + /// This hint can help reduce the number of fetch requests. For more details see the + /// [ParquetMetaDataReader documentation](https://docs.rs/parquet/latest/parquet/file/metadata/struct.ParquetMetaDataReader.html#method.with_prefetch_hint). + /// + /// Defaults to 512 KiB, matching DataFusion's default `ParquetOptions::metadata_size_hint`. + #[builder(default = Some(DEFAULT_METADATA_SIZE_HINT))] + pub(crate) metadata_size_hint: Option, + /// Gap threshold for merging nearby byte ranges into a single request. + /// Ranges with gaps smaller than this value will be coalesced. + /// + /// Defaults to 1 MiB, matching object_store's `OBJECT_STORE_COALESCE_DEFAULT`. + #[builder(default = DEFAULT_RANGE_COALESCE_BYTES)] + pub(crate) range_coalesce_bytes: u64, + /// Maximum number of merged byte ranges to fetch concurrently. + /// + /// Defaults to 10, matching object_store's `OBJECT_STORE_COALESCE_PARALLEL`. + #[builder(default = DEFAULT_RANGE_FETCH_CONCURRENCY)] + pub(crate) range_fetch_concurrency: usize, + /// Whether to preload the column index when reading Parquet metadata. + #[builder(default = true)] + pub(crate) preload_column_index: bool, + /// Whether to preload the offset index when reading Parquet metadata. + #[builder(default = true)] + pub(crate) preload_offset_index: bool, + /// Whether to preload the page index when reading Parquet metadata. + #[builder(default = false)] + pub(crate) preload_page_index: bool, +} + +impl ParquetReadOptions { + pub(crate) fn metadata_size_hint(&self) -> Option { + self.metadata_size_hint + } + + pub(crate) fn range_coalesce_bytes(&self) -> u64 { + self.range_coalesce_bytes + } + + pub(crate) fn range_fetch_concurrency(&self) -> usize { + self.range_fetch_concurrency + } + + pub(crate) fn preload_column_index(&self) -> bool { + self.preload_column_index + } + + pub(crate) fn preload_offset_index(&self) -> bool { + self.preload_offset_index + } + + pub(crate) fn preload_page_index(&self) -> bool { + self.preload_page_index + } +} diff --git a/crates/iceberg/src/arrow/reader/pipeline.rs b/crates/iceberg/src/arrow/reader/pipeline.rs new file mode 100644 index 0000000000..94059fc62b --- /dev/null +++ b/crates/iceberg/src/arrow/reader/pipeline.rs @@ -0,0 +1,1174 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! The main `ArrowReader` pipeline: reading a stream of `FileScanTask`s, +//! opening Parquet files and resolving schemas, then wiring projection, +//! predicates, row-group / row selection, and delete handling into a stream +//! of transformed Arrow `RecordBatch`es. + +use std::sync::Arc; + +use futures::{StreamExt, TryStreamExt}; +use parquet::arrow::arrow_reader::{ArrowReaderMetadata, ArrowReaderOptions}; +use parquet::arrow::{PARQUET_FIELD_ID_META_KEY, ParquetRecordBatchStreamBuilder}; + +use super::{ + ArrowFileReader, ArrowReader, ParquetReadOptions, add_fallback_field_ids_to_arrow_schema, + apply_name_mapping_to_arrow_schema, +}; +use crate::arrow::caching_delete_file_loader::CachingDeleteFileLoader; +use crate::arrow::int96::coerce_int96_timestamps; +use crate::arrow::record_batch_transformer::RecordBatchTransformerBuilder; +use crate::error::Result; +use crate::io::{FileIO, FileMetadata}; +use crate::metadata_columns::{RESERVED_FIELD_ID_FILE, is_metadata_field}; +use crate::scan::{ArrowRecordBatchStream, FileScanTask, FileScanTaskStream}; +use crate::spec::Datum; +use crate::{Error, ErrorKind}; + +impl ArrowReader { + /// Take a stream of FileScanTasks and reads all the files. + /// Returns a stream of Arrow RecordBatches containing the data from the files + pub fn read(self, tasks: FileScanTaskStream) -> Result { + let file_io = self.file_io.clone(); + let batch_size = self.batch_size; + let concurrency_limit_data_files = self.concurrency_limit_data_files; + let row_group_filtering_enabled = self.row_group_filtering_enabled; + let row_selection_enabled = self.row_selection_enabled; + let parquet_read_options = self.parquet_read_options; + + // Fast-path for single concurrency to avoid overhead of try_flatten_unordered + let stream: ArrowRecordBatchStream = if concurrency_limit_data_files == 1 { + Box::pin( + tasks + .and_then(move |task| { + let file_io = file_io.clone(); + + Self::process_file_scan_task( + task, + batch_size, + file_io, + self.delete_file_loader.clone(), + row_group_filtering_enabled, + row_selection_enabled, + parquet_read_options, + ) + }) + .map_err(|err| { + Error::new(ErrorKind::Unexpected, "file scan task generate failed") + .with_source(err) + }) + .try_flatten(), + ) + } else { + Box::pin( + tasks + .map_ok(move |task| { + let file_io = file_io.clone(); + + Self::process_file_scan_task( + task, + batch_size, + file_io, + self.delete_file_loader.clone(), + row_group_filtering_enabled, + row_selection_enabled, + parquet_read_options, + ) + }) + .map_err(|err| { + Error::new(ErrorKind::Unexpected, "file scan task generate failed") + .with_source(err) + }) + .try_buffer_unordered(concurrency_limit_data_files) + .try_flatten_unordered(concurrency_limit_data_files), + ) + }; + + Ok(stream) + } + + async fn process_file_scan_task( + task: FileScanTask, + batch_size: Option, + file_io: FileIO, + delete_file_loader: CachingDeleteFileLoader, + row_group_filtering_enabled: bool, + row_selection_enabled: bool, + parquet_read_options: ParquetReadOptions, + ) -> Result { + let should_load_page_index = + (row_selection_enabled && task.predicate.is_some()) || !task.deletes.is_empty(); + let mut parquet_read_options = parquet_read_options; + parquet_read_options.preload_page_index = should_load_page_index; + + let delete_filter_rx = + delete_file_loader.load_deletes(&task.deletes, Arc::clone(&task.schema)); + + // Open the Parquet file once, loading its metadata + let (parquet_file_reader, arrow_metadata) = Self::open_parquet_file( + &task.data_file_path, + &file_io, + task.file_size_in_bytes, + parquet_read_options, + ) + .await?; + + // Check if Parquet file has embedded field IDs + // Corresponds to Java's ParquetSchemaUtil.hasIds() + // Reference: parquet/src/main/java/org/apache/iceberg/parquet/ParquetSchemaUtil.java:118 + let missing_field_ids = arrow_metadata + .schema() + .fields() + .iter() + .next() + .is_some_and(|f| f.metadata().get(PARQUET_FIELD_ID_META_KEY).is_none()); + + // Three-branch schema resolution strategy matching Java's ReadConf constructor + // + // Per Iceberg spec Column Projection rules: + // "Columns in Iceberg data files are selected by field id. The table schema's column + // names and order may change after a data file is written, and projection must be done + // using field ids." + // https://iceberg.apache.org/spec/#column-projection + // + // When Parquet files lack field IDs (e.g., Hive/Spark migrations via add_files), + // we must assign field IDs BEFORE reading data to enable correct projection. + // + // Java's ReadConf determines field ID strategy: + // - Branch 1: hasIds(fileSchema) → trust embedded field IDs, use pruneColumns() + // - Branch 2: nameMapping present → applyNameMapping(), then pruneColumns() + // - Branch 3: fallback → addFallbackIds(), then pruneColumnsFallback() + let arrow_metadata = if missing_field_ids { + // Parquet file lacks field IDs - must assign them before reading + let arrow_schema = if let Some(name_mapping) = &task.name_mapping { + // Branch 2: Apply name mapping to assign correct Iceberg field IDs + // Per spec rule #2: "Use schema.name-mapping.default metadata to map field id + // to columns without field id" + // Corresponds to Java's ParquetSchemaUtil.applyNameMapping() + apply_name_mapping_to_arrow_schema( + Arc::clone(arrow_metadata.schema()), + name_mapping, + )? + } else { + // Branch 3: No name mapping - use position-based fallback IDs + // Corresponds to Java's ParquetSchemaUtil.addFallbackIds() + add_fallback_field_ids_to_arrow_schema(arrow_metadata.schema()) + }; + + let options = ArrowReaderOptions::new().with_schema(arrow_schema); + ArrowReaderMetadata::try_new(Arc::clone(arrow_metadata.metadata()), options).map_err( + |e| { + Error::new( + ErrorKind::Unexpected, + "Failed to create ArrowReaderMetadata with field ID schema", + ) + .with_source(e) + }, + )? + } else { + // Branch 1: File has embedded field IDs - trust them + arrow_metadata + }; + + // Coerce INT96 timestamp columns to the resolution specified by the Iceberg schema. + // This must happen before building the stream reader to avoid i64 overflow in arrow-rs. + let arrow_metadata = if let Some(coerced_schema) = + coerce_int96_timestamps(arrow_metadata.schema(), &task.schema) + { + let options = ArrowReaderOptions::new().with_schema(Arc::clone(&coerced_schema)); + ArrowReaderMetadata::try_new(Arc::clone(arrow_metadata.metadata()), options).map_err( + |e| { + Error::new( + ErrorKind::Unexpected, + format!( + "Failed to create ArrowReaderMetadata with INT96-coerced schema: {coerced_schema}" + ), + ) + .with_source(e) + }, + )? + } else { + arrow_metadata + }; + + // Build the stream reader, reusing the already-opened file reader + let mut record_batch_stream_builder = + ParquetRecordBatchStreamBuilder::new_with_metadata(parquet_file_reader, arrow_metadata); + + // Filter out metadata fields for Parquet projection (they don't exist in files) + let project_field_ids_without_metadata: Vec = task + .project_field_ids + .iter() + .filter(|&&id| !is_metadata_field(id)) + .copied() + .collect(); + + // Create projection mask based on field IDs + // - If file has embedded IDs: field-ID-based projection (missing_field_ids=false) + // - If name mapping applied: field-ID-based projection (missing_field_ids=true but IDs now match) + // - If fallback IDs: position-based projection (missing_field_ids=true) + let projection_mask = Self::get_arrow_projection_mask( + &project_field_ids_without_metadata, + &task.schema, + record_batch_stream_builder.parquet_schema(), + record_batch_stream_builder.schema(), + missing_field_ids, // Whether to use position-based (true) or field-ID-based (false) projection + )?; + + record_batch_stream_builder = + record_batch_stream_builder.with_projection(projection_mask.clone()); + + // RecordBatchTransformer performs any transformations required on the RecordBatches + // that come back from the file, such as type promotion, default column insertion, + // column re-ordering, partition constants, and virtual field addition (like _file) + let mut record_batch_transformer_builder = + RecordBatchTransformerBuilder::new(task.schema_ref(), task.project_field_ids()); + + // Add the _file metadata column if it's in the projected fields + if task.project_field_ids().contains(&RESERVED_FIELD_ID_FILE) { + let file_datum = Datum::string(task.data_file_path.clone()); + record_batch_transformer_builder = + record_batch_transformer_builder.with_constant(RESERVED_FIELD_ID_FILE, file_datum); + } + + if let (Some(partition_spec), Some(partition_data)) = + (task.partition_spec.clone(), task.partition.clone()) + { + record_batch_transformer_builder = + record_batch_transformer_builder.with_partition(partition_spec, partition_data)?; + } + + let mut record_batch_transformer = record_batch_transformer_builder.build(); + + if let Some(batch_size) = batch_size { + record_batch_stream_builder = record_batch_stream_builder.with_batch_size(batch_size); + } + + let delete_filter = delete_filter_rx.await.unwrap()?; + let delete_predicate = delete_filter.build_equality_delete_predicate(&task).await?; + + // In addition to the optional predicate supplied in the `FileScanTask`, + // we also have an optional predicate resulting from equality delete files. + // If both are present, we logical-AND them together to form a single filter + // predicate that we can pass to the `RecordBatchStreamBuilder`. + let final_predicate = match (&task.predicate, delete_predicate) { + (None, None) => None, + (Some(predicate), None) => Some(predicate.clone()), + (None, Some(ref predicate)) => Some(predicate.clone()), + (Some(filter_predicate), Some(delete_predicate)) => { + Some(filter_predicate.clone().and(delete_predicate)) + } + }; + + // There are three possible sources for potential lists of selected RowGroup indices, + // and two for `RowSelection`s. + // Selected RowGroup index lists can come from three sources: + // * When task.start and task.length specify a byte range (file splitting); + // * When there are equality delete files that are applicable; + // * When there is a scan predicate and row_group_filtering_enabled = true. + // `RowSelection`s can be created in either or both of the following cases: + // * When there are positional delete files that are applicable; + // * When there is a scan predicate and row_selection_enabled = true + // Note that row group filtering from predicates only happens when + // there is a scan predicate AND row_group_filtering_enabled = true, + // but we perform row selection filtering if there are applicable + // equality delete files OR (there is a scan predicate AND row_selection_enabled), + // since the only implemented method of applying positional deletes is + // by using a `RowSelection`. + let mut selected_row_group_indices = None; + let mut row_selection = None; + + // Filter row groups based on byte range from task.start and task.length. + // If both start and length are 0, read the entire file (backwards compatibility). + if task.start != 0 || task.length != 0 { + let byte_range_filtered_row_groups = Self::filter_row_groups_by_byte_range( + record_batch_stream_builder.metadata(), + task.start, + task.length, + )?; + selected_row_group_indices = Some(byte_range_filtered_row_groups); + } + + if let Some(predicate) = final_predicate { + let (iceberg_field_ids, field_id_map) = Self::build_field_id_set_and_map( + record_batch_stream_builder.parquet_schema(), + &predicate, + )?; + + let row_filter = Self::get_row_filter( + &predicate, + record_batch_stream_builder.parquet_schema(), + &iceberg_field_ids, + &field_id_map, + )?; + record_batch_stream_builder = record_batch_stream_builder.with_row_filter(row_filter); + + if row_group_filtering_enabled { + let predicate_filtered_row_groups = Self::get_selected_row_group_indices( + &predicate, + record_batch_stream_builder.metadata(), + &field_id_map, + &task.schema, + )?; + + // Merge predicate-based filtering with byte range filtering (if present) + // by taking the intersection of both filters + selected_row_group_indices = match selected_row_group_indices { + Some(byte_range_filtered) => { + // Keep only row groups that are in both filters + let intersection: Vec = byte_range_filtered + .into_iter() + .filter(|idx| predicate_filtered_row_groups.contains(idx)) + .collect(); + Some(intersection) + } + None => Some(predicate_filtered_row_groups), + }; + } + + if row_selection_enabled { + row_selection = Some(Self::get_row_selection_for_filter_predicate( + &predicate, + record_batch_stream_builder.metadata(), + &selected_row_group_indices, + &field_id_map, + &task.schema, + )?); + } + } + + let positional_delete_indexes = delete_filter.get_delete_vector(&task); + + if let Some(positional_delete_indexes) = positional_delete_indexes { + let delete_row_selection = { + let positional_delete_indexes = positional_delete_indexes.lock().unwrap(); + + Self::build_deletes_row_selection( + record_batch_stream_builder.metadata().row_groups(), + &selected_row_group_indices, + &positional_delete_indexes, + ) + }?; + + // merge the row selection from the delete files with the row selection + // from the filter predicate, if there is one from the filter predicate + row_selection = match row_selection { + None => Some(delete_row_selection), + Some(filter_row_selection) => { + Some(filter_row_selection.intersection(&delete_row_selection)) + } + }; + } + + if let Some(row_selection) = row_selection { + record_batch_stream_builder = + record_batch_stream_builder.with_row_selection(row_selection); + } + + if let Some(selected_row_group_indices) = selected_row_group_indices { + record_batch_stream_builder = + record_batch_stream_builder.with_row_groups(selected_row_group_indices); + } + + // Build the batch stream and send all the RecordBatches that it generates + // to the requester. + let record_batch_stream = + record_batch_stream_builder + .build()? + .map(move |batch| match batch { + Ok(batch) => { + // Process the record batch (type promotion, column reordering, virtual fields, etc.) + record_batch_transformer.process_record_batch(batch) + } + Err(err) => Err(err.into()), + }); + + Ok(Box::pin(record_batch_stream) as ArrowRecordBatchStream) + } + + /// Opens a Parquet file and loads its metadata, returning both the reader and metadata. + /// The reader can be reused to build a `ParquetRecordBatchStreamBuilder` without + /// reopening the file. + pub(crate) async fn open_parquet_file( + data_file_path: &str, + file_io: &FileIO, + file_size_in_bytes: u64, + parquet_read_options: ParquetReadOptions, + ) -> Result<(ArrowFileReader, ArrowReaderMetadata)> { + let parquet_file = file_io.new_input(data_file_path)?; + let parquet_reader = parquet_file.reader().await?; + let mut reader = ArrowFileReader::new( + FileMetadata { + size: file_size_in_bytes, + }, + parquet_reader, + ) + .with_parquet_read_options(parquet_read_options); + + let arrow_metadata = ArrowReaderMetadata::load_async(&mut reader, Default::default()) + .await + .map_err(|e| { + Error::new(ErrorKind::Unexpected, "Failed to load Parquet metadata").with_source(e) + })?; + + Ok((reader, arrow_metadata)) + } +} + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + use std::fs::File; + use std::sync::Arc; + + use arrow_array::cast::AsArray; + use arrow_array::{Array, ArrayRef, RecordBatch}; + use arrow_schema::{DataType, Field, Schema as ArrowSchema}; + use futures::TryStreamExt; + use parquet::arrow::{ArrowWriter, PARQUET_FIELD_ID_META_KEY}; + use parquet::basic::Compression; + use parquet::file::properties::WriterProperties; + use tempfile::TempDir; + + use crate::arrow::ArrowReaderBuilder; + use crate::io::FileIO; + use crate::scan::{FileScanTask, FileScanTaskStream}; + use crate::spec::{DataFileFormat, NestedField, PrimitiveType, Schema, SchemaRef, Type}; + + // INT96 encoding: [nanos_low_u32, nanos_high_u32, julian_day_u32] + // Julian day 2_440_588 = Unix epoch (1970-01-01) + const UNIX_EPOCH_JULIAN: i64 = 2_440_588; + const MICROS_PER_DAY: i64 = 86_400_000_000; + // Noon on 3333-01-01 (Julian day 2_953_529) — outside the i64 nanosecond range (~1677-2262). + const INT96_TEST_NANOS_WITHIN_DAY: u64 = 43_200_000_000_000; + const INT96_TEST_JULIAN_DAY: u32 = 2_953_529; + + fn make_int96_test_value() -> (parquet::data_type::Int96, i64) { + let mut val = parquet::data_type::Int96::new(); + val.set_data( + (INT96_TEST_NANOS_WITHIN_DAY & 0xFFFFFFFF) as u32, + (INT96_TEST_NANOS_WITHIN_DAY >> 32) as u32, + INT96_TEST_JULIAN_DAY, + ); + let expected_micros = (INT96_TEST_JULIAN_DAY as i64 - UNIX_EPOCH_JULIAN) * MICROS_PER_DAY + + (INT96_TEST_NANOS_WITHIN_DAY / 1_000) as i64; + (val, expected_micros) + } + + async fn read_int96_batches( + file_path: &str, + schema: SchemaRef, + project_field_ids: Vec, + ) -> Vec { + let file_io = FileIO::new_with_fs(); + let reader = ArrowReaderBuilder::new(file_io).build(); + + let file_size = std::fs::metadata(file_path).unwrap().len(); + let task = FileScanTask { + file_size_in_bytes: file_size, + start: 0, + length: file_size, + record_count: None, + data_file_path: file_path.to_string(), + data_file_format: DataFileFormat::Parquet, + schema, + project_field_ids, + predicate: None, + deletes: vec![], + partition: None, + partition_spec: None, + name_mapping: None, + case_sensitive: false, + }; + + let tasks = Box::pin(futures::stream::iter(vec![Ok(task)])) as FileScanTaskStream; + reader.read(tasks).unwrap().try_collect().await.unwrap() + } + + // ArrowWriter cannot write INT96, so we use SerializedFileWriter directly. + fn write_int96_parquet_file( + table_location: &str, + filename: &str, + with_field_ids: bool, + ) -> (String, Vec) { + use parquet::basic::{Repetition, Type as PhysicalType}; + use parquet::data_type::{Int32Type, Int96, Int96Type}; + use parquet::file::writer::SerializedFileWriter; + use parquet::schema::types::Type as SchemaType; + + let file_path = format!("{table_location}/{filename}"); + + let mut ts_builder = SchemaType::primitive_type_builder("ts", PhysicalType::INT96) + .with_repetition(Repetition::OPTIONAL); + let mut id_builder = SchemaType::primitive_type_builder("id", PhysicalType::INT32) + .with_repetition(Repetition::REQUIRED); + + if with_field_ids { + ts_builder = ts_builder.with_id(Some(1)); + id_builder = id_builder.with_id(Some(2)); + } + + let schema = SchemaType::group_type_builder("schema") + .with_fields(vec![ + Arc::new(ts_builder.build().unwrap()), + Arc::new(id_builder.build().unwrap()), + ]) + .build() + .unwrap(); + + // Dates outside the i64 nanosecond range (~1677-2262) overflow without coercion. + const NOON_NANOS: u64 = INT96_TEST_NANOS_WITHIN_DAY; + const JULIAN_3333: u32 = INT96_TEST_JULIAN_DAY; + const JULIAN_2100: u32 = 2_488_070; + + let test_data: Vec<(u32, u32, u32, i64)> = vec![ + // 3333-01-01 00:00:00 + ( + 0, + 0, + JULIAN_3333, + (JULIAN_3333 as i64 - UNIX_EPOCH_JULIAN) * MICROS_PER_DAY, + ), + // 3333-01-01 12:00:00 + ( + (NOON_NANOS & 0xFFFFFFFF) as u32, + (NOON_NANOS >> 32) as u32, + JULIAN_3333, + (JULIAN_3333 as i64 - UNIX_EPOCH_JULIAN) * MICROS_PER_DAY + + (NOON_NANOS / 1_000) as i64, + ), + // 2100-01-01 00:00:00 + ( + 0, + 0, + JULIAN_2100, + (JULIAN_2100 as i64 - UNIX_EPOCH_JULIAN) * MICROS_PER_DAY, + ), + ]; + + let int96_values: Vec = test_data + .iter() + .map(|(lo, hi, day, _)| { + let mut v = Int96::new(); + v.set_data(*lo, *hi, *day); + v + }) + .collect(); + + let id_values: Vec = (0..test_data.len() as i32).collect(); + let expected_micros: Vec = test_data.iter().map(|(_, _, _, m)| *m).collect(); + + let file = File::create(&file_path).unwrap(); + let mut writer = + SerializedFileWriter::new(file, Arc::new(schema), Default::default()).unwrap(); + + let mut row_group = writer.next_row_group().unwrap(); + { + // def=1: ts is OPTIONAL and present. No repetition levels (top-level columns). + let mut col = row_group.next_column().unwrap().unwrap(); + col.typed::() + .write_batch(&int96_values, Some(&vec![1; test_data.len()]), None) + .unwrap(); + col.close().unwrap(); + } + { + let mut col = row_group.next_column().unwrap().unwrap(); + col.typed::() + .write_batch(&id_values, None, None) + .unwrap(); + col.close().unwrap(); + } + row_group.close().unwrap(); + writer.close().unwrap(); + + (file_path, expected_micros) + } + + async fn assert_int96_read_matches( + file_path: &str, + schema: SchemaRef, + project_field_ids: Vec, + expected_micros: &[i64], + ) { + use arrow_array::TimestampMicrosecondArray; + + let batches = read_int96_batches(file_path, schema, project_field_ids).await; + + assert_eq!(batches.len(), 1); + let ts_array = batches[0] + .column(0) + .as_any() + .downcast_ref::() + .expect("Expected TimestampMicrosecondArray"); + + for (i, expected) in expected_micros.iter().enumerate() { + assert_eq!( + ts_array.value(i), + *expected, + "Row {i}: got {}, expected {expected}", + ts_array.value(i) + ); + } + } + + /// Test that concurrency=1 reads all files correctly and in deterministic order. + /// This verifies the fast-path optimization for single concurrency. + #[tokio::test] + async fn test_read_with_concurrency_one() { + use arrow_array::Int32Array; + + let schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::required(1, "id", Type::Primitive(PrimitiveType::Int)).into(), + NestedField::required(2, "file_num", Type::Primitive(PrimitiveType::Int)) + .into(), + ]) + .build() + .unwrap(), + ); + + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("id", DataType::Int32, false).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + "1".to_string(), + )])), + Field::new("file_num", DataType::Int32, false).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + "2".to_string(), + )])), + ])); + + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + let file_io = FileIO::new_with_fs(); + + // Create 3 parquet files with different data + let props = WriterProperties::builder() + .set_compression(Compression::SNAPPY) + .build(); + + for file_num in 0..3 { + let id_data = Arc::new(Int32Array::from_iter_values( + file_num * 10..(file_num + 1) * 10, + )) as ArrayRef; + let file_num_data = Arc::new(Int32Array::from(vec![file_num; 10])) as ArrayRef; + + let to_write = + RecordBatch::try_new(arrow_schema.clone(), vec![id_data, file_num_data]).unwrap(); + + let file = File::create(format!("{table_location}/file_{file_num}.parquet")).unwrap(); + let mut writer = + ArrowWriter::try_new(file, to_write.schema(), Some(props.clone())).unwrap(); + writer.write(&to_write).expect("Writing batch"); + writer.close().unwrap(); + } + + // Read with concurrency=1 (fast-path) + let reader = ArrowReaderBuilder::new(file_io) + .with_data_file_concurrency_limit(1) + .build(); + + // Create tasks in a specific order: file_0, file_1, file_2 + let tasks = vec![ + Ok(FileScanTask { + file_size_in_bytes: std::fs::metadata(format!("{table_location}/file_0.parquet")) + .unwrap() + .len(), + start: 0, + length: 0, + record_count: None, + data_file_path: format!("{table_location}/file_0.parquet"), + data_file_format: DataFileFormat::Parquet, + schema: schema.clone(), + project_field_ids: vec![1, 2], + predicate: None, + deletes: vec![], + partition: None, + partition_spec: None, + name_mapping: None, + case_sensitive: false, + }), + Ok(FileScanTask { + file_size_in_bytes: std::fs::metadata(format!("{table_location}/file_1.parquet")) + .unwrap() + .len(), + start: 0, + length: 0, + record_count: None, + data_file_path: format!("{table_location}/file_1.parquet"), + data_file_format: DataFileFormat::Parquet, + schema: schema.clone(), + project_field_ids: vec![1, 2], + predicate: None, + deletes: vec![], + partition: None, + partition_spec: None, + name_mapping: None, + case_sensitive: false, + }), + Ok(FileScanTask { + file_size_in_bytes: std::fs::metadata(format!("{table_location}/file_2.parquet")) + .unwrap() + .len(), + start: 0, + length: 0, + record_count: None, + data_file_path: format!("{table_location}/file_2.parquet"), + data_file_format: DataFileFormat::Parquet, + schema: schema.clone(), + project_field_ids: vec![1, 2], + predicate: None, + deletes: vec![], + partition: None, + partition_spec: None, + name_mapping: None, + case_sensitive: false, + }), + ]; + + let tasks_stream = Box::pin(futures::stream::iter(tasks)) as FileScanTaskStream; + + let result = reader + .read(tasks_stream) + .unwrap() + .try_collect::>() + .await + .unwrap(); + + // Verify we got all 30 rows (10 from each file) + let total_rows: usize = result.iter().map(|b| b.num_rows()).sum(); + assert_eq!(total_rows, 30, "Should have 30 total rows"); + + // Collect all ids and file_nums to verify data + let mut all_ids = Vec::new(); + let mut all_file_nums = Vec::new(); + + for batch in &result { + let id_col = batch + .column(0) + .as_primitive::(); + let file_num_col = batch + .column(1) + .as_primitive::(); + + for i in 0..batch.num_rows() { + all_ids.push(id_col.value(i)); + all_file_nums.push(file_num_col.value(i)); + } + } + + assert_eq!(all_ids.len(), 30); + assert_eq!(all_file_nums.len(), 30); + + // With concurrency=1 and sequential processing, files should be processed in order + // file_0: ids 0-9, file_num=0 + // file_1: ids 10-19, file_num=1 + // file_2: ids 20-29, file_num=2 + for i in 0..10 { + assert_eq!(all_file_nums[i], 0, "First 10 rows should be from file_0"); + assert_eq!(all_ids[i], i as i32, "IDs should be 0-9"); + } + for i in 10..20 { + assert_eq!(all_file_nums[i], 1, "Next 10 rows should be from file_1"); + assert_eq!(all_ids[i], i as i32, "IDs should be 10-19"); + } + for i in 20..30 { + assert_eq!(all_file_nums[i], 2, "Last 10 rows should be from file_2"); + assert_eq!(all_ids[i], i as i32, "IDs should be 20-29"); + } + } + + #[tokio::test] + async fn test_read_int96_timestamps_with_field_ids() { + let schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::optional(1, "ts", Type::Primitive(PrimitiveType::Timestamp)) + .into(), + NestedField::required(2, "id", Type::Primitive(PrimitiveType::Int)).into(), + ]) + .build() + .unwrap(), + ); + + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + let (file_path, expected_micros) = + write_int96_parquet_file(&table_location, "with_ids.parquet", true); + + assert_int96_read_matches(&file_path, schema, vec![1, 2], &expected_micros).await; + } + + #[tokio::test] + async fn test_read_int96_timestamps_without_field_ids() { + let schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::optional(1, "ts", Type::Primitive(PrimitiveType::Timestamp)) + .into(), + NestedField::required(2, "id", Type::Primitive(PrimitiveType::Int)).into(), + ]) + .build() + .unwrap(), + ); + + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + let (file_path, expected_micros) = + write_int96_parquet_file(&table_location, "no_ids.parquet", false); + + assert_int96_read_matches(&file_path, schema, vec![1, 2], &expected_micros).await; + } + + #[tokio::test] + async fn test_read_int96_timestamps_in_struct() { + use arrow_array::{StructArray, TimestampMicrosecondArray}; + use parquet::basic::{Repetition, Type as PhysicalType}; + use parquet::data_type::Int96Type; + use parquet::file::writer::SerializedFileWriter; + use parquet::schema::types::Type as SchemaType; + + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + let file_path = format!("{table_location}/struct_int96.parquet"); + + let ts_type = SchemaType::primitive_type_builder("ts", PhysicalType::INT96) + .with_repetition(Repetition::OPTIONAL) + .with_id(Some(2)) + .build() + .unwrap(); + + let struct_type = SchemaType::group_type_builder("data") + .with_repetition(Repetition::REQUIRED) + .with_id(Some(1)) + .with_fields(vec![Arc::new(ts_type)]) + .build() + .unwrap(); + + let parquet_schema = SchemaType::group_type_builder("schema") + .with_fields(vec![Arc::new(struct_type)]) + .build() + .unwrap(); + + let (int96_val, expected_micros) = make_int96_test_value(); + + let file = File::create(&file_path).unwrap(); + let mut writer = + SerializedFileWriter::new(file, Arc::new(parquet_schema), Default::default()).unwrap(); + + // def=1: struct is REQUIRED so no level, ts is OPTIONAL and present (1). + // No repetition levels needed (no repeated groups). + let mut row_group = writer.next_row_group().unwrap(); + { + let mut col = row_group.next_column().unwrap().unwrap(); + col.typed::() + .write_batch(&[int96_val], Some(&[1]), None) + .unwrap(); + col.close().unwrap(); + } + row_group.close().unwrap(); + writer.close().unwrap(); + + let iceberg_schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::required( + 1, + "data", + Type::Struct(crate::spec::StructType::new(vec![ + NestedField::optional( + 2, + "ts", + Type::Primitive(PrimitiveType::Timestamp), + ) + .into(), + ])), + ) + .into(), + ]) + .build() + .unwrap(), + ); + + let batches = read_int96_batches(&file_path, iceberg_schema, vec![1]).await; + + assert_eq!(batches.len(), 1); + let struct_array = batches[0] + .column(0) + .as_any() + .downcast_ref::() + .expect("Expected StructArray"); + let ts_array = struct_array + .column(0) + .as_any() + .downcast_ref::() + .expect("Expected TimestampMicrosecondArray inside struct"); + + assert_eq!( + ts_array.value(0), + expected_micros, + "INT96 in struct: got {}, expected {expected_micros}", + ts_array.value(0) + ); + } + + #[tokio::test] + async fn test_read_int96_timestamps_in_list() { + use arrow_array::{ListArray, TimestampMicrosecondArray}; + use parquet::basic::{Repetition, Type as PhysicalType}; + use parquet::data_type::Int96Type; + use parquet::file::writer::SerializedFileWriter; + use parquet::schema::types::Type as SchemaType; + + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + let file_path = format!("{table_location}/list_int96.parquet"); + + // 3-level LIST encoding: + // optional group timestamps (LIST) { + // repeated group list { + // optional int96 element; + // } + // } + let element_type = SchemaType::primitive_type_builder("element", PhysicalType::INT96) + .with_repetition(Repetition::OPTIONAL) + .with_id(Some(2)) + .build() + .unwrap(); + + let list_group = SchemaType::group_type_builder("list") + .with_repetition(Repetition::REPEATED) + .with_fields(vec![Arc::new(element_type)]) + .build() + .unwrap(); + + let list_type = SchemaType::group_type_builder("timestamps") + .with_repetition(Repetition::OPTIONAL) + .with_id(Some(1)) + .with_logical_type(Some(parquet::basic::LogicalType::List)) + .with_fields(vec![Arc::new(list_group)]) + .build() + .unwrap(); + + let parquet_schema = SchemaType::group_type_builder("schema") + .with_fields(vec![Arc::new(list_type)]) + .build() + .unwrap(); + + let (int96_val, expected_micros) = make_int96_test_value(); + + let file = File::create(&file_path).unwrap(); + let mut writer = + SerializedFileWriter::new(file, Arc::new(parquet_schema), Default::default()).unwrap(); + + // Write a single row with a list containing one INT96 element. + // def=3: list present (1) + repeated group (2) + element present (3) + // rep=0: start of a new list + let mut row_group = writer.next_row_group().unwrap(); + { + let mut col = row_group.next_column().unwrap().unwrap(); + col.typed::() + .write_batch(&[int96_val], Some(&[3]), Some(&[0])) + .unwrap(); + col.close().unwrap(); + } + row_group.close().unwrap(); + writer.close().unwrap(); + + let iceberg_schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::optional( + 1, + "timestamps", + Type::List(crate::spec::ListType { + element_field: NestedField::optional( + 2, + "element", + Type::Primitive(PrimitiveType::Timestamp), + ) + .into(), + }), + ) + .into(), + ]) + .build() + .unwrap(), + ); + + let batches = read_int96_batches(&file_path, iceberg_schema, vec![1]).await; + + assert_eq!(batches.len(), 1); + let list_array = batches[0] + .column(0) + .as_any() + .downcast_ref::() + .expect("Expected ListArray"); + let ts_array = list_array + .values() + .as_any() + .downcast_ref::() + .expect("Expected TimestampMicrosecondArray inside list"); + + assert_eq!( + ts_array.value(0), + expected_micros, + "INT96 in list: got {}, expected {expected_micros}", + ts_array.value(0) + ); + } + + #[tokio::test] + async fn test_read_int96_timestamps_in_map() { + use arrow_array::{MapArray, TimestampMicrosecondArray}; + use parquet::basic::{Repetition, Type as PhysicalType}; + use parquet::data_type::{ByteArrayType, Int96Type}; + use parquet::file::writer::SerializedFileWriter; + use parquet::schema::types::Type as SchemaType; + + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + let file_path = format!("{table_location}/map_int96.parquet"); + + // MAP encoding: + // optional group ts_map (MAP) { + // repeated group key_value { + // required binary key (UTF8); + // optional int96 value; + // } + // } + let key_type = SchemaType::primitive_type_builder("key", PhysicalType::BYTE_ARRAY) + .with_repetition(Repetition::REQUIRED) + .with_logical_type(Some(parquet::basic::LogicalType::String)) + .with_id(Some(2)) + .build() + .unwrap(); + + let value_type = SchemaType::primitive_type_builder("value", PhysicalType::INT96) + .with_repetition(Repetition::OPTIONAL) + .with_id(Some(3)) + .build() + .unwrap(); + + let key_value_group = SchemaType::group_type_builder("key_value") + .with_repetition(Repetition::REPEATED) + .with_fields(vec![Arc::new(key_type), Arc::new(value_type)]) + .build() + .unwrap(); + + let map_type = SchemaType::group_type_builder("ts_map") + .with_repetition(Repetition::OPTIONAL) + .with_id(Some(1)) + .with_logical_type(Some(parquet::basic::LogicalType::Map)) + .with_fields(vec![Arc::new(key_value_group)]) + .build() + .unwrap(); + + let parquet_schema = SchemaType::group_type_builder("schema") + .with_fields(vec![Arc::new(map_type)]) + .build() + .unwrap(); + + let (int96_val, expected_micros) = make_int96_test_value(); + + let file = File::create(&file_path).unwrap(); + let mut writer = + SerializedFileWriter::new(file, Arc::new(parquet_schema), Default::default()).unwrap(); + + // Write a single row with a map containing one key-value pair. + // rep=0 for both columns: start of a new map. + // key def=2: map present (1) + key_value entry present (2), key is REQUIRED. + // value def=3: map present (1) + key_value entry present (2) + value present (3). + let mut row_group = writer.next_row_group().unwrap(); + { + let mut col = row_group.next_column().unwrap().unwrap(); + col.typed::() + .write_batch( + &[parquet::data_type::ByteArray::from("event_time")], + Some(&[2]), + Some(&[0]), + ) + .unwrap(); + col.close().unwrap(); + } + { + let mut col = row_group.next_column().unwrap().unwrap(); + col.typed::() + .write_batch(&[int96_val], Some(&[3]), Some(&[0])) + .unwrap(); + col.close().unwrap(); + } + row_group.close().unwrap(); + writer.close().unwrap(); + + let iceberg_schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::optional( + 1, + "ts_map", + Type::Map(crate::spec::MapType { + key_field: NestedField::required( + 2, + "key", + Type::Primitive(PrimitiveType::String), + ) + .into(), + value_field: NestedField::optional( + 3, + "value", + Type::Primitive(PrimitiveType::Timestamp), + ) + .into(), + }), + ) + .into(), + ]) + .build() + .unwrap(), + ); + + let batches = read_int96_batches(&file_path, iceberg_schema, vec![1]).await; + + assert_eq!(batches.len(), 1); + let map_array = batches[0] + .column(0) + .as_any() + .downcast_ref::() + .expect("Expected MapArray"); + let ts_array = map_array + .values() + .as_any() + .downcast_ref::() + .expect("Expected TimestampMicrosecondArray as map values"); + + assert_eq!( + ts_array.value(0), + expected_micros, + "INT96 in map: got {}, expected {expected_micros}", + ts_array.value(0) + ); + } +} diff --git a/crates/iceberg/src/arrow/reader/positional_deletes.rs b/crates/iceberg/src/arrow/reader/positional_deletes.rs new file mode 100644 index 0000000000..eea031852b --- /dev/null +++ b/crates/iceberg/src/arrow/reader/positional_deletes.rs @@ -0,0 +1,931 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Positional delete handling for `ArrowReader`: converting a `DeleteVector` +//! into a Parquet `RowSelection` that skips the deleted rows, while respecting +//! any row-group selection made by the predicate evaluator. + +use parquet::arrow::arrow_reader::{RowSelection, RowSelector}; +use parquet::file::metadata::RowGroupMetaData; + +use super::ArrowReader; +use crate::delete_vector::DeleteVector; +use crate::error::Result; + +impl ArrowReader { + /// computes a `RowSelection` from positional delete indices. + /// + /// Using the Parquet page index, we build a `RowSelection` that rejects rows that are indicated + /// as having been deleted by a positional delete, taking into account any row groups that have + /// been skipped entirely by the filter predicate + pub(super) fn build_deletes_row_selection( + row_group_metadata_list: &[RowGroupMetaData], + selected_row_groups: &Option>, + positional_deletes: &DeleteVector, + ) -> Result { + let mut results: Vec = Vec::new(); + let mut selected_row_groups_idx = 0; + let mut current_row_group_base_idx: u64 = 0; + let mut delete_vector_iter = positional_deletes.iter(); + let mut next_deleted_row_idx_opt = delete_vector_iter.next(); + + for (idx, row_group_metadata) in row_group_metadata_list.iter().enumerate() { + let row_group_num_rows = row_group_metadata.num_rows() as u64; + let next_row_group_base_idx = current_row_group_base_idx + row_group_num_rows; + + // if row group selection is enabled, + if let Some(selected_row_groups) = selected_row_groups { + // if we've consumed all the selected row groups, we're done + if selected_row_groups_idx == selected_row_groups.len() { + break; + } + + if idx == selected_row_groups[selected_row_groups_idx] { + // we're in a selected row group. Increment selected_row_groups_idx + // so that next time around the for loop we're looking for the next + // selected row group + selected_row_groups_idx += 1; + } else { + // Advance iterator past all deletes in the skipped row group. + // advance_to() positions the iterator to the first delete >= next_row_group_base_idx. + // However, if our cached next_deleted_row_idx_opt is in the skipped range, + // we need to call next() to update the cache with the newly positioned value. + delete_vector_iter.advance_to(next_row_group_base_idx); + // Only update the cache if the cached value is stale (in the skipped range) + if let Some(cached_idx) = next_deleted_row_idx_opt + && cached_idx < next_row_group_base_idx + { + next_deleted_row_idx_opt = delete_vector_iter.next(); + } + + // still increment the current page base index but then skip to the next row group + // in the file + current_row_group_base_idx += row_group_num_rows; + continue; + } + } + + let mut next_deleted_row_idx = match next_deleted_row_idx_opt { + Some(next_deleted_row_idx) => { + // if the index of the next deleted row is beyond this row group, add a selection for + // the remainder of this row group and skip to the next row group + if next_deleted_row_idx >= next_row_group_base_idx { + results.push(RowSelector::select(row_group_num_rows as usize)); + current_row_group_base_idx += row_group_num_rows; + continue; + } + + next_deleted_row_idx + } + + // If there are no more pos deletes, add a selector for the entirety of this row group. + _ => { + results.push(RowSelector::select(row_group_num_rows as usize)); + current_row_group_base_idx += row_group_num_rows; + continue; + } + }; + + let mut current_idx = current_row_group_base_idx; + 'chunks: while next_deleted_row_idx < next_row_group_base_idx { + // `select` all rows that precede the next delete index + if current_idx < next_deleted_row_idx { + let run_length = next_deleted_row_idx - current_idx; + results.push(RowSelector::select(run_length as usize)); + current_idx += run_length; + } + + // `skip` all consecutive deleted rows in the current row group + let mut run_length = 0; + while next_deleted_row_idx == current_idx + && next_deleted_row_idx < next_row_group_base_idx + { + run_length += 1; + current_idx += 1; + + next_deleted_row_idx_opt = delete_vector_iter.next(); + next_deleted_row_idx = match next_deleted_row_idx_opt { + Some(next_deleted_row_idx) => next_deleted_row_idx, + _ => { + // We've processed the final positional delete. + // Conclude the skip and then break so that we select the remaining + // rows in the row group and move on to the next row group + results.push(RowSelector::skip(run_length)); + break 'chunks; + } + }; + } + if run_length > 0 { + results.push(RowSelector::skip(run_length)); + } + } + + if current_idx < next_row_group_base_idx { + results.push(RowSelector::select( + (next_row_group_base_idx - current_idx) as usize, + )); + } + + current_row_group_base_idx += row_group_num_rows; + } + + Ok(results.into()) + } +} + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + use std::fs::File; + use std::sync::Arc; + + use arrow_array::cast::AsArray; + use arrow_array::{RecordBatch, StringArray}; + use arrow_schema::{DataType, Field, Schema as ArrowSchema}; + use futures::TryStreamExt; + use parquet::arrow::arrow_reader::{RowSelection, RowSelector}; + use parquet::arrow::{ArrowWriter, PARQUET_FIELD_ID_META_KEY}; + use parquet::basic::Compression; + use parquet::file::metadata::{ColumnChunkMetaData, RowGroupMetaData}; + use parquet::file::properties::WriterProperties; + use parquet::schema::types::{SchemaDescPtr, SchemaDescriptor}; + use roaring::RoaringTreemap; + use tempfile::TempDir; + + use crate::arrow::{ArrowReader, ArrowReaderBuilder}; + use crate::delete_vector::DeleteVector; + use crate::io::FileIO; + use crate::scan::{FileScanTask, FileScanTaskDeleteFile, FileScanTaskStream}; + use crate::spec::{DataContentType, DataFileFormat, NestedField, PrimitiveType, Schema, Type}; + + fn build_test_row_group_meta( + schema_descr: SchemaDescPtr, + columns: Vec, + num_rows: i64, + ordinal: i16, + ) -> RowGroupMetaData { + RowGroupMetaData::builder(schema_descr.clone()) + .set_num_rows(num_rows) + .set_total_byte_size(2000) + .set_column_metadata(columns) + .set_ordinal(ordinal) + .build() + .unwrap() + } + + fn get_test_schema_descr() -> SchemaDescPtr { + use parquet::schema::types::Type as SchemaType; + + let schema = SchemaType::group_type_builder("schema") + .with_fields(vec![ + Arc::new( + SchemaType::primitive_type_builder("a", parquet::basic::Type::INT32) + .build() + .unwrap(), + ), + Arc::new( + SchemaType::primitive_type_builder("b", parquet::basic::Type::INT32) + .build() + .unwrap(), + ), + ]) + .build() + .unwrap(); + + Arc::new(SchemaDescriptor::new(Arc::new(schema))) + } + + #[test] + fn test_build_deletes_row_selection() { + let schema_descr = get_test_schema_descr(); + + let mut columns = vec![]; + for ptr in schema_descr.columns() { + let column = ColumnChunkMetaData::builder(ptr.clone()).build().unwrap(); + columns.push(column); + } + + let row_groups_metadata = vec![ + build_test_row_group_meta(schema_descr.clone(), columns.clone(), 1000, 0), + build_test_row_group_meta(schema_descr.clone(), columns.clone(), 500, 1), + build_test_row_group_meta(schema_descr.clone(), columns.clone(), 500, 2), + build_test_row_group_meta(schema_descr.clone(), columns.clone(), 1000, 3), + build_test_row_group_meta(schema_descr.clone(), columns.clone(), 500, 4), + ]; + + let selected_row_groups = Some(vec![1, 3]); + + /* cases to cover: + * {skip|select} {first|intermediate|last} {one row|multiple rows} in + {first|intermediate|last} {skipped|selected} row group + * row group selection disabled + */ + + let positional_deletes = RoaringTreemap::from_iter(&[ + 1, // in skipped rg 0, should be ignored + 3, // run of three consecutive items in skipped rg0 + 4, 5, 998, // two consecutive items at end of skipped rg0 + 999, 1000, // solitary row at start of selected rg1 (1, 9) + 1010, // run of 3 rows in selected rg1 + 1011, 1012, // (3, 485) + 1498, // run of two items at end of selected rg1 + 1499, 1500, // run of two items at start of skipped rg2 + 1501, 1600, // should ignore, in skipped rg2 + 1999, // single row at end of skipped rg2 + 2000, // run of two items at start of selected rg3 + 2001, // (4, 98) + 2100, // single row in selected row group 3 (1, 99) + 2200, // run of 3 consecutive rows in selected row group 3 + 2201, 2202, // (3, 796) + 2999, // single item at end of selected rg3 (1) + 3000, // single item at start of skipped rg4 + ]); + + let positional_deletes = DeleteVector::new(positional_deletes); + + // using selected row groups 1 and 3 + let result = ArrowReader::build_deletes_row_selection( + &row_groups_metadata, + &selected_row_groups, + &positional_deletes, + ) + .unwrap(); + + let expected = RowSelection::from(vec![ + RowSelector::skip(1), + RowSelector::select(9), + RowSelector::skip(3), + RowSelector::select(485), + RowSelector::skip(4), + RowSelector::select(98), + RowSelector::skip(1), + RowSelector::select(99), + RowSelector::skip(3), + RowSelector::select(796), + RowSelector::skip(1), + ]); + + assert_eq!(result, expected); + + // selecting all row groups + let result = ArrowReader::build_deletes_row_selection( + &row_groups_metadata, + &None, + &positional_deletes, + ) + .unwrap(); + + let expected = RowSelection::from(vec![ + RowSelector::select(1), + RowSelector::skip(1), + RowSelector::select(1), + RowSelector::skip(3), + RowSelector::select(992), + RowSelector::skip(3), + RowSelector::select(9), + RowSelector::skip(3), + RowSelector::select(485), + RowSelector::skip(4), + RowSelector::select(98), + RowSelector::skip(1), + RowSelector::select(398), + RowSelector::skip(3), + RowSelector::select(98), + RowSelector::skip(1), + RowSelector::select(99), + RowSelector::skip(3), + RowSelector::select(796), + RowSelector::skip(2), + RowSelector::select(499), + ]); + + assert_eq!(result, expected); + } + + /// Test for bug where position deletes in later row groups are not applied correctly. + /// + /// When a file has multiple row groups and a position delete targets a row in a later + /// row group, the `build_deletes_row_selection` function had a bug where it would + /// fail to increment `current_row_group_base_idx` when skipping row groups. + /// + /// This test creates: + /// - A data file with 200 rows split into 2 row groups (0-99, 100-199) + /// - A position delete file that deletes row 199 (last row in second row group) + /// + /// Expected behavior: Should return 199 rows (with id=200 deleted) + /// Bug behavior: Returns 200 rows (delete is not applied) + /// + /// This bug was discovered while running Apache Spark + Apache Iceberg integration tests + /// through DataFusion Comet. The following Iceberg Java tests failed due to this bug: + /// - `org.apache.iceberg.spark.extensions.TestMergeOnReadDelete::testDeleteWithMultipleRowGroupsParquet` + /// - `org.apache.iceberg.spark.extensions.TestMergeOnReadUpdate::testUpdateWithMultipleRowGroupsParquet` + #[tokio::test] + async fn test_position_delete_across_multiple_row_groups() { + use arrow_array::{Int32Array, Int64Array}; + use parquet::file::reader::{FileReader, SerializedFileReader}; + + // Field IDs for positional delete schema + const FIELD_ID_POSITIONAL_DELETE_FILE_PATH: u64 = 2147483546; + const FIELD_ID_POSITIONAL_DELETE_POS: u64 = 2147483545; + + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + + // Create table schema with a single 'id' column + let table_schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::required(1, "id", Type::Primitive(PrimitiveType::Int)).into(), + ]) + .build() + .unwrap(), + ); + + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("id", DataType::Int32, false).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + "1".to_string(), + )])), + ])); + + // Step 1: Create data file with 200 rows in 2 row groups + // Row group 0: rows 0-99 (ids 1-100) + // Row group 1: rows 100-199 (ids 101-200) + let data_file_path = format!("{table_location}/data.parquet"); + + let batch1 = RecordBatch::try_new(arrow_schema.clone(), vec![Arc::new( + Int32Array::from_iter_values(1..=100), + )]) + .unwrap(); + + let batch2 = RecordBatch::try_new(arrow_schema.clone(), vec![Arc::new( + Int32Array::from_iter_values(101..=200), + )]) + .unwrap(); + + // Force each batch into its own row group + let props = WriterProperties::builder() + .set_compression(Compression::SNAPPY) + .set_max_row_group_row_count(Some(100)) + .build(); + + let file = File::create(&data_file_path).unwrap(); + let mut writer = ArrowWriter::try_new(file, arrow_schema.clone(), Some(props)).unwrap(); + writer.write(&batch1).expect("Writing batch 1"); + writer.write(&batch2).expect("Writing batch 2"); + writer.close().unwrap(); + + // Verify we created 2 row groups + let verify_file = File::open(&data_file_path).unwrap(); + let verify_reader = SerializedFileReader::new(verify_file).unwrap(); + assert_eq!( + verify_reader.metadata().num_row_groups(), + 2, + "Should have 2 row groups" + ); + + // Step 2: Create position delete file that deletes row 199 (id=200, last row in row group 1) + let delete_file_path = format!("{table_location}/deletes.parquet"); + + let delete_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("file_path", DataType::Utf8, false).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + FIELD_ID_POSITIONAL_DELETE_FILE_PATH.to_string(), + )])), + Field::new("pos", DataType::Int64, false).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + FIELD_ID_POSITIONAL_DELETE_POS.to_string(), + )])), + ])); + + // Delete row at position 199 (0-indexed, so it's the last row: id=200) + let delete_batch = RecordBatch::try_new(delete_schema.clone(), vec![ + Arc::new(StringArray::from_iter_values(vec![data_file_path.clone()])), + Arc::new(Int64Array::from_iter_values(vec![199i64])), + ]) + .unwrap(); + + let delete_props = WriterProperties::builder() + .set_compression(Compression::SNAPPY) + .build(); + + let delete_file = File::create(&delete_file_path).unwrap(); + let mut delete_writer = + ArrowWriter::try_new(delete_file, delete_schema, Some(delete_props)).unwrap(); + delete_writer.write(&delete_batch).unwrap(); + delete_writer.close().unwrap(); + + // Step 3: Read the data file with the delete applied + let file_io = FileIO::new_with_fs(); + let reader = ArrowReaderBuilder::new(file_io).build(); + + let task = FileScanTask { + file_size_in_bytes: std::fs::metadata(&data_file_path).unwrap().len(), + start: 0, + length: 0, + record_count: Some(200), + data_file_path: data_file_path.clone(), + data_file_format: DataFileFormat::Parquet, + schema: table_schema.clone(), + project_field_ids: vec![1], + predicate: None, + deletes: vec![FileScanTaskDeleteFile { + file_size_in_bytes: std::fs::metadata(&delete_file_path).unwrap().len(), + file_path: delete_file_path, + file_type: DataContentType::PositionDeletes, + partition_spec_id: 0, + equality_ids: None, + }], + partition: None, + partition_spec: None, + name_mapping: None, + case_sensitive: false, + }; + + let tasks = Box::pin(futures::stream::iter(vec![Ok(task)])) as FileScanTaskStream; + let result = reader + .read(tasks) + .unwrap() + .try_collect::>() + .await + .unwrap(); + + // Step 4: Verify we got 199 rows (not 200) + let total_rows: usize = result.iter().map(|b| b.num_rows()).sum(); + + println!("Total rows read: {total_rows}"); + println!("Expected: 199 rows (deleted row 199 which had id=200)"); + + // This assertion will FAIL before the fix and PASS after the fix + assert_eq!( + total_rows, 199, + "Expected 199 rows after deleting row 199, but got {total_rows} rows. \ + The bug causes position deletes in later row groups to be ignored." + ); + + // Verify the deleted row (id=200) is not present + let all_ids: Vec = result + .iter() + .flat_map(|batch| { + batch + .column(0) + .as_primitive::() + .values() + .iter() + .copied() + }) + .collect(); + + assert!( + !all_ids.contains(&200), + "Row with id=200 should be deleted but was found in results" + ); + + // Verify we have all other ids (1-199) + let expected_ids: Vec = (1..=199).collect(); + assert_eq!( + all_ids, expected_ids, + "Should have ids 1-199 but got different values" + ); + } + + /// Test for bug where position deletes are lost when skipping unselected row groups. + /// + /// This is a variant of `test_position_delete_across_multiple_row_groups` that exercises + /// the row group selection code path (`selected_row_groups: Some([...])`). + /// + /// When a file has multiple row groups and only some are selected for reading, + /// the `build_deletes_row_selection` function must correctly skip over deletes in + /// unselected row groups WITHOUT consuming deletes that belong to selected row groups. + /// + /// This test creates: + /// - A data file with 200 rows split into 2 row groups (0-99, 100-199) + /// - A position delete file that deletes row 199 (last row in second row group) + /// - Row group selection that reads ONLY row group 1 (rows 100-199) + /// + /// Expected behavior: Should return 99 rows (with row 199 deleted) + /// Bug behavior: Returns 100 rows (delete is lost when skipping row group 0) + /// + /// The bug occurs when processing row group 0 (unselected): + /// ```rust + /// delete_vector_iter.advance_to(next_row_group_base_idx); // Position at first delete >= 100 + /// next_deleted_row_idx_opt = delete_vector_iter.next(); // BUG: Consumes delete at 199! + /// ``` + /// + /// The fix is to NOT call `next()` after `advance_to()` when skipping unselected row groups, + /// because `advance_to()` already positions the iterator correctly without consuming elements. + #[tokio::test] + async fn test_position_delete_with_row_group_selection() { + use arrow_array::{Int32Array, Int64Array}; + use parquet::file::reader::{FileReader, SerializedFileReader}; + + // Field IDs for positional delete schema + const FIELD_ID_POSITIONAL_DELETE_FILE_PATH: u64 = 2147483546; + const FIELD_ID_POSITIONAL_DELETE_POS: u64 = 2147483545; + + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + + // Create table schema with a single 'id' column + let table_schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::required(1, "id", Type::Primitive(PrimitiveType::Int)).into(), + ]) + .build() + .unwrap(), + ); + + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("id", DataType::Int32, false).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + "1".to_string(), + )])), + ])); + + // Step 1: Create data file with 200 rows in 2 row groups + // Row group 0: rows 0-99 (ids 1-100) + // Row group 1: rows 100-199 (ids 101-200) + let data_file_path = format!("{table_location}/data.parquet"); + + let batch1 = RecordBatch::try_new(arrow_schema.clone(), vec![Arc::new( + Int32Array::from_iter_values(1..=100), + )]) + .unwrap(); + + let batch2 = RecordBatch::try_new(arrow_schema.clone(), vec![Arc::new( + Int32Array::from_iter_values(101..=200), + )]) + .unwrap(); + + // Force each batch into its own row group + let props = WriterProperties::builder() + .set_compression(Compression::SNAPPY) + .set_max_row_group_row_count(Some(100)) + .build(); + + let file = File::create(&data_file_path).unwrap(); + let mut writer = ArrowWriter::try_new(file, arrow_schema.clone(), Some(props)).unwrap(); + writer.write(&batch1).expect("Writing batch 1"); + writer.write(&batch2).expect("Writing batch 2"); + writer.close().unwrap(); + + // Verify we created 2 row groups + let verify_file = File::open(&data_file_path).unwrap(); + let verify_reader = SerializedFileReader::new(verify_file).unwrap(); + assert_eq!( + verify_reader.metadata().num_row_groups(), + 2, + "Should have 2 row groups" + ); + + // Step 2: Create position delete file that deletes row 199 (id=200, last row in row group 1) + let delete_file_path = format!("{table_location}/deletes.parquet"); + + let delete_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("file_path", DataType::Utf8, false).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + FIELD_ID_POSITIONAL_DELETE_FILE_PATH.to_string(), + )])), + Field::new("pos", DataType::Int64, false).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + FIELD_ID_POSITIONAL_DELETE_POS.to_string(), + )])), + ])); + + // Delete row at position 199 (0-indexed, so it's the last row: id=200) + let delete_batch = RecordBatch::try_new(delete_schema.clone(), vec![ + Arc::new(StringArray::from_iter_values(vec![data_file_path.clone()])), + Arc::new(Int64Array::from_iter_values(vec![199i64])), + ]) + .unwrap(); + + let delete_props = WriterProperties::builder() + .set_compression(Compression::SNAPPY) + .build(); + + let delete_file = File::create(&delete_file_path).unwrap(); + let mut delete_writer = + ArrowWriter::try_new(delete_file, delete_schema, Some(delete_props)).unwrap(); + delete_writer.write(&delete_batch).unwrap(); + delete_writer.close().unwrap(); + + // Step 3: Get byte ranges to read ONLY row group 1 (rows 100-199) + // This exercises the row group selection code path where row group 0 is skipped + let metadata_file = File::open(&data_file_path).unwrap(); + let metadata_reader = SerializedFileReader::new(metadata_file).unwrap(); + let metadata = metadata_reader.metadata(); + + let row_group_0 = metadata.row_group(0); + let row_group_1 = metadata.row_group(1); + + let rg0_start = 4u64; // Parquet files start with 4-byte magic "PAR1" + let rg1_start = rg0_start + row_group_0.compressed_size() as u64; + let rg1_length = row_group_1.compressed_size() as u64; + + println!( + "Row group 0: starts at byte {}, {} bytes compressed", + rg0_start, + row_group_0.compressed_size() + ); + println!( + "Row group 1: starts at byte {}, {} bytes compressed", + rg1_start, + row_group_1.compressed_size() + ); + + let file_io = FileIO::new_with_fs(); + let reader = ArrowReaderBuilder::new(file_io).build(); + + // Create FileScanTask that reads ONLY row group 1 via byte range filtering + let task = FileScanTask { + file_size_in_bytes: std::fs::metadata(&data_file_path).unwrap().len(), + start: rg1_start, + length: rg1_length, + record_count: Some(100), // Row group 1 has 100 rows + data_file_path: data_file_path.clone(), + data_file_format: DataFileFormat::Parquet, + schema: table_schema.clone(), + project_field_ids: vec![1], + predicate: None, + deletes: vec![FileScanTaskDeleteFile { + file_size_in_bytes: std::fs::metadata(&delete_file_path).unwrap().len(), + file_path: delete_file_path, + file_type: DataContentType::PositionDeletes, + partition_spec_id: 0, + equality_ids: None, + }], + partition: None, + partition_spec: None, + name_mapping: None, + case_sensitive: false, + }; + + let tasks = Box::pin(futures::stream::iter(vec![Ok(task)])) as FileScanTaskStream; + let result = reader + .read(tasks) + .unwrap() + .try_collect::>() + .await + .unwrap(); + + // Step 4: Verify we got 99 rows (not 100) + // Row group 1 has 100 rows (ids 101-200), minus 1 delete (id=200) = 99 rows + let total_rows: usize = result.iter().map(|b| b.num_rows()).sum(); + + println!("Total rows read from row group 1: {total_rows}"); + println!("Expected: 99 rows (row group 1 has 100 rows, 1 delete at position 199)"); + + // This assertion will FAIL before the fix and PASS after the fix + assert_eq!( + total_rows, 99, + "Expected 99 rows from row group 1 after deleting position 199, but got {total_rows} rows. \ + The bug causes position deletes to be lost when advance_to() is followed by next() \ + when skipping unselected row groups." + ); + + // Verify the deleted row (id=200) is not present + let all_ids: Vec = result + .iter() + .flat_map(|batch| { + batch + .column(0) + .as_primitive::() + .values() + .iter() + .copied() + }) + .collect(); + + assert!( + !all_ids.contains(&200), + "Row with id=200 should be deleted but was found in results" + ); + + // Verify we have ids 101-199 (not 101-200) + let expected_ids: Vec = (101..=199).collect(); + assert_eq!( + all_ids, expected_ids, + "Should have ids 101-199 but got different values" + ); + } + + /// Test for bug where stale cached delete causes infinite loop when skipping row groups. + /// + /// This test exposes the inverse scenario of `test_position_delete_with_row_group_selection`: + /// - Position delete targets a row in the SKIPPED row group (not the selected one) + /// - After calling advance_to(), the cached delete index is stale + /// - Without updating the cache, the code enters an infinite loop + /// + /// This test creates: + /// - A data file with 200 rows split into 2 row groups (0-99, 100-199) + /// - A position delete file that deletes row 0 (first row in SKIPPED row group 0) + /// - Row group selection that reads ONLY row group 1 (rows 100-199) + /// + /// The bug occurs when skipping row group 0: + /// ```rust + /// let mut next_deleted_row_idx_opt = delete_vector_iter.next(); // Some(0) + /// // ... skip to row group 1 ... + /// delete_vector_iter.advance_to(100); // Iterator advances past delete at 0 + /// // BUG: next_deleted_row_idx_opt is still Some(0) - STALE! + /// // When processing row group 1: + /// // current_idx = 100, next_deleted_row_idx = 0, next_row_group_base_idx = 200 + /// // Loop condition: 0 < 200 (true) + /// // But: current_idx (100) > next_deleted_row_idx (0) + /// // And: current_idx (100) != next_deleted_row_idx (0) + /// // Neither branch executes -> INFINITE LOOP! + /// ``` + /// + /// Expected behavior: Should return 100 rows (delete at 0 doesn't affect row group 1) + /// Bug behavior: Infinite loop in build_deletes_row_selection + #[tokio::test] + async fn test_position_delete_in_skipped_row_group() { + use arrow_array::{Int32Array, Int64Array}; + use parquet::file::reader::{FileReader, SerializedFileReader}; + + // Field IDs for positional delete schema + const FIELD_ID_POSITIONAL_DELETE_FILE_PATH: u64 = 2147483546; + const FIELD_ID_POSITIONAL_DELETE_POS: u64 = 2147483545; + + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + + // Create table schema with a single 'id' column + let table_schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::required(1, "id", Type::Primitive(PrimitiveType::Int)).into(), + ]) + .build() + .unwrap(), + ); + + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("id", DataType::Int32, false).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + "1".to_string(), + )])), + ])); + + // Step 1: Create data file with 200 rows in 2 row groups + // Row group 0: rows 0-99 (ids 1-100) + // Row group 1: rows 100-199 (ids 101-200) + let data_file_path = format!("{table_location}/data.parquet"); + + let batch1 = RecordBatch::try_new(arrow_schema.clone(), vec![Arc::new( + Int32Array::from_iter_values(1..=100), + )]) + .unwrap(); + + let batch2 = RecordBatch::try_new(arrow_schema.clone(), vec![Arc::new( + Int32Array::from_iter_values(101..=200), + )]) + .unwrap(); + + // Force each batch into its own row group + let props = WriterProperties::builder() + .set_compression(Compression::SNAPPY) + .set_max_row_group_row_count(Some(100)) + .build(); + + let file = File::create(&data_file_path).unwrap(); + let mut writer = ArrowWriter::try_new(file, arrow_schema.clone(), Some(props)).unwrap(); + writer.write(&batch1).expect("Writing batch 1"); + writer.write(&batch2).expect("Writing batch 2"); + writer.close().unwrap(); + + // Verify we created 2 row groups + let verify_file = File::open(&data_file_path).unwrap(); + let verify_reader = SerializedFileReader::new(verify_file).unwrap(); + assert_eq!( + verify_reader.metadata().num_row_groups(), + 2, + "Should have 2 row groups" + ); + + // Step 2: Create position delete file that deletes row 0 (id=1, first row in row group 0) + let delete_file_path = format!("{table_location}/deletes.parquet"); + + let delete_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("file_path", DataType::Utf8, false).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + FIELD_ID_POSITIONAL_DELETE_FILE_PATH.to_string(), + )])), + Field::new("pos", DataType::Int64, false).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + FIELD_ID_POSITIONAL_DELETE_POS.to_string(), + )])), + ])); + + // Delete row at position 0 (0-indexed, so it's the first row: id=1) + let delete_batch = RecordBatch::try_new(delete_schema.clone(), vec![ + Arc::new(StringArray::from_iter_values(vec![data_file_path.clone()])), + Arc::new(Int64Array::from_iter_values(vec![0i64])), + ]) + .unwrap(); + + let delete_props = WriterProperties::builder() + .set_compression(Compression::SNAPPY) + .build(); + + let delete_file = File::create(&delete_file_path).unwrap(); + let mut delete_writer = + ArrowWriter::try_new(delete_file, delete_schema, Some(delete_props)).unwrap(); + delete_writer.write(&delete_batch).unwrap(); + delete_writer.close().unwrap(); + + // Step 3: Get byte ranges to read ONLY row group 1 (rows 100-199) + // This exercises the row group selection code path where row group 0 is skipped + let metadata_file = File::open(&data_file_path).unwrap(); + let metadata_reader = SerializedFileReader::new(metadata_file).unwrap(); + let metadata = metadata_reader.metadata(); + + let row_group_0 = metadata.row_group(0); + let row_group_1 = metadata.row_group(1); + + let rg0_start = 4u64; // Parquet files start with 4-byte magic "PAR1" + let rg1_start = rg0_start + row_group_0.compressed_size() as u64; + let rg1_length = row_group_1.compressed_size() as u64; + + let file_io = FileIO::new_with_fs(); + let reader = ArrowReaderBuilder::new(file_io).build(); + + // Create FileScanTask that reads ONLY row group 1 via byte range filtering + let task = FileScanTask { + file_size_in_bytes: std::fs::metadata(&data_file_path).unwrap().len(), + start: rg1_start, + length: rg1_length, + record_count: Some(100), // Row group 1 has 100 rows + data_file_path: data_file_path.clone(), + data_file_format: DataFileFormat::Parquet, + schema: table_schema.clone(), + project_field_ids: vec![1], + predicate: None, + deletes: vec![FileScanTaskDeleteFile { + file_size_in_bytes: std::fs::metadata(&delete_file_path).unwrap().len(), + file_path: delete_file_path, + file_type: DataContentType::PositionDeletes, + partition_spec_id: 0, + equality_ids: None, + }], + partition: None, + partition_spec: None, + name_mapping: None, + case_sensitive: false, + }; + + let tasks = Box::pin(futures::stream::iter(vec![Ok(task)])) as FileScanTaskStream; + let result = reader + .read(tasks) + .unwrap() + .try_collect::>() + .await + .unwrap(); + + // Step 4: Verify we got 100 rows (all of row group 1) + // The delete at position 0 is in row group 0, which is skipped, so it doesn't affect us + let total_rows: usize = result.iter().map(|b| b.num_rows()).sum(); + + assert_eq!( + total_rows, 100, + "Expected 100 rows from row group 1 (delete at position 0 is in skipped row group 0). \ + If this hangs or fails, it indicates the cached delete index was not updated after advance_to()." + ); + + // Verify we have all ids from row group 1 (101-200) + let all_ids: Vec = result + .iter() + .flat_map(|batch| { + batch + .column(0) + .as_primitive::() + .values() + .iter() + .copied() + }) + .collect(); + + let expected_ids: Vec = (101..=200).collect(); + assert_eq!( + all_ids, expected_ids, + "Should have ids 101-200 (all of row group 1)" + ); + } +} diff --git a/crates/iceberg/src/arrow/reader/predicate_visitor.rs b/crates/iceberg/src/arrow/reader/predicate_visitor.rs new file mode 100644 index 0000000000..272de49390 --- /dev/null +++ b/crates/iceberg/src/arrow/reader/predicate_visitor.rs @@ -0,0 +1,820 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Visitors that translate Iceberg bound predicates into the pieces needed for +//! Arrow-level evaluation: collecting referenced field IDs and producing +//! per-record-batch predicate closures. + +use std::collections::{HashMap, HashSet}; +use std::sync::Arc; + +use arrow_arith::boolean::{and, and_kleene, is_not_null, is_null, not, or, or_kleene}; +use arrow_array::cast::AsArray; +use arrow_array::types::{Float32Type, Float64Type}; +use arrow_array::{Array, ArrayRef, BooleanArray, Datum as ArrowDatum, RecordBatch, Scalar}; +use arrow_buffer::BooleanBuffer; +use arrow_cast::cast::cast; +use arrow_ord::cmp::{eq, gt, gt_eq, lt, lt_eq, neq}; +use arrow_schema::{ArrowError, DataType}; +use arrow_string::like::starts_with; +use fnv::FnvHashSet; +use parquet::schema::types::SchemaDescriptor; + +use crate::arrow::get_arrow_datum; +use crate::error::Result; +use crate::expr::visitors::bound_predicate_visitor::BoundPredicateVisitor; +use crate::expr::{BoundPredicate, BoundReference}; +use crate::spec::Datum; +use crate::{Error, ErrorKind}; + +/// A visitor to collect field ids from bound predicates. +pub(super) struct CollectFieldIdVisitor { + pub(super) field_ids: HashSet, +} + +impl CollectFieldIdVisitor { + pub(super) fn field_ids(self) -> HashSet { + self.field_ids + } +} + +impl BoundPredicateVisitor for CollectFieldIdVisitor { + type T = (); + + fn always_true(&mut self) -> Result<()> { + Ok(()) + } + + fn always_false(&mut self) -> Result<()> { + Ok(()) + } + + fn and(&mut self, _lhs: (), _rhs: ()) -> Result<()> { + Ok(()) + } + + fn or(&mut self, _lhs: (), _rhs: ()) -> Result<()> { + Ok(()) + } + + fn not(&mut self, _inner: ()) -> Result<()> { + Ok(()) + } + + fn is_null(&mut self, reference: &BoundReference, _predicate: &BoundPredicate) -> Result<()> { + self.field_ids.insert(reference.field().id); + Ok(()) + } + + fn not_null(&mut self, reference: &BoundReference, _predicate: &BoundPredicate) -> Result<()> { + self.field_ids.insert(reference.field().id); + Ok(()) + } + + fn is_nan(&mut self, reference: &BoundReference, _predicate: &BoundPredicate) -> Result<()> { + self.field_ids.insert(reference.field().id); + Ok(()) + } + + fn not_nan(&mut self, reference: &BoundReference, _predicate: &BoundPredicate) -> Result<()> { + self.field_ids.insert(reference.field().id); + Ok(()) + } + + fn less_than( + &mut self, + reference: &BoundReference, + _literal: &Datum, + _predicate: &BoundPredicate, + ) -> Result<()> { + self.field_ids.insert(reference.field().id); + Ok(()) + } + + fn less_than_or_eq( + &mut self, + reference: &BoundReference, + _literal: &Datum, + _predicate: &BoundPredicate, + ) -> Result<()> { + self.field_ids.insert(reference.field().id); + Ok(()) + } + + fn greater_than( + &mut self, + reference: &BoundReference, + _literal: &Datum, + _predicate: &BoundPredicate, + ) -> Result<()> { + self.field_ids.insert(reference.field().id); + Ok(()) + } + + fn greater_than_or_eq( + &mut self, + reference: &BoundReference, + _literal: &Datum, + _predicate: &BoundPredicate, + ) -> Result<()> { + self.field_ids.insert(reference.field().id); + Ok(()) + } + + fn eq( + &mut self, + reference: &BoundReference, + _literal: &Datum, + _predicate: &BoundPredicate, + ) -> Result<()> { + self.field_ids.insert(reference.field().id); + Ok(()) + } + + fn not_eq( + &mut self, + reference: &BoundReference, + _literal: &Datum, + _predicate: &BoundPredicate, + ) -> Result<()> { + self.field_ids.insert(reference.field().id); + Ok(()) + } + + fn starts_with( + &mut self, + reference: &BoundReference, + _literal: &Datum, + _predicate: &BoundPredicate, + ) -> Result<()> { + self.field_ids.insert(reference.field().id); + Ok(()) + } + + fn not_starts_with( + &mut self, + reference: &BoundReference, + _literal: &Datum, + _predicate: &BoundPredicate, + ) -> Result<()> { + self.field_ids.insert(reference.field().id); + Ok(()) + } + + fn r#in( + &mut self, + reference: &BoundReference, + _literals: &FnvHashSet, + _predicate: &BoundPredicate, + ) -> Result<()> { + self.field_ids.insert(reference.field().id); + Ok(()) + } + + fn not_in( + &mut self, + reference: &BoundReference, + _literals: &FnvHashSet, + _predicate: &BoundPredicate, + ) -> Result<()> { + self.field_ids.insert(reference.field().id); + Ok(()) + } +} + +/// A visitor to convert Iceberg bound predicates to Arrow predicates. +pub(super) struct PredicateConverter<'a> { + /// The Parquet schema descriptor. + pub(super) parquet_schema: &'a SchemaDescriptor, + /// The map between field id and leaf column index in Parquet schema. + pub(super) column_map: &'a HashMap, + /// The required column indices in Parquet schema for the predicates. + pub(super) column_indices: &'a Vec, +} + +impl PredicateConverter<'_> { + /// When visiting a bound reference, we return index of the leaf column in the + /// required column indices which is used to project the column in the record batch. + /// Return None if the field id is not found in the column map, which is possible + /// due to schema evolution. + fn bound_reference(&mut self, reference: &BoundReference) -> Result> { + // The leaf column's index in Parquet schema. + if let Some(column_idx) = self.column_map.get(&reference.field().id) { + if self.parquet_schema.get_column_root(*column_idx).is_group() { + return Err(Error::new( + ErrorKind::DataInvalid, + format!( + "Leaf column `{}` in predicates isn't a root column in Parquet schema.", + reference.field().name + ), + )); + } + + // The leaf column's index in the required column indices. + let index = self + .column_indices + .iter() + .position(|&idx| idx == *column_idx) + .ok_or(Error::new( + ErrorKind::DataInvalid, + format!( + "Leaf column `{}` in predicates cannot be found in the required column indices.", + reference.field().name + ), + ))?; + + Ok(Some(index)) + } else { + Ok(None) + } + } + + /// Build an Arrow predicate that always returns true. + fn build_always_true(&self) -> Result> { + Ok(Box::new(|batch| { + Ok(BooleanArray::from(vec![true; batch.num_rows()])) + })) + } + + /// Build an Arrow predicate that always returns false. + fn build_always_false(&self) -> Result> { + Ok(Box::new(|batch| { + Ok(BooleanArray::from(vec![false; batch.num_rows()])) + })) + } +} + +/// Gets the leaf column from the record batch for the required column index. Only +/// supports top-level columns for now. +fn project_column( + batch: &RecordBatch, + column_idx: usize, +) -> std::result::Result { + let column = batch.column(column_idx); + + match column.data_type() { + DataType::Struct(_) => Err(ArrowError::SchemaError( + "Does not support struct column yet.".to_string(), + )), + _ => Ok(column.clone()), + } +} + +fn compute_is_nan(array: &ArrayRef) -> std::result::Result { + // Compute NaN over the contiguous values slice, then fold the null bitmap + // in with a single bitwise AND so that null slots become false. + let (is_nan, nulls) = match array.data_type() { + DataType::Float32 => { + let arr = array.as_primitive::(); + ( + BooleanBuffer::from_iter(arr.values().iter().map(|v| v.is_nan())), + arr.nulls(), + ) + } + DataType::Float64 => { + let arr = array.as_primitive::(); + ( + BooleanBuffer::from_iter(arr.values().iter().map(|v| v.is_nan())), + arr.nulls(), + ) + } + _ => unreachable!("is_nan is only valid for float types"), + }; + + let values = match nulls { + Some(nulls) => &is_nan & nulls.inner(), + None => is_nan, + }; + + Ok(BooleanArray::new(values, None)) +} + +pub(super) type PredicateResult = + dyn FnMut(RecordBatch) -> std::result::Result + Send + 'static; + +impl BoundPredicateVisitor for PredicateConverter<'_> { + type T = Box; + + fn always_true(&mut self) -> Result> { + self.build_always_true() + } + + fn always_false(&mut self) -> Result> { + self.build_always_false() + } + + fn and( + &mut self, + mut lhs: Box, + mut rhs: Box, + ) -> Result> { + Ok(Box::new(move |batch| { + let left = lhs(batch.clone())?; + let right = rhs(batch)?; + and_kleene(&left, &right) + })) + } + + fn or( + &mut self, + mut lhs: Box, + mut rhs: Box, + ) -> Result> { + Ok(Box::new(move |batch| { + let left = lhs(batch.clone())?; + let right = rhs(batch)?; + or_kleene(&left, &right) + })) + } + + fn not(&mut self, mut inner: Box) -> Result> { + Ok(Box::new(move |batch| { + let pred_ret = inner(batch)?; + not(&pred_ret) + })) + } + + fn is_null( + &mut self, + reference: &BoundReference, + _predicate: &BoundPredicate, + ) -> Result> { + if let Some(idx) = self.bound_reference(reference)? { + Ok(Box::new(move |batch| { + let column = project_column(&batch, idx)?; + is_null(&column) + })) + } else { + // A missing column, treating it as null. + self.build_always_true() + } + } + + fn not_null( + &mut self, + reference: &BoundReference, + _predicate: &BoundPredicate, + ) -> Result> { + if let Some(idx) = self.bound_reference(reference)? { + Ok(Box::new(move |batch| { + let column = project_column(&batch, idx)?; + is_not_null(&column) + })) + } else { + // A missing column, treating it as null. + self.build_always_false() + } + } + + fn is_nan( + &mut self, + reference: &BoundReference, + _predicate: &BoundPredicate, + ) -> Result> { + if let Some(idx) = self.bound_reference(reference)? { + Ok(Box::new(move |batch| { + let column = project_column(&batch, idx)?; + compute_is_nan(&column) + })) + } else { + // A missing column, treating it as null. + self.build_always_false() + } + } + + fn not_nan( + &mut self, + reference: &BoundReference, + _predicate: &BoundPredicate, + ) -> Result> { + if let Some(idx) = self.bound_reference(reference)? { + Ok(Box::new(move |batch| { + let column = project_column(&batch, idx)?; + let is_nan = compute_is_nan(&column)?; + not(&is_nan) + })) + } else { + // A missing column, treating it as null. + self.build_always_true() + } + } + + fn less_than( + &mut self, + reference: &BoundReference, + literal: &Datum, + _predicate: &BoundPredicate, + ) -> Result> { + if let Some(idx) = self.bound_reference(reference)? { + let literal = get_arrow_datum(literal)?; + + Ok(Box::new(move |batch| { + let left = project_column(&batch, idx)?; + let literal = try_cast_literal(&literal, left.data_type())?; + lt(&left, literal.as_ref()) + })) + } else { + // A missing column, treating it as null. + self.build_always_true() + } + } + + fn less_than_or_eq( + &mut self, + reference: &BoundReference, + literal: &Datum, + _predicate: &BoundPredicate, + ) -> Result> { + if let Some(idx) = self.bound_reference(reference)? { + let literal = get_arrow_datum(literal)?; + + Ok(Box::new(move |batch| { + let left = project_column(&batch, idx)?; + let literal = try_cast_literal(&literal, left.data_type())?; + lt_eq(&left, literal.as_ref()) + })) + } else { + // A missing column, treating it as null. + self.build_always_true() + } + } + + fn greater_than( + &mut self, + reference: &BoundReference, + literal: &Datum, + _predicate: &BoundPredicate, + ) -> Result> { + if let Some(idx) = self.bound_reference(reference)? { + let literal = get_arrow_datum(literal)?; + + Ok(Box::new(move |batch| { + let left = project_column(&batch, idx)?; + let literal = try_cast_literal(&literal, left.data_type())?; + gt(&left, literal.as_ref()) + })) + } else { + // A missing column, treating it as null. + self.build_always_false() + } + } + + fn greater_than_or_eq( + &mut self, + reference: &BoundReference, + literal: &Datum, + _predicate: &BoundPredicate, + ) -> Result> { + if let Some(idx) = self.bound_reference(reference)? { + let literal = get_arrow_datum(literal)?; + + Ok(Box::new(move |batch| { + let left = project_column(&batch, idx)?; + let literal = try_cast_literal(&literal, left.data_type())?; + gt_eq(&left, literal.as_ref()) + })) + } else { + // A missing column, treating it as null. + self.build_always_false() + } + } + + fn eq( + &mut self, + reference: &BoundReference, + literal: &Datum, + _predicate: &BoundPredicate, + ) -> Result> { + if let Some(idx) = self.bound_reference(reference)? { + let literal = get_arrow_datum(literal)?; + + Ok(Box::new(move |batch| { + let left = project_column(&batch, idx)?; + let literal = try_cast_literal(&literal, left.data_type())?; + eq(&left, literal.as_ref()) + })) + } else { + // A missing column, treating it as null. + self.build_always_false() + } + } + + fn not_eq( + &mut self, + reference: &BoundReference, + literal: &Datum, + _predicate: &BoundPredicate, + ) -> Result> { + if let Some(idx) = self.bound_reference(reference)? { + let literal = get_arrow_datum(literal)?; + + Ok(Box::new(move |batch| { + let left = project_column(&batch, idx)?; + let literal = try_cast_literal(&literal, left.data_type())?; + neq(&left, literal.as_ref()) + })) + } else { + // A missing column, treating it as null. + self.build_always_false() + } + } + + fn starts_with( + &mut self, + reference: &BoundReference, + literal: &Datum, + _predicate: &BoundPredicate, + ) -> Result> { + if let Some(idx) = self.bound_reference(reference)? { + let literal = get_arrow_datum(literal)?; + + Ok(Box::new(move |batch| { + let left = project_column(&batch, idx)?; + let literal = try_cast_literal(&literal, left.data_type())?; + starts_with(&left, literal.as_ref()) + })) + } else { + // A missing column, treating it as null. + self.build_always_false() + } + } + + fn not_starts_with( + &mut self, + reference: &BoundReference, + literal: &Datum, + _predicate: &BoundPredicate, + ) -> Result> { + if let Some(idx) = self.bound_reference(reference)? { + let literal = get_arrow_datum(literal)?; + + Ok(Box::new(move |batch| { + let left = project_column(&batch, idx)?; + let literal = try_cast_literal(&literal, left.data_type())?; + // update here if arrow ever adds a native not_starts_with + not(&starts_with(&left, literal.as_ref())?) + })) + } else { + // A missing column, treating it as null. + self.build_always_true() + } + } + + fn r#in( + &mut self, + reference: &BoundReference, + literals: &FnvHashSet, + _predicate: &BoundPredicate, + ) -> Result> { + if let Some(idx) = self.bound_reference(reference)? { + let literals: Vec<_> = literals + .iter() + .map(|lit| get_arrow_datum(lit).unwrap()) + .collect(); + + Ok(Box::new(move |batch| { + // update this if arrow ever adds a native is_in kernel + let left = project_column(&batch, idx)?; + + let mut acc = BooleanArray::from(vec![false; batch.num_rows()]); + for literal in &literals { + let literal = try_cast_literal(literal, left.data_type())?; + acc = or(&acc, &eq(&left, literal.as_ref())?)? + } + + Ok(acc) + })) + } else { + // A missing column, treating it as null. + self.build_always_false() + } + } + + fn not_in( + &mut self, + reference: &BoundReference, + literals: &FnvHashSet, + _predicate: &BoundPredicate, + ) -> Result> { + if let Some(idx) = self.bound_reference(reference)? { + let literals: Vec<_> = literals + .iter() + .map(|lit| get_arrow_datum(lit).unwrap()) + .collect(); + + Ok(Box::new(move |batch| { + // update this if arrow ever adds a native not_in kernel + let left = project_column(&batch, idx)?; + let mut acc = BooleanArray::from(vec![true; batch.num_rows()]); + for literal in &literals { + let literal = try_cast_literal(literal, left.data_type())?; + acc = and(&acc, &neq(&left, literal.as_ref())?)? + } + + Ok(acc) + })) + } else { + // A missing column, treating it as null. + self.build_always_true() + } + } +} + +/// The Arrow type of an array that the Parquet reader reads may not match the exact Arrow type +/// that Iceberg uses for literals - but they are effectively the same logical type, +/// i.e. LargeUtf8 and Utf8 or Utf8View and Utf8 or Utf8View and LargeUtf8. +/// +/// The Arrow compute kernels that we use must match the type exactly, so first cast the literal +/// into the type of the batch we read from Parquet before sending it to the compute kernel. +fn try_cast_literal( + literal: &Arc, + column_type: &DataType, +) -> std::result::Result, ArrowError> { + let literal_array = literal.get().0; + + // No cast required + if literal_array.data_type() == column_type { + return Ok(Arc::clone(literal)); + } + + let literal_array = cast(literal_array, column_type)?; + Ok(Arc::new(Scalar::new(literal_array))) +} + +#[cfg(test)] +mod tests { + use std::collections::{HashMap, HashSet}; + use std::sync::Arc; + + use arrow_array::{Array, BooleanArray, RecordBatch}; + use arrow_schema::{DataType, Field, Schema as ArrowSchema}; + use parquet::schema::parser::parse_message_type; + use parquet::schema::types::SchemaDescriptor; + + use super::{CollectFieldIdVisitor, PredicateConverter}; + use crate::expr::visitors::bound_predicate_visitor::visit; + use crate::expr::{Bind, Predicate, Reference}; + use crate::spec::{NestedField, PrimitiveType, Schema, SchemaRef, Type}; + + fn table_schema_simple() -> SchemaRef { + Arc::new( + Schema::builder() + .with_schema_id(1) + .with_identifier_field_ids(vec![2]) + .with_fields(vec![ + NestedField::optional(1, "foo", Type::Primitive(PrimitiveType::String)).into(), + NestedField::required(2, "bar", Type::Primitive(PrimitiveType::Int)).into(), + NestedField::optional(3, "baz", Type::Primitive(PrimitiveType::Boolean)).into(), + NestedField::optional(4, "qux", Type::Primitive(PrimitiveType::Float)).into(), + ]) + .build() + .unwrap(), + ) + } + + #[test] + fn test_collect_field_id() { + let schema = table_schema_simple(); + let expr = Reference::new("qux").is_null(); + let bound_expr = expr.bind(schema, true).unwrap(); + + let mut visitor = CollectFieldIdVisitor { + field_ids: HashSet::default(), + }; + visit(&mut visitor, &bound_expr).unwrap(); + + let mut expected = HashSet::default(); + expected.insert(4_i32); + + assert_eq!(visitor.field_ids, expected); + } + + #[test] + fn test_collect_field_id_with_and() { + let schema = table_schema_simple(); + let expr = Reference::new("qux") + .is_null() + .and(Reference::new("baz").is_null()); + let bound_expr = expr.bind(schema, true).unwrap(); + + let mut visitor = CollectFieldIdVisitor { + field_ids: HashSet::default(), + }; + visit(&mut visitor, &bound_expr).unwrap(); + + let mut expected = HashSet::default(); + expected.insert(4_i32); + expected.insert(3); + + assert_eq!(visitor.field_ids, expected); + } + + #[test] + fn test_collect_field_id_with_or() { + let schema = table_schema_simple(); + let expr = Reference::new("qux") + .is_null() + .or(Reference::new("baz").is_null()); + let bound_expr = expr.bind(schema, true).unwrap(); + + let mut visitor = CollectFieldIdVisitor { + field_ids: HashSet::default(), + }; + visit(&mut visitor, &bound_expr).unwrap(); + + let mut expected = HashSet::default(); + expected.insert(4_i32); + expected.insert(3); + + assert_eq!(visitor.field_ids, expected); + } + + fn apply_predicate_to_batch( + predicate: Predicate, + schema: SchemaRef, + batch: RecordBatch, + ) -> BooleanArray { + let bound = predicate.bind(schema, true).unwrap(); + + // Build a trivial Parquet schema with one float column at field id 4 + let message_type = " + message schema { + optional float qux = 4; + } + "; + let parquet_type = parse_message_type(message_type).expect("parse schema"); + let parquet_schema = SchemaDescriptor::new(Arc::new(parquet_type)); + + let column_map = HashMap::from([(4i32, 0usize)]); + let column_indices = vec![0usize]; + + let mut converter = PredicateConverter { + parquet_schema: &parquet_schema, + column_map: &column_map, + column_indices: &column_indices, + }; + + let mut predicate_fn = visit(&mut converter, &bound).unwrap(); + predicate_fn(batch).unwrap() + } + + #[test] + fn test_predicate_converter_nan() { + use arrow_array::Float32Array; + + let schema = table_schema_simple(); + let arrow_schema = Arc::new(ArrowSchema::new(vec![Field::new( + "qux", + DataType::Float32, + true, + )])); + let values = vec![Some(1.0f32), Some(f32::NAN), None, Some(0.0f32)]; + + // is_nan: non-null-propagating per Java's implementation - NULL → false + let batch = RecordBatch::try_new(arrow_schema.clone(), vec![Arc::new(Float32Array::from( + values.clone(), + ))]) + .unwrap(); + let result = + apply_predicate_to_batch(Reference::new("qux").is_nan(), schema.clone(), batch); + assert_eq!( + [ + result.value(0), + result.value(1), + result.value(2), + result.value(3) + ], + [false, true, false, false] + ); + assert!(!result.is_null(2)); + + // not_nan: non-null-propagating per Java's implementation - NULL → true + let batch = + RecordBatch::try_new(arrow_schema, vec![Arc::new(Float32Array::from(values))]).unwrap(); + let result = apply_predicate_to_batch(Reference::new("qux").is_not_nan(), schema, batch); + assert_eq!( + [ + result.value(0), + result.value(1), + result.value(2), + result.value(3) + ], + [true, false, true, true] + ); + assert!(!result.is_null(2)); + } +} diff --git a/crates/iceberg/src/arrow/reader/projection.rs b/crates/iceberg/src/arrow/reader/projection.rs new file mode 100644 index 0000000000..d3fa00b84b --- /dev/null +++ b/crates/iceberg/src/arrow/reader/projection.rs @@ -0,0 +1,1718 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Column projection for `ArrowReader`: building the Parquet projection mask +//! from Iceberg field IDs, and mapping field IDs between Iceberg and Parquet +//! (including fallback handling for files without embedded IDs). + +use std::collections::{HashMap, HashSet}; +use std::str::FromStr; +use std::sync::Arc; + +use arrow_schema::{Field, Schema as ArrowSchema, SchemaRef as ArrowSchemaRef}; +use parquet::arrow::{PARQUET_FIELD_ID_META_KEY, ProjectionMask}; +use parquet::schema::types::{SchemaDescriptor, Type as ParquetType}; + +use super::{ArrowReader, CollectFieldIdVisitor}; +use crate::arrow::arrow_schema_to_schema; +use crate::error::Result; +use crate::expr::BoundPredicate; +use crate::expr::visitors::bound_predicate_visitor::visit; +use crate::spec::{NameMapping, NestedField, PrimitiveType, Schema, Type}; +use crate::{Error, ErrorKind}; + +impl ArrowReader { + pub(super) fn build_field_id_set_and_map( + parquet_schema: &SchemaDescriptor, + predicate: &BoundPredicate, + ) -> Result<(HashSet, HashMap)> { + // Collects all Iceberg field IDs referenced in the filter predicate + let mut collector = CollectFieldIdVisitor { + field_ids: HashSet::default(), + }; + visit(&mut collector, predicate)?; + + let iceberg_field_ids = collector.field_ids(); + + // Without embedded field IDs, we fall back to position-based mapping for compatibility + let field_id_map = match build_field_id_map(parquet_schema)? { + Some(map) => map, + None => build_fallback_field_id_map(parquet_schema), + }; + + Ok((iceberg_field_ids, field_id_map)) + } + + /// Recursively extract leaf field IDs because Parquet projection works at the leaf column level. + /// Nested types (struct/list/map) are flattened in Parquet's columnar format. + fn include_leaf_field_id(field: &NestedField, field_ids: &mut Vec) { + match field.field_type.as_ref() { + Type::Primitive(_) => { + field_ids.push(field.id); + } + Type::Struct(struct_type) => { + for nested_field in struct_type.fields() { + Self::include_leaf_field_id(nested_field, field_ids); + } + } + Type::List(list_type) => { + Self::include_leaf_field_id(&list_type.element_field, field_ids); + } + Type::Map(map_type) => { + Self::include_leaf_field_id(&map_type.key_field, field_ids); + Self::include_leaf_field_id(&map_type.value_field, field_ids); + } + } + } + + pub(super) fn get_arrow_projection_mask( + field_ids: &[i32], + iceberg_schema_of_task: &Schema, + parquet_schema: &SchemaDescriptor, + arrow_schema: &ArrowSchemaRef, + use_fallback: bool, // Whether file lacks embedded field IDs (e.g., migrated from Hive/Spark) + ) -> Result { + fn type_promotion_is_valid( + file_type: Option<&PrimitiveType>, + projected_type: Option<&PrimitiveType>, + ) -> bool { + match (file_type, projected_type) { + (Some(lhs), Some(rhs)) if lhs == rhs => true, + (Some(PrimitiveType::Int), Some(PrimitiveType::Long)) => true, + (Some(PrimitiveType::Float), Some(PrimitiveType::Double)) => true, + ( + Some(PrimitiveType::Decimal { + precision: file_precision, + scale: file_scale, + }), + Some(PrimitiveType::Decimal { + precision: requested_precision, + scale: requested_scale, + }), + ) if requested_precision >= file_precision && file_scale == requested_scale => true, + // Uuid will be store as Fixed(16) in parquet file, so the read back type will be Fixed(16). + (Some(PrimitiveType::Fixed(16)), Some(PrimitiveType::Uuid)) => true, + _ => false, + } + } + + if field_ids.is_empty() { + return Ok(ProjectionMask::all()); + } + + if use_fallback { + // Position-based projection necessary because file lacks embedded field IDs + Self::get_arrow_projection_mask_fallback(field_ids, parquet_schema) + } else { + // Field-ID-based projection using embedded field IDs from Parquet metadata + + // Parquet's columnar format requires leaf-level (not top-level struct/list/map) projection + let mut leaf_field_ids = vec![]; + for field_id in field_ids { + let field = iceberg_schema_of_task.field_by_id(*field_id); + if let Some(field) = field { + Self::include_leaf_field_id(field, &mut leaf_field_ids); + } + } + + Self::get_arrow_projection_mask_with_field_ids( + &leaf_field_ids, + iceberg_schema_of_task, + parquet_schema, + arrow_schema, + type_promotion_is_valid, + ) + } + } + + /// Standard projection using embedded field IDs from Parquet metadata. + /// For iceberg-java compatibility with ParquetSchemaUtil.pruneColumns(). + fn get_arrow_projection_mask_with_field_ids( + leaf_field_ids: &[i32], + iceberg_schema_of_task: &Schema, + parquet_schema: &SchemaDescriptor, + arrow_schema: &ArrowSchemaRef, + type_promotion_is_valid: fn(Option<&PrimitiveType>, Option<&PrimitiveType>) -> bool, + ) -> Result { + let mut column_map = HashMap::new(); + let fields = arrow_schema.fields(); + + // Pre-project only the fields that have been selected, possibly avoiding converting + // some Arrow types that are not yet supported. + let mut projected_fields: HashMap = HashMap::new(); + let projected_arrow_schema = ArrowSchema::new_with_metadata( + fields.filter_leaves(|_, f| { + f.metadata() + .get(PARQUET_FIELD_ID_META_KEY) + .and_then(|field_id| i32::from_str(field_id).ok()) + .is_some_and(|field_id| { + projected_fields.insert((*f).clone(), field_id); + leaf_field_ids.contains(&field_id) + }) + }), + arrow_schema.metadata().clone(), + ); + let iceberg_schema = arrow_schema_to_schema(&projected_arrow_schema)?; + + fields.filter_leaves(|idx, field| { + let Some(field_id) = projected_fields.get(field).cloned() else { + return false; + }; + + let iceberg_field = iceberg_schema_of_task.field_by_id(field_id); + let parquet_iceberg_field = iceberg_schema.field_by_id(field_id); + + if iceberg_field.is_none() || parquet_iceberg_field.is_none() { + return false; + } + + if !type_promotion_is_valid( + parquet_iceberg_field + .unwrap() + .field_type + .as_primitive_type(), + iceberg_field.unwrap().field_type.as_primitive_type(), + ) { + return false; + } + + column_map.insert(field_id, idx); + true + }); + + // Schema evolution: New columns may not exist in old Parquet files. + // We only project existing columns; RecordBatchTransformer adds default/NULL values. + let mut indices = vec![]; + for field_id in leaf_field_ids { + if let Some(col_idx) = column_map.get(field_id) { + indices.push(*col_idx); + } + } + + if indices.is_empty() { + // Edge case: All requested columns are new (don't exist in file). + // Project all columns so RecordBatchTransformer has a batch to transform. + Ok(ProjectionMask::all()) + } else { + Ok(ProjectionMask::leaves(parquet_schema, indices)) + } + } + + /// Fallback projection for Parquet files without field IDs. + /// Uses position-based matching: field ID N → column position N-1. + /// Projects entire top-level columns (including nested content) for iceberg-java compatibility. + fn get_arrow_projection_mask_fallback( + field_ids: &[i32], + parquet_schema: &SchemaDescriptor, + ) -> Result { + // Position-based: field_id N → column N-1 (field IDs are 1-indexed) + let parquet_root_fields = parquet_schema.root_schema().get_fields(); + let mut root_indices = vec![]; + + for field_id in field_ids.iter() { + let parquet_pos = (*field_id - 1) as usize; + + if parquet_pos < parquet_root_fields.len() { + root_indices.push(parquet_pos); + } + // RecordBatchTransformer adds missing columns with NULL values + } + + if root_indices.is_empty() { + Ok(ProjectionMask::all()) + } else { + Ok(ProjectionMask::roots(parquet_schema, root_indices)) + } + } +} + +/// Build the map of parquet field id to Parquet column index in the schema. +/// Returns None if the Parquet file doesn't have field IDs embedded (e.g., migrated tables). +pub(super) fn build_field_id_map( + parquet_schema: &SchemaDescriptor, +) -> Result>> { + let mut column_map = HashMap::new(); + + for (idx, field) in parquet_schema.columns().iter().enumerate() { + let field_type = field.self_type(); + match field_type { + ParquetType::PrimitiveType { basic_info, .. } => { + if !basic_info.has_id() { + return Ok(None); + } + column_map.insert(basic_info.id(), idx); + } + ParquetType::GroupType { .. } => { + return Err(Error::new( + ErrorKind::DataInvalid, + format!( + "Leaf column in schema should be primitive type but got {field_type:?}" + ), + )); + } + }; + } + + Ok(Some(column_map)) +} + +/// Build a fallback field ID map for Parquet files without embedded field IDs. +/// +/// Returns the number of primitive (leaf) columns in a Parquet type, recursing into groups. +fn leaf_count(ty: &parquet::schema::types::Type) -> usize { + if ty.is_primitive() { + 1 + } else { + ty.get_fields().iter().map(|f| leaf_count(f)).sum() + } +} + +/// Builds a mapping from fallback field IDs to leaf column indices for Parquet files +/// without embedded field IDs. Returns entries only for primitive top-level fields. +/// +/// Must use top-level field positions (not leaf column positions) to stay consistent +/// with `add_fallback_field_ids_to_arrow_schema`, which assigns ordinal IDs to +/// top-level Arrow fields. Using leaf positions instead would produce wrong indices +/// when nested types (struct/list/map) expand into multiple leaf columns. +/// +/// Mirrors iceberg-java's ParquetSchemaUtil.addFallbackIds() which iterates +/// fileSchema.getFields() assigning ordinal IDs to top-level fields. +pub(super) fn build_fallback_field_id_map( + parquet_schema: &SchemaDescriptor, +) -> HashMap { + let mut column_map = HashMap::new(); + let mut leaf_idx = 0; + + for (top_pos, field) in parquet_schema.root_schema().get_fields().iter().enumerate() { + let field_id = (top_pos + 1) as i32; + if field.is_primitive() { + column_map.insert(field_id, leaf_idx); + } + leaf_idx += leaf_count(field); + } + + column_map +} + +/// Apply name mapping to Arrow schema for Parquet files lacking field IDs. +/// +/// Assigns Iceberg field IDs based on column names using the name mapping, +/// enabling correct projection on migrated files (e.g., from Hive/Spark via add_files). +/// +/// Per Iceberg spec Column Projection rule #2: +/// "Use schema.name-mapping.default metadata to map field id to columns without field id" +/// https://iceberg.apache.org/spec/#column-projection +/// +/// Corresponds to Java's ParquetSchemaUtil.applyNameMapping() and ApplyNameMapping visitor. +/// The key difference is Java operates on Parquet MessageType, while we operate on Arrow Schema. +/// +/// # Arguments +/// * `arrow_schema` - Arrow schema from Parquet file (without field IDs) +/// * `name_mapping` - Name mapping from table metadata (TableProperties.DEFAULT_NAME_MAPPING) +/// +/// # Returns +/// Arrow schema with field IDs assigned based on name mapping +pub(super) fn apply_name_mapping_to_arrow_schema( + arrow_schema: ArrowSchemaRef, + name_mapping: &NameMapping, +) -> Result> { + debug_assert!( + arrow_schema + .fields() + .iter() + .next() + .is_none_or(|f| f.metadata().get(PARQUET_FIELD_ID_META_KEY).is_none()), + "Schema already has field IDs - name mapping should not be applied" + ); + + let fields_with_mapped_ids: Vec<_> = arrow_schema + .fields() + .iter() + .map(|field| { + // Look up this column name in name mapping to get the Iceberg field ID. + // Corresponds to Java's ApplyNameMapping visitor which calls + // nameMapping.find(currentPath()) and returns field.withId() if found. + // + // If the field isn't in the mapping, leave it WITHOUT assigning an ID + // (matching Java's behavior of returning the field unchanged). + // Later, during projection, fields without IDs are filtered out. + let mapped_field_opt = name_mapping + .fields() + .iter() + .find(|f| f.names().contains(&field.name().to_string())); + + let mut metadata = field.metadata().clone(); + + if let Some(mapped_field) = mapped_field_opt + && let Some(field_id) = mapped_field.field_id() + { + // Field found in mapping with a field_id → assign it + metadata.insert(PARQUET_FIELD_ID_META_KEY.to_string(), field_id.to_string()); + } + // If field_id is None, leave the field without an ID (will be filtered by projection) + + Field::new(field.name(), field.data_type().clone(), field.is_nullable()) + .with_metadata(metadata) + }) + .collect(); + + Ok(Arc::new(ArrowSchema::new_with_metadata( + fields_with_mapped_ids, + arrow_schema.metadata().clone(), + ))) +} + +/// Add position-based fallback field IDs to Arrow schema for Parquet files lacking them. +/// Enables projection on migrated files (e.g., from Hive/Spark). +/// +/// Why at schema level (not per-batch): Efficiency - avoids repeated schema modification. +/// Why only top-level: Nested projection uses leaf column indices, not parent struct IDs. +/// Why 1-indexed: Compatibility with iceberg-java's ParquetSchemaUtil.addFallbackIds(). +pub(super) fn add_fallback_field_ids_to_arrow_schema( + arrow_schema: &ArrowSchemaRef, +) -> Arc { + debug_assert!( + arrow_schema + .fields() + .iter() + .next() + .is_none_or(|f| f.metadata().get(PARQUET_FIELD_ID_META_KEY).is_none()), + "Schema already has field IDs" + ); + + let fields_with_fallback_ids: Vec<_> = arrow_schema + .fields() + .iter() + .enumerate() + .map(|(pos, field)| { + let mut metadata = field.metadata().clone(); + let field_id = (pos + 1) as i32; // 1-indexed for Java compatibility + metadata.insert(PARQUET_FIELD_ID_META_KEY.to_string(), field_id.to_string()); + + Field::new(field.name(), field.data_type().clone(), field.is_nullable()) + .with_metadata(metadata) + }) + .collect(); + + Arc::new(ArrowSchema::new_with_metadata( + fields_with_fallback_ids, + arrow_schema.metadata().clone(), + )) +} + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + use std::fs::File; + use std::sync::Arc; + + use arrow_array::cast::AsArray; + use arrow_array::{ArrayRef, RecordBatch, StringArray}; + use arrow_schema::{DataType, Field, Schema as ArrowSchema, TimeUnit}; + use futures::TryStreamExt; + use parquet::arrow::{ArrowWriter, PARQUET_FIELD_ID_META_KEY, ProjectionMask}; + use parquet::basic::Compression; + use parquet::file::properties::WriterProperties; + use parquet::schema::parser::parse_message_type; + use parquet::schema::types::SchemaDescriptor; + use tempfile::TempDir; + + use crate::ErrorKind; + use crate::arrow::{ArrowReader, ArrowReaderBuilder}; + use crate::expr::{Bind, Reference}; + use crate::io::FileIO; + use crate::scan::{FileScanTask, FileScanTaskStream}; + use crate::spec::{DataFileFormat, Datum, NestedField, PrimitiveType, Schema, Type}; + + #[test] + fn test_arrow_projection_mask() { + let schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_identifier_field_ids(vec![1]) + .with_fields(vec![ + NestedField::required(1, "c1", Type::Primitive(PrimitiveType::String)).into(), + NestedField::optional(2, "c2", Type::Primitive(PrimitiveType::Int)).into(), + NestedField::optional( + 3, + "c3", + Type::Primitive(PrimitiveType::Decimal { + precision: 38, + scale: 3, + }), + ) + .into(), + ]) + .build() + .unwrap(), + ); + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("c1", DataType::Utf8, false).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + "1".to_string(), + )])), + // Type not supported + Field::new("c2", DataType::Duration(TimeUnit::Microsecond), true).with_metadata( + HashMap::from([(PARQUET_FIELD_ID_META_KEY.to_string(), "2".to_string())]), + ), + // Precision is beyond the supported range + Field::new("c3", DataType::Decimal128(39, 3), true).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + "3".to_string(), + )])), + ])); + + let message_type = " +message schema { + required binary c1 (STRING) = 1; + optional int32 c2 (INTEGER(8,true)) = 2; + optional fixed_len_byte_array(17) c3 (DECIMAL(39,3)) = 3; +} + "; + let parquet_type = parse_message_type(message_type).expect("should parse schema"); + let parquet_schema = SchemaDescriptor::new(Arc::new(parquet_type)); + + // Try projecting the fields c2 and c3 with the unsupported data types + let err = ArrowReader::get_arrow_projection_mask( + &[1, 2, 3], + &schema, + &parquet_schema, + &arrow_schema, + false, + ) + .unwrap_err(); + + assert_eq!(err.kind(), ErrorKind::DataInvalid); + assert_eq!( + err.to_string(), + "DataInvalid => Unsupported Arrow data type: Duration(µs)".to_string() + ); + + // Omitting field c2, we still get an error due to c3 being selected + let err = ArrowReader::get_arrow_projection_mask( + &[1, 3], + &schema, + &parquet_schema, + &arrow_schema, + false, + ) + .unwrap_err(); + + assert_eq!(err.kind(), ErrorKind::DataInvalid); + assert_eq!( + err.to_string(), + "DataInvalid => Failed to create decimal type, source: DataInvalid => Decimals with precision larger than 38 are not supported: 39".to_string() + ); + + // Finally avoid selecting fields with unsupported data types + let mask = ArrowReader::get_arrow_projection_mask( + &[1], + &schema, + &parquet_schema, + &arrow_schema, + false, + ) + .expect("Some ProjectionMask"); + assert_eq!(mask, ProjectionMask::leaves(&parquet_schema, vec![0])); + } + + /// Test schema evolution: reading old Parquet file (with only column 'a') + /// using a newer table schema (with columns 'a' and 'b'). + /// This tests that: + /// 1. get_arrow_projection_mask allows missing columns + /// 2. RecordBatchTransformer adds missing column 'b' with NULL values + #[tokio::test] + async fn test_schema_evolution_add_column() { + use arrow_array::{Array, Int32Array}; + + // New table schema: columns 'a' and 'b' (b was added later, file only has 'a') + let new_schema = Arc::new( + Schema::builder() + .with_schema_id(2) + .with_fields(vec![ + NestedField::required(1, "a", Type::Primitive(PrimitiveType::Int)).into(), + NestedField::optional(2, "b", Type::Primitive(PrimitiveType::Int)).into(), + ]) + .build() + .unwrap(), + ); + + // Create Arrow schema for old Parquet file (only has column 'a') + let arrow_schema_old = Arc::new(ArrowSchema::new(vec![ + Field::new("a", DataType::Int32, false).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + "1".to_string(), + )])), + ])); + + // Write old Parquet file with only column 'a' + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + let file_io = FileIO::new_with_fs(); + + let data_a = Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef; + let to_write = RecordBatch::try_new(arrow_schema_old.clone(), vec![data_a]).unwrap(); + + let props = WriterProperties::builder() + .set_compression(Compression::SNAPPY) + .build(); + let file = File::create(format!("{table_location}/old_file.parquet")).unwrap(); + let mut writer = ArrowWriter::try_new(file, to_write.schema(), Some(props)).unwrap(); + writer.write(&to_write).expect("Writing batch"); + writer.close().unwrap(); + + // Read the old Parquet file using the NEW schema (with column 'b') + let reader = ArrowReaderBuilder::new(file_io).build(); + let tasks = Box::pin(futures::stream::iter( + vec![Ok(FileScanTask { + file_size_in_bytes: std::fs::metadata(format!("{table_location}/old_file.parquet")) + .unwrap() + .len(), + start: 0, + length: 0, + record_count: None, + data_file_path: format!("{table_location}/old_file.parquet"), + data_file_format: DataFileFormat::Parquet, + schema: new_schema.clone(), + project_field_ids: vec![1, 2], // Request both columns 'a' and 'b' + predicate: None, + deletes: vec![], + partition: None, + partition_spec: None, + name_mapping: None, + case_sensitive: false, + })] + .into_iter(), + )) as FileScanTaskStream; + + let result = reader + .read(tasks) + .unwrap() + .try_collect::>() + .await + .unwrap(); + + // Verify we got the correct data + assert_eq!(result.len(), 1); + let batch = &result[0]; + + // Should have 2 columns now + assert_eq!(batch.num_columns(), 2); + assert_eq!(batch.num_rows(), 3); + + // Column 'a' should have the original data + let col_a = batch + .column(0) + .as_primitive::(); + assert_eq!(col_a.values(), &[1, 2, 3]); + + // Column 'b' should be all NULLs (it didn't exist in the old file) + let col_b = batch + .column(1) + .as_primitive::(); + assert_eq!(col_b.null_count(), 3); + assert!(col_b.is_null(0)); + assert!(col_b.is_null(1)); + assert!(col_b.is_null(2)); + } + + /// Test reading Parquet files without field ID metadata (e.g., migrated tables). + /// This exercises the position-based fallback path. + /// + /// Corresponds to Java's ParquetSchemaUtil.addFallbackIds() + pruneColumnsFallback() + /// in /parquet/src/main/java/org/apache/iceberg/parquet/ParquetSchemaUtil.java + #[tokio::test] + async fn test_read_parquet_file_without_field_ids() { + let schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::required(1, "name", Type::Primitive(PrimitiveType::String)).into(), + NestedField::required(2, "age", Type::Primitive(PrimitiveType::Int)).into(), + ]) + .build() + .unwrap(), + ); + + // Parquet file from a migrated table - no field ID metadata + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("name", DataType::Utf8, false), + Field::new("age", DataType::Int32, false), + ])); + + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + let file_io = FileIO::new_with_fs(); + + let name_data = vec!["Alice", "Bob", "Charlie"]; + let age_data = vec![30, 25, 35]; + + use arrow_array::Int32Array; + let name_col = Arc::new(StringArray::from(name_data.clone())) as ArrayRef; + let age_col = Arc::new(Int32Array::from(age_data.clone())) as ArrayRef; + + let to_write = RecordBatch::try_new(arrow_schema.clone(), vec![name_col, age_col]).unwrap(); + + let props = WriterProperties::builder() + .set_compression(Compression::SNAPPY) + .build(); + + let file = File::create(format!("{table_location}/1.parquet")).unwrap(); + let mut writer = ArrowWriter::try_new(file, to_write.schema(), Some(props)).unwrap(); + + writer.write(&to_write).expect("Writing batch"); + writer.close().unwrap(); + + let reader = ArrowReaderBuilder::new(file_io).build(); + + let tasks = Box::pin(futures::stream::iter( + vec![Ok(FileScanTask { + file_size_in_bytes: std::fs::metadata(format!("{table_location}/1.parquet")) + .unwrap() + .len(), + start: 0, + length: 0, + record_count: None, + data_file_path: format!("{table_location}/1.parquet"), + data_file_format: DataFileFormat::Parquet, + schema: schema.clone(), + project_field_ids: vec![1, 2], + predicate: None, + deletes: vec![], + partition: None, + partition_spec: None, + name_mapping: None, + case_sensitive: false, + })] + .into_iter(), + )) as FileScanTaskStream; + + let result = reader + .read(tasks) + .unwrap() + .try_collect::>() + .await + .unwrap(); + + assert_eq!(result.len(), 1); + let batch = &result[0]; + assert_eq!(batch.num_rows(), 3); + assert_eq!(batch.num_columns(), 2); + + // Verify position-based mapping: field_id 1 → position 0, field_id 2 → position 1 + let name_array = batch.column(0).as_string::(); + assert_eq!(name_array.value(0), "Alice"); + assert_eq!(name_array.value(1), "Bob"); + assert_eq!(name_array.value(2), "Charlie"); + + let age_array = batch + .column(1) + .as_primitive::(); + assert_eq!(age_array.value(0), 30); + assert_eq!(age_array.value(1), 25); + assert_eq!(age_array.value(2), 35); + } + + /// Test reading Parquet files without field IDs with partial projection. + /// Only a subset of columns are requested, verifying position-based fallback + /// handles column selection correctly. + #[tokio::test] + async fn test_read_parquet_without_field_ids_partial_projection() { + use arrow_array::Int32Array; + + let schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::required(1, "col1", Type::Primitive(PrimitiveType::String)).into(), + NestedField::required(2, "col2", Type::Primitive(PrimitiveType::Int)).into(), + NestedField::required(3, "col3", Type::Primitive(PrimitiveType::String)).into(), + NestedField::required(4, "col4", Type::Primitive(PrimitiveType::Int)).into(), + ]) + .build() + .unwrap(), + ); + + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("col1", DataType::Utf8, false), + Field::new("col2", DataType::Int32, false), + Field::new("col3", DataType::Utf8, false), + Field::new("col4", DataType::Int32, false), + ])); + + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + let file_io = FileIO::new_with_fs(); + + let col1_data = Arc::new(StringArray::from(vec!["a", "b"])) as ArrayRef; + let col2_data = Arc::new(Int32Array::from(vec![10, 20])) as ArrayRef; + let col3_data = Arc::new(StringArray::from(vec!["c", "d"])) as ArrayRef; + let col4_data = Arc::new(Int32Array::from(vec![30, 40])) as ArrayRef; + + let to_write = RecordBatch::try_new(arrow_schema.clone(), vec![ + col1_data, col2_data, col3_data, col4_data, + ]) + .unwrap(); + + let props = WriterProperties::builder() + .set_compression(Compression::SNAPPY) + .build(); + + let file = File::create(format!("{table_location}/1.parquet")).unwrap(); + let mut writer = ArrowWriter::try_new(file, to_write.schema(), Some(props)).unwrap(); + + writer.write(&to_write).expect("Writing batch"); + writer.close().unwrap(); + + let reader = ArrowReaderBuilder::new(file_io).build(); + + let tasks = Box::pin(futures::stream::iter( + vec![Ok(FileScanTask { + file_size_in_bytes: std::fs::metadata(format!("{table_location}/1.parquet")) + .unwrap() + .len(), + start: 0, + length: 0, + record_count: None, + data_file_path: format!("{table_location}/1.parquet"), + data_file_format: DataFileFormat::Parquet, + schema: schema.clone(), + project_field_ids: vec![1, 3], + predicate: None, + deletes: vec![], + partition: None, + partition_spec: None, + name_mapping: None, + case_sensitive: false, + })] + .into_iter(), + )) as FileScanTaskStream; + + let result = reader + .read(tasks) + .unwrap() + .try_collect::>() + .await + .unwrap(); + + assert_eq!(result.len(), 1); + let batch = &result[0]; + assert_eq!(batch.num_rows(), 2); + assert_eq!(batch.num_columns(), 2); + + let col1_array = batch.column(0).as_string::(); + assert_eq!(col1_array.value(0), "a"); + assert_eq!(col1_array.value(1), "b"); + + let col3_array = batch.column(1).as_string::(); + assert_eq!(col3_array.value(0), "c"); + assert_eq!(col3_array.value(1), "d"); + } + + /// Test reading Parquet files without field IDs with schema evolution. + /// The Iceberg schema has more fields than the Parquet file, testing that + /// missing columns are filled with NULLs. + #[tokio::test] + async fn test_read_parquet_without_field_ids_schema_evolution() { + use arrow_array::{Array, Int32Array}; + + // Schema with field 3 added after the file was written + let schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::required(1, "name", Type::Primitive(PrimitiveType::String)).into(), + NestedField::required(2, "age", Type::Primitive(PrimitiveType::Int)).into(), + NestedField::optional(3, "city", Type::Primitive(PrimitiveType::String)).into(), + ]) + .build() + .unwrap(), + ); + + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("name", DataType::Utf8, false), + Field::new("age", DataType::Int32, false), + ])); + + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + let file_io = FileIO::new_with_fs(); + + let name_data = Arc::new(StringArray::from(vec!["Alice", "Bob"])) as ArrayRef; + let age_data = Arc::new(Int32Array::from(vec![30, 25])) as ArrayRef; + + let to_write = + RecordBatch::try_new(arrow_schema.clone(), vec![name_data, age_data]).unwrap(); + + let props = WriterProperties::builder() + .set_compression(Compression::SNAPPY) + .build(); + + let file = File::create(format!("{table_location}/1.parquet")).unwrap(); + let mut writer = ArrowWriter::try_new(file, to_write.schema(), Some(props)).unwrap(); + + writer.write(&to_write).expect("Writing batch"); + writer.close().unwrap(); + + let reader = ArrowReaderBuilder::new(file_io).build(); + + let tasks = Box::pin(futures::stream::iter( + vec![Ok(FileScanTask { + file_size_in_bytes: std::fs::metadata(format!("{table_location}/1.parquet")) + .unwrap() + .len(), + start: 0, + length: 0, + record_count: None, + data_file_path: format!("{table_location}/1.parquet"), + data_file_format: DataFileFormat::Parquet, + schema: schema.clone(), + project_field_ids: vec![1, 2, 3], + predicate: None, + deletes: vec![], + partition: None, + partition_spec: None, + name_mapping: None, + case_sensitive: false, + })] + .into_iter(), + )) as FileScanTaskStream; + + let result = reader + .read(tasks) + .unwrap() + .try_collect::>() + .await + .unwrap(); + + assert_eq!(result.len(), 1); + let batch = &result[0]; + assert_eq!(batch.num_rows(), 2); + assert_eq!(batch.num_columns(), 3); + + let name_array = batch.column(0).as_string::(); + assert_eq!(name_array.value(0), "Alice"); + assert_eq!(name_array.value(1), "Bob"); + + let age_array = batch + .column(1) + .as_primitive::(); + assert_eq!(age_array.value(0), 30); + assert_eq!(age_array.value(1), 25); + + // Verify missing column filled with NULLs + let city_array = batch.column(2).as_string::(); + assert_eq!(city_array.null_count(), 2); + assert!(city_array.is_null(0)); + assert!(city_array.is_null(1)); + } + + /// Test reading Parquet files without field IDs that have multiple row groups. + /// This ensures the position-based fallback works correctly across row group boundaries. + #[tokio::test] + async fn test_read_parquet_without_field_ids_multiple_row_groups() { + use arrow_array::Int32Array; + + let schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::required(1, "name", Type::Primitive(PrimitiveType::String)).into(), + NestedField::required(2, "value", Type::Primitive(PrimitiveType::Int)).into(), + ]) + .build() + .unwrap(), + ); + + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("name", DataType::Utf8, false), + Field::new("value", DataType::Int32, false), + ])); + + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + let file_io = FileIO::new_with_fs(); + + // Small row group size to create multiple row groups + let props = WriterProperties::builder() + .set_compression(Compression::SNAPPY) + .set_write_batch_size(2) + .set_max_row_group_row_count(Some(2)) + .build(); + + let file = File::create(format!("{table_location}/1.parquet")).unwrap(); + let mut writer = ArrowWriter::try_new(file, arrow_schema.clone(), Some(props)).unwrap(); + + // Write 6 rows in 3 batches (will create 3 row groups) + for batch_num in 0..3 { + let name_data = Arc::new(StringArray::from(vec![ + format!("name_{}", batch_num * 2), + format!("name_{}", batch_num * 2 + 1), + ])) as ArrayRef; + let value_data = + Arc::new(Int32Array::from(vec![batch_num * 2, batch_num * 2 + 1])) as ArrayRef; + + let batch = + RecordBatch::try_new(arrow_schema.clone(), vec![name_data, value_data]).unwrap(); + writer.write(&batch).expect("Writing batch"); + } + writer.close().unwrap(); + + let reader = ArrowReaderBuilder::new(file_io).build(); + + let tasks = Box::pin(futures::stream::iter( + vec![Ok(FileScanTask { + file_size_in_bytes: std::fs::metadata(format!("{table_location}/1.parquet")) + .unwrap() + .len(), + start: 0, + length: 0, + record_count: None, + data_file_path: format!("{table_location}/1.parquet"), + data_file_format: DataFileFormat::Parquet, + schema: schema.clone(), + project_field_ids: vec![1, 2], + predicate: None, + deletes: vec![], + partition: None, + partition_spec: None, + name_mapping: None, + case_sensitive: false, + })] + .into_iter(), + )) as FileScanTaskStream; + + let result = reader + .read(tasks) + .unwrap() + .try_collect::>() + .await + .unwrap(); + + assert!(!result.is_empty()); + + let mut all_names = Vec::new(); + let mut all_values = Vec::new(); + + for batch in &result { + let name_array = batch.column(0).as_string::(); + let value_array = batch + .column(1) + .as_primitive::(); + + for i in 0..batch.num_rows() { + all_names.push(name_array.value(i).to_string()); + all_values.push(value_array.value(i)); + } + } + + assert_eq!(all_names.len(), 6); + assert_eq!(all_values.len(), 6); + + for i in 0..6 { + assert_eq!(all_names[i], format!("name_{i}")); + assert_eq!(all_values[i], i as i32); + } + } + + /// Test reading Parquet files without field IDs with nested types (struct). + /// Java's pruneColumnsFallback() projects entire top-level columns including nested content. + /// This test verifies that a top-level struct field is projected correctly with all its nested fields. + #[tokio::test] + async fn test_read_parquet_without_field_ids_with_struct() { + use arrow_array::{Int32Array, StructArray}; + use arrow_schema::Fields; + + let schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::required(1, "id", Type::Primitive(PrimitiveType::Int)).into(), + NestedField::required( + 2, + "person", + Type::Struct(crate::spec::StructType::new(vec![ + NestedField::required( + 3, + "name", + Type::Primitive(PrimitiveType::String), + ) + .into(), + NestedField::required(4, "age", Type::Primitive(PrimitiveType::Int)) + .into(), + ])), + ) + .into(), + ]) + .build() + .unwrap(), + ); + + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new( + "person", + DataType::Struct(Fields::from(vec![ + Field::new("name", DataType::Utf8, false), + Field::new("age", DataType::Int32, false), + ])), + false, + ), + ])); + + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + let file_io = FileIO::new_with_fs(); + + let id_data = Arc::new(Int32Array::from(vec![1, 2])) as ArrayRef; + let name_data = Arc::new(StringArray::from(vec!["Alice", "Bob"])) as ArrayRef; + let age_data = Arc::new(Int32Array::from(vec![30, 25])) as ArrayRef; + let person_data = Arc::new(StructArray::from(vec![ + ( + Arc::new(Field::new("name", DataType::Utf8, false)), + name_data, + ), + ( + Arc::new(Field::new("age", DataType::Int32, false)), + age_data, + ), + ])) as ArrayRef; + + let to_write = + RecordBatch::try_new(arrow_schema.clone(), vec![id_data, person_data]).unwrap(); + + let props = WriterProperties::builder() + .set_compression(Compression::SNAPPY) + .build(); + + let file = File::create(format!("{table_location}/1.parquet")).unwrap(); + let mut writer = ArrowWriter::try_new(file, to_write.schema(), Some(props)).unwrap(); + + writer.write(&to_write).expect("Writing batch"); + writer.close().unwrap(); + + let reader = ArrowReaderBuilder::new(file_io).build(); + + let tasks = Box::pin(futures::stream::iter( + vec![Ok(FileScanTask { + file_size_in_bytes: std::fs::metadata(format!("{table_location}/1.parquet")) + .unwrap() + .len(), + start: 0, + length: 0, + record_count: None, + data_file_path: format!("{table_location}/1.parquet"), + data_file_format: DataFileFormat::Parquet, + schema: schema.clone(), + project_field_ids: vec![1, 2], + predicate: None, + deletes: vec![], + partition: None, + partition_spec: None, + name_mapping: None, + case_sensitive: false, + })] + .into_iter(), + )) as FileScanTaskStream; + + let result = reader + .read(tasks) + .unwrap() + .try_collect::>() + .await + .unwrap(); + + assert_eq!(result.len(), 1); + let batch = &result[0]; + assert_eq!(batch.num_rows(), 2); + assert_eq!(batch.num_columns(), 2); + + let id_array = batch + .column(0) + .as_primitive::(); + assert_eq!(id_array.value(0), 1); + assert_eq!(id_array.value(1), 2); + + let person_array = batch.column(1).as_struct(); + assert_eq!(person_array.num_columns(), 2); + + let name_array = person_array.column(0).as_string::(); + assert_eq!(name_array.value(0), "Alice"); + assert_eq!(name_array.value(1), "Bob"); + + let age_array = person_array + .column(1) + .as_primitive::(); + assert_eq!(age_array.value(0), 30); + assert_eq!(age_array.value(1), 25); + } + + /// Test reading Parquet files without field IDs with schema evolution - column added in the middle. + /// When a new column is inserted between existing columns in the schema order, + /// the fallback projection must correctly map field IDs to output positions. + #[tokio::test] + async fn test_read_parquet_without_field_ids_schema_evolution_add_column_in_middle() { + use arrow_array::{Array, Int32Array}; + + let arrow_schema_old = Arc::new(ArrowSchema::new(vec![ + Field::new("col0", DataType::Int32, true), + Field::new("col1", DataType::Int32, true), + ])); + + // New column added between existing columns: col0 (id=1), newCol (id=5), col1 (id=2) + let schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::optional(1, "col0", Type::Primitive(PrimitiveType::Int)).into(), + NestedField::optional(5, "newCol", Type::Primitive(PrimitiveType::Int)).into(), + NestedField::optional(2, "col1", Type::Primitive(PrimitiveType::Int)).into(), + ]) + .build() + .unwrap(), + ); + + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + let file_io = FileIO::new_with_fs(); + + let col0_data = Arc::new(Int32Array::from(vec![1, 2])) as ArrayRef; + let col1_data = Arc::new(Int32Array::from(vec![10, 20])) as ArrayRef; + + let to_write = + RecordBatch::try_new(arrow_schema_old.clone(), vec![col0_data, col1_data]).unwrap(); + + let props = WriterProperties::builder() + .set_compression(Compression::SNAPPY) + .build(); + + let file = File::create(format!("{table_location}/1.parquet")).unwrap(); + let mut writer = ArrowWriter::try_new(file, to_write.schema(), Some(props)).unwrap(); + writer.write(&to_write).expect("Writing batch"); + writer.close().unwrap(); + + let reader = ArrowReaderBuilder::new(file_io).build(); + + let tasks = Box::pin(futures::stream::iter( + vec![Ok(FileScanTask { + file_size_in_bytes: std::fs::metadata(format!("{table_location}/1.parquet")) + .unwrap() + .len(), + start: 0, + length: 0, + record_count: None, + data_file_path: format!("{table_location}/1.parquet"), + data_file_format: DataFileFormat::Parquet, + schema: schema.clone(), + project_field_ids: vec![1, 5, 2], + predicate: None, + deletes: vec![], + partition: None, + partition_spec: None, + name_mapping: None, + case_sensitive: false, + })] + .into_iter(), + )) as FileScanTaskStream; + + let result = reader + .read(tasks) + .unwrap() + .try_collect::>() + .await + .unwrap(); + + assert_eq!(result.len(), 1); + let batch = &result[0]; + assert_eq!(batch.num_rows(), 2); + assert_eq!(batch.num_columns(), 3); + + let result_col0 = batch + .column(0) + .as_primitive::(); + assert_eq!(result_col0.value(0), 1); + assert_eq!(result_col0.value(1), 2); + + // New column should be NULL (doesn't exist in old file) + let result_newcol = batch + .column(1) + .as_primitive::(); + assert_eq!(result_newcol.null_count(), 2); + assert!(result_newcol.is_null(0)); + assert!(result_newcol.is_null(1)); + + let result_col1 = batch + .column(2) + .as_primitive::(); + assert_eq!(result_col1.value(0), 10); + assert_eq!(result_col1.value(1), 20); + } + + /// Test reading Parquet files without field IDs with a filter that eliminates all row groups. + /// During development of field ID mapping, we saw a panic when row_selection_enabled=true and + /// all row groups are filtered out. + #[tokio::test] + async fn test_read_parquet_without_field_ids_filter_eliminates_all_rows() { + use arrow_array::{Float64Array, Int32Array}; + + // Schema with fields that will use fallback IDs 1, 2, 3 + let schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::required(1, "id", Type::Primitive(PrimitiveType::Int)).into(), + NestedField::required(2, "name", Type::Primitive(PrimitiveType::String)).into(), + NestedField::required(3, "value", Type::Primitive(PrimitiveType::Double)) + .into(), + ]) + .build() + .unwrap(), + ); + + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("name", DataType::Utf8, false), + Field::new("value", DataType::Float64, false), + ])); + + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + let file_io = FileIO::new_with_fs(); + + // Write data where all ids are >= 10 + let id_data = Arc::new(Int32Array::from(vec![10, 11, 12])) as ArrayRef; + let name_data = Arc::new(StringArray::from(vec!["a", "b", "c"])) as ArrayRef; + let value_data = Arc::new(Float64Array::from(vec![100.0, 200.0, 300.0])) as ArrayRef; + + let to_write = + RecordBatch::try_new(arrow_schema.clone(), vec![id_data, name_data, value_data]) + .unwrap(); + + let props = WriterProperties::builder() + .set_compression(Compression::SNAPPY) + .build(); + + let file = File::create(format!("{table_location}/1.parquet")).unwrap(); + let mut writer = ArrowWriter::try_new(file, to_write.schema(), Some(props)).unwrap(); + writer.write(&to_write).expect("Writing batch"); + writer.close().unwrap(); + + // Filter that eliminates all row groups: id < 5 + let predicate = Reference::new("id").less_than(Datum::int(5)); + + // Enable both row_group_filtering and row_selection - triggered the panic + let reader = ArrowReaderBuilder::new(file_io) + .with_row_group_filtering_enabled(true) + .with_row_selection_enabled(true) + .build(); + + let tasks = Box::pin(futures::stream::iter( + vec![Ok(FileScanTask { + file_size_in_bytes: std::fs::metadata(format!("{table_location}/1.parquet")) + .unwrap() + .len(), + start: 0, + length: 0, + record_count: None, + data_file_path: format!("{table_location}/1.parquet"), + data_file_format: DataFileFormat::Parquet, + schema: schema.clone(), + project_field_ids: vec![1, 2, 3], + predicate: Some(predicate.bind(schema, true).unwrap()), + deletes: vec![], + partition: None, + partition_spec: None, + name_mapping: None, + case_sensitive: false, + })] + .into_iter(), + )) as FileScanTaskStream; + + // Should no longer panic + let result = reader + .read(tasks) + .unwrap() + .try_collect::>() + .await + .unwrap(); + + // Should return empty results + assert!(result.is_empty() || result.iter().all(|batch| batch.num_rows() == 0)); + } + + /// Test bucket partitioning reads source column from data file (not partition metadata). + /// + /// This is an integration test verifying the complete ArrowReader pipeline with bucket partitioning. + /// It corresponds to TestRuntimeFiltering tests in Iceberg Java (e.g., testRenamedSourceColumnTable). + /// + /// # Iceberg Spec Requirements + /// + /// Per the Iceberg spec "Column Projection" section: + /// > "Return the value from partition metadata if an **Identity Transform** exists for the field" + /// + /// This means: + /// - Identity transforms (e.g., `identity(dept)`) use constants from partition metadata + /// - Non-identity transforms (e.g., `bucket(4, id)`) must read source columns from data files + /// - Partition metadata for bucket transforms stores bucket numbers (0-3), NOT source values + /// + /// Java's PartitionUtil.constantsMap() implements this via: + /// ```java + /// if (field.transform().isIdentity()) { + /// idToConstant.put(field.sourceId(), converted); + /// } + /// ``` + /// + /// # What This Test Verifies + /// + /// This test ensures the full ArrowReader → RecordBatchTransformer pipeline correctly handles + /// bucket partitioning when FileScanTask provides partition_spec and partition_data: + /// + /// - Parquet file has field_id=1 named "id" with actual data [1, 5, 9, 13] + /// - FileScanTask specifies partition_spec with bucket(4, id) and partition_data with bucket=1 + /// - RecordBatchTransformer.constants_map() excludes bucket-partitioned field from constants + /// - ArrowReader correctly reads [1, 5, 9, 13] from the data file + /// - Values are NOT replaced with constant 1 from partition metadata + /// + /// # Why This Matters + /// + /// Without correct handling: + /// - Runtime filtering would break (e.g., `WHERE id = 5` would fail) + /// - Query results would be incorrect (all rows would have id=1) + /// - Bucket partitioning would be unusable for query optimization + /// + /// # References + /// - Iceberg spec: format/spec.md "Column Projection" + "Partition Transforms" + /// - Java test: spark/src/test/java/.../TestRuntimeFiltering.java + /// - Java impl: core/src/main/java/org/apache/iceberg/util/PartitionUtil.java + #[tokio::test] + async fn test_bucket_partitioning_reads_source_column_from_file() { + use arrow_array::Int32Array; + + use crate::spec::{Literal, PartitionSpec, Struct, Transform}; + + // Iceberg schema with id and name columns + let schema = Arc::new( + Schema::builder() + .with_schema_id(0) + .with_fields(vec![ + NestedField::required(1, "id", Type::Primitive(PrimitiveType::Int)).into(), + NestedField::optional(2, "name", Type::Primitive(PrimitiveType::String)).into(), + ]) + .build() + .unwrap(), + ); + + // Partition spec: bucket(4, id) + let partition_spec = Arc::new( + PartitionSpec::builder(schema.clone()) + .with_spec_id(0) + .add_partition_field("id", "id_bucket", Transform::Bucket(4)) + .unwrap() + .build() + .unwrap(), + ); + + // Partition data: bucket value is 1 + let partition_data = Struct::from_iter(vec![Some(Literal::int(1))]); + + // Create Arrow schema with field IDs for Parquet file + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("id", DataType::Int32, false).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + "1".to_string(), + )])), + Field::new("name", DataType::Utf8, true).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + "2".to_string(), + )])), + ])); + + // Write Parquet file with data + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + let file_io = FileIO::new_with_fs(); + + let id_data = Arc::new(Int32Array::from(vec![1, 5, 9, 13])) as ArrayRef; + let name_data = + Arc::new(StringArray::from(vec!["Alice", "Bob", "Charlie", "Dave"])) as ArrayRef; + + let to_write = + RecordBatch::try_new(arrow_schema.clone(), vec![id_data, name_data]).unwrap(); + + let props = WriterProperties::builder() + .set_compression(Compression::SNAPPY) + .build(); + let file = File::create(format!("{}/data.parquet", &table_location)).unwrap(); + let mut writer = ArrowWriter::try_new(file, to_write.schema(), Some(props)).unwrap(); + writer.write(&to_write).expect("Writing batch"); + writer.close().unwrap(); + + // Read the Parquet file with partition spec and data + let reader = ArrowReaderBuilder::new(file_io).build(); + let tasks = Box::pin(futures::stream::iter( + vec![Ok(FileScanTask { + file_size_in_bytes: std::fs::metadata(format!("{table_location}/data.parquet")) + .unwrap() + .len(), + start: 0, + length: 0, + record_count: None, + data_file_path: format!("{table_location}/data.parquet"), + data_file_format: DataFileFormat::Parquet, + schema: schema.clone(), + project_field_ids: vec![1, 2], + predicate: None, + deletes: vec![], + partition: Some(partition_data), + partition_spec: Some(partition_spec), + name_mapping: None, + case_sensitive: false, + })] + .into_iter(), + )) as FileScanTaskStream; + + let result = reader + .read(tasks) + .unwrap() + .try_collect::>() + .await + .unwrap(); + + // Verify we got the correct data + assert_eq!(result.len(), 1); + let batch = &result[0]; + + assert_eq!(batch.num_columns(), 2); + assert_eq!(batch.num_rows(), 4); + + // The id column MUST contain actual values from the Parquet file [1, 5, 9, 13], + // NOT the constant partition value 1 + let id_col = batch + .column(0) + .as_primitive::(); + assert_eq!(id_col.value(0), 1); + assert_eq!(id_col.value(1), 5); + assert_eq!(id_col.value(2), 9); + assert_eq!(id_col.value(3), 13); + + let name_col = batch.column(1).as_string::(); + assert_eq!(name_col.value(0), "Alice"); + assert_eq!(name_col.value(1), "Bob"); + assert_eq!(name_col.value(2), "Charlie"); + assert_eq!(name_col.value(3), "Dave"); + } + + /// Regression for : + /// predicate on a column after nested types in a migrated file (no field IDs). + /// Schema has struct, list, and map columns before the predicate target (`id`), + /// exercising the fallback field ID mapping across all nested type variants. + #[tokio::test] + async fn test_predicate_on_migrated_file_with_nested_types() { + use serde::{Deserialize, Serialize}; + use serde_arrow::schema::{SchemaLike, TracingOptions}; + + #[derive(Serialize, Deserialize)] + struct Person { + name: String, + age: i32, + } + + #[derive(Serialize, Deserialize)] + struct Row { + person: Person, + people: Vec, + props: std::collections::BTreeMap, + id: i32, + } + + let rows = vec![ + Row { + person: Person { + name: "Alice".into(), + age: 30, + }, + people: vec![Person { + name: "Alice".into(), + age: 30, + }], + props: [("k1".into(), "v1".into())].into(), + id: 1, + }, + Row { + person: Person { + name: "Bob".into(), + age: 25, + }, + people: vec![Person { + name: "Bob".into(), + age: 25, + }], + props: [("k2".into(), "v2".into())].into(), + id: 2, + }, + Row { + person: Person { + name: "Carol".into(), + age: 40, + }, + people: vec![Person { + name: "Carol".into(), + age: 40, + }], + props: [("k3".into(), "v3".into())].into(), + id: 3, + }, + ]; + + let tracing_options = TracingOptions::default() + .map_as_struct(false) + .strings_as_large_utf8(false) + .sequence_as_large_list(false); + let fields = Vec::::from_type::(tracing_options).unwrap(); + let arrow_schema = Arc::new(ArrowSchema::new(fields.clone())); + let batch = serde_arrow::to_record_batch(&fields, &rows).unwrap(); + + // Fallback field IDs: person=1, people=2, props=3, id=4 + let iceberg_schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::required( + 1, + "person", + Type::Struct(crate::spec::StructType::new(vec![ + NestedField::required( + 5, + "name", + Type::Primitive(PrimitiveType::String), + ) + .into(), + NestedField::required(6, "age", Type::Primitive(PrimitiveType::Int)) + .into(), + ])), + ) + .into(), + NestedField::required( + 2, + "people", + Type::List(crate::spec::ListType { + element_field: NestedField::required( + 7, + "element", + Type::Struct(crate::spec::StructType::new(vec![ + NestedField::required( + 8, + "name", + Type::Primitive(PrimitiveType::String), + ) + .into(), + NestedField::required( + 9, + "age", + Type::Primitive(PrimitiveType::Int), + ) + .into(), + ])), + ) + .into(), + }), + ) + .into(), + NestedField::required( + 3, + "props", + Type::Map(crate::spec::MapType { + key_field: NestedField::required( + 10, + "key", + Type::Primitive(PrimitiveType::String), + ) + .into(), + value_field: NestedField::required( + 11, + "value", + Type::Primitive(PrimitiveType::String), + ) + .into(), + }), + ) + .into(), + NestedField::required(4, "id", Type::Primitive(PrimitiveType::Int)).into(), + ]) + .build() + .unwrap(), + ); + + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + let file_path = format!("{table_location}/1.parquet"); + + let props = WriterProperties::builder() + .set_compression(Compression::SNAPPY) + .build(); + let file = File::create(&file_path).unwrap(); + let mut writer = ArrowWriter::try_new(file, arrow_schema, Some(props)).unwrap(); + writer.write(&batch).expect("Writing batch"); + writer.close().unwrap(); + + let predicate = Reference::new("id").greater_than(Datum::int(1)); + + let reader = ArrowReaderBuilder::new(FileIO::new_with_fs()) + .with_row_group_filtering_enabled(true) + .with_row_selection_enabled(true) + .build(); + + let tasks = Box::pin(futures::stream::iter( + vec![Ok(FileScanTask { + file_size_in_bytes: std::fs::metadata(&file_path).unwrap().len(), + start: 0, + length: 0, + record_count: None, + data_file_path: file_path, + data_file_format: DataFileFormat::Parquet, + schema: iceberg_schema.clone(), + project_field_ids: vec![4], + predicate: Some(predicate.bind(iceberg_schema, true).unwrap()), + deletes: vec![], + partition: None, + partition_spec: None, + name_mapping: None, + case_sensitive: false, + })] + .into_iter(), + )) as FileScanTaskStream; + + let result = reader + .read(tasks) + .unwrap() + .try_collect::>() + .await + .unwrap(); + + let ids: Vec = result + .iter() + .flat_map(|b| { + b.column(0) + .as_primitive::() + .values() + .iter() + .copied() + }) + .collect(); + assert_eq!(ids, vec![2, 3]); + } +} diff --git a/crates/iceberg/src/arrow/reader/row_filter.rs b/crates/iceberg/src/arrow/reader/row_filter.rs new file mode 100644 index 0000000000..52f7260cc6 --- /dev/null +++ b/crates/iceberg/src/arrow/reader/row_filter.rs @@ -0,0 +1,616 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Predicate-driven row filtering for `ArrowReader`: constructing Arrow `RowFilter`s +//! from Iceberg predicates, row-group selection based on column statistics, and +//! row-selection via the Parquet page index. Also includes byte-range row-group +//! filtering used for file splitting. + +use std::collections::{HashMap, HashSet}; +use std::sync::Arc; + +use parquet::arrow::ProjectionMask; +use parquet::arrow::arrow_reader::{ArrowPredicateFn, RowFilter, RowSelection}; +use parquet::file::metadata::ParquetMetaData; +use parquet::schema::types::SchemaDescriptor; + +use super::{ArrowReader, PredicateConverter}; +use crate::error::Result; +use crate::expr::BoundPredicate; +use crate::expr::visitors::bound_predicate_visitor::visit; +use crate::expr::visitors::page_index_evaluator::PageIndexEvaluator; +use crate::expr::visitors::row_group_metrics_evaluator::RowGroupMetricsEvaluator; +use crate::spec::Schema; +use crate::{Error, ErrorKind}; + +impl ArrowReader { + pub(super) fn get_row_filter( + predicates: &BoundPredicate, + parquet_schema: &SchemaDescriptor, + iceberg_field_ids: &HashSet, + field_id_map: &HashMap, + ) -> Result { + // Collect Parquet column indices from field ids. + // If the field id is not found in Parquet schema, it will be ignored due to schema evolution. + let mut column_indices = iceberg_field_ids + .iter() + .filter_map(|field_id| field_id_map.get(field_id).cloned()) + .collect::>(); + column_indices.sort(); + + // The converter that converts `BoundPredicates` to `ArrowPredicates` + let mut converter = PredicateConverter { + parquet_schema, + column_map: field_id_map, + column_indices: &column_indices, + }; + + // After collecting required leaf column indices used in the predicate, + // creates the projection mask for the Arrow predicates. + let projection_mask = ProjectionMask::leaves(parquet_schema, column_indices.clone()); + let predicate_func = visit(&mut converter, predicates)?; + let arrow_predicate = ArrowPredicateFn::new(projection_mask, predicate_func); + Ok(RowFilter::new(vec![Box::new(arrow_predicate)])) + } + + pub(super) fn get_selected_row_group_indices( + predicate: &BoundPredicate, + parquet_metadata: &Arc, + field_id_map: &HashMap, + snapshot_schema: &Schema, + ) -> Result> { + let row_groups_metadata = parquet_metadata.row_groups(); + let mut results = Vec::with_capacity(row_groups_metadata.len()); + + for (idx, row_group_metadata) in row_groups_metadata.iter().enumerate() { + if RowGroupMetricsEvaluator::eval( + predicate, + row_group_metadata, + field_id_map, + snapshot_schema, + )? { + results.push(idx); + } + } + + Ok(results) + } + + pub(super) fn get_row_selection_for_filter_predicate( + predicate: &BoundPredicate, + parquet_metadata: &Arc, + selected_row_groups: &Option>, + field_id_map: &HashMap, + snapshot_schema: &Schema, + ) -> Result { + let Some(column_index) = parquet_metadata.column_index() else { + return Err(Error::new( + ErrorKind::Unexpected, + "Parquet file metadata does not contain a column index", + )); + }; + + let Some(offset_index) = parquet_metadata.offset_index() else { + return Err(Error::new( + ErrorKind::Unexpected, + "Parquet file metadata does not contain an offset index", + )); + }; + + // If all row groups were filtered out, return an empty RowSelection (select no rows) + if let Some(selected_row_groups) = selected_row_groups + && selected_row_groups.is_empty() + { + return Ok(RowSelection::from(Vec::new())); + } + + let mut selected_row_groups_idx = 0; + + let page_index = column_index + .iter() + .enumerate() + .zip(offset_index) + .zip(parquet_metadata.row_groups()); + + let mut results = Vec::new(); + for (((idx, column_index), offset_index), row_group_metadata) in page_index { + if let Some(selected_row_groups) = selected_row_groups { + // skip row groups that aren't present in selected_row_groups + if idx == selected_row_groups[selected_row_groups_idx] { + selected_row_groups_idx += 1; + } else { + continue; + } + } + + let selections_for_page = PageIndexEvaluator::eval( + predicate, + column_index, + offset_index, + row_group_metadata, + field_id_map, + snapshot_schema, + )?; + + results.push(selections_for_page); + + if let Some(selected_row_groups) = selected_row_groups + && selected_row_groups_idx == selected_row_groups.len() + { + break; + } + } + + Ok(results.into_iter().flatten().collect::>().into()) + } + + /// Filters row groups by byte range to support Iceberg's file splitting. + /// + /// Iceberg splits large files at row group boundaries, so we only read row groups + /// whose byte ranges overlap with [start, start+length). + pub(super) fn filter_row_groups_by_byte_range( + parquet_metadata: &Arc, + start: u64, + length: u64, + ) -> Result> { + let row_groups = parquet_metadata.row_groups(); + let mut selected = Vec::new(); + let end = start + length; + + // Row groups are stored sequentially after the 4-byte magic header. + let mut current_byte_offset = 4u64; + + for (idx, row_group) in row_groups.iter().enumerate() { + let row_group_size = row_group.compressed_size() as u64; + let row_group_end = current_byte_offset + row_group_size; + + if current_byte_offset < end && start < row_group_end { + selected.push(idx); + } + + current_byte_offset = row_group_end; + } + + Ok(selected) + } +} + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + use std::fs::File; + use std::sync::Arc; + + use arrow_array::cast::AsArray; + use arrow_array::{ArrayRef, LargeStringArray, RecordBatch, StringArray}; + use arrow_schema::{DataType, Field, Schema as ArrowSchema}; + use futures::TryStreamExt; + use parquet::arrow::{ArrowWriter, PARQUET_FIELD_ID_META_KEY}; + use parquet::basic::Compression; + use parquet::file::properties::WriterProperties; + use tempfile::TempDir; + + use crate::arrow::{ArrowReader, ArrowReaderBuilder}; + use crate::expr::{Bind, Predicate, Reference}; + use crate::io::FileIO; + use crate::scan::{FileScanTask, FileScanTaskStream}; + use crate::spec::{DataFileFormat, Datum, NestedField, PrimitiveType, Schema, SchemaRef, Type}; + + async fn test_perform_read( + predicate: Predicate, + schema: SchemaRef, + table_location: String, + reader: ArrowReader, + ) -> Vec> { + let tasks = Box::pin(futures::stream::iter( + vec![Ok(FileScanTask { + file_size_in_bytes: std::fs::metadata(format!("{table_location}/1.parquet")) + .unwrap() + .len(), + start: 0, + length: 0, + record_count: None, + data_file_path: format!("{table_location}/1.parquet"), + data_file_format: DataFileFormat::Parquet, + schema: schema.clone(), + project_field_ids: vec![1], + predicate: Some(predicate.bind(schema, true).unwrap()), + deletes: vec![], + partition: None, + partition_spec: None, + name_mapping: None, + case_sensitive: false, + })] + .into_iter(), + )) as FileScanTaskStream; + + let result = reader + .read(tasks) + .unwrap() + .try_collect::>() + .await + .unwrap(); + + result[0].columns()[0] + .as_string_opt::() + .unwrap() + .iter() + .map(|v| v.map(ToOwned::to_owned)) + .collect::>() + } + + fn setup_kleene_logic( + data_for_col_a: Vec>, + col_a_type: DataType, + ) -> (FileIO, SchemaRef, String, TempDir) { + let schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::optional(1, "a", Type::Primitive(PrimitiveType::String)).into(), + ]) + .build() + .unwrap(), + ); + + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("a", col_a_type.clone(), true).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + "1".to_string(), + )])), + ])); + + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + + let file_io = FileIO::new_with_fs(); + + let col = match col_a_type { + DataType::Utf8 => Arc::new(StringArray::from(data_for_col_a)) as ArrayRef, + DataType::LargeUtf8 => Arc::new(LargeStringArray::from(data_for_col_a)) as ArrayRef, + _ => panic!("unexpected col_a_type"), + }; + + let to_write = RecordBatch::try_new(arrow_schema.clone(), vec![col]).unwrap(); + + // Write the Parquet files + let props = WriterProperties::builder() + .set_compression(Compression::SNAPPY) + .build(); + + let file = File::create(format!("{table_location}/1.parquet")).unwrap(); + let mut writer = + ArrowWriter::try_new(file, to_write.schema(), Some(props.clone())).unwrap(); + + writer.write(&to_write).expect("Writing batch"); + + // writer must be closed to write footer + writer.close().unwrap(); + + (file_io, schema, table_location, tmp_dir) + } + + #[tokio::test] + async fn test_kleene_logic_or_behaviour() { + // a IS NULL OR a = 'foo' + let predicate = Reference::new("a") + .is_null() + .or(Reference::new("a").equal_to(Datum::string("foo"))); + + // Table data: [NULL, "foo", "bar"] + let data_for_col_a = vec![None, Some("foo".to_string()), Some("bar".to_string())]; + + // Expected: [NULL, "foo"]. + let expected = vec![None, Some("foo".to_string())]; + + let (file_io, schema, table_location, _temp_dir) = + setup_kleene_logic(data_for_col_a, DataType::Utf8); + let reader = ArrowReaderBuilder::new(file_io).build(); + + let result_data = test_perform_read(predicate, schema, table_location, reader).await; + + assert_eq!(result_data, expected); + } + + #[tokio::test] + async fn test_kleene_logic_and_behaviour() { + // a IS NOT NULL AND a != 'foo' + let predicate = Reference::new("a") + .is_not_null() + .and(Reference::new("a").not_equal_to(Datum::string("foo"))); + + // Table data: [NULL, "foo", "bar"] + let data_for_col_a = vec![None, Some("foo".to_string()), Some("bar".to_string())]; + + // Expected: ["bar"]. + let expected = vec![Some("bar".to_string())]; + + let (file_io, schema, table_location, _temp_dir) = + setup_kleene_logic(data_for_col_a, DataType::Utf8); + let reader = ArrowReaderBuilder::new(file_io).build(); + + let result_data = test_perform_read(predicate, schema, table_location, reader).await; + + assert_eq!(result_data, expected); + } + + #[tokio::test] + async fn test_predicate_cast_literal() { + let predicates = vec![ + // a == 'foo' + (Reference::new("a").equal_to(Datum::string("foo")), vec![ + Some("foo".to_string()), + ]), + // a != 'foo' + ( + Reference::new("a").not_equal_to(Datum::string("foo")), + vec![Some("bar".to_string())], + ), + // STARTS_WITH(a, 'foo') + (Reference::new("a").starts_with(Datum::string("f")), vec![ + Some("foo".to_string()), + ]), + // NOT STARTS_WITH(a, 'foo') + ( + Reference::new("a").not_starts_with(Datum::string("f")), + vec![Some("bar".to_string())], + ), + // a < 'foo' + (Reference::new("a").less_than(Datum::string("foo")), vec![ + Some("bar".to_string()), + ]), + // a <= 'foo' + ( + Reference::new("a").less_than_or_equal_to(Datum::string("foo")), + vec![Some("foo".to_string()), Some("bar".to_string())], + ), + // a > 'foo' + ( + Reference::new("a").greater_than(Datum::string("bar")), + vec![Some("foo".to_string())], + ), + // a >= 'foo' + ( + Reference::new("a").greater_than_or_equal_to(Datum::string("foo")), + vec![Some("foo".to_string())], + ), + // a IN ('foo', 'bar') + ( + Reference::new("a").is_in([Datum::string("foo"), Datum::string("baz")]), + vec![Some("foo".to_string())], + ), + // a NOT IN ('foo', 'bar') + ( + Reference::new("a").is_not_in([Datum::string("foo"), Datum::string("baz")]), + vec![Some("bar".to_string())], + ), + ]; + + // Table data: ["foo", "bar"] + let data_for_col_a = vec![Some("foo".to_string()), Some("bar".to_string())]; + + let (file_io, schema, table_location, _temp_dir) = + setup_kleene_logic(data_for_col_a, DataType::LargeUtf8); + let reader = ArrowReaderBuilder::new(file_io).build(); + + for (predicate, expected) in predicates { + println!("testing predicate {predicate}"); + let result_data = test_perform_read( + predicate.clone(), + schema.clone(), + table_location.clone(), + reader.clone(), + ) + .await; + + assert_eq!(result_data, expected, "predicate={predicate}"); + } + } + + /// Verifies that file splits respect byte ranges and only read specific row groups. + #[tokio::test] + async fn test_file_splits_respect_byte_ranges() { + use arrow_array::Int32Array; + use parquet::file::reader::{FileReader, SerializedFileReader}; + + let schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::required(1, "id", Type::Primitive(PrimitiveType::Int)).into(), + ]) + .build() + .unwrap(), + ); + + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("id", DataType::Int32, false).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + "1".to_string(), + )])), + ])); + + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + let file_path = format!("{table_location}/multi_row_group.parquet"); + + // Force each batch into its own row group for testing byte range filtering. + let batch1 = RecordBatch::try_new(arrow_schema.clone(), vec![Arc::new(Int32Array::from( + (0..100).collect::>(), + ))]) + .unwrap(); + let batch2 = RecordBatch::try_new(arrow_schema.clone(), vec![Arc::new(Int32Array::from( + (100..200).collect::>(), + ))]) + .unwrap(); + let batch3 = RecordBatch::try_new(arrow_schema.clone(), vec![Arc::new(Int32Array::from( + (200..300).collect::>(), + ))]) + .unwrap(); + + let props = WriterProperties::builder() + .set_compression(Compression::SNAPPY) + .set_max_row_group_row_count(Some(100)) + .build(); + + let file = File::create(&file_path).unwrap(); + let mut writer = ArrowWriter::try_new(file, arrow_schema.clone(), Some(props)).unwrap(); + writer.write(&batch1).expect("Writing batch 1"); + writer.write(&batch2).expect("Writing batch 2"); + writer.write(&batch3).expect("Writing batch 3"); + writer.close().unwrap(); + + // Read the file metadata to get row group byte positions + let file = File::open(&file_path).unwrap(); + let reader = SerializedFileReader::new(file).unwrap(); + let metadata = reader.metadata(); + + println!("File has {} row groups", metadata.num_row_groups()); + assert_eq!(metadata.num_row_groups(), 3, "Expected 3 row groups"); + + // Get byte positions for each row group + let row_group_0 = metadata.row_group(0); + let row_group_1 = metadata.row_group(1); + let row_group_2 = metadata.row_group(2); + + let rg0_start = 4u64; // Parquet files start with 4-byte magic "PAR1" + let rg1_start = rg0_start + row_group_0.compressed_size() as u64; + let rg2_start = rg1_start + row_group_1.compressed_size() as u64; + let file_end = rg2_start + row_group_2.compressed_size() as u64; + + println!( + "Row group 0: {} rows, starts at byte {}, {} bytes compressed", + row_group_0.num_rows(), + rg0_start, + row_group_0.compressed_size() + ); + println!( + "Row group 1: {} rows, starts at byte {}, {} bytes compressed", + row_group_1.num_rows(), + rg1_start, + row_group_1.compressed_size() + ); + println!( + "Row group 2: {} rows, starts at byte {}, {} bytes compressed", + row_group_2.num_rows(), + rg2_start, + row_group_2.compressed_size() + ); + + let file_io = FileIO::new_with_fs(); + let reader = ArrowReaderBuilder::new(file_io).build(); + + // Task 1: read only the first row group + let task1 = FileScanTask { + file_size_in_bytes: std::fs::metadata(&file_path).unwrap().len(), + start: rg0_start, + length: row_group_0.compressed_size() as u64, + record_count: Some(100), + data_file_path: file_path.clone(), + data_file_format: DataFileFormat::Parquet, + schema: schema.clone(), + project_field_ids: vec![1], + predicate: None, + deletes: vec![], + partition: None, + partition_spec: None, + name_mapping: None, + case_sensitive: false, + }; + + // Task 2: read the second and third row groups + let task2 = FileScanTask { + file_size_in_bytes: std::fs::metadata(&file_path).unwrap().len(), + start: rg1_start, + length: file_end - rg1_start, + record_count: Some(200), + data_file_path: file_path.clone(), + data_file_format: DataFileFormat::Parquet, + schema: schema.clone(), + project_field_ids: vec![1], + predicate: None, + deletes: vec![], + partition: None, + partition_spec: None, + name_mapping: None, + case_sensitive: false, + }; + + let tasks1 = Box::pin(futures::stream::iter(vec![Ok(task1)])) as FileScanTaskStream; + let result1 = reader + .clone() + .read(tasks1) + .unwrap() + .try_collect::>() + .await + .unwrap(); + + let total_rows_task1: usize = result1.iter().map(|b| b.num_rows()).sum(); + println!( + "Task 1 (bytes {}-{}) returned {} rows", + rg0_start, + rg0_start + row_group_0.compressed_size() as u64, + total_rows_task1 + ); + + let tasks2 = Box::pin(futures::stream::iter(vec![Ok(task2)])) as FileScanTaskStream; + let result2 = reader + .read(tasks2) + .unwrap() + .try_collect::>() + .await + .unwrap(); + + let total_rows_task2: usize = result2.iter().map(|b| b.num_rows()).sum(); + println!("Task 2 (bytes {rg1_start}-{file_end}) returned {total_rows_task2} rows"); + + assert_eq!( + total_rows_task1, 100, + "Task 1 should read only the first row group (100 rows), but got {total_rows_task1} rows" + ); + + assert_eq!( + total_rows_task2, 200, + "Task 2 should read only the second+third row groups (200 rows), but got {total_rows_task2} rows" + ); + + // Verify the actual data values are correct (not just the row count) + if total_rows_task1 > 0 { + let first_batch = &result1[0]; + let id_col = first_batch + .column(0) + .as_primitive::(); + let first_val = id_col.value(0); + let last_val = id_col.value(id_col.len() - 1); + println!("Task 1 data range: {first_val} to {last_val}"); + + assert_eq!(first_val, 0, "Task 1 should start with id=0"); + assert_eq!(last_val, 99, "Task 1 should end with id=99"); + } + + if total_rows_task2 > 0 { + let first_batch = &result2[0]; + let id_col = first_batch + .column(0) + .as_primitive::(); + let first_val = id_col.value(0); + println!("Task 2 first value: {first_val}"); + + assert_eq!(first_val, 100, "Task 2 should start with id=100, not id=0"); + } + } +} From 02e28d9ecca32d350979573004964e856bd1197c Mon Sep 17 00:00:00 2001 From: Jem Bishop <40360024+jembishop@users.noreply.github.com> Date: Fri, 24 Apr 2026 23:26:07 +0100 Subject: [PATCH 28/45] fix: allow missing sequence-number in v2 snapshots for v1-upgraded tables (#2127) ## Which issue does this PR close? Didn't make an issue sorry. Very small change. ## What changes are included in this PR? After upgrading to v2 table I got complaints that this field does not exist, so Added a default for sequence number for this struct. I think this should be ok, as this is treated as 0 for iceberg v1 wrt to v2 compat in other contexts? But would like some confirmation. ## Are these changes tested? Tested that it fixes my problem, yes. --- crates/iceberg/src/spec/snapshot.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/crates/iceberg/src/spec/snapshot.rs b/crates/iceberg/src/spec/snapshot.rs index 72b5417c47..3b8a3c934e 100644 --- a/crates/iceberg/src/spec/snapshot.rs +++ b/crates/iceberg/src/spec/snapshot.rs @@ -291,6 +291,7 @@ pub(super) mod _serde { pub snapshot_id: i64, #[serde(skip_serializing_if = "Option::is_none")] pub parent_snapshot_id: Option, + #[serde(default)] pub sequence_number: i64, pub timestamp_ms: i64, pub manifest_list: String, From bcc2b5a444fe97e0f4e00ebca707dbaa0c0ed5e1 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 27 Apr 2026 08:56:38 +0800 Subject: [PATCH 29/45] chore(deps): Bump tokio from 1.51.1 to 1.52.1 (#2374) Bumps [tokio](https://github.com/tokio-rs/tokio) from 1.51.1 to 1.52.1.
Release notes

Sourced from tokio's releases.

Tokio v1.52.1

1.52.1 (April 16th, 2026)

Fixed

  • runtime: revert #7757 to fix [a regression]#8056 that causes spawn_blocking to hang (#8057)

#7757: tokio-rs/tokio#7757 #8056: tokio-rs/tokio#8056 #8057: tokio-rs/tokio#8057

Tokio v1.52.0

1.52.0 (April 14th, 2026)

Added

  • io: AioSource::register_borrowed for I/O safety support (#7992)
  • net: add try_io function to unix::pipe sender and receiver types (#8030)

Added (unstable)

  • runtime: Builder::enable_eager_driver_handoff setting enable eager hand off of the I/O and time drivers before polling tasks (#8010)
  • taskdump: add trace_with() for customized task dumps (#8025)
  • taskdump: allow impl FnMut() in trace_with instead of just fn() (#8040)
  • fs: support io_uring in AsyncRead for File (#7907)

Changed

  • runtime: improve spawn_blocking scalability with sharded queue (#7757)
  • runtime: use compare_exchange_weak() in worker queue (#8028)

Fixed

  • runtime: overflow second half of tasks when local queue is filled instead of first half (#8029)

Documented

  • docs: fix typo in oneshot::Sender::send docs (#8026)
  • docs: hide #[tokio::main] attribute in the docs of sync::watch (#8035)
  • net: add docs on ConnectionRefused errors with UDP sockets (#7870)

#7757: tokio-rs/tokio#7757 #7870: tokio-rs/tokio#7870 #7907: tokio-rs/tokio#7907 #7992: tokio-rs/tokio#7992 #8010: tokio-rs/tokio#8010 #8025: tokio-rs/tokio#8025 #8026: tokio-rs/tokio#8026 #8028: tokio-rs/tokio#8028 #8029: tokio-rs/tokio#8029

... (truncated)

Commits
  • 905c146 chore: prepare to release v1.52.1 (#8059)
  • 56aaa43 rt: revert #7757 to fix regression in spawn_blocking (#8057)
  • 57ff47a ci: update trybuild to expect output from rustc 1.95.0 (#8058)
  • 812de3e ci: bump taiki-e/cache-cargo-install-action from 1 to 3 (#8053)
  • ba82e73 ci: use Dependabot to keep github actions up to date (#8052)
  • 2e85f9d ci: replace cirrus-ci with freebsd-vm (#8041)
  • a7e1cd8 ci: update GitHub Actions workflows to use latest tool versions (#8047)
  • 5f7be0a chore: perpare 1.52.0 (#8045)
  • 36d12d2 taskdump: allow impl FnMut() in taskdumps instead of just fn() (#8040)
  • f943312 fs: support io-uring in AsyncRead for File (#7907)
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=tokio&package-manager=cargo&previous-version=1.51.1&new-version=1.52.1)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.lock | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 528aaf023f..d9448e52fe 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -157,7 +157,7 @@ version = "1.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc" dependencies = [ - "windows-sys 0.61.2", + "windows-sys 0.60.2", ] [[package]] @@ -168,7 +168,7 @@ checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d" dependencies = [ "anstyle", "once_cell_polyfill", - "windows-sys 0.61.2", + "windows-sys 0.60.2", ] [[package]] @@ -1324,7 +1324,7 @@ version = "3.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "faf9468729b8cbcea668e36183cb69d317348c2e08e994829fb56ebfdfbaac34" dependencies = [ - "windows-sys 0.61.2", + "windows-sys 0.48.0", ] [[package]] @@ -2518,7 +2518,7 @@ dependencies = [ "libc", "option-ext", "redox_users", - "windows-sys 0.61.2", + "windows-sys 0.59.0", ] [[package]] @@ -2671,7 +2671,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" dependencies = [ "libc", - "windows-sys 0.61.2", + "windows-sys 0.52.0", ] [[package]] @@ -3302,7 +3302,7 @@ dependencies = [ "libc", "percent-encoding", "pin-project-lite", - "socket2 0.6.3", + "socket2 0.5.10", "tokio", "tower-service", "tracing", @@ -3873,7 +3873,7 @@ dependencies = [ "portable-atomic", "portable-atomic-util", "serde_core", - "windows-sys 0.61.2", + "windows-sys 0.52.0", ] [[package]] @@ -4422,7 +4422,7 @@ version = "0.50.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5" dependencies = [ - "windows-sys 0.61.2", + "windows-sys 0.59.0", ] [[package]] @@ -5191,7 +5191,7 @@ dependencies = [ "quinn-udp", "rustc-hash", "rustls", - "socket2 0.6.3", + "socket2 0.5.10", "thiserror 2.0.18", "tokio", "tracing", @@ -5228,7 +5228,7 @@ dependencies = [ "cfg_aliases", "libc", "once_cell", - "socket2 0.6.3", + "socket2 0.5.10", "tracing", "windows-sys 0.60.2", ] @@ -5693,7 +5693,7 @@ dependencies = [ "errno", "libc", "linux-raw-sys", - "windows-sys 0.61.2", + "windows-sys 0.52.0", ] [[package]] @@ -6217,7 +6217,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3a766e1110788c36f4fa1c2b71b387a7815aa65f88ce0229841826633d93723e" dependencies = [ "libc", - "windows-sys 0.61.2", + "windows-sys 0.60.2", ] [[package]] @@ -6656,7 +6656,7 @@ dependencies = [ "getrandom 0.4.2", "once_cell", "rustix", - "windows-sys 0.61.2", + "windows-sys 0.52.0", ] [[package]] @@ -6792,9 +6792,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.51.1" +version = "1.52.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f66bf9585cda4b724d3e78ab34b73fb2bbaba9011b9bfdf69dc836382ea13b8c" +checksum = "b67dee974fe86fd92cc45b7a95fdd2f99a36a6d7b0d431a231178d3d670bbcc6" dependencies = [ "bytes", "libc", @@ -7552,7 +7552,7 @@ version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" dependencies = [ - "windows-sys 0.61.2", + "windows-sys 0.48.0", ] [[package]] From 7e84157818bb09e9d6bdfc7d3bdc4061a6d078b6 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 27 Apr 2026 09:20:03 +0800 Subject: [PATCH 30/45] chore(deps): Bump datafusion-cli from 53.0.0 to 53.1.0 (#2373) Bumps [datafusion-cli](https://github.com/apache/datafusion) from 53.0.0 to 53.1.0.
Commits
  • eae7bf4 [branch-53] Update version and add Changelog (#21559)
  • 637acc4 [branch-53] chore: update deps for cargo audit (#21415) (#21587)
  • 01f2d91 [branch-53] Restore Sort unparser guard for correct ORDER BY placement (#2065...
  • 242fb76 [branch-53] fix: foreign inner ffi types (#21439) (#21524)
  • 6fc7114 [branch-53] fix: FilterExec should drop projection when apply projection push...
  • 61d8483 [branch-53] fix: use datafusion_expr instead of datafusion crate in spark bit...
  • 3224e0c [branch-53] fix: use spill writer's schema instead of the first batch schema ...
  • d24faa0 [branch-53] chore: Optimize schema rewriter usages (#21158) (#21183)
  • c45c2ce [branch-53] Substrait join consumer should not merge nullability of join keys...
  • 2c3a360 [branch-53] Fix push_down_filter for children with non-empty fetch fields (#2...
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=datafusion-cli&package-manager=cargo&previous-version=53.0.0&new-version=53.1.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- Cargo.lock | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d9448e52fe..66a5599493 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1086,7 +1086,7 @@ version = "3.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "519bd3116aeeb42d5372c29d982d16d0170d3d4a5ed85fc7dd91642ffff3c67c" dependencies = [ - "darling 0.23.0", + "darling 0.20.11", "ident_case", "prettyplease", "proc-macro2", @@ -1747,9 +1747,9 @@ dependencies = [ [[package]] name = "datafusion-cli" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8735220c84a731c3917dce75ec837a8376eddf5462b0c5dbaf5a2e354c9b6e05" +checksum = "84a22c001ad1ac11cda09dab69b151eef5b1a992e23bc524ab0d1e63e5dea327" dependencies = [ "arrow", "async-trait", @@ -5078,7 +5078,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "343d3bd7056eda839b03204e68deff7d1b13aba7af2b2fd16890697274262ee7" dependencies = [ "heck", - "itertools 0.14.0", + "itertools 0.13.0", "log", "multimap", "petgraph", @@ -5097,7 +5097,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "27c6023962132f4b30eb4c172c91ce92d933da334c59c23cddee82358ddafb0b" dependencies = [ "anyhow", - "itertools 0.14.0", + "itertools 0.13.0", "proc-macro2", "quote", "syn", From 406ca384da07a8da50aff791573dbf5158414d70 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 27 Apr 2026 09:22:08 +0800 Subject: [PATCH 31/45] chore(deps): Bump crate-ci/typos from 1.45.0 to 1.45.1 (#2372) Bumps [crate-ci/typos](https://github.com/crate-ci/typos) from 1.45.0 to 1.45.1.
Release notes

Sourced from crate-ci/typos's releases.

v1.45.1

[1.45.1] - 2026-04-13

Fixes

  • (action) Use a temp dir for caching
Changelog

Sourced from crate-ci/typos's changelog.

Change Log

All notable changes to this project will be documented in this file.

The format is based on Keep a Changelog and this project adheres to Semantic Versioning.

[Unreleased] - ReleaseDate

[1.45.1] - 2026-04-13

Fixes

  • (action) Use a temp dir for caching

[1.45.0] - 2026-04-01

Features

  • Updated the dictionary with the March 2026 changes

[1.44.0] - 2026-02-27

Features

[1.43.5] - 2026-02-16

Fixes

  • (pypi) Hopefully fix the sdist build

[1.43.4] - 2026-02-09

Fixes

  • Don't correct pincher

[1.43.3] - 2026-02-06

Fixes

  • (action) Adjust how typos are reported to github

[1.43.2] - 2026-02-05

Fixes

  • Don't correct certifi in Python

... (truncated)

Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=crate-ci/typos&package-manager=github_actions&previous-version=1.45.0&new-version=1.45.1)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/ci_typos.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci_typos.yml b/.github/workflows/ci_typos.yml index 089ddfe8e2..fff347e638 100644 --- a/.github/workflows/ci_typos.yml +++ b/.github/workflows/ci_typos.yml @@ -47,4 +47,4 @@ jobs: with: persist-credentials: false - name: Check typos - uses: crate-ci/typos@02ea592e44b3a53c302f697cddca7641cd051c3d # v1.45.0 + uses: crate-ci/typos@cf5f1c29a8ac336af8568821ec41919923b05a83 # v1.45.1 From 7e3f8276b5081bd6b10d5ee8d7528d0aa3db4fc4 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 27 Apr 2026 09:22:49 +0800 Subject: [PATCH 32/45] chore(deps): Bump astral-sh/setup-uv from 8.0.0 to 8.1.0 (#2371) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [astral-sh/setup-uv](https://github.com/astral-sh/setup-uv) from 8.0.0 to 8.1.0.
Release notes

Sourced from astral-sh/setup-uv's releases.

v8.1.0 🌈 New input no-project

Changes

This add the a new boolean input no-project. It only makes sense to use in combination with activate-environment: true and will append --no project to the uv venv call. This is for example useful if you have a pyproject.toml file with parts unparseable by uv

🚀 Enhancements

  • Add input no-project in combination with activate-environment @​eifinger (#856)

🧰 Maintenance

📚 Documentation

⬆️ Dependency updates

  • chore(deps): bump release-drafter/release-drafter from 7.1.1 to 7.2.0 @dependabot[bot] (#855)
Commits
  • 0880764 fix: grant contents:write to validate-release job (#860)
  • 717d6ab Add a release-gate step to the release workflow (#859)
  • 5a911eb Draft commitish releases (#858)
  • 080c31e Add action-types.yml to instructions (#857)
  • b3e97d2 Add input no-project in combination with activate-environment (#856)
  • 7dd591d chore(deps): bump release-drafter/release-drafter from 7.1.1 to 7.2.0 (#855)
  • 1541b77 chore: update known checksums for 0.11.7 (#853)
  • cdfb2ee Refactor version resolving (#852)
  • cb84d12 chore: update known checksums for 0.11.6 (#850)
  • 1912cc6 chore: update known checksums for 0.11.5 (#845)
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=astral-sh/setup-uv&package-manager=github_actions&previous-version=8.0.0&new-version=8.1.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/bindings_python_ci.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/bindings_python_ci.yml b/.github/workflows/bindings_python_ci.yml index 4483a53310..842fce7f83 100644 --- a/.github/workflows/bindings_python_ci.yml +++ b/.github/workflows/bindings_python_ci.yml @@ -63,7 +63,7 @@ jobs: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 with: persist-credentials: false - - uses: astral-sh/setup-uv@cec208311dfd045dd5311c1add060b2062131d57 # v8.0.0 + - uses: astral-sh/setup-uv@08807647e7069bb48b6ef5acd8ec9567f424441b # v8.1.0 with: version: "0.9.3" enable-cache: true @@ -100,7 +100,7 @@ jobs: working-directory: "bindings/python" command: build args: --out dist -i python3.12 # Explicitly set interpreter; manylinux containers have multiple Pythons and maturin may pick an older one - - uses: astral-sh/setup-uv@cec208311dfd045dd5311c1add060b2062131d57 # v8.0.0 + - uses: astral-sh/setup-uv@08807647e7069bb48b6ef5acd8ec9567f424441b # v8.1.0 with: version: "0.9.3" enable-cache: true From 94426495aafbf328de3fa614113a776f542b82a0 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 27 Apr 2026 09:23:35 +0800 Subject: [PATCH 33/45] chore(deps): Bump taiki-e/install-action from 2.75.7 to 2.75.18 (#2370) Bumps [taiki-e/install-action](https://github.com/taiki-e/install-action) from 2.75.7 to 2.75.18.
Release notes

Sourced from taiki-e/install-action's releases.

2.75.18

  • Update vacuum@latest to 0.26.1.

  • Update wasm-tools@latest to 1.247.0.

  • Update mise@latest to 2026.4.16.

  • Update espup@latest to 0.17.1.

  • Update trivy@latest to 0.70.0.

2.75.17

  • Update tombi@latest to 0.9.18.

  • Update mise@latest to 2026.4.15.

2.75.16

  • Update uv@latest to 0.11.7.

  • Update mise@latest to 2026.4.14.

  • Update vacuum@latest to 0.25.9.

  • Update cargo-machete@latest to 0.9.2.

  • Update cargo-deny@latest to 0.19.4.

2.75.15

  • Update cargo-nextest@latest to 0.9.133.

  • Update biome@latest to 2.4.12.

2.75.14

2.75.13

  • Update zizmor@latest to 1.24.1.

2.75.12

  • Update typos@latest to 1.45.1.

  • Update cargo-xwin@latest to 0.21.5.

  • Update cargo-binstall@latest to 1.18.1.

2.75.11

... (truncated)

Changelog

Sourced from taiki-e/install-action's changelog.

Changelog

All notable changes to this project will be documented in this file.

This project adheres to Semantic Versioning.

[Unreleased]

  • Update mise@latest to 2026.4.20.

[2.75.22] - 2026-04-25

  • Update tombi@latest to 0.9.22.

  • Update biome@latest to 2.4.13.

[2.75.21] - 2026-04-24

  • Update mise@latest to 2026.4.19.

  • Update tombi@latest to 0.9.21.

  • Update syft@latest to 1.43.0.

[2.75.20] - 2026-04-23

  • Update prek@latest to 0.3.10.

  • Update cargo-xwin@latest to 0.22.0.

[2.75.19] - 2026-04-21

  • Update wasmtime@latest to 44.0.0.

  • Update tombi@latest to 0.9.20.

  • Update martin@latest to 1.6.0.

  • Update just@latest to 1.50.0.

  • Update mise@latest to 2026.4.18.

  • Update rclone@latest to 1.73.5.

[2.75.18] - 2026-04-19

... (truncated)

Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=taiki-e/install-action&package-manager=github_actions&previous-version=2.75.7&new-version=2.75.18)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 1949015462..8b31386e47 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -163,7 +163,7 @@ jobs: - name: Install cargo-nextest if: matrix.test-suite.name == 'default' - uses: taiki-e/install-action@0abfcd587b70a713fdaa7fb502c885e2112acb15 # v2.75.7 + uses: taiki-e/install-action@055f5df8c3f65ea01cd41e9dc855becd88953486 # v2.75.18 with: tool: cargo-nextest From d7c647ff792649460dc3ac38193bcbe6882990da Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 27 Apr 2026 10:31:58 +0800 Subject: [PATCH 34/45] chore(deps): Bump datafusion-sqllogictest from 53.0.0 to 53.1.0 (#2369) --- Cargo.lock | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 66a5599493..d659a6a7f8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2340,9 +2340,9 @@ dependencies = [ [[package]] name = "datafusion-spark" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "923a8b871962a9d860f036f743a20af50ff04729f1da2468ed220dab4f61c97d" +checksum = "e059dcf8544da0d6598d0235be3cc29c209094a5976b2e4822e4a2cf91c2b5c5" dependencies = [ "arrow", "bigdecimal", @@ -2386,9 +2386,9 @@ dependencies = [ [[package]] name = "datafusion-sqllogictest" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a43746bd59e7f2655be4c5553ede4a1ceb1cd34005932fa9e2bd0641c714c46e" +checksum = "04e5a4a7a49143a68936992b6dbb0db44121c635e9992b2482817278f1e69c56" dependencies = [ "arrow", "async-trait", @@ -2412,9 +2412,9 @@ dependencies = [ [[package]] name = "datafusion-substrait" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d5e5656a7e63d51dd3e5af3dbd347ea83bbe993a77c66b854b74961570d16490" +checksum = "98494539a5468979cc42d86c7bc5f0f8cb71ee5c742694c26fc34efdd29dd2e5" dependencies = [ "async-recursion", "async-trait", From 00eb1601f6216f8cb31d19aaf7921eba436a66af Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 27 Apr 2026 14:07:19 +0800 Subject: [PATCH 35/45] chore(deps): Bump zizmorcore/zizmor-action from 0.5.2 to 0.5.3 (#2368) Bumps [zizmorcore/zizmor-action](https://github.com/zizmorcore/zizmor-action) from 0.5.2 to 0.5.3.
Release notes

Sourced from zizmorcore/zizmor-action's releases.

v0.5.3

What's Changed

  • 1.24.0 and 1.24.1 are now available via the action
  • 1.24.1 is now the default version of zizmor used by the action

Full Changelog: https://github.com/zizmorcore/zizmor-action/compare/v0.5.2...v0.5.3

Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=zizmorcore/zizmor-action&package-manager=github_actions&previous-version=0.5.2&new-version=0.5.3)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself)
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: blackmwk --- .github/workflows/zizmor.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/zizmor.yml b/.github/workflows/zizmor.yml index 313835fcbe..9306853937 100644 --- a/.github/workflows/zizmor.yml +++ b/.github/workflows/zizmor.yml @@ -39,6 +39,6 @@ jobs: persist-credentials: false - name: Run zizmor 🌈 - uses: zizmorcore/zizmor-action@71321a20a9ded102f6e9ce5718a2fcec2c4f70d8 # v0.5.2 + uses: zizmorcore/zizmor-action@b1d7e1fb5de872772f31590499237e7cce841e8e # v0.5.3 with: advanced-security: false From 36ae6460c4792af11169fc984a644989a6a9e4fd Mon Sep 17 00:00:00 2001 From: Kevin Liu Date: Mon, 27 Apr 2026 16:07:59 -0400 Subject: [PATCH 36/45] docs: bump version for python binding (#2377) ## Which issue does this PR close? - Closes #. ## What changes are included in this PR? Include bumping `bindings/python/Cargo.toml` version in release docs ## Are these changes tested? --- website/src/release.md | 1 + 1 file changed, 1 insertion(+) diff --git a/website/src/release.md b/website/src/release.md index 7549b8ef4d..4f4043b8f3 100644 --- a/website/src/release.md +++ b/website/src/release.md @@ -108,6 +108,7 @@ Bump all components' version in the project to the new iceberg version. Please note that this version is the exact version of the release, not the release candidate version. - rust core: bump version in `Cargo.toml` +- python binding: bump version in `bindings/python/Cargo.toml` ### Update docs From 1ad4bfd39319508e79960d16dad1b1cdf965c5f4 Mon Sep 17 00:00:00 2001 From: Matt Butrovich Date: Tue, 28 Apr 2026 04:25:27 -0400 Subject: [PATCH 37/45] feat(reader): Add read_with_metrics() for scan I/O metrics (#2349) ## Which issue does this PR close? - Closes #. ## What changes are included in this PR? Add always-on per-scan I/O metrics to `ArrowReader`. **Motivation:** Downstream engines need per-scan byte counts for their UIs. For example, DataFusion Comet uses this to populate `bytes_scanned` on its Iceberg scan operator, which flows through to Spark UI via `TaskMetrics.inputMetrics.setBytesRead()`. This must be per-scan, not global. Concurrent scans against the same `FileIO` need independent counters. The approach matches DataFusion's pattern of wrapping `AsyncFileReader` with a counting layer and is storage-backend agnostic. **`ArrowReader::read()` now returns `ScanResult`** - `ScanResult` wraps the record batch stream and `ScanMetrics`. Accessors: `stream()`, `metrics()`, `into_parts()`. - Metrics are always collected. One `fetch_add(Relaxed)` per I/O request, negligible overhead. - Counter is created fresh per `read()` call, so cloned readers get independent metrics. **New file: `crates/iceberg/src/arrow/scan_metrics.rs`** - `CountingFileRead`: generic wrapper that increments a shared `AtomicU64` on each `read()`. - `ScanMetrics`: public handle exposing `bytes_read()`. - `ScanResult`: public struct returned by `ArrowReader::read()`. **`FileRead` blanket impl for `Box`** - Enables generic `CountingFileRead` to wrap the boxed reader returned by `FileIO::reader()`. **Single `open_parquet_file` with counting** - All Parquet opens (data files and delete files) go through the same `open_parquet_file` wrapped with `CountingFileRead`, so `bytes_read` reflects total scan I/O. - `build_parquet_reader()`: shared internals for reader construction and metadata loading. **`FileScanTaskReader` struct (refactor)** - Extracted `process_file_scan_task`'s parameters into a `Clone` struct with a `process(self, task)` method, resolving a `clippy::too_many_arguments` violation. Struct and impl are co-located. **Re-exports** - `ScanMetrics` and `ScanResult` re-exported from `iceberg::arrow` and `iceberg::scan`. ## Are these changes tested? `test_scan_metrics_bytes_read` in `reader.rs`: asserts `bytes_read() == 0` before stream consumption (the stream is lazy) and `bytes_read() > 0` after. `test_scan_metrics_includes_delete_file_bytes`: reads the same data file with and without a positional delete file and asserts `bytes_read` is strictly greater when deletes are present. All existing reader and scan tests pass (updated to use `ScanResult::stream()`). --------- Co-authored-by: Claude Opus 4.6 (1M context) Co-authored-by: blackmwk --- .../src/arrow/caching_delete_file_loader.rs | 21 ++- .../iceberg/src/arrow/delete_file_loader.rs | 18 ++- crates/iceberg/src/arrow/mod.rs | 2 + crates/iceberg/src/arrow/reader/pipeline.rs | 148 ++++++++++-------- .../src/arrow/reader/positional_deletes.rs | 3 + crates/iceberg/src/arrow/reader/projection.rs | 10 ++ crates/iceberg/src/arrow/reader/row_filter.rs | 3 + crates/iceberg/src/arrow/scan_metrics.rs | 96 ++++++++++++ crates/iceberg/src/io/file_io.rs | 7 + crates/iceberg/src/scan/mod.rs | 12 +- 10 files changed, 245 insertions(+), 75 deletions(-) create mode 100644 crates/iceberg/src/arrow/scan_metrics.rs diff --git a/crates/iceberg/src/arrow/caching_delete_file_loader.rs b/crates/iceberg/src/arrow/caching_delete_file_loader.rs index ae97534d83..231971fd54 100644 --- a/crates/iceberg/src/arrow/caching_delete_file_loader.rs +++ b/crates/iceberg/src/arrow/caching_delete_file_loader.rs @@ -25,6 +25,7 @@ use tokio::sync::oneshot::{Receiver, channel}; use super::delete_filter::{DeleteFilter, PosDelLoadAction}; use crate::arrow::delete_file_loader::BasicDeleteFileLoader; +use crate::arrow::scan_metrics::ScanMetrics; use crate::arrow::{arrow_primitive_to_literal, arrow_schema_to_schema}; use crate::delete_vector::DeleteVector; use crate::expr::Predicate::AlwaysTrue; @@ -77,13 +78,22 @@ enum ParsedDeleteFileContext { #[allow(unused_variables)] impl CachingDeleteFileLoader { pub(crate) fn new(file_io: FileIO, concurrency_limit_data_files: usize) -> Self { + let scan_metrics = ScanMetrics::new(); CachingDeleteFileLoader { - basic_delete_file_loader: BasicDeleteFileLoader::new(file_io), + basic_delete_file_loader: BasicDeleteFileLoader::new(file_io, scan_metrics), concurrency_limit_data_files, delete_filter: DeleteFilter::default(), } } + pub(crate) fn with_scan_metrics(mut self, scan_metrics: ScanMetrics) -> Self { + self.basic_delete_file_loader = BasicDeleteFileLoader::new( + self.basic_delete_file_loader.file_io().clone(), + scan_metrics, + ); + self + } + /// Initiates loading of all deletes for all the specified tasks /// /// Returned future completes once all positional deletes and delete vectors @@ -612,7 +622,8 @@ mod tests { let eq_delete_file_path = setup_write_equality_delete_file_1(table_location); - let basic_delete_file_loader = BasicDeleteFileLoader::new(file_io.clone()); + let basic_delete_file_loader = + BasicDeleteFileLoader::new(file_io.clone(), ScanMetrics::new()); let record_batch_stream = basic_delete_file_loader .parquet_to_batch_stream( &eq_delete_file_path, @@ -808,7 +819,8 @@ mod tests { }; let file_io = FileIO::new_with_fs(); - let basic_delete_file_loader = BasicDeleteFileLoader::new(file_io.clone()); + let basic_delete_file_loader = + BasicDeleteFileLoader::new(file_io.clone(), ScanMetrics::new()); let batch_stream = basic_delete_file_loader .parquet_to_batch_stream( @@ -994,7 +1006,8 @@ mod tests { writer.write(&record_batch).unwrap(); writer.close().unwrap(); - let basic_delete_file_loader = BasicDeleteFileLoader::new(file_io.clone()); + let basic_delete_file_loader = + BasicDeleteFileLoader::new(file_io.clone(), ScanMetrics::new()); let record_batch_stream = basic_delete_file_loader .parquet_to_batch_stream(&path, std::fs::metadata(&path).unwrap().len()) .await diff --git a/crates/iceberg/src/arrow/delete_file_loader.rs b/crates/iceberg/src/arrow/delete_file_loader.rs index 0be62ad496..134b029613 100644 --- a/crates/iceberg/src/arrow/delete_file_loader.rs +++ b/crates/iceberg/src/arrow/delete_file_loader.rs @@ -23,6 +23,7 @@ use parquet::arrow::ParquetRecordBatchStreamBuilder; use crate::arrow::ArrowReader; use crate::arrow::reader::ParquetReadOptions; use crate::arrow::record_batch_transformer::RecordBatchTransformerBuilder; +use crate::arrow::scan_metrics::ScanMetrics; use crate::io::FileIO; use crate::scan::{ArrowRecordBatchStream, FileScanTaskDeleteFile}; use crate::spec::{Schema, SchemaRef}; @@ -45,13 +46,22 @@ pub trait DeleteFileLoader { #[derive(Clone, Debug)] pub(crate) struct BasicDeleteFileLoader { file_io: FileIO, + scan_metrics: ScanMetrics, } #[allow(unused_variables)] impl BasicDeleteFileLoader { - pub fn new(file_io: FileIO) -> Self { - BasicDeleteFileLoader { file_io } + pub fn new(file_io: FileIO, scan_metrics: ScanMetrics) -> Self { + BasicDeleteFileLoader { + file_io, + scan_metrics, + } } + + pub(crate) fn file_io(&self) -> &FileIO { + &self.file_io + } + /// Loads a RecordBatchStream for a given datafile. pub(crate) async fn parquet_to_batch_stream( &self, @@ -69,6 +79,7 @@ impl BasicDeleteFileLoader { &self.file_io, file_size_in_bytes, parquet_read_options, + self.scan_metrics.bytes_read_counter(), ) .await?; @@ -137,7 +148,8 @@ mod tests { let table_location = tmp_dir.path(); let file_io = FileIO::new_with_fs(); - let delete_file_loader = BasicDeleteFileLoader::new(file_io.clone()); + let scan_metrics = ScanMetrics::new(); + let delete_file_loader = BasicDeleteFileLoader::new(file_io.clone(), scan_metrics); let file_scan_tasks = setup(table_location); diff --git a/crates/iceberg/src/arrow/mod.rs b/crates/iceberg/src/arrow/mod.rs index 7823320452..bf53633cfc 100644 --- a/crates/iceberg/src/arrow/mod.rs +++ b/crates/iceberg/src/arrow/mod.rs @@ -32,9 +32,11 @@ mod reader; /// RecordBatch projection utilities pub mod record_batch_projector; pub(crate) mod record_batch_transformer; +mod scan_metrics; mod value; pub use reader::*; +pub use scan_metrics::{ScanMetrics, ScanResult}; pub use value::*; /// Partition value calculator for computing partition values pub mod partition_value_calculator; diff --git a/crates/iceberg/src/arrow/reader/pipeline.rs b/crates/iceberg/src/arrow/reader/pipeline.rs index 94059fc62b..8ecee294c4 100644 --- a/crates/iceberg/src/arrow/reader/pipeline.rs +++ b/crates/iceberg/src/arrow/reader/pipeline.rs @@ -21,6 +21,7 @@ //! of transformed Arrow `RecordBatch`es. use std::sync::Arc; +use std::sync::atomic::AtomicU64; use futures::{StreamExt, TryStreamExt}; use parquet::arrow::arrow_reader::{ArrowReaderMetadata, ArrowReaderOptions}; @@ -33,8 +34,9 @@ use super::{ use crate::arrow::caching_delete_file_loader::CachingDeleteFileLoader; use crate::arrow::int96::coerce_int96_timestamps; use crate::arrow::record_batch_transformer::RecordBatchTransformerBuilder; +use crate::arrow::scan_metrics::{CountingFileRead, ScanMetrics, ScanResult}; use crate::error::Result; -use crate::io::{FileIO, FileMetadata}; +use crate::io::{FileIO, FileMetadata, FileRead}; use crate::metadata_columns::{RESERVED_FIELD_ID_FILE, is_metadata_field}; use crate::scan::{ArrowRecordBatchStream, FileScanTask, FileScanTaskStream}; use crate::spec::Datum; @@ -42,32 +44,28 @@ use crate::{Error, ErrorKind}; impl ArrowReader { /// Take a stream of FileScanTasks and reads all the files. - /// Returns a stream of Arrow RecordBatches containing the data from the files - pub fn read(self, tasks: FileScanTaskStream) -> Result { - let file_io = self.file_io.clone(); - let batch_size = self.batch_size; + /// Returns a [`ScanResult`] containing the record batch stream and scan metrics. + pub fn read(self, tasks: FileScanTaskStream) -> Result { let concurrency_limit_data_files = self.concurrency_limit_data_files; - let row_group_filtering_enabled = self.row_group_filtering_enabled; - let row_selection_enabled = self.row_selection_enabled; - let parquet_read_options = self.parquet_read_options; + let scan_metrics = ScanMetrics::new(); + + let task_reader = FileScanTaskReader { + batch_size: self.batch_size, + file_io: self.file_io, + delete_file_loader: self + .delete_file_loader + .with_scan_metrics(scan_metrics.clone()), + row_group_filtering_enabled: self.row_group_filtering_enabled, + row_selection_enabled: self.row_selection_enabled, + parquet_read_options: self.parquet_read_options, + scan_metrics: scan_metrics.clone(), + }; // Fast-path for single concurrency to avoid overhead of try_flatten_unordered let stream: ArrowRecordBatchStream = if concurrency_limit_data_files == 1 { Box::pin( tasks - .and_then(move |task| { - let file_io = file_io.clone(); - - Self::process_file_scan_task( - task, - batch_size, - file_io, - self.delete_file_loader.clone(), - row_group_filtering_enabled, - row_selection_enabled, - parquet_read_options, - ) - }) + .and_then(move |task| task_reader.clone().process(task)) .map_err(|err| { Error::new(ErrorKind::Unexpected, "file scan task generate failed") .with_source(err) @@ -77,19 +75,7 @@ impl ArrowReader { } else { Box::pin( tasks - .map_ok(move |task| { - let file_io = file_io.clone(); - - Self::process_file_scan_task( - task, - batch_size, - file_io, - self.delete_file_loader.clone(), - row_group_filtering_enabled, - row_selection_enabled, - parquet_read_options, - ) - }) + .map_ok(move |task| task_reader.clone().process(task)) .map_err(|err| { Error::new(ErrorKind::Unexpected, "file scan task generate failed") .with_source(err) @@ -99,32 +85,41 @@ impl ArrowReader { ) }; - Ok(stream) + Ok(ScanResult::new(stream, scan_metrics)) } +} - async fn process_file_scan_task( - task: FileScanTask, - batch_size: Option, - file_io: FileIO, - delete_file_loader: CachingDeleteFileLoader, - row_group_filtering_enabled: bool, - row_selection_enabled: bool, - parquet_read_options: ParquetReadOptions, - ) -> Result { +/// Per-scan state for processing [`FileScanTask`]s. Created once per +/// [`ArrowReader::read`] call and cloned per task. +#[derive(Clone)] +struct FileScanTaskReader { + batch_size: Option, + file_io: FileIO, + delete_file_loader: CachingDeleteFileLoader, + row_group_filtering_enabled: bool, + row_selection_enabled: bool, + parquet_read_options: ParquetReadOptions, + scan_metrics: ScanMetrics, +} + +impl FileScanTaskReader { + async fn process(self, task: FileScanTask) -> Result { let should_load_page_index = - (row_selection_enabled && task.predicate.is_some()) || !task.deletes.is_empty(); - let mut parquet_read_options = parquet_read_options; + (self.row_selection_enabled && task.predicate.is_some()) || !task.deletes.is_empty(); + let mut parquet_read_options = self.parquet_read_options; parquet_read_options.preload_page_index = should_load_page_index; - let delete_filter_rx = - delete_file_loader.load_deletes(&task.deletes, Arc::clone(&task.schema)); + let delete_filter_rx = self + .delete_file_loader + .load_deletes(&task.deletes, Arc::clone(&task.schema)); // Open the Parquet file once, loading its metadata - let (parquet_file_reader, arrow_metadata) = Self::open_parquet_file( + let (parquet_file_reader, arrow_metadata) = ArrowReader::open_parquet_file( &task.data_file_path, - &file_io, + &self.file_io, task.file_size_in_bytes, parquet_read_options, + self.scan_metrics.bytes_read_counter(), ) .await?; @@ -222,7 +217,7 @@ impl ArrowReader { // - If file has embedded IDs: field-ID-based projection (missing_field_ids=false) // - If name mapping applied: field-ID-based projection (missing_field_ids=true but IDs now match) // - If fallback IDs: position-based projection (missing_field_ids=true) - let projection_mask = Self::get_arrow_projection_mask( + let projection_mask = ArrowReader::get_arrow_projection_mask( &project_field_ids_without_metadata, &task.schema, record_batch_stream_builder.parquet_schema(), @@ -255,7 +250,7 @@ impl ArrowReader { let mut record_batch_transformer = record_batch_transformer_builder.build(); - if let Some(batch_size) = batch_size { + if let Some(batch_size) = self.batch_size { record_batch_stream_builder = record_batch_stream_builder.with_batch_size(batch_size); } @@ -296,7 +291,7 @@ impl ArrowReader { // Filter row groups based on byte range from task.start and task.length. // If both start and length are 0, read the entire file (backwards compatibility). if task.start != 0 || task.length != 0 { - let byte_range_filtered_row_groups = Self::filter_row_groups_by_byte_range( + let byte_range_filtered_row_groups = ArrowReader::filter_row_groups_by_byte_range( record_batch_stream_builder.metadata(), task.start, task.length, @@ -305,12 +300,12 @@ impl ArrowReader { } if let Some(predicate) = final_predicate { - let (iceberg_field_ids, field_id_map) = Self::build_field_id_set_and_map( + let (iceberg_field_ids, field_id_map) = ArrowReader::build_field_id_set_and_map( record_batch_stream_builder.parquet_schema(), &predicate, )?; - let row_filter = Self::get_row_filter( + let row_filter = ArrowReader::get_row_filter( &predicate, record_batch_stream_builder.parquet_schema(), &iceberg_field_ids, @@ -318,8 +313,8 @@ impl ArrowReader { )?; record_batch_stream_builder = record_batch_stream_builder.with_row_filter(row_filter); - if row_group_filtering_enabled { - let predicate_filtered_row_groups = Self::get_selected_row_group_indices( + if self.row_group_filtering_enabled { + let predicate_filtered_row_groups = ArrowReader::get_selected_row_group_indices( &predicate, record_batch_stream_builder.metadata(), &field_id_map, @@ -341,8 +336,8 @@ impl ArrowReader { }; } - if row_selection_enabled { - row_selection = Some(Self::get_row_selection_for_filter_predicate( + if self.row_selection_enabled { + row_selection = Some(ArrowReader::get_row_selection_for_filter_predicate( &predicate, record_batch_stream_builder.metadata(), &selected_row_group_indices, @@ -358,7 +353,7 @@ impl ArrowReader { let delete_row_selection = { let positional_delete_indexes = positional_delete_indexes.lock().unwrap(); - Self::build_deletes_row_selection( + ArrowReader::build_deletes_row_selection( record_batch_stream_builder.metadata().row_groups(), &selected_row_group_indices, &positional_delete_indexes, @@ -400,18 +395,34 @@ impl ArrowReader { Ok(Box::pin(record_batch_stream) as ArrowRecordBatchStream) } +} - /// Opens a Parquet file and loads its metadata, returning both the reader and metadata. - /// The reader can be reused to build a `ParquetRecordBatchStreamBuilder` without - /// reopening the file. +impl ArrowReader { + /// Opens a Parquet file and loads its metadata, wrapping the reader with + /// [`CountingFileRead`] so all I/O is accumulated into `bytes_read`. pub(crate) async fn open_parquet_file( data_file_path: &str, file_io: &FileIO, file_size_in_bytes: u64, parquet_read_options: ParquetReadOptions, + bytes_read: &Arc, ) -> Result<(ArrowFileReader, ArrowReaderMetadata)> { let parquet_file = file_io.new_input(data_file_path)?; - let parquet_reader = parquet_file.reader().await?; + let counting_reader = + CountingFileRead::new(parquet_file.reader().await?, Arc::clone(bytes_read)); + Self::build_parquet_reader( + Box::new(counting_reader), + file_size_in_bytes, + parquet_read_options, + ) + .await + } + + async fn build_parquet_reader( + parquet_reader: Box, + file_size_in_bytes: u64, + parquet_read_options: ParquetReadOptions, + ) -> Result<(ArrowFileReader, ArrowReaderMetadata)> { let mut reader = ArrowFileReader::new( FileMetadata { size: file_size_in_bytes, @@ -497,7 +508,13 @@ mod tests { }; let tasks = Box::pin(futures::stream::iter(vec![Ok(task)])) as FileScanTaskStream; - reader.read(tasks).unwrap().try_collect().await.unwrap() + reader + .read(tasks) + .unwrap() + .stream() + .try_collect() + .await + .unwrap() } // ArrowWriter cannot write INT96, so we use SerializedFileWriter directly. @@ -748,6 +765,7 @@ mod tests { let result = reader .read(tasks_stream) .unwrap() + .stream() .try_collect::>() .await .unwrap(); diff --git a/crates/iceberg/src/arrow/reader/positional_deletes.rs b/crates/iceberg/src/arrow/reader/positional_deletes.rs index eea031852b..b2993572c5 100644 --- a/crates/iceberg/src/arrow/reader/positional_deletes.rs +++ b/crates/iceberg/src/arrow/reader/positional_deletes.rs @@ -461,6 +461,7 @@ mod tests { let result = reader .read(tasks) .unwrap() + .stream() .try_collect::>() .await .unwrap(); @@ -681,6 +682,7 @@ mod tests { let result = reader .read(tasks) .unwrap() + .stream() .try_collect::>() .await .unwrap(); @@ -895,6 +897,7 @@ mod tests { let result = reader .read(tasks) .unwrap() + .stream() .try_collect::>() .await .unwrap(); diff --git a/crates/iceberg/src/arrow/reader/projection.rs b/crates/iceberg/src/arrow/reader/projection.rs index d3fa00b84b..deae027e14 100644 --- a/crates/iceberg/src/arrow/reader/projection.rs +++ b/crates/iceberg/src/arrow/reader/projection.rs @@ -602,6 +602,7 @@ message schema { let result = reader .read(tasks) .unwrap() + .stream() .try_collect::>() .await .unwrap(); @@ -704,6 +705,7 @@ message schema { let result = reader .read(tasks) .unwrap() + .stream() .try_collect::>() .await .unwrap(); @@ -805,6 +807,7 @@ message schema { let result = reader .read(tasks) .unwrap() + .stream() .try_collect::>() .await .unwrap(); @@ -895,6 +898,7 @@ message schema { let result = reader .read(tasks) .unwrap() + .stream() .try_collect::>() .await .unwrap(); @@ -999,6 +1003,7 @@ message schema { let result = reader .read(tasks) .unwrap() + .stream() .try_collect::>() .await .unwrap(); @@ -1132,6 +1137,7 @@ message schema { let result = reader .read(tasks) .unwrap() + .stream() .try_collect::>() .await .unwrap(); @@ -1232,6 +1238,7 @@ message schema { let result = reader .read(tasks) .unwrap() + .stream() .try_collect::>() .await .unwrap(); @@ -1346,6 +1353,7 @@ message schema { let result = reader .read(tasks) .unwrap() + .stream() .try_collect::>() .await .unwrap(); @@ -1488,6 +1496,7 @@ message schema { let result = reader .read(tasks) .unwrap() + .stream() .try_collect::>() .await .unwrap(); @@ -1699,6 +1708,7 @@ message schema { let result = reader .read(tasks) .unwrap() + .stream() .try_collect::>() .await .unwrap(); diff --git a/crates/iceberg/src/arrow/reader/row_filter.rs b/crates/iceberg/src/arrow/reader/row_filter.rs index 52f7260cc6..80432a0437 100644 --- a/crates/iceberg/src/arrow/reader/row_filter.rs +++ b/crates/iceberg/src/arrow/reader/row_filter.rs @@ -241,6 +241,7 @@ mod tests { let result = reader .read(tasks) .unwrap() + .stream() .try_collect::>() .await .unwrap(); @@ -555,6 +556,7 @@ mod tests { .clone() .read(tasks1) .unwrap() + .stream() .try_collect::>() .await .unwrap(); @@ -571,6 +573,7 @@ mod tests { let result2 = reader .read(tasks2) .unwrap() + .stream() .try_collect::>() .await .unwrap(); diff --git a/crates/iceberg/src/arrow/scan_metrics.rs b/crates/iceberg/src/arrow/scan_metrics.rs new file mode 100644 index 0000000000..4331a53fcb --- /dev/null +++ b/crates/iceberg/src/arrow/scan_metrics.rs @@ -0,0 +1,96 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Scan metrics and I/O counting for Parquet data file reads. + +use std::ops::Range; +use std::sync::Arc; +use std::sync::atomic::{AtomicU64, Ordering}; + +use bytes::Bytes; + +use crate::error::Result; +use crate::io::FileRead; +use crate::scan::ArrowRecordBatchStream; + +/// Wraps a [`FileRead`] to count bytes read via a shared atomic counter. +pub(crate) struct CountingFileRead { + inner: F, + bytes_read: Arc, +} + +impl CountingFileRead { + pub(crate) fn new(inner: F, bytes_read: Arc) -> Self { + Self { inner, bytes_read } + } +} + +#[async_trait::async_trait] +impl FileRead for CountingFileRead { + async fn read(&self, range: Range) -> Result { + debug_assert!(range.end >= range.start); + self.bytes_read + .fetch_add(range.end - range.start, Ordering::Relaxed); + self.inner.read(range).await + } +} + +/// Metrics collected during an Iceberg scan. +#[derive(Clone, Debug)] +pub struct ScanMetrics { + bytes_read: Arc, +} + +impl ScanMetrics { + pub(crate) fn new() -> Self { + Self { + bytes_read: Arc::new(AtomicU64::new(0)), + } + } + + pub(crate) fn bytes_read_counter(&self) -> &Arc { + &self.bytes_read + } + + /// Total bytes read from storage for data files during this scan. + pub fn bytes_read(&self) -> u64 { + self.bytes_read.load(Ordering::Relaxed) + } +} + +/// Result of [`ArrowReader::read`](super::ArrowReader::read), containing the +/// record batch stream and metrics collected during the scan. +pub struct ScanResult { + stream: ArrowRecordBatchStream, + metrics: ScanMetrics, +} + +impl ScanResult { + pub(crate) fn new(stream: ArrowRecordBatchStream, metrics: ScanMetrics) -> Self { + Self { stream, metrics } + } + + /// Consumes the result, returning only the record batch stream. + pub fn stream(self) -> ArrowRecordBatchStream { + self.stream + } + + /// Returns a reference to the scan metrics. + pub fn metrics(&self) -> &ScanMetrics { + &self.metrics + } +} diff --git a/crates/iceberg/src/io/file_io.rs b/crates/iceberg/src/io/file_io.rs index 594b070e03..227d8f4d5b 100644 --- a/crates/iceberg/src/io/file_io.rs +++ b/crates/iceberg/src/io/file_io.rs @@ -255,6 +255,13 @@ pub trait FileRead: Send + Sync + Unpin + 'static { async fn read(&self, range: Range) -> crate::Result; } +#[async_trait::async_trait] +impl + Send + Sync + Unpin + 'static> FileRead for T { + async fn read(&self, range: Range) -> crate::Result { + self.as_ref().read(range).await + } +} + /// Input file is used for reading from files. #[derive(Debug)] pub struct InputFile { diff --git a/crates/iceberg/src/scan/mod.rs b/crates/iceberg/src/scan/mod.rs index 4a1e27bdc1..27f479183a 100644 --- a/crates/iceberg/src/scan/mod.rs +++ b/crates/iceberg/src/scan/mod.rs @@ -32,6 +32,7 @@ use futures::{SinkExt, StreamExt, TryStreamExt}; pub use task::*; use crate::arrow::ArrowReaderBuilder; +pub use crate::arrow::{ScanMetrics, ScanResult}; use crate::delete_file_index::DeleteFileIndex; use crate::expr::visitors::inclusive_metrics_evaluator::InclusiveMetricsEvaluator; use crate::expr::{Bind, BoundPredicate, Predicate}; @@ -441,7 +442,10 @@ impl TableScan { arrow_reader_builder = arrow_reader_builder.with_batch_size(batch_size); } - arrow_reader_builder.build().read(self.plan_files().await?) + arrow_reader_builder + .build() + .read(self.plan_files().await?) + .map(|result| result.stream()) } /// Returns a reference to the column names of the table scan. @@ -1364,13 +1368,15 @@ pub mod tests { let batch_stream = reader .clone() .read(Box::pin(stream::iter(vec![Ok(plan_task.remove(0))]))) - .unwrap(); + .unwrap() + .stream(); let batch_1: Vec<_> = batch_stream.try_collect().await.unwrap(); let reader = ArrowReaderBuilder::new(fixture.table.file_io().clone()).build(); let batch_stream = reader .read(Box::pin(stream::iter(vec![Ok(plan_task.remove(0))]))) - .unwrap(); + .unwrap() + .stream(); let batch_2: Vec<_> = batch_stream.try_collect().await.unwrap(); assert_eq!(batch_1, batch_2); From 88ca8b6fdc2146d48e9bbcc7c7cb1bfda096af5a Mon Sep 17 00:00:00 2001 From: Xander Date: Tue, 28 Apr 2026 10:42:22 +0100 Subject: [PATCH 38/45] feat(encryption) [3/N] Support encryption: KMS (#2339) ## Which issue does this PR close? Part of #2034 ## What changes are included in this PR? Adds the `KeyManagementClient` trait and an in-memory implementation for testing. - `KeyManagementClient` trait mirrors Java's `KeyManagementClient` [interface](https://github.com/apache/iceberg/blob/main/core/src/main/java/org/apache/iceberg/encryption/KeyManagementClient.java). - `InMemoryKeyManagementClient` for testing-only KMS that stores master keys in memory to wrap/unwrap. Supports configurable key sizes and explicit key bytes for cross-language interop tests. ## Are these changes tested? Tests covered for `InMemoryKeyManagementClient` --- crates/iceberg/src/encryption/crypto.rs | 9 +- crates/iceberg/src/encryption/kms/client.rs | 98 +++++++ crates/iceberg/src/encryption/kms/memory.rs | 296 ++++++++++++++++++++ crates/iceberg/src/encryption/kms/mod.rs | 27 ++ crates/iceberg/src/encryption/mod.rs | 8 +- crates/iceberg/src/error.rs | 6 + 6 files changed, 436 insertions(+), 8 deletions(-) create mode 100644 crates/iceberg/src/encryption/kms/client.rs create mode 100644 crates/iceberg/src/encryption/kms/memory.rs create mode 100644 crates/iceberg/src/encryption/kms/mod.rs diff --git a/crates/iceberg/src/encryption/crypto.rs b/crates/iceberg/src/encryption/crypto.rs index 0b34580db8..0f6a9eff43 100644 --- a/crates/iceberg/src/encryption/crypto.rs +++ b/crates/iceberg/src/encryption/crypto.rs @@ -43,7 +43,7 @@ use crate::{Error, ErrorKind, Result}; /// containing `SensitiveBytes` can safely derive or implement `Debug` /// without risk of leaking key material. #[derive(Clone, PartialEq, Eq)] -struct SensitiveBytes(Zeroizing>); +pub struct SensitiveBytes(Zeroizing>); impl SensitiveBytes { /// Wraps the given bytes as sensitive material. @@ -57,13 +57,11 @@ impl SensitiveBytes { } /// Returns the number of bytes. - #[allow(dead_code)] // Encryption work is ongoing so currently unused pub fn len(&self) -> usize { self.0.len() } /// Returns `true` if the byte slice is empty. - #[allow(dead_code)] // Encryption work is ongoing so currently unused pub fn is_empty(&self) -> bool { self.0.is_empty() } @@ -85,9 +83,10 @@ impl fmt::Display for SensitiveBytes { /// /// The Iceberg spec supports 128, 192, and 256-bit keys for AES-GCM. /// See: -#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)] pub enum AesKeySize { - /// 128-bit AES key (16 bytes) + /// 128-bit AES key (16 bytes). Default per the Iceberg spec. + #[default] Bits128 = 128, /// 192-bit AES key (24 bytes) Bits192 = 192, diff --git a/crates/iceberg/src/encryption/kms/client.rs b/crates/iceberg/src/encryption/kms/client.rs new file mode 100644 index 0000000000..85cd511758 --- /dev/null +++ b/crates/iceberg/src/encryption/kms/client.rs @@ -0,0 +1,98 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Key management client trait for encryption key operations. +//! +//! Mirrors the Java `KeyManagementClient` interface from the Apache Iceberg spec. + +use async_trait::async_trait; + +use crate::Result; +use crate::encryption::SensitiveBytes; + +/// Result of a server-side key generation operation. +/// +/// Returned by [`KeyManagementClient::generate_key`] when the KMS supports +/// atomic key generation and wrapping. +pub struct GeneratedKey { + key: SensitiveBytes, + wrapped_key: Vec, +} + +impl GeneratedKey { + /// Creates a new `GeneratedKey` from plaintext key bytes and wrapped key bytes. + pub fn new(key: SensitiveBytes, wrapped_key: Vec) -> Self { + Self { key, wrapped_key } + } + + /// Returns the plaintext key bytes. Zeroized on drop, redacted in Debug. + pub fn key(&self) -> &SensitiveBytes { + &self.key + } + + /// Returns the wrapped (encrypted) key bytes. + pub fn wrapped_key(&self) -> &[u8] { + &self.wrapped_key + } +} + +/// Pluggable interface for key management systems (AWS KMS, Azure Key Vault, etc.). +#[async_trait] +pub trait KeyManagementClient: Send + Sync + std::fmt::Debug { + /// Wrap (encrypt) a key using a wrapping key managed by the KMS. + async fn wrap_key(&self, key: &[u8], wrapping_key_id: &str) -> Result>; + + /// Unwrap (decrypt) a previously wrapped key. + async fn unwrap_key(&self, wrapped_key: &[u8], wrapping_key_id: &str) + -> Result; + + /// Whether this KMS supports server-side key generation. + /// + /// If `true`, callers can use [`generate_key`](Self::generate_key) for atomic + /// key generation and wrapping, which is more secure than generating a key + /// locally and then wrapping it. + fn supports_key_generation(&self) -> bool; + + /// Generate a new key and wrap it atomically on the server side. + /// + /// This is only supported when [`supports_key_generation`](Self::supports_key_generation) + /// returns `true`. + async fn generate_key(&self, wrapping_key_id: &str) -> Result; +} + +#[async_trait] +impl + Send + Sync + std::fmt::Debug> KeyManagementClient for T { + async fn wrap_key(&self, key: &[u8], wrapping_key_id: &str) -> Result> { + self.as_ref().wrap_key(key, wrapping_key_id).await + } + + async fn unwrap_key( + &self, + wrapped_key: &[u8], + wrapping_key_id: &str, + ) -> Result { + self.as_ref().unwrap_key(wrapped_key, wrapping_key_id).await + } + + fn supports_key_generation(&self) -> bool { + self.as_ref().supports_key_generation() + } + + async fn generate_key(&self, wrapping_key_id: &str) -> Result { + self.as_ref().generate_key(wrapping_key_id).await + } +} diff --git a/crates/iceberg/src/encryption/kms/memory.rs b/crates/iceberg/src/encryption/kms/memory.rs new file mode 100644 index 0000000000..65319831dd --- /dev/null +++ b/crates/iceberg/src/encryption/kms/memory.rs @@ -0,0 +1,296 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! In-memory KMS implementation for testing and development. +//! +//! **WARNING**: This implementation is NOT suitable for production use. +//! Keys are stored in memory only and will be lost when the process exits. + +use std::collections::HashMap; +use std::fmt; +use std::sync::{Arc, RwLock}; + +use async_trait::async_trait; + +use super::KeyManagementClient; +use crate::encryption::{AesGcmCipher, AesKeySize, SecureKey, SensitiveBytes}; +use crate::error::lock_error; +use crate::{Error, ErrorKind, Result}; + +/// In-memory KMS for testing. Not suitable for production use. +/// +/// ``` +/// use iceberg::encryption::KeyManagementClient; +/// use iceberg::encryption::kms::MemoryKeyManagementClient; +/// +/// # async fn example() -> iceberg::Result<()> { +/// let kms = MemoryKeyManagementClient::new(); +/// kms.add_master_key("my-master-key")?; +/// +/// let dek = vec![0u8; 16]; +/// let wrapped = kms.wrap_key(&dek, "my-master-key").await?; +/// let unwrapped = kms.unwrap_key(&wrapped, "my-master-key").await?; +/// assert_eq!(dek.as_slice(), unwrapped.as_bytes()); +/// # Ok(()) +/// # } +/// ``` +#[derive(Clone, Default)] +pub struct MemoryKeyManagementClient { + master_keys: Arc>>, + master_key_size: AesKeySize, +} + +impl fmt::Debug for MemoryKeyManagementClient { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("MemoryKeyManagementClient") + .field("master_key_size", &self.master_key_size) + .field("key_count", &self.key_count()) + .finish() + } +} + +impl MemoryKeyManagementClient { + /// Creates a new in-memory KMS with 128-bit AES keys. + pub fn new() -> Self { + Self::default() + } + + /// Creates a new in-memory KMS with the specified master key size. + pub fn with_master_key_size(master_key_size: AesKeySize) -> Self { + Self { + master_keys: Arc::new(RwLock::new(HashMap::new())), + master_key_size, + } + } + + /// Adds a randomly generated master key with the given ID. + pub fn add_master_key(&self, key_id: impl Into) -> Result<()> { + let key = SecureKey::generate(self.master_key_size); + self.insert_key(key_id.into(), SensitiveBytes::new(key.as_bytes())) + } + + /// Adds a master key with explicit key bytes. + /// + /// Use this to seed the KMS with known key material, e.g. for + /// cross-language integration tests where both Java and Rust must + /// share the same master key bytes. + pub fn add_master_key_bytes( + &self, + key_id: impl Into, + key_bytes: SensitiveBytes, + ) -> Result<()> { + Self::check_key_length(&key_bytes)?; + self.insert_key(key_id.into(), key_bytes) + } + + /// Check the key length is valid by constructing a SecureKey. + fn check_key_length(key_bytes: &SensitiveBytes) -> Result<()> { + SecureKey::new(key_bytes.as_bytes())?; + Ok(()) + } + + fn insert_key(&self, key_id: String, key: SensitiveBytes) -> Result<()> { + let mut keys = self.master_keys.write().map_err(lock_error)?; + + if keys.contains_key(&key_id) { + return Err(Error::new( + ErrorKind::DataInvalid, + format!("Master key already exists: {key_id}"), + )); + } + + keys.insert(key_id, key); + Ok(()) + } + + fn get_master_key(&self, key_id: &str) -> Result { + let keys = self.master_keys.read().map_err(lock_error)?; + + keys.get(key_id).cloned().ok_or_else(|| { + Error::new( + ErrorKind::DataInvalid, + format!("Master key not found: {key_id}"), + ) + }) + } + + /// Number of master keys stored. + pub fn key_count(&self) -> usize { + self.master_keys.read().map(|keys| keys.len()).unwrap_or(0) + } + + /// Whether a master key with the given ID exists. + pub fn has_key(&self, key_id: &str) -> bool { + self.master_keys + .read() + .map(|keys| keys.contains_key(key_id)) + .unwrap_or(false) + } +} + +#[async_trait] +impl KeyManagementClient for MemoryKeyManagementClient { + async fn wrap_key(&self, key: &[u8], wrapping_key_id: &str) -> Result> { + let master_key_bytes = self.get_master_key(wrapping_key_id)?; + let master_key = SecureKey::new(master_key_bytes.as_bytes())?; + let cipher = AesGcmCipher::new(master_key); + + cipher.encrypt(key, None) + } + + async fn unwrap_key( + &self, + wrapped_key: &[u8], + wrapping_key_id: &str, + ) -> Result { + let master_key_bytes = self.get_master_key(wrapping_key_id)?; + let master_key = SecureKey::new(master_key_bytes.as_bytes())?; + let cipher = AesGcmCipher::new(master_key); + + Ok(SensitiveBytes::new(cipher.decrypt(wrapped_key, None)?)) + } + + fn supports_key_generation(&self) -> bool { + false + } + + async fn generate_key(&self, _wrapping_key_id: &str) -> Result { + Err(Error::new( + ErrorKind::FeatureUnsupported, + "MemoryKeyManagementClient does not support server-side key generation", + )) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[tokio::test] + async fn test_wrap_unwrap_roundtrip() { + let kms = MemoryKeyManagementClient::new(); + kms.add_master_key("master-1").unwrap(); + let dek = vec![0u8; 16]; + + let wrapped = kms.wrap_key(&dek, "master-1").await.unwrap(); + let unwrapped = kms.unwrap_key(&wrapped, "master-1").await.unwrap(); + assert_eq!(unwrapped.as_bytes(), dek.as_slice()); + } + + #[tokio::test] + async fn test_wrap_unknown_key_fails() { + let kms = MemoryKeyManagementClient::new(); + let dek = vec![0u8; 16]; + + let result = kms.wrap_key(&dek, "nonexistent").await; + assert!(result.is_err()); + } + + #[tokio::test] + async fn test_wrong_master_key_fails_unwrap() { + let kms = MemoryKeyManagementClient::new(); + kms.add_master_key("master-1").unwrap(); + kms.add_master_key("master-2").unwrap(); + let dek = vec![0u8; 16]; + + let wrapped = kms.wrap_key(&dek, "master-1").await.unwrap(); + + let result = kms.unwrap_key(&wrapped, "master-2").await; + assert!(result.is_err()); + } + + #[tokio::test] + async fn test_does_not_support_key_generation() { + let kms = MemoryKeyManagementClient::new(); + assert!(!kms.supports_key_generation()); + + let result = kms.generate_key("master-1").await; + assert!(result.is_err()); + } + + #[tokio::test] + async fn test_multiple_master_keys() { + let kms = MemoryKeyManagementClient::new(); + kms.add_master_key("master-1").unwrap(); + kms.add_master_key("master-2").unwrap(); + let dek1 = vec![1u8; 16]; + let dek2 = vec![2u8; 16]; + + let wrapped1 = kms.wrap_key(&dek1, "master-1").await.unwrap(); + let wrapped2 = kms.wrap_key(&dek2, "master-2").await.unwrap(); + + let unwrapped1 = kms.unwrap_key(&wrapped1, "master-1").await.unwrap(); + let unwrapped2 = kms.unwrap_key(&wrapped2, "master-2").await.unwrap(); + + assert_eq!(unwrapped1.as_bytes(), dek1.as_slice()); + assert_eq!(unwrapped2.as_bytes(), dek2.as_slice()); + } + + #[tokio::test] + async fn test_add_master_key() { + let kms = MemoryKeyManagementClient::new(); + + kms.add_master_key("my-key").unwrap(); + assert!(kms.has_key("my-key")); + assert_eq!(kms.key_count(), 1); + + let result = kms.add_master_key("my-key"); + assert!(result.is_err()); + } + + #[tokio::test] + async fn test_add_master_key_bytes() { + let kms = MemoryKeyManagementClient::new(); + let key_bytes = SensitiveBytes::new([42u8; 16]); + + kms.add_master_key_bytes("my-key", key_bytes).unwrap(); + assert!(kms.has_key("my-key")); + + let dek = vec![7u8; 16]; + let wrapped = kms.wrap_key(&dek, "my-key").await.unwrap(); + let unwrapped = kms.unwrap_key(&wrapped, "my-key").await.unwrap(); + assert_eq!(unwrapped.as_bytes(), dek.as_slice()); + } + + #[tokio::test] + async fn test_add_master_key_bytes_invalid_length() { + let kms = MemoryKeyManagementClient::new(); + + let result = kms.add_master_key_bytes("my-key", SensitiveBytes::new([0u8; 7])); + assert!(result.is_err()); + } + + #[tokio::test] + async fn test_with_master_key_size() { + let kms = MemoryKeyManagementClient::with_master_key_size(AesKeySize::Bits256); + kms.add_master_key("master-256").unwrap(); + + let dek = vec![0u8; 16]; + let wrapped = kms.wrap_key(&dek, "master-256").await.unwrap(); + let unwrapped = kms.unwrap_key(&wrapped, "master-256").await.unwrap(); + assert_eq!(unwrapped.as_bytes(), dek.as_slice()); + } + + #[tokio::test] + async fn test_clone_shares_state() { + let kms1 = MemoryKeyManagementClient::new(); + let kms2 = kms1.clone(); + + kms1.add_master_key("shared-key").unwrap(); + assert!(kms2.has_key("shared-key")); + } +} diff --git a/crates/iceberg/src/encryption/kms/mod.rs b/crates/iceberg/src/encryption/kms/mod.rs new file mode 100644 index 0000000000..160e692550 --- /dev/null +++ b/crates/iceberg/src/encryption/kms/mod.rs @@ -0,0 +1,27 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Key Management System trait and implementations. +//! +//! This module provides the [`KeyManagementClient`] trait for pluggable KMS +//! integration and implementations for different key management systems. + +mod client; +mod memory; + +pub use client::{GeneratedKey, KeyManagementClient}; +pub use memory::MemoryKeyManagementClient; diff --git a/crates/iceberg/src/encryption/mod.rs b/crates/iceberg/src/encryption/mod.rs index 9888a153c7..38edb72f53 100644 --- a/crates/iceberg/src/encryption/mod.rs +++ b/crates/iceberg/src/encryption/mod.rs @@ -17,15 +17,17 @@ //! Encryption module for Apache Iceberg. //! -//! This module provides core cryptographic primitives for encrypting -//! and decrypting data in Iceberg tables. +//! This module provides core cryptographic primitives and key management +//! for encrypting and decrypting data in Iceberg tables. mod crypto; mod file_decryptor; mod file_encryptor; +pub mod kms; mod stream; -pub use crypto::{AesGcmCipher, AesKeySize, SecureKey}; +pub use crypto::{AesGcmCipher, AesKeySize, SecureKey, SensitiveBytes}; pub use file_decryptor::AesGcmFileDecryptor; pub use file_encryptor::AesGcmFileEncryptor; +pub use kms::{GeneratedKey, KeyManagementClient}; pub use stream::{AesGcmFileRead, AesGcmFileWrite}; diff --git a/crates/iceberg/src/error.rs b/crates/iceberg/src/error.rs index a0399a8082..02c3eee8fc 100644 --- a/crates/iceberg/src/error.rs +++ b/crates/iceberg/src/error.rs @@ -18,6 +18,7 @@ use std::backtrace::{Backtrace, BacktraceStatus}; use std::fmt; use std::fmt::{Debug, Display, Formatter}; +use std::sync::PoisonError; use chrono::{DateTime, TimeZone as _, Utc}; @@ -422,6 +423,11 @@ define_from_err!( define_from_err!(std::io::Error, ErrorKind::Unexpected, "IO Operation failed"); +/// Converts a [`PoisonError`] from a poisoned lock into an [`Error`]. +pub(crate) fn lock_error(e: PoisonError) -> Error { + Error::new(ErrorKind::Unexpected, format!("Lock poisoned: {e}")) +} + /// Converts a timestamp in milliseconds to `DateTime`, handling errors. /// /// # Arguments From 40cc78767eaaf5c2a7463cc28212b8a0972ad629 Mon Sep 17 00:00:00 2001 From: Shawn Chang Date: Tue, 28 Apr 2026 18:23:18 -0700 Subject: [PATCH 39/45] chore: Improve release process (#2381) ## Which issue does this PR close? - Closes #. ## What changes are included in this PR? - Improve release process doc ## Are these changes tested? not needed --- website/src/release.md | 51 ++++++++++++++++++++++-------------------- 1 file changed, 27 insertions(+), 24 deletions(-) diff --git a/website/src/release.md b/website/src/release.md index 4f4043b8f3..79c2bca8a0 100644 --- a/website/src/release.md +++ b/website/src/release.md @@ -160,6 +160,7 @@ dist ├── apache-iceberg-rust-0.2.0.tar.gz.asc └── apache-iceberg-rust-0.2.0.tar.gz.sha512 ``` +It is recommended to verify the artifacts yourself before uploading them to the SVN dist repo, see [How to verify a release](#how-to-verify-a-release) ### Upload artifacts to the SVN dist repo @@ -176,7 +177,9 @@ svn co https://dist.apache.org/repos/dist/dev/iceberg/ /tmp/iceberg-dist-dev Then, upload the artifacts: -> The `${release_version}` here should be like `0.2.0-rc.1` +> The `${release_version}` here should be like `0.2.0-rc1` + +Example of uploaded artifacts can be found at: https://dist.apache.org/repos/dist/dev/iceberg/apache-iceberg-rust-0.9.1-rc3/ ```shell # create a directory named by version @@ -190,7 +193,8 @@ cd /tmp/iceberg-dist-dev/ # check svn status svn status - +``` +```shell # add to svn svn add apache-iceberg-rust-${release_version} @@ -220,11 +224,11 @@ Title: Content: ``` -Hello, Apache Iceberg Rust Community, +Hello Apache Iceberg Rust Community, -This is a call for a vote to release Apache Iceberg rust version ${iceberg_version}. +This is a call for a vote to release Apache Iceberg Rust version ${iceberg_version}. -The tag to be voted on is v${release_version}. +The tag to be voted on is: v${release_version}. The release candidate: @@ -238,30 +242,30 @@ Git tag for the release: https://github.com/apache/iceberg-rust/releases/tag/v${release_version} -Please download, verify, and test. +Please download, verify, and test the release candidate. -The VOTE will be open for at least 72 hours and until the necessary -number of votes are reached. +This vote will be open for at least 72 hours and will remain open until the required number of votes is reached. -[ ] +1 approve -[ ] +0 no opinion -[ ] -1 disapprove with the reason +Please vote accordingly: +[ ] +1 Approve +[ ] +0 No opinion +[ ] -1 Disapprove (please provide a reason) -To learn more about Apache Iceberg, please see https://rust.iceberg.apache.org/ +To learn more about Apache Iceberg, please visit: +https://rust.iceberg.apache.org/ Checklist for reference: - -[ ] Download links are valid. -[ ] Checksums and signatures. -[ ] LICENSE/NOTICE files exist -[ ] No unexpected binary files +[ ] Download links are valid +[ ] Checksums and signatures are correct +[ ] LICENSE and NOTICE files are present +[ ] No unexpected binary files are included [ ] All source files have ASF headers -[ ] Can compile from source +[ ] The project builds successfully from source -More details please refer to https://rust.iceberg.apache.org/release.html#how-to-verify-a-release. - -Thanks +For more details, please refer to: +https://rust.iceberg.apache.org/release.html#how-to-verify-a-release +Thanks, ${name} ``` @@ -278,7 +282,7 @@ Title: Content: ``` -Hello, Apache Iceberg Rust Community, +Hello Apache Iceberg Rust Community, The vote to release Apache Iceberg Rust ${release_version} has passed. @@ -296,8 +300,7 @@ Non-Binding votes: Vote thread: ${vote_thread_url} -Thanks - +Thanks, ${name} ``` From 0d97497bb9dd46a6e3c3304e7c3855384b0119f2 Mon Sep 17 00:00:00 2001 From: Matt Butrovich Date: Tue, 28 Apr 2026 21:24:14 -0400 Subject: [PATCH 40/45] docs(arrow): clarify ScanMetrics::bytes_read includes delete files (#2379) ## Which issue does this PR close? - Closes #. ## What changes are included in this PR? Fix the `ScanMetrics::bytes_read` docstring. The counter is shared with `CachingDeleteFileLoader`, so it already includes delete-file reads, not just data files. Docstring now reflects that. ## Are these changes tested? Docstring-only change, no behavior change, so no new tests. --- crates/iceberg/src/arrow/scan_metrics.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/iceberg/src/arrow/scan_metrics.rs b/crates/iceberg/src/arrow/scan_metrics.rs index 4331a53fcb..642190c57d 100644 --- a/crates/iceberg/src/arrow/scan_metrics.rs +++ b/crates/iceberg/src/arrow/scan_metrics.rs @@ -66,7 +66,7 @@ impl ScanMetrics { &self.bytes_read } - /// Total bytes read from storage for data files during this scan. + /// Total bytes read from storage during this scan, including data files and delete files. pub fn bytes_read(&self) -> u64 { self.bytes_read.load(Ordering::Relaxed) } From 7290ef0b831a13d3f626f2f5f5315e5de59b8dee Mon Sep 17 00:00:00 2001 From: Xander Date: Wed, 29 Apr 2026 09:29:57 +0100 Subject: [PATCH 41/45] feat(encryption) [4/N] Support encryption: StandardKeyMetadata (#2340) ## Which issue does this PR close? Part of #2034 ## What changes are included in this PR? Adds `StandardKeyMetadata`, the Avro-serialized key metadata format that lives inside Iceberg's `key_metadata` binary fields (data file field 131, manifest list field 519). The Iceberg spec leaves the format of these bytes as ["implementation-specific"](https://iceberg.apache.org/spec/#data-file-fields). This implementation is byte-compatible with Java's `org.apache.iceberg.encryption.StandardKeyMetadata` for cross-language interop. ## Are these changes tested? Yes --------- Co-authored-by: Claude Opus 4.6 (1M context) --- crates/iceberg/src/encryption/key_metadata.rs | 286 ++++++++++++++++++ crates/iceberg/src/encryption/mod.rs | 2 + 2 files changed, 288 insertions(+) create mode 100644 crates/iceberg/src/encryption/key_metadata.rs diff --git a/crates/iceberg/src/encryption/key_metadata.rs b/crates/iceberg/src/encryption/key_metadata.rs new file mode 100644 index 0000000000..4ef66ce394 --- /dev/null +++ b/crates/iceberg/src/encryption/key_metadata.rs @@ -0,0 +1,286 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Avro-serialized key metadata format compatible with Java's +//! `org.apache.iceberg.encryption.StandardKeyMetadata`. + +use std::fmt; + +use super::SensitiveBytes; +use crate::{Error, ErrorKind, Result}; + +/// Standard key metadata for Iceberg table encryption. +/// +/// Contains the Data Encryption Key (DEK), AAD prefix, and optional file +/// length. Byte-compatible with Java's `StandardKeyMetadata` via Avro +/// serialization. +/// +/// Wire format: `[version byte (0x01)] [Avro binary datum]` +#[derive(Clone, PartialEq, Eq)] +pub struct StandardKeyMetadata { + encryption_key: SensitiveBytes, + aad_prefix: Option>, + file_length: Option, +} + +impl fmt::Debug for StandardKeyMetadata { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("StandardKeyMetadata") + .field("encryption_key", &self.encryption_key) + .field( + "aad_prefix", + &self + .aad_prefix + .as_ref() + .map(|b| format!("[{} bytes]", b.len())), + ) + .field("file_length", &self.file_length) + .finish() + } +} + +impl StandardKeyMetadata { + /// Creates a new `StandardKeyMetadata`. + pub fn new(encryption_key: &[u8]) -> Self { + Self { + encryption_key: SensitiveBytes::new(encryption_key), + aad_prefix: None, + file_length: None, + } + } + + /// Adds an AAD prefix. + pub fn with_aad_prefix(mut self, aad_prefix: &[u8]) -> Self { + self.aad_prefix = Some(aad_prefix.into()); + self + } + + /// Adds a file length. + pub fn with_file_length(mut self, length: u64) -> Self { + self.file_length = Some(length); + self + } + + /// Returns the plaintext Data Encryption Key. + pub fn encryption_key(&self) -> &SensitiveBytes { + &self.encryption_key + } + + /// Returns the AAD prefix. + pub fn aad_prefix(&self) -> Option<&[u8]> { + self.aad_prefix.as_deref() + } + + /// Returns the optional file length. + pub fn file_length(&self) -> Option { + self.file_length + } + + /// Encodes to Java-compatible format: `[0x01] [Avro binary datum]` + pub fn encode(&self) -> Result> { + _serde::StandardKeyMetadataV1::from(self).encode() + } + + /// Decodes from Java-compatible format. + pub fn decode(bytes: &[u8]) -> Result { + _serde::StandardKeyMetadataV1::decode(bytes).map(Self::from) + } +} + +mod _serde { + use std::io::Cursor; + use std::sync::{Arc, LazyLock}; + + use apache_avro::{Schema as AvroSchema, from_avro_datum, from_value, to_avro_datum, to_value}; + use serde::{Deserialize, Serialize}; + + use super::*; + use crate::avro::schema_to_avro_schema; + use crate::spec::{NestedField, PrimitiveType, Schema, Type}; + + pub(super) const V1: u8 = 1; + + /// Avro schema for StandardKeyMetadata V1, derived from Iceberg schema. + pub(super) static AVRO_SCHEMA_V1: LazyLock = LazyLock::new(|| { + let schema = Schema::builder() + .with_fields(vec![ + Arc::new(NestedField::required( + 0, + "encryption_key", + Type::Primitive(PrimitiveType::Binary), + )), + Arc::new(NestedField::optional( + 1, + "aad_prefix", + Type::Primitive(PrimitiveType::Binary), + )), + Arc::new(NestedField::optional( + 2, + "file_length", + Type::Primitive(PrimitiveType::Long), + )), + ]) + .build() + .expect("Failed to build StandardKeyMetadata Iceberg schema"); + + schema_to_avro_schema("StandardKeyMetadata", &schema) + .expect("Failed to convert StandardKeyMetadata to Avro schema") + }); + + /// Serde struct for Avro serialization of [`StandardKeyMetadata`] V1. + /// Field names must match [`AVRO_SCHEMA_V1`] exactly. + #[derive(Serialize, Deserialize)] + pub(super) struct StandardKeyMetadataV1 { + pub encryption_key: serde_bytes::ByteBuf, + pub aad_prefix: Option, + pub file_length: Option, + } + + impl StandardKeyMetadataV1 { + pub(super) fn encode(&self) -> Result> { + let value = to_value(self) + .and_then(|v| v.resolve(&AVRO_SCHEMA_V1)) + .map_err(|e| { + Error::new(ErrorKind::Unexpected, "Failed to encode key metadata") + .with_source(e) + })?; + + let datum = to_avro_datum(&AVRO_SCHEMA_V1, value).map_err(|e| { + Error::new(ErrorKind::Unexpected, "Failed to encode key metadata").with_source(e) + })?; + + let mut result = Vec::with_capacity(1 + datum.len()); + result.push(V1); + result.extend_from_slice(&datum); + Ok(result.into_boxed_slice()) + } + + pub(super) fn decode(bytes: &[u8]) -> Result { + if bytes.is_empty() { + return Err(Error::new( + ErrorKind::DataInvalid, + "Empty key metadata buffer", + )); + } + + let version = bytes[0]; + if version != V1 { + return Err(Error::new( + ErrorKind::FeatureUnsupported, + format!("Cannot resolve schema for version: {version}"), + )); + } + + let mut reader = Cursor::new(&bytes[1..]); + let value = from_avro_datum(&AVRO_SCHEMA_V1, &mut reader, None).map_err(|e| { + Error::new(ErrorKind::DataInvalid, "Failed to decode key metadata").with_source(e) + })?; + + from_value(&value).map_err(|e| { + Error::new( + ErrorKind::DataInvalid, + "Failed to decode key metadata fields", + ) + .with_source(e) + }) + } + } + + impl From<&StandardKeyMetadata> for StandardKeyMetadataV1 { + fn from(metadata: &StandardKeyMetadata) -> Self { + Self { + encryption_key: serde_bytes::ByteBuf::from(metadata.encryption_key.as_bytes()), + aad_prefix: metadata + .aad_prefix + .as_ref() + .map(|b| serde_bytes::ByteBuf::from(b.as_ref())), + file_length: metadata.file_length, + } + } + } + + impl From for StandardKeyMetadata { + fn from(v1: StandardKeyMetadataV1) -> Self { + Self { + encryption_key: SensitiveBytes::new(v1.encryption_key.into_vec()), + aad_prefix: v1.aad_prefix.map(|b| b.into_vec().into_boxed_slice()), + file_length: v1.file_length, + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_roundtrip() { + let key = b"0123456789012345"; + let aad = b"1234567890123456"; + + let metadata = StandardKeyMetadata::new(key).with_aad_prefix(aad); + let serialized = metadata.encode().unwrap(); + let parsed = StandardKeyMetadata::decode(&serialized).unwrap(); + + assert_eq!(parsed.encryption_key().as_bytes(), key); + assert_eq!(parsed.aad_prefix(), Some(aad.as_slice())); + assert_eq!(parsed.file_length(), None); + } + + #[test] + fn test_roundtrip_with_length() { + let key = b"0123456789012345"; + let aad = b"1234567890123456"; + + let file_length = 100_000; + let metadata = StandardKeyMetadata::new(key) + .with_aad_prefix(aad) + .with_file_length(file_length); + let serialized = metadata.encode().unwrap(); + let parsed = StandardKeyMetadata::decode(&serialized).unwrap(); + + assert_eq!(parsed.encryption_key().as_bytes(), key); + assert_eq!(parsed.aad_prefix(), Some(aad.as_slice())); + assert_eq!(parsed.file_length(), Some(file_length)); + } + + #[test] + fn test_unsupported_version() { + let result = StandardKeyMetadata::decode(&[0x02]); + assert!(result.is_err()); + let err = result.unwrap_err(); + assert_eq!(err.kind(), ErrorKind::FeatureUnsupported); + } + + #[test] + fn test_empty_buffer() { + let result = StandardKeyMetadata::decode(&[]); + assert!(result.is_err()); + assert_eq!(result.unwrap_err().kind(), ErrorKind::DataInvalid); + } + + #[test] + fn test_roundtrip_without_aad() { + let metadata = StandardKeyMetadata::new(&[1, 2, 3, 4]); + let serialized = metadata.encode().unwrap(); + let parsed = StandardKeyMetadata::decode(&serialized).unwrap(); + + assert_eq!(parsed.encryption_key().as_bytes(), &[1, 2, 3, 4]); + assert_eq!(parsed.aad_prefix(), None); + } +} diff --git a/crates/iceberg/src/encryption/mod.rs b/crates/iceberg/src/encryption/mod.rs index 38edb72f53..773d781d6d 100644 --- a/crates/iceberg/src/encryption/mod.rs +++ b/crates/iceberg/src/encryption/mod.rs @@ -23,11 +23,13 @@ mod crypto; mod file_decryptor; mod file_encryptor; +pub(crate) mod key_metadata; pub mod kms; mod stream; pub use crypto::{AesGcmCipher, AesKeySize, SecureKey, SensitiveBytes}; pub use file_decryptor::AesGcmFileDecryptor; pub use file_encryptor::AesGcmFileEncryptor; +pub use key_metadata::StandardKeyMetadata; pub use kms::{GeneratedKey, KeyManagementClient}; pub use stream::{AesGcmFileRead, AesGcmFileWrite}; From 7be70373bcbe542c98f719f9475fd4f731ffa11a Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Wed, 29 Apr 2026 12:15:52 +0200 Subject: [PATCH 42/45] Reintroduce spawn --- crates/iceberg/src/arrow/reader/pipeline.rs | 44 ++++++++++++++++----- 1 file changed, 34 insertions(+), 10 deletions(-) diff --git a/crates/iceberg/src/arrow/reader/pipeline.rs b/crates/iceberg/src/arrow/reader/pipeline.rs index 5019b98097..7bb7feb9d3 100644 --- a/crates/iceberg/src/arrow/reader/pipeline.rs +++ b/crates/iceberg/src/arrow/reader/pipeline.rs @@ -23,13 +23,15 @@ use std::sync::atomic::AtomicU64; use std::sync::{Arc, Mutex}; +use arrow_array::RecordBatch; +use futures::channel::mpsc::channel; use futures::{StreamExt, TryStreamExt}; use parquet::arrow::arrow_reader::{ArrowReaderMetadata, ArrowReaderOptions}; use parquet::arrow::{PARQUET_FIELD_ID_META_KEY, ParquetRecordBatchStreamBuilder}; use super::{ ArrowFileReader, ArrowReader, ParquetReadOptions, add_fallback_field_ids_to_arrow_schema, - apply_name_mapping_to_arrow_schema, + apply_name_mapping_to_arrow_schema, process_record_batch_stream, }; use crate::arrow::caching_delete_file_loader::CachingDeleteFileLoader; use crate::arrow::int96::coerce_int96_timestamps; @@ -43,6 +45,7 @@ use crate::io::{FileIO, FileMetadata, FileRead}; use crate::metadata_columns::{ RESERVED_FIELD_ID_FILE, RESERVED_FIELD_ID_POS, is_metadata_field, row_pos_field, }; +use crate::runtime::spawn; use crate::scan::{ArrowRecordBatchStream, FileScanTask, FileScanTaskStream}; use crate::spec::{Datum, NameMapping, PartitionSpec, SchemaRef, Struct}; use crate::{Error, ErrorKind}; @@ -78,16 +81,37 @@ impl ArrowReader { .try_flatten(), ) } else { - Box::pin( - tasks - .map_ok(move |task| task_reader.clone().process(task)) - .map_err(|err| { - Error::new(ErrorKind::Unexpected, "file scan task generate failed") - .with_source(err) + // Multi-concurrency path: spawn each file's IO-heavy processing as an independent + // tokio task for true parallelism, streaming results through a channel. + let (tx, rx) = channel::>(concurrency_limit_data_files); + + // Outer spawn: runs the task coordination loop without blocking the caller. + spawn(async move { + let _ = tasks + .try_for_each_concurrent(concurrency_limit_data_files, |task| { + let task_reader = task_reader.clone(); + let tx = tx.clone(); + + async move { + // Inner spawn: each file's IO operations run on their own tokio task. + spawn(async move { + let record_batch_stream = task_reader.process(task).await; + process_record_batch_stream( + record_batch_stream, + tx, + "failed to read record batch", + ) + .await; + }) + .await; + + Ok(()) + } }) - .try_buffer_unordered(concurrency_limit_data_files) - .try_flatten_unordered(concurrency_limit_data_files), - ) + .await; + }); + + Box::pin(rx) as ArrowRecordBatchStream }; Ok(ScanResult::new(stream, scan_metrics)) From 2bc7125ee9e547ea4a20b7bd0071941776f23552 Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Wed, 29 Apr 2026 12:48:15 +0200 Subject: [PATCH 43/45] fix(azdls): restore configured_scheme validation lost in silent merge The merge from upstream silently dropped our fork's AzureStorageScheme validation in azdls_create_operator / match_path_with_config because our main hadn't touched those files since the common ancestor. Restore configured_scheme as Option in OpenDalStorage::Azdls (optional to support the resolving storage, which auto-detects the scheme from the path and passes None). Direct usage via OpenDalStorageFactory::Azdls still requires an explicit scheme and wraps it in Some(...) when building the storage. Also restore the 3-element test tuples in test_azdls_create_operator that upstream's new test cases were missing. Co-Authored-By: Claude Sonnet 4.6 --- crates/storage/opendal/src/azdls.rs | 35 ++++++++++++++++++++----- crates/storage/opendal/src/lib.rs | 32 ++++++++++++++++------ crates/storage/opendal/src/resolving.rs | 1 + 3 files changed, 54 insertions(+), 14 deletions(-) diff --git a/crates/storage/opendal/src/azdls.rs b/crates/storage/opendal/src/azdls.rs index ce9a94d88d..c1d61c6b5a 100644 --- a/crates/storage/opendal/src/azdls.rs +++ b/crates/storage/opendal/src/azdls.rs @@ -121,12 +121,16 @@ pub(crate) fn azdls_config_parse(mut properties: HashMap) -> Res /// /// The path is expected to include the scheme in a format like: /// `abfss://@.dfs.core.windows.net/mydir/myfile.parquet`. +/// +/// When `configured_scheme` is `None`, scheme validation is skipped (used by +/// the resolving storage, which auto-detects the scheme from the path). pub(crate) fn azdls_create_operator<'a>( absolute_path: &'a str, config: &AzdlsConfig, + configured_scheme: Option<&AzureStorageScheme>, ) -> Result<(opendal::Operator, &'a str)> { let path = absolute_path.parse::()?; - match_path_with_config(&path, config)?; + match_path_with_config(&path, config, configured_scheme)?; let op = azdls_config_build(config, &path)?; @@ -192,7 +196,22 @@ impl FromStr for AzureStorageScheme { } /// Validates whether the given path matches what's configured for the backend. -pub(crate) fn match_path_with_config(path: &AzureStoragePath, config: &AzdlsConfig) -> Result<()> { +/// +/// When `configured_scheme` is `None`, scheme validation is skipped. +pub(crate) fn match_path_with_config( + path: &AzureStoragePath, + config: &AzdlsConfig, + configured_scheme: Option<&AzureStorageScheme>, +) -> Result<()> { + if let Some(configured_scheme) = configured_scheme { + ensure_data_valid!( + &path.scheme == configured_scheme, + "Storage::Azdls: Scheme mismatch: configured {}, passed {}", + configured_scheme, + path.scheme + ); + } + if let Some(ref configured_account_name) = config.account_name { ensure_data_valid!( &path.account_name == configured_account_name, @@ -506,6 +525,7 @@ mod tests { endpoint: Some("https://myaccount.dfs.core.windows.net".to_string()), ..Default::default() }, + AzureStorageScheme::Abfss, ), Some(("myfs", "/path/to/file.parquet")), ), @@ -518,6 +538,7 @@ mod tests { endpoint: Some("https://myaccount.dfs.core.windows.net".to_string()), ..Default::default() }, + AzureStorageScheme::Abfss, ), None, ), @@ -531,6 +552,7 @@ mod tests { endpoint: Some("http://myaccount.dfs.core.windows.net".to_string()), ..Default::default() }, + AzureStorageScheme::Abfss, ), None, ), @@ -543,6 +565,7 @@ mod tests { endpoint: Some("https://myaccount.dfs.core.chinacloudapi.cn".to_string()), ..Default::default() }, + AzureStorageScheme::Abfss, ), None, ), @@ -556,20 +579,20 @@ mod tests { endpoint: None, ..Default::default() }, + AzureStorageScheme::Abfs, ), Some(("myfs", "/path/to/file.parquet")), ), ( - "scheme differs from a previously-configured one is accepted", + "scheme differs from configured scheme", ( - // No configured scheme exists anymore; both abfss and wasbs - // should be accepted by the same storage. "wasbs://myfs@myaccount.blob.core.windows.net/path/to/file.parquet", AzdlsConfig { account_name: Some("myaccount".to_string()), endpoint: Some("https://myaccount.blob.core.windows.net".to_string()), ..Default::default() }, + AzureStorageScheme::Wasbs, ), Some(("myfs", "/path/to/file.parquet")), ), @@ -590,7 +613,7 @@ mod tests { ]; for (name, input, expected) in test_cases { - let result = azdls_create_operator(input.0, &input.1); + let result = azdls_create_operator(input.0, &input.1, Some(&input.2)); match expected { Some((expected_filesystem, expected_path)) => { assert!(result.is_ok(), "Test case {name} failed: {result:?}"); diff --git a/crates/storage/opendal/src/lib.rs b/crates/storage/opendal/src/lib.rs index a0336868e3..3e29f40388 100644 --- a/crates/storage/opendal/src/lib.rs +++ b/crates/storage/opendal/src/lib.rs @@ -119,7 +119,10 @@ pub enum OpenDalStorageFactory { Oss, /// Azure Data Lake Storage factory. #[cfg(feature = "opendal-azdls")] - Azdls, + Azdls { + /// The configured Azure storage scheme. + configured_scheme: AzureStorageScheme, + }, } #[typetag::serde(name = "OpenDalStorageFactory")] @@ -149,9 +152,12 @@ impl StorageFactory for OpenDalStorageFactory { config: oss_config_parse(config.props().clone())?.into(), })), #[cfg(feature = "opendal-azdls")] - OpenDalStorageFactory::Azdls => Ok(Arc::new(OpenDalStorage::Azdls { - config: azdls_config_parse(config.props().clone())?.into(), - })), + OpenDalStorageFactory::Azdls { configured_scheme } => { + Ok(Arc::new(OpenDalStorage::Azdls { + configured_scheme: Some(configured_scheme.clone()), + config: azdls_config_parse(config.props().clone())?.into(), + })) + } #[cfg(all( not(feature = "opendal-memory"), not(feature = "opendal-fs"), @@ -212,9 +218,12 @@ pub enum OpenDalStorage { /// Accepts paths of the form /// `abfs[s]://@.dfs./` or /// `wasb[s]://@.blob./`. - /// The scheme is derived from the path at call time. #[cfg(feature = "opendal-azdls")] Azdls { + /// The expected Azure storage scheme. When set, paths must use this + /// exact scheme; `None` disables scheme validation (used by the + /// resolving storage, which auto-detects the scheme from the path). + configured_scheme: Option, /// Azure DLS configuration. config: Arc, }, @@ -310,7 +319,10 @@ impl OpenDalStorage { } } #[cfg(feature = "opendal-azdls")] - OpenDalStorage::Azdls { config } => azdls_create_operator(path, config)?, + OpenDalStorage::Azdls { + configured_scheme, + config, + } => azdls_create_operator(path, config, configured_scheme.as_ref())?, #[cfg(all( not(feature = "opendal-s3"), not(feature = "opendal-fs"), @@ -402,9 +414,12 @@ impl OpenDalStorage { } } #[cfg(feature = "opendal-azdls")] - OpenDalStorage::Azdls { config } => { + OpenDalStorage::Azdls { + configured_scheme, + config, + } => { let azure_path = path.parse::()?; - match_path_with_config(&azure_path, config)?; + match_path_with_config(&azure_path, config, configured_scheme.as_ref())?; let relative_path_len = azure_path.path.len(); Ok(&path[path.len() - relative_path_len..]) } @@ -693,6 +708,7 @@ mod tests { #[test] fn test_relativize_path_azdls() { let storage = OpenDalStorage::Azdls { + configured_scheme: Some(AzureStorageScheme::Abfss), config: Arc::new(AzdlsConfig { account_name: Some("myaccount".to_string()), endpoint: Some("https://myaccount.dfs.core.windows.net".to_string()), diff --git a/crates/storage/opendal/src/resolving.rs b/crates/storage/opendal/src/resolving.rs index 64a16b18d2..0c04108347 100644 --- a/crates/storage/opendal/src/resolving.rs +++ b/crates/storage/opendal/src/resolving.rs @@ -114,6 +114,7 @@ fn build_storage_for_scheme( Scheme::Azdls => { let config = crate::azdls::azdls_config_parse(props.clone())?; Ok(OpenDalStorage::Azdls { + configured_scheme: None, config: Arc::new(config), }) } From 39cc6d9ebc40f217d75912f1fbb44fbbc2e5e7e1 Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Wed, 29 Apr 2026 12:56:27 +0200 Subject: [PATCH 44/45] test(azdls): restore scheme-mismatch rejection test Re-adds the test case that was removed by upstream in #2338 when they dropped configured_scheme entirely. Since we kept the scheme validation, this test should exist to cover it. Co-Authored-By: Claude Sonnet 4.6 --- crates/storage/opendal/src/azdls.rs | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/crates/storage/opendal/src/azdls.rs b/crates/storage/opendal/src/azdls.rs index c1d61c6b5a..0bd0b96541 100644 --- a/crates/storage/opendal/src/azdls.rs +++ b/crates/storage/opendal/src/azdls.rs @@ -596,6 +596,19 @@ mod tests { ), Some(("myfs", "/path/to/file.parquet")), ), + ( + "different scheme is rejected", + ( + "wasbs://myfs@myaccount.dfs.core.windows.net/path/to/file.parquet", + AzdlsConfig { + account_name: Some("myaccount".to_string()), + endpoint: Some("https://myaccount.dfs.core.windows.net".to_string()), + ..Default::default() + }, + AzureStorageScheme::Abfss, + ), + None, + ), ( "azurite endpoint with explicit configuration", ( From 721097035e564e0fe66b3bf1f63450449776e4e7 Mon Sep 17 00:00:00 2001 From: Gerald Berger Date: Wed, 29 Apr 2026 13:01:59 +0200 Subject: [PATCH 45/45] refactor(azdls): drop configured_scheme from opendal crate storage impls MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The scheme check added no real value — Azure accepts any scheme variant against the same endpoint. Upstream removed it in #2338 for the same reason. Dropping it eliminates the Option workaround in OpenDalStorage::Azdls and simplifies OpenDalStorageFactory::Azdls back to a unit variant. Co-Authored-By: Claude Sonnet 4.6 --- crates/storage/opendal/src/azdls.rs | 47 +++---------------------- crates/storage/opendal/src/lib.rs | 31 ++++------------ crates/storage/opendal/src/resolving.rs | 1 - 3 files changed, 11 insertions(+), 68 deletions(-) diff --git a/crates/storage/opendal/src/azdls.rs b/crates/storage/opendal/src/azdls.rs index 0bd0b96541..bab414a95c 100644 --- a/crates/storage/opendal/src/azdls.rs +++ b/crates/storage/opendal/src/azdls.rs @@ -121,16 +121,12 @@ pub(crate) fn azdls_config_parse(mut properties: HashMap) -> Res /// /// The path is expected to include the scheme in a format like: /// `abfss://@.dfs.core.windows.net/mydir/myfile.parquet`. -/// -/// When `configured_scheme` is `None`, scheme validation is skipped (used by -/// the resolving storage, which auto-detects the scheme from the path). pub(crate) fn azdls_create_operator<'a>( absolute_path: &'a str, config: &AzdlsConfig, - configured_scheme: Option<&AzureStorageScheme>, ) -> Result<(opendal::Operator, &'a str)> { let path = absolute_path.parse::()?; - match_path_with_config(&path, config, configured_scheme)?; + match_path_with_config(&path, config)?; let op = azdls_config_build(config, &path)?; @@ -196,22 +192,7 @@ impl FromStr for AzureStorageScheme { } /// Validates whether the given path matches what's configured for the backend. -/// -/// When `configured_scheme` is `None`, scheme validation is skipped. -pub(crate) fn match_path_with_config( - path: &AzureStoragePath, - config: &AzdlsConfig, - configured_scheme: Option<&AzureStorageScheme>, -) -> Result<()> { - if let Some(configured_scheme) = configured_scheme { - ensure_data_valid!( - &path.scheme == configured_scheme, - "Storage::Azdls: Scheme mismatch: configured {}, passed {}", - configured_scheme, - path.scheme - ); - } - +pub(crate) fn match_path_with_config(path: &AzureStoragePath, config: &AzdlsConfig) -> Result<()> { if let Some(ref configured_account_name) = config.account_name { ensure_data_valid!( &path.account_name == configured_account_name, @@ -525,7 +506,6 @@ mod tests { endpoint: Some("https://myaccount.dfs.core.windows.net".to_string()), ..Default::default() }, - AzureStorageScheme::Abfss, ), Some(("myfs", "/path/to/file.parquet")), ), @@ -538,7 +518,6 @@ mod tests { endpoint: Some("https://myaccount.dfs.core.windows.net".to_string()), ..Default::default() }, - AzureStorageScheme::Abfss, ), None, ), @@ -552,7 +531,6 @@ mod tests { endpoint: Some("http://myaccount.dfs.core.windows.net".to_string()), ..Default::default() }, - AzureStorageScheme::Abfss, ), None, ), @@ -565,7 +543,6 @@ mod tests { endpoint: Some("https://myaccount.dfs.core.chinacloudapi.cn".to_string()), ..Default::default() }, - AzureStorageScheme::Abfss, ), None, ), @@ -579,12 +556,11 @@ mod tests { endpoint: None, ..Default::default() }, - AzureStorageScheme::Abfs, ), Some(("myfs", "/path/to/file.parquet")), ), ( - "scheme differs from configured scheme", + "different scheme is accepted", ( "wasbs://myfs@myaccount.blob.core.windows.net/path/to/file.parquet", AzdlsConfig { @@ -592,23 +568,9 @@ mod tests { endpoint: Some("https://myaccount.blob.core.windows.net".to_string()), ..Default::default() }, - AzureStorageScheme::Wasbs, ), Some(("myfs", "/path/to/file.parquet")), ), - ( - "different scheme is rejected", - ( - "wasbs://myfs@myaccount.dfs.core.windows.net/path/to/file.parquet", - AzdlsConfig { - account_name: Some("myaccount".to_string()), - endpoint: Some("https://myaccount.dfs.core.windows.net".to_string()), - ..Default::default() - }, - AzureStorageScheme::Abfss, - ), - None, - ), ( "azurite endpoint with explicit configuration", ( @@ -619,14 +581,13 @@ mod tests { account_key: Some("secret".to_string()), ..Default::default() }, - AzureStorageScheme::Wasb, ), Some(("testfs", "/path/to/data.parquet")), ), ]; for (name, input, expected) in test_cases { - let result = azdls_create_operator(input.0, &input.1, Some(&input.2)); + let result = azdls_create_operator(input.0, &input.1); match expected { Some((expected_filesystem, expected_path)) => { assert!(result.is_ok(), "Test case {name} failed: {result:?}"); diff --git a/crates/storage/opendal/src/lib.rs b/crates/storage/opendal/src/lib.rs index 3e29f40388..7fdf9e6965 100644 --- a/crates/storage/opendal/src/lib.rs +++ b/crates/storage/opendal/src/lib.rs @@ -119,10 +119,7 @@ pub enum OpenDalStorageFactory { Oss, /// Azure Data Lake Storage factory. #[cfg(feature = "opendal-azdls")] - Azdls { - /// The configured Azure storage scheme. - configured_scheme: AzureStorageScheme, - }, + Azdls, } #[typetag::serde(name = "OpenDalStorageFactory")] @@ -152,12 +149,9 @@ impl StorageFactory for OpenDalStorageFactory { config: oss_config_parse(config.props().clone())?.into(), })), #[cfg(feature = "opendal-azdls")] - OpenDalStorageFactory::Azdls { configured_scheme } => { - Ok(Arc::new(OpenDalStorage::Azdls { - configured_scheme: Some(configured_scheme.clone()), - config: azdls_config_parse(config.props().clone())?.into(), - })) - } + OpenDalStorageFactory::Azdls => Ok(Arc::new(OpenDalStorage::Azdls { + config: azdls_config_parse(config.props().clone())?.into(), + })), #[cfg(all( not(feature = "opendal-memory"), not(feature = "opendal-fs"), @@ -220,10 +214,6 @@ pub enum OpenDalStorage { /// `wasb[s]://@.blob./`. #[cfg(feature = "opendal-azdls")] Azdls { - /// The expected Azure storage scheme. When set, paths must use this - /// exact scheme; `None` disables scheme validation (used by the - /// resolving storage, which auto-detects the scheme from the path). - configured_scheme: Option, /// Azure DLS configuration. config: Arc, }, @@ -319,10 +309,7 @@ impl OpenDalStorage { } } #[cfg(feature = "opendal-azdls")] - OpenDalStorage::Azdls { - configured_scheme, - config, - } => azdls_create_operator(path, config, configured_scheme.as_ref())?, + OpenDalStorage::Azdls { config } => azdls_create_operator(path, config)?, #[cfg(all( not(feature = "opendal-s3"), not(feature = "opendal-fs"), @@ -414,12 +401,9 @@ impl OpenDalStorage { } } #[cfg(feature = "opendal-azdls")] - OpenDalStorage::Azdls { - configured_scheme, - config, - } => { + OpenDalStorage::Azdls { config } => { let azure_path = path.parse::()?; - match_path_with_config(&azure_path, config, configured_scheme.as_ref())?; + match_path_with_config(&azure_path, config)?; let relative_path_len = azure_path.path.len(); Ok(&path[path.len() - relative_path_len..]) } @@ -708,7 +692,6 @@ mod tests { #[test] fn test_relativize_path_azdls() { let storage = OpenDalStorage::Azdls { - configured_scheme: Some(AzureStorageScheme::Abfss), config: Arc::new(AzdlsConfig { account_name: Some("myaccount".to_string()), endpoint: Some("https://myaccount.dfs.core.windows.net".to_string()), diff --git a/crates/storage/opendal/src/resolving.rs b/crates/storage/opendal/src/resolving.rs index 0c04108347..64a16b18d2 100644 --- a/crates/storage/opendal/src/resolving.rs +++ b/crates/storage/opendal/src/resolving.rs @@ -114,7 +114,6 @@ fn build_storage_for_scheme( Scheme::Azdls => { let config = crate::azdls::azdls_config_parse(props.clone())?; Ok(OpenDalStorage::Azdls { - configured_scheme: None, config: Arc::new(config), }) }