diff --git a/.cargo/audit.toml b/.cargo/audit.toml index 09e2d35c50..71354ea3a5 100644 --- a/.cargo/audit.toml +++ b/.cargo/audit.toml @@ -33,4 +33,9 @@ ignore = [ # # Introduced by object_store, see https://github.com/apache/arrow-rs-object-store/issues/564 "RUSTSEC-2025-0134", + # `rand` unsoundness with custom logger using `rand::rng()` + # + # Direct dependency upgraded to 0.9.3+. Transitive rand 0.8.5 remains + # from reqsign/sqllogictest/rustc-hash — no 0.8.x patch exists. + "RUSTSEC-2026-0097", ] diff --git a/.github/workflows/asf-allowlist-check.yml b/.github/workflows/asf-allowlist-check.yml index d4e84c5922..65dbe8bcbe 100644 --- a/.github/workflows/asf-allowlist-check.yml +++ b/.github/workflows/asf-allowlist-check.yml @@ -43,5 +43,4 @@ jobs: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 with: persist-credentials: false - # Intentionally unpinned to always use the latest allowlist from the ASF. - - uses: apache/infrastructure-actions/allowlist-check@main # zizmor: ignore[unpinned-uses] + - uses: apache/infrastructure-actions/allowlist-check@4e9c961f587f72b170874b6f5cd4ac15f7f26eb8 # main diff --git a/.github/workflows/audit.yml b/.github/workflows/audit.yml index 68731cbed3..3f9865ed8a 100644 --- a/.github/workflows/audit.yml +++ b/.github/workflows/audit.yml @@ -37,7 +37,10 @@ on: - cron: '0 0 * * *' permissions: + # All other permissions are set to none contents: read + checks: write + issues: write jobs: security_audit: diff --git a/.github/workflows/bindings_python_ci.yml b/.github/workflows/bindings_python_ci.yml index a02ae9f0af..842fce7f83 100644 --- a/.github/workflows/bindings_python_ci.yml +++ b/.github/workflows/bindings_python_ci.yml @@ -63,7 +63,7 @@ jobs: - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6 with: persist-credentials: false - - uses: astral-sh/setup-uv@5a095e7a2014a4212f075830d4f7277575a9d098 # v7.3.1 + - uses: astral-sh/setup-uv@08807647e7069bb48b6ef5acd8ec9567f424441b # v8.1.0 with: version: "0.9.3" enable-cache: true @@ -95,12 +95,12 @@ jobs: - uses: actions/setup-python@a309ff8b426b58ec0e2a45f0f869d46889d02405 # v6 with: python-version: 3.12 - - uses: PyO3/maturin-action@04ac600d27cdf7a9a280dadf7147097c42b757ad # v1.50.1 + - uses: PyO3/maturin-action@e83996d129638aa358a18fbd1dfb82f0b0fb5d3b # v1.51.0 with: working-directory: "bindings/python" command: build args: --out dist -i python3.12 # Explicitly set interpreter; manylinux containers have multiple Pythons and maturin may pick an older one - - uses: astral-sh/setup-uv@5a095e7a2014a4212f075830d4f7277575a9d098 # v7.3.1 + - uses: astral-sh/setup-uv@08807647e7069bb48b6ef5acd8ec9567f424441b # v8.1.0 with: version: "0.9.3" enable-cache: true diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 071d6dbcbf..8b31386e47 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -163,7 +163,7 @@ jobs: - name: Install cargo-nextest if: matrix.test-suite.name == 'default' - uses: taiki-e/install-action@0fde6d128a3d980ceac30be8c8b8739abd963b81 # v2.70.0 + uses: taiki-e/install-action@055f5df8c3f65ea01cd41e9dc855becd88953486 # v2.75.18 with: tool: cargo-nextest diff --git a/.github/workflows/ci_typos.yml b/.github/workflows/ci_typos.yml index 9373c7295d..fff347e638 100644 --- a/.github/workflows/ci_typos.yml +++ b/.github/workflows/ci_typos.yml @@ -47,4 +47,4 @@ jobs: with: persist-credentials: false - name: Check typos - uses: crate-ci/typos@631208b7aac2daa8b707f55e7331f9112b0e062d # v1.44.0 + uses: crate-ci/typos@cf5f1c29a8ac336af8568821ec41919923b05a83 # v1.45.1 diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index fe0459aeb7..7e9c8208c8 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -46,11 +46,11 @@ jobs: persist-credentials: false - name: Initialize CodeQL - uses: github/codeql-action/init@c10b8064de6f491fea524254123dbe5e09572f13 # v4.35.1 + uses: github/codeql-action/init@95e58e9a2cdfd71adc6e0353d5c52f41a045d225 # v4.35.2 with: languages: actions - name: Perform CodeQL Analysis - uses: github/codeql-action/analyze@c10b8064de6f491fea524254123dbe5e09572f13 # v4.35.1 + uses: github/codeql-action/analyze@95e58e9a2cdfd71adc6e0353d5c52f41a045d225 # v4.35.2 with: category: "/language:actions" diff --git a/.github/workflows/release_python.yml b/.github/workflows/release_python.yml index c9817e064c..f93f299d56 100644 --- a/.github/workflows/release_python.yml +++ b/.github/workflows/release_python.yml @@ -61,21 +61,21 @@ jobs: exit 1 fi echo "✅ Release tag format is valid: $RELEASE_TAG" - + # Strip 'v' prefix for cargo version CARGO_VERSION="${RELEASE_TAG#v}" echo "Cargo version (without v prefix): $CARGO_VERSION" - + # For manual triggers, validate that the tag matches the version in Cargo.toml if [ "${{ github.event_name }}" == "workflow_dispatch" ]; then # Extract base version (without -rc.X suffix) for comparison with Cargo.toml BASE_VERSION="${CARGO_VERSION%-rc.*}" echo "Base version (for Cargo.toml comparison): $BASE_VERSION" - + # Read version from Cargo.toml and validate it matches CARGO_TOML_VERSION=$(grep '^version = ' bindings/python/Cargo.toml | head -1 | sed 's/version = "\(.*\)"/\1/') echo "Version in bindings/python/Cargo.toml: $CARGO_TOML_VERSION" - + if [ "$BASE_VERSION" != "$CARGO_TOML_VERSION" ]; then echo "❌ Version mismatch!" echo " Release tag base version: $BASE_VERSION" @@ -85,7 +85,7 @@ jobs: fi echo "✅ Version matches bindings/python/Cargo.toml" fi - + # Check if this is a release candidate if [[ "$RELEASE_TAG" =~ -rc\.[0-9]+$ ]]; then IS_RC="true" @@ -94,7 +94,7 @@ jobs: IS_RC="false" echo "This is a stable release" fi - + # Set outputs for other jobs to use echo "cargo-version=$CARGO_VERSION" >> $GITHUB_OUTPUT echo "is-rc=$IS_RC" >> $GITHUB_OUTPUT @@ -110,7 +110,7 @@ jobs: - name: Install toml-cli if: ${{ needs.validate-release-tag.outputs.is-rc == 'true' }} run: cargo install toml-cli - + - name: Set cargo version for RC if: ${{ needs.validate-release-tag.outputs.is-rc == 'true' }} working-directory: "bindings/python" @@ -124,13 +124,13 @@ jobs: env: NEEDS_VALIDATE_RELEASE_TAG_OUTPUTS_CARGO_VERSION: ${{ needs.validate-release-tag.outputs.cargo-version }} - - uses: PyO3/maturin-action@04ac600d27cdf7a9a280dadf7147097c42b757ad # v1.50.1 + - uses: PyO3/maturin-action@e83996d129638aa358a18fbd1dfb82f0b0fb5d3b # v1.51.0 with: working-directory: "bindings/python" command: sdist args: -o dist - name: Upload sdist - uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: name: wheels-sdist path: bindings/python/dist @@ -159,7 +159,7 @@ jobs: - name: Install toml-cli if: ${{ needs.validate-release-tag.outputs.is-rc == 'true' }} run: cargo install toml-cli - + - name: Set cargo version for RC if: ${{ needs.validate-release-tag.outputs.is-rc == 'true' }} working-directory: "bindings/python" @@ -184,7 +184,7 @@ jobs: uses: ./.github/actions/setup-builder with: rust-version: ${{ steps.get-msrv.outputs.msrv }} - - uses: PyO3/maturin-action@04ac600d27cdf7a9a280dadf7147097c42b757ad # v1.50.1 + - uses: PyO3/maturin-action@e83996d129638aa358a18fbd1dfb82f0b0fb5d3b # v1.51.0 with: target: ${{ matrix.target }} manylinux: ${{ matrix.manylinux || 'auto' }} @@ -192,7 +192,7 @@ jobs: command: build args: --release -o dist -i python3.12 # Explicitly set interpreter; manylinux containers have multiple Pythons and maturin may pick an older one - name: Upload wheels - uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: name: wheels-${{ matrix.os }}-${{ matrix.target }} path: bindings/python/dist diff --git a/.github/workflows/release_python_nightly.yml b/.github/workflows/release_python_nightly.yml index 55695784e9..26b034554c 100644 --- a/.github/workflows/release_python_nightly.yml +++ b/.github/workflows/release_python_nightly.yml @@ -48,14 +48,14 @@ jobs: with: timestamp: ${{ needs.set-version.outputs.TIMESTAMP }} - - uses: PyO3/maturin-action@04ac600d27cdf7a9a280dadf7147097c42b757ad # v1.50.1 + - uses: PyO3/maturin-action@e83996d129638aa358a18fbd1dfb82f0b0fb5d3b # v1.51.0 with: working-directory: "bindings/python" command: sdist args: -o dist - name: Upload sdist - uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: name: wheels-sdist path: bindings/python/dist @@ -98,7 +98,7 @@ jobs: with: rust-version: ${{ steps.get-msrv.outputs.msrv }} - - uses: PyO3/maturin-action@04ac600d27cdf7a9a280dadf7147097c42b757ad # v1.50.1 + - uses: PyO3/maturin-action@e83996d129638aa358a18fbd1dfb82f0b0fb5d3b # v1.51.0 with: target: ${{ matrix.target }} manylinux: ${{ matrix.manylinux || 'auto' }} @@ -107,7 +107,7 @@ jobs: args: --release -o dist -i python3.12 # Explicitly set interpreter; manylinux containers have multiple Pythons and maturin may pick an older one - name: Upload wheels - uses: actions/upload-artifact@bbbca2ddaa5d8feaa63e36b76fdaad77386f024f # v7.0.0 + uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1 with: name: wheels-${{ matrix.os }}-${{ matrix.target }} path: bindings/python/dist diff --git a/.github/workflows/zizmor.yml b/.github/workflows/zizmor.yml index 313835fcbe..9306853937 100644 --- a/.github/workflows/zizmor.yml +++ b/.github/workflows/zizmor.yml @@ -39,6 +39,6 @@ jobs: persist-credentials: false - name: Run zizmor 🌈 - uses: zizmorcore/zizmor-action@71321a20a9ded102f6e9ce5718a2fcec2c4f70d8 # v0.5.2 + uses: zizmorcore/zizmor-action@b1d7e1fb5de872772f31590499237e7cce841e8e # v0.5.3 with: advanced-security: false diff --git a/.typos.toml b/.typos.toml index 407ce8168c..36996a553a 100644 --- a/.typos.toml +++ b/.typos.toml @@ -18,5 +18,9 @@ [type.rust] extend-ignore-identifiers-re = ["^bimap$"] +[default.extend-words] +ags = "ags" +AGS = "AGS" + [files] extend-exclude = ["**/testdata", "CHANGELOG.md"] diff --git a/Cargo.lock b/Cargo.lock index d3b5bb6646..2dad4ba41d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -133,7 +133,7 @@ version = "1.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc" dependencies = [ - "windows-sys 0.61.2", + "windows-sys 0.60.2", ] [[package]] @@ -144,7 +144,7 @@ checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d" dependencies = [ "anstyle", "once_cell_polyfill", - "windows-sys 0.61.2", + "windows-sys 0.60.2", ] [[package]] @@ -169,7 +169,7 @@ dependencies = [ "miniz_oxide", "num-bigint", "quad-rand", - "rand 0.9.2", + "rand 0.9.4", "regex-lite", "serde", "serde_bytes", @@ -1062,7 +1062,7 @@ version = "3.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "519bd3116aeeb42d5372c29d982d16d0170d3d4a5ed85fc7dd91642ffff3c67c" dependencies = [ - "darling 0.23.0", + "darling 0.20.11", "ident_case", "prettyplease", "proc-macro2", @@ -1103,6 +1103,20 @@ name = "bytemuck" version = "1.25.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c8efb64bd706a16a1bdde310ae86b351e4d21550d98d056f22f8a7f7a2183fec" +dependencies = [ + "bytemuck_derive", +] + +[[package]] +name = "bytemuck_derive" +version = "1.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9abbd1bc6865053c427f7198e6af43bfdedc55ab791faed4fbd361d789575ff" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] [[package]] name = "byteorder" @@ -1286,7 +1300,7 @@ version = "3.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "faf9468729b8cbcea668e36183cb69d317348c2e08e994829fb56ebfdfbaac34" dependencies = [ - "windows-sys 0.61.2", + "windows-sys 0.48.0", ] [[package]] @@ -1605,9 +1619,9 @@ dependencies = [ [[package]] name = "datafusion" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de9f8117889ba9503440f1dd79ebab32ba52ccf1720bb83cd718a29d4edc0d16" +checksum = "93db0e623840612f7f2cd757f7e8a8922064192363732c88692e0870016e141b" dependencies = [ "arrow", "arrow-schema", @@ -1649,7 +1663,7 @@ dependencies = [ "object_store", "parking_lot", "parquet", - "rand 0.9.2", + "rand 0.9.4", "regex", "sqlparser", "tempfile", @@ -1661,9 +1675,9 @@ dependencies = [ [[package]] name = "datafusion-catalog" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be893b73a13671f310ffcc8da2c546b81efcc54c22e0382c0a28aa3537017137" +checksum = "37cefde60b26a7f4ff61e9d2ff2833322f91df2b568d7238afe67bde5bdffb66" dependencies = [ "arrow", "async-trait", @@ -1686,9 +1700,9 @@ dependencies = [ [[package]] name = "datafusion-catalog-listing" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "830487b51ed83807d6b32d6325f349c3144ae0c9bf772cf2a712db180c31d5e6" +checksum = "17e112307715d6a7a331111a4c2330ff54bc237183511c319e3708a4cff431fb" dependencies = [ "arrow", "async-trait", @@ -1709,9 +1723,9 @@ dependencies = [ [[package]] name = "datafusion-cli" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8735220c84a731c3917dce75ec837a8376eddf5462b0c5dbaf5a2e354c9b6e05" +checksum = "84a22c001ad1ac11cda09dab69b151eef5b1a992e23bc524ab0d1e63e5dea327" dependencies = [ "arrow", "async-trait", @@ -1737,9 +1751,9 @@ dependencies = [ [[package]] name = "datafusion-common" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d7663f3af955292f8004e74bcaf8f7ea3d66cc38438749615bb84815b61a293" +checksum = "d72a11ca44a95e1081870d3abb80c717496e8a7acb467a1d3e932bb636af5cc2" dependencies = [ "ahash", "apache-avro", @@ -1764,9 +1778,9 @@ dependencies = [ [[package]] name = "datafusion-common-runtime" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f590205c7e32fe1fea48dd53ffb406e56ae0e7a062213a3ac848db8771641bd" +checksum = "89f4afaed29670ec4fd6053643adc749fe3f4bc9d1ce1b8c5679b22c67d12def" dependencies = [ "futures", "log", @@ -1775,9 +1789,9 @@ dependencies = [ [[package]] name = "datafusion-datasource" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fde1e030a9dc87b743c806fbd631f5ecfa2ccaa4ffb61fa19144a07fea406b79" +checksum = "e9fb386e1691355355a96419978a0022b7947b44d4a24a6ea99f00b6b485cbb6" dependencies = [ "arrow", "async-compression", @@ -1801,7 +1815,7 @@ dependencies = [ "liblzma", "log", "object_store", - "rand 0.9.2", + "rand 0.9.4", "tokio", "tokio-util", "url", @@ -1810,9 +1824,9 @@ dependencies = [ [[package]] name = "datafusion-datasource-arrow" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "331ebae7055dc108f9b54994b93dff91f3a17445539efe5b74e89264f7b36e15" +checksum = "ffa6c52cfed0734c5f93754d1c0175f558175248bf686c944fb05c373e5fc096" dependencies = [ "arrow", "arrow-ipc", @@ -1834,9 +1848,9 @@ dependencies = [ [[package]] name = "datafusion-datasource-avro" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49dda81c79b6ba57b1853a9158abc66eb85a3aa1cede0c517dabec6d8a4ed3aa" +checksum = "a579c3bd290c66ea4b269493e75e8a3ed42c9c895a651f10210a29538aee50c4" dependencies = [ "apache-avro", "arrow", @@ -1854,9 +1868,9 @@ dependencies = [ [[package]] name = "datafusion-datasource-csv" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e0d475088325e2986876aa27bb30d0574f72a22955a527d202f454681d55c5c" +checksum = "503f29e0582c1fc189578d665ff57d9300da1f80c282777d7eb67bb79fb8cdca" dependencies = [ "arrow", "async-trait", @@ -1877,9 +1891,9 @@ dependencies = [ [[package]] name = "datafusion-datasource-json" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea1520d81f31770f3ad6ee98b391e75e87a68a5bb90de70064ace5e0a7182fe8" +checksum = "e33804749abc8d0c8cb7473228483cb8070e524c6f6086ee1b85a64debe2b3d2" dependencies = [ "arrow", "async-trait", @@ -1901,9 +1915,9 @@ dependencies = [ [[package]] name = "datafusion-datasource-parquet" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95be805d0742ab129720f4c51ad9242cd872599cdb076098b03f061fcdc7f946" +checksum = "32a8e0365e0e08e8ff94d912f0ababcf9065a1a304018ba90b1fc83c855b4997" dependencies = [ "arrow", "async-trait", @@ -1931,15 +1945,15 @@ dependencies = [ [[package]] name = "datafusion-doc" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c93ad9e37730d2c7196e68616f3f2dd3b04c892e03acd3a8eeca6e177f3c06a" +checksum = "8de6ac0df1662b9148ad3c987978b32cbec7c772f199b1d53520c8fa764a87ee" [[package]] name = "datafusion-execution" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9437d3cd5d363f9319f8122182d4d233427de79c7eb748f23054c9aaa0fdd8df" +checksum = "c03c7fbdaefcca4ef6ffe425a5fc2325763bfb426599bb0bf4536466efabe709" dependencies = [ "arrow", "arrow-buffer", @@ -1954,16 +1968,16 @@ dependencies = [ "object_store", "parking_lot", "parquet", - "rand 0.9.2", + "rand 0.9.4", "tempfile", "url", ] [[package]] name = "datafusion-expr" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67164333342b86521d6d93fa54081ee39839894fb10f7a700c099af96d7552cf" +checksum = "574b9b6977fedbd2a611cbff12e5caf90f31640ad9dc5870f152836d94bad0dd" dependencies = [ "arrow", "async-trait", @@ -1984,9 +1998,9 @@ dependencies = [ [[package]] name = "datafusion-expr-common" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ab05fdd00e05d5a6ee362882546d29d6d3df43a6c55355164a7fbee12d163bc9" +checksum = "7d7c3adf3db8bf61e92eb90cb659c8e8b734593a8f7c8e12a843c7ddba24b87e" dependencies = [ "arrow", "datafusion-common", @@ -1997,9 +2011,9 @@ dependencies = [ [[package]] name = "datafusion-functions" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04fb863482d987cf938db2079e07ab0d3bb64595f28907a6c2f8671ad71cca7e" +checksum = "f28aa4e10384e782774b10e72aca4d93ef7b31aa653095d9d4536b0a3dbc51b6" dependencies = [ "arrow", "arrow-buffer", @@ -2020,7 +2034,7 @@ dependencies = [ "md-5", "memchr", "num-traits", - "rand 0.9.2", + "rand 0.9.4", "regex", "sha2", "unicode-segmentation", @@ -2029,9 +2043,9 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "829856f4e14275fb376c104f27cbf3c3b57a9cfe24885d98677525f5e43ce8d6" +checksum = "00aa6217e56098ba84e0a338176fe52f0a84cca398021512c6c8c5eff806d0ad" dependencies = [ "ahash", "arrow", @@ -2051,9 +2065,9 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate-common" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08af79cc3d2aa874a362fb97decfcbd73d687190cb096f16a6c85a7780cce311" +checksum = "b511250349407db7c43832ab2de63f5557b19a20dfd236b39ca2c04468b50d47" dependencies = [ "ahash", "arrow", @@ -2064,9 +2078,9 @@ dependencies = [ [[package]] name = "datafusion-functions-nested" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "465ae3368146d49c2eda3e2c0ef114424c87e8a6b509ab34c1026ace6497e790" +checksum = "ef13a858e20d50f0a9bb5e96e7ac82b4e7597f247515bccca4fdd2992df0212a" dependencies = [ "arrow", "arrow-ord", @@ -2089,9 +2103,9 @@ dependencies = [ [[package]] name = "datafusion-functions-table" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6156e6b22fcf1784112fc0173f3ae6e78c8fdb4d3ed0eace9543873b437e2af6" +checksum = "72b40d3f5bbb3905f9ccb1ce9485a9595c77b69758a7c24d3ba79e334ff51e7e" dependencies = [ "arrow", "async-trait", @@ -2105,9 +2119,9 @@ dependencies = [ [[package]] name = "datafusion-functions-window" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca7baec14f866729012efb89011a6973f3a346dc8090c567bfcd328deff551c1" +checksum = "d4e88ec9d57c9b685d02f58bfee7be62d72610430ddcedb82a08e5d9925dbfb6" dependencies = [ "arrow", "datafusion-common", @@ -2123,9 +2137,9 @@ dependencies = [ [[package]] name = "datafusion-functions-window-common" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "159228c3280d342658466bb556dc24de30047fe1d7e559dc5d16ccc5324166f9" +checksum = "8307bb93519b1a91913723a1130cfafeee3f72200d870d88e91a6fc5470ede5c" dependencies = [ "datafusion-common", "datafusion-physical-expr-common", @@ -2133,9 +2147,9 @@ dependencies = [ [[package]] name = "datafusion-macros" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5427e5da5edca4d21ea1c7f50e1c9421775fe33d7d5726e5641a833566e7578" +checksum = "2e367e6a71051d0ebdd29b2f85d12059b38b1d1f172c6906e80016da662226bd" dependencies = [ "datafusion-doc", "quote", @@ -2144,9 +2158,9 @@ dependencies = [ [[package]] name = "datafusion-optimizer" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89099eefcd5b223ec685c36a41d35c69239236310d71d339f2af0fa4383f3f46" +checksum = "e929015451a67f77d9d8b727b2bf3a40c4445fdef6cdc53281d7d97c76888ace" dependencies = [ "arrow", "chrono", @@ -2164,9 +2178,9 @@ dependencies = [ [[package]] name = "datafusion-physical-expr" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0f222df5195d605d79098ef37bdd5323bff0131c9d877a24da6ec98dfca9fe36" +checksum = "4b1e68aba7a4b350401cfdf25a3d6f989ad898a7410164afe9ca52080244cb59" dependencies = [ "ahash", "arrow", @@ -2188,9 +2202,9 @@ dependencies = [ [[package]] name = "datafusion-physical-expr-adapter" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "40838625d63d9c12549d81979db3dd675d159055eb9135009ba272ab0e8d0f64" +checksum = "ea22315f33cf2e0adc104e8ec42e285f6ed93998d565c65e82fec6a9ee9f9db4" dependencies = [ "arrow", "datafusion-common", @@ -2203,9 +2217,9 @@ dependencies = [ [[package]] name = "datafusion-physical-expr-common" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eacbcc4cfd502558184ed58fa3c72e775ec65bf077eef5fd2b3453db676f893c" +checksum = "b04b45ea8ad3ac2d78f2ea2a76053e06591c9629c7a603eda16c10649ecf4362" dependencies = [ "ahash", "arrow", @@ -2220,9 +2234,9 @@ dependencies = [ [[package]] name = "datafusion-physical-optimizer" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d501d0e1d0910f015677121601ac177ec59272ef5c9324d1147b394988f40941" +checksum = "7cb13397809a425918f608dfe8653f332015a3e330004ab191b4404187238b95" dependencies = [ "arrow", "datafusion-common", @@ -2239,9 +2253,9 @@ dependencies = [ [[package]] name = "datafusion-physical-plan" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "463c88ad6f1ecab1810f4c9f046898bee035b370137eb79b2b2db925e270631d" +checksum = "5edc023675791af9d5fb4cc4c24abf5f7bd3bd4dcf9e5bd90ea1eff6976dcc79" dependencies = [ "ahash", "arrow", @@ -2271,9 +2285,9 @@ dependencies = [ [[package]] name = "datafusion-pruning" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2857618a0ecbd8cd0cf29826889edd3a25774ec26b2995fc3862095c95d88fc6" +checksum = "ac8c76860e355616555081cab5968cec1af7a80701ff374510860bcd567e365a" dependencies = [ "arrow", "datafusion-common", @@ -2288,9 +2302,9 @@ dependencies = [ [[package]] name = "datafusion-session" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ef8637e35022c5c775003b3ab1debc6b4a8f0eb41b069bdd5475dd3aa93f6eba" +checksum = "5412111aa48e2424ba926112e192f7a6b7e4ccb450145d25ce5ede9f19dc491e" dependencies = [ "async-trait", "datafusion-common", @@ -2302,9 +2316,9 @@ dependencies = [ [[package]] name = "datafusion-spark" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "923a8b871962a9d860f036f743a20af50ff04729f1da2468ed220dab4f61c97d" +checksum = "e059dcf8544da0d6598d0235be3cc29c209094a5976b2e4822e4a2cf91c2b5c5" dependencies = [ "arrow", "bigdecimal", @@ -2320,7 +2334,7 @@ dependencies = [ "datafusion-functions-nested", "log", "percent-encoding", - "rand 0.9.2", + "rand 0.9.4", "serde_json", "sha1", "sha2", @@ -2329,9 +2343,9 @@ dependencies = [ [[package]] name = "datafusion-sql" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "12d9e9f16a1692a11c94bcc418191fa15fd2b4d72a0c1a0c607db93c0b84dd81" +checksum = "fa0d133ddf8b9b3b872acac900157f783e7b879fe9a6bccf389abebbfac45ec1" dependencies = [ "arrow", "bigdecimal", @@ -2348,9 +2362,9 @@ dependencies = [ [[package]] name = "datafusion-sqllogictest" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a43746bd59e7f2655be4c5553ede4a1ceb1cd34005932fa9e2bd0641c714c46e" +checksum = "04e5a4a7a49143a68936992b6dbb0db44121c635e9992b2482817278f1e69c56" dependencies = [ "arrow", "async-trait", @@ -2374,9 +2388,9 @@ dependencies = [ [[package]] name = "datafusion-substrait" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d5e5656a7e63d51dd3e5af3dbd347ea83bbe993a77c66b854b74961570d16490" +checksum = "98494539a5468979cc42d86c7bc5f0f8cb71ee5c742694c26fc34efdd29dd2e5" dependencies = [ "async-recursion", "async-trait", @@ -2480,7 +2494,7 @@ dependencies = [ "libc", "option-ext", "redox_users", - "windows-sys 0.61.2", + "windows-sys 0.59.0", ] [[package]] @@ -2633,7 +2647,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" dependencies = [ "libc", - "windows-sys 0.61.2", + "windows-sys 0.52.0", ] [[package]] @@ -3035,6 +3049,7 @@ version = "2.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b" dependencies = [ + "bytemuck", "cfg-if", "crunchy", "num-traits", @@ -3265,7 +3280,7 @@ dependencies = [ "libc", "percent-encoding", "pin-project-lite", - "socket2 0.6.3", + "socket2 0.5.10", "tokio", "tower-service", "tracing", @@ -3335,12 +3350,13 @@ dependencies = [ "ordered-float 4.6.0", "parquet", "pretty_assertions", - "rand 0.8.5", + "rand 0.9.4", "regex", "reqsign", "reqwest", "roaring", "serde", + "serde_arrow", "serde_bytes", "serde_derive", "serde_json", @@ -3449,14 +3465,19 @@ name = "iceberg-catalog-s3tables" version = "0.9.0" dependencies = [ "anyhow", + "arrow-array", + "arrow-schema", "async-trait", "aws-config", "aws-sdk-s3tables", + "futures", "iceberg", "iceberg-storage-opendal", "iceberg_test_utils", "itertools 0.13.0", + "parquet", "tokio", + "uuid", ] [[package]] @@ -3834,7 +3855,7 @@ dependencies = [ "portable-atomic", "portable-atomic-util", "serde_core", - "windows-sys 0.61.2", + "windows-sys 0.52.0", ] [[package]] @@ -4114,6 +4135,21 @@ dependencies = [ "twox-hash", ] +[[package]] +name = "marrow" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f5240d6977234968ff9ad254bfa73aa397fb51e41dcb22b1eb85835e9295485b" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "bytemuck", + "half", + "serde", +] + [[package]] name = "md-5" version = "0.10.6" @@ -4241,7 +4277,7 @@ dependencies = [ "hyper-util", "log", "pin-project-lite", - "rand 0.9.2", + "rand 0.9.4", "regex", "serde_json", "serde_urlencoded", @@ -4370,7 +4406,7 @@ version = "0.50.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5" dependencies = [ - "windows-sys 0.61.2", + "windows-sys 0.59.0", ] [[package]] @@ -4499,7 +4535,7 @@ dependencies = [ "parking_lot", "percent-encoding", "quick-xml 0.39.2", - "rand 0.10.0", + "rand 0.10.1", "reqwest", "ring", "rustls-pki-types", @@ -5026,7 +5062,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "343d3bd7056eda839b03204e68deff7d1b13aba7af2b2fd16890697274262ee7" dependencies = [ "heck", - "itertools 0.14.0", + "itertools 0.13.0", "log", "multimap", "petgraph", @@ -5045,7 +5081,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "27c6023962132f4b30eb4c172c91ce92d933da334c59c23cddee82358ddafb0b" dependencies = [ "anyhow", - "itertools 0.14.0", + "itertools 0.13.0", "proc-macro2", "quote", "syn", @@ -5139,7 +5175,7 @@ dependencies = [ "quinn-udp", "rustc-hash", "rustls", - "socket2 0.6.3", + "socket2 0.5.10", "thiserror 2.0.18", "tokio", "tracing", @@ -5155,7 +5191,7 @@ dependencies = [ "bytes", "getrandom 0.3.4", "lru-slab", - "rand 0.9.2", + "rand 0.9.4", "ring", "rustc-hash", "rustls", @@ -5176,7 +5212,7 @@ dependencies = [ "cfg_aliases", "libc", "once_cell", - "socket2 0.6.3", + "socket2 0.5.10", "tracing", "windows-sys 0.60.2", ] @@ -5235,9 +5271,9 @@ dependencies = [ [[package]] name = "rand" -version = "0.9.2" +version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1" +checksum = "44c5af06bb1b7d3216d91932aed5265164bf384dc89cd6ba05cf59a35f5f76ea" dependencies = [ "rand_chacha 0.9.0", "rand_core 0.9.5", @@ -5245,9 +5281,9 @@ dependencies = [ [[package]] name = "rand" -version = "0.10.0" +version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc266eb313df6c5c09c1c7b1fbe2510961e5bcd3add930c1e31f7ed9da0feff8" +checksum = "d2e8e8bcc7961af1fdac401278c6a831614941f6164ee3bf4ce61b7edb162207" dependencies = [ "chacha20", "getrandom 0.4.2", @@ -5641,7 +5677,7 @@ dependencies = [ "errno", "libc", "linux-raw-sys", - "windows-sys 0.61.2", + "windows-sys 0.52.0", ] [[package]] @@ -5683,9 +5719,9 @@ dependencies = [ [[package]] name = "rustls-webpki" -version = "0.103.10" +version = "0.103.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df33b2b81ac578cabaf06b89b0631153a3f416b0a886e8a7a1707fb51abbd1ef" +checksum = "61c429a8649f110dddef65e2a5ad240f747e85f7758a6bccc7e5777bd33f756e" dependencies = [ "aws-lc-rs", "ring", @@ -5877,6 +5913,21 @@ dependencies = [ "serde", ] +[[package]] +name = "serde_arrow" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2784e59a0315568e850cb01ddadf458f8c09e28d8cfc4880c2cc08f5dc3444e0" +dependencies = [ + "arrow-array", + "arrow-schema", + "bytemuck", + "chrono", + "half", + "marrow", + "serde", +] + [[package]] name = "serde_bytes" version = "0.11.19" @@ -6150,7 +6201,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3a766e1110788c36f4fa1c2b71b387a7815aa65f88ce0229841826633d93723e" dependencies = [ "libc", - "windows-sys 0.61.2", + "windows-sys 0.60.2", ] [[package]] @@ -6589,7 +6640,7 @@ dependencies = [ "getrandom 0.4.2", "once_cell", "rustix", - "windows-sys 0.61.2", + "windows-sys 0.52.0", ] [[package]] @@ -6725,9 +6776,9 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.50.0" +version = "1.52.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "27ad5e34374e03cfffefc301becb44e9dc3c17584f414349ebe29ed26661822d" +checksum = "b67dee974fe86fd92cc45b7a95fdd2f99a36a6d7b0d431a231178d3d670bbcc6" dependencies = [ "bytes", "libc", @@ -6742,9 +6793,9 @@ dependencies = [ [[package]] name = "tokio-macros" -version = "2.6.1" +version = "2.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c55a2eff8b69ce66c84f85e1da1c233edc36ceb85a2058d11b0d6a3c7e7569c" +checksum = "385a6cb71ab9ab790c5fe8d67f1645e6c450a7ce006a33de03daa956cf70a496" dependencies = [ "proc-macro2", "quote", @@ -7231,7 +7282,7 @@ dependencies = [ "nix 0.29.0", "once_cell", "pin-project", - "rand 0.9.2", + "rand 0.9.4", "socket2 0.5.10", "thiserror 2.0.18", "tokio", @@ -7481,7 +7532,7 @@ version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" dependencies = [ - "windows-sys 0.61.2", + "windows-sys 0.48.0", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 778e69c9d9..7f612c44bf 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -66,7 +66,7 @@ cfg-if = "1" chrono = "0.4.41" clap = { version = "4.5.48", features = ["derive", "cargo"] } dashmap = "6" -datafusion = "53.0.0" +datafusion = "53.1.0" datafusion-cli = "53.0.0" datafusion-sqllogictest = "53.0.0" derive_builder = "0.20" @@ -108,7 +108,7 @@ ordered-float = "4" parquet = "58" pilota = "0.11.10" pretty_assertions = "1.4" -rand = "0.8.5" +rand = "0.9.3" regex = "1.11.3" reqwest = { version = "0.12.12", default-features = false, features = ["json"] } roaring = { version = "0.11" } diff --git a/bindings/python/Cargo.lock b/bindings/python/Cargo.lock index 1b5c06f492..72ea322d7b 100644 --- a/bindings/python/Cargo.lock +++ b/bindings/python/Cargo.lock @@ -163,7 +163,7 @@ dependencies = [ "miniz_oxide", "num-bigint", "quad-rand", - "rand 0.9.2", + "rand 0.9.4", "regex-lite", "serde", "serde_bytes", @@ -1052,9 +1052,9 @@ dependencies = [ [[package]] name = "datafusion" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de9f8117889ba9503440f1dd79ebab32ba52ccf1720bb83cd718a29d4edc0d16" +checksum = "93db0e623840612f7f2cd757f7e8a8922064192363732c88692e0870016e141b" dependencies = [ "arrow", "arrow-schema", @@ -1095,7 +1095,7 @@ dependencies = [ "object_store", "parking_lot", "parquet", - "rand 0.9.2", + "rand 0.9.4", "regex", "sqlparser", "tempfile", @@ -1107,9 +1107,9 @@ dependencies = [ [[package]] name = "datafusion-catalog" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "be893b73a13671f310ffcc8da2c546b81efcc54c22e0382c0a28aa3537017137" +checksum = "37cefde60b26a7f4ff61e9d2ff2833322f91df2b568d7238afe67bde5bdffb66" dependencies = [ "arrow", "async-trait", @@ -1132,9 +1132,9 @@ dependencies = [ [[package]] name = "datafusion-catalog-listing" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "830487b51ed83807d6b32d6325f349c3144ae0c9bf772cf2a712db180c31d5e6" +checksum = "17e112307715d6a7a331111a4c2330ff54bc237183511c319e3708a4cff431fb" dependencies = [ "arrow", "async-trait", @@ -1155,9 +1155,9 @@ dependencies = [ [[package]] name = "datafusion-common" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0d7663f3af955292f8004e74bcaf8f7ea3d66cc38438749615bb84815b61a293" +checksum = "d72a11ca44a95e1081870d3abb80c717496e8a7acb467a1d3e932bb636af5cc2" dependencies = [ "ahash", "arrow", @@ -1180,9 +1180,9 @@ dependencies = [ [[package]] name = "datafusion-common-runtime" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f590205c7e32fe1fea48dd53ffb406e56ae0e7a062213a3ac848db8771641bd" +checksum = "89f4afaed29670ec4fd6053643adc749fe3f4bc9d1ce1b8c5679b22c67d12def" dependencies = [ "futures", "log", @@ -1191,9 +1191,9 @@ dependencies = [ [[package]] name = "datafusion-datasource" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fde1e030a9dc87b743c806fbd631f5ecfa2ccaa4ffb61fa19144a07fea406b79" +checksum = "e9fb386e1691355355a96419978a0022b7947b44d4a24a6ea99f00b6b485cbb6" dependencies = [ "arrow", "async-compression", @@ -1217,7 +1217,7 @@ dependencies = [ "liblzma", "log", "object_store", - "rand 0.9.2", + "rand 0.9.4", "tokio", "tokio-util", "url", @@ -1226,9 +1226,9 @@ dependencies = [ [[package]] name = "datafusion-datasource-arrow" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "331ebae7055dc108f9b54994b93dff91f3a17445539efe5b74e89264f7b36e15" +checksum = "ffa6c52cfed0734c5f93754d1c0175f558175248bf686c944fb05c373e5fc096" dependencies = [ "arrow", "arrow-ipc", @@ -1250,9 +1250,9 @@ dependencies = [ [[package]] name = "datafusion-datasource-csv" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e0d475088325e2986876aa27bb30d0574f72a22955a527d202f454681d55c5c" +checksum = "503f29e0582c1fc189578d665ff57d9300da1f80c282777d7eb67bb79fb8cdca" dependencies = [ "arrow", "async-trait", @@ -1273,9 +1273,9 @@ dependencies = [ [[package]] name = "datafusion-datasource-json" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea1520d81f31770f3ad6ee98b391e75e87a68a5bb90de70064ace5e0a7182fe8" +checksum = "e33804749abc8d0c8cb7473228483cb8070e524c6f6086ee1b85a64debe2b3d2" dependencies = [ "arrow", "async-trait", @@ -1297,9 +1297,9 @@ dependencies = [ [[package]] name = "datafusion-datasource-parquet" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95be805d0742ab129720f4c51ad9242cd872599cdb076098b03f061fcdc7f946" +checksum = "32a8e0365e0e08e8ff94d912f0ababcf9065a1a304018ba90b1fc83c855b4997" dependencies = [ "arrow", "async-trait", @@ -1327,15 +1327,15 @@ dependencies = [ [[package]] name = "datafusion-doc" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c93ad9e37730d2c7196e68616f3f2dd3b04c892e03acd3a8eeca6e177f3c06a" +checksum = "8de6ac0df1662b9148ad3c987978b32cbec7c772f199b1d53520c8fa764a87ee" [[package]] name = "datafusion-execution" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9437d3cd5d363f9319f8122182d4d233427de79c7eb748f23054c9aaa0fdd8df" +checksum = "c03c7fbdaefcca4ef6ffe425a5fc2325763bfb426599bb0bf4536466efabe709" dependencies = [ "arrow", "arrow-buffer", @@ -1349,16 +1349,16 @@ dependencies = [ "log", "object_store", "parking_lot", - "rand 0.9.2", + "rand 0.9.4", "tempfile", "url", ] [[package]] name = "datafusion-expr" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67164333342b86521d6d93fa54081ee39839894fb10f7a700c099af96d7552cf" +checksum = "574b9b6977fedbd2a611cbff12e5caf90f31640ad9dc5870f152836d94bad0dd" dependencies = [ "arrow", "async-trait", @@ -1379,9 +1379,9 @@ dependencies = [ [[package]] name = "datafusion-expr-common" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ab05fdd00e05d5a6ee362882546d29d6d3df43a6c55355164a7fbee12d163bc9" +checksum = "7d7c3adf3db8bf61e92eb90cb659c8e8b734593a8f7c8e12a843c7ddba24b87e" dependencies = [ "arrow", "datafusion-common", @@ -1422,9 +1422,9 @@ dependencies = [ [[package]] name = "datafusion-functions" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "04fb863482d987cf938db2079e07ab0d3bb64595f28907a6c2f8671ad71cca7e" +checksum = "f28aa4e10384e782774b10e72aca4d93ef7b31aa653095d9d4536b0a3dbc51b6" dependencies = [ "arrow", "arrow-buffer", @@ -1445,7 +1445,7 @@ dependencies = [ "md-5", "memchr", "num-traits", - "rand 0.9.2", + "rand 0.9.4", "regex", "sha2", "unicode-segmentation", @@ -1454,9 +1454,9 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "829856f4e14275fb376c104f27cbf3c3b57a9cfe24885d98677525f5e43ce8d6" +checksum = "00aa6217e56098ba84e0a338176fe52f0a84cca398021512c6c8c5eff806d0ad" dependencies = [ "ahash", "arrow", @@ -1476,9 +1476,9 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate-common" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08af79cc3d2aa874a362fb97decfcbd73d687190cb096f16a6c85a7780cce311" +checksum = "b511250349407db7c43832ab2de63f5557b19a20dfd236b39ca2c04468b50d47" dependencies = [ "ahash", "arrow", @@ -1489,9 +1489,9 @@ dependencies = [ [[package]] name = "datafusion-functions-nested" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "465ae3368146d49c2eda3e2c0ef114424c87e8a6b509ab34c1026ace6497e790" +checksum = "ef13a858e20d50f0a9bb5e96e7ac82b4e7597f247515bccca4fdd2992df0212a" dependencies = [ "arrow", "arrow-ord", @@ -1514,9 +1514,9 @@ dependencies = [ [[package]] name = "datafusion-functions-table" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6156e6b22fcf1784112fc0173f3ae6e78c8fdb4d3ed0eace9543873b437e2af6" +checksum = "72b40d3f5bbb3905f9ccb1ce9485a9595c77b69758a7c24d3ba79e334ff51e7e" dependencies = [ "arrow", "async-trait", @@ -1530,9 +1530,9 @@ dependencies = [ [[package]] name = "datafusion-functions-window" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca7baec14f866729012efb89011a6973f3a346dc8090c567bfcd328deff551c1" +checksum = "d4e88ec9d57c9b685d02f58bfee7be62d72610430ddcedb82a08e5d9925dbfb6" dependencies = [ "arrow", "datafusion-common", @@ -1548,9 +1548,9 @@ dependencies = [ [[package]] name = "datafusion-functions-window-common" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "159228c3280d342658466bb556dc24de30047fe1d7e559dc5d16ccc5324166f9" +checksum = "8307bb93519b1a91913723a1130cfafeee3f72200d870d88e91a6fc5470ede5c" dependencies = [ "datafusion-common", "datafusion-physical-expr-common", @@ -1558,9 +1558,9 @@ dependencies = [ [[package]] name = "datafusion-macros" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5427e5da5edca4d21ea1c7f50e1c9421775fe33d7d5726e5641a833566e7578" +checksum = "2e367e6a71051d0ebdd29b2f85d12059b38b1d1f172c6906e80016da662226bd" dependencies = [ "datafusion-doc", "quote", @@ -1569,9 +1569,9 @@ dependencies = [ [[package]] name = "datafusion-optimizer" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "89099eefcd5b223ec685c36a41d35c69239236310d71d339f2af0fa4383f3f46" +checksum = "e929015451a67f77d9d8b727b2bf3a40c4445fdef6cdc53281d7d97c76888ace" dependencies = [ "arrow", "chrono", @@ -1589,9 +1589,9 @@ dependencies = [ [[package]] name = "datafusion-physical-expr" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0f222df5195d605d79098ef37bdd5323bff0131c9d877a24da6ec98dfca9fe36" +checksum = "4b1e68aba7a4b350401cfdf25a3d6f989ad898a7410164afe9ca52080244cb59" dependencies = [ "ahash", "arrow", @@ -1613,9 +1613,9 @@ dependencies = [ [[package]] name = "datafusion-physical-expr-adapter" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "40838625d63d9c12549d81979db3dd675d159055eb9135009ba272ab0e8d0f64" +checksum = "ea22315f33cf2e0adc104e8ec42e285f6ed93998d565c65e82fec6a9ee9f9db4" dependencies = [ "arrow", "datafusion-common", @@ -1628,9 +1628,9 @@ dependencies = [ [[package]] name = "datafusion-physical-expr-common" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eacbcc4cfd502558184ed58fa3c72e775ec65bf077eef5fd2b3453db676f893c" +checksum = "b04b45ea8ad3ac2d78f2ea2a76053e06591c9629c7a603eda16c10649ecf4362" dependencies = [ "ahash", "arrow", @@ -1645,9 +1645,9 @@ dependencies = [ [[package]] name = "datafusion-physical-optimizer" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d501d0e1d0910f015677121601ac177ec59272ef5c9324d1147b394988f40941" +checksum = "7cb13397809a425918f608dfe8653f332015a3e330004ab191b4404187238b95" dependencies = [ "arrow", "datafusion-common", @@ -1664,9 +1664,9 @@ dependencies = [ [[package]] name = "datafusion-physical-plan" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "463c88ad6f1ecab1810f4c9f046898bee035b370137eb79b2b2db925e270631d" +checksum = "5edc023675791af9d5fb4cc4c24abf5f7bd3bd4dcf9e5bd90ea1eff6976dcc79" dependencies = [ "ahash", "arrow", @@ -1719,7 +1719,7 @@ dependencies = [ "datafusion-proto-common", "object_store", "prost", - "rand 0.9.2", + "rand 0.9.4", ] [[package]] @@ -1735,9 +1735,9 @@ dependencies = [ [[package]] name = "datafusion-pruning" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2857618a0ecbd8cd0cf29826889edd3a25774ec26b2995fc3862095c95d88fc6" +checksum = "ac8c76860e355616555081cab5968cec1af7a80701ff374510860bcd567e365a" dependencies = [ "arrow", "datafusion-common", @@ -1752,9 +1752,9 @@ dependencies = [ [[package]] name = "datafusion-session" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ef8637e35022c5c775003b3ab1debc6b4a8f0eb41b069bdd5475dd3aa93f6eba" +checksum = "5412111aa48e2424ba926112e192f7a6b7e4ccb450145d25ce5ede9f19dc491e" dependencies = [ "async-trait", "datafusion-common", @@ -1766,9 +1766,9 @@ dependencies = [ [[package]] name = "datafusion-sql" -version = "53.0.0" +version = "53.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "12d9e9f16a1692a11c94bcc418191fa15fd2b4d72a0c1a0c607db93c0b84dd81" +checksum = "fa0d133ddf8b9b3b872acac900157f783e7b879fe9a6bccf389abebbfac45ec1" dependencies = [ "arrow", "bigdecimal", @@ -2435,7 +2435,7 @@ dependencies = [ "once_cell", "ordered-float 4.6.0", "parquet", - "rand 0.8.5", + "rand 0.9.4", "reqwest", "roaring", "serde", @@ -3587,7 +3587,7 @@ dependencies = [ "bytes", "getrandom 0.3.4", "lru-slab", - "rand 0.9.2", + "rand 0.9.4", "ring", "rustc-hash", "rustls", @@ -3647,9 +3647,9 @@ dependencies = [ [[package]] name = "rand" -version = "0.9.2" +version = "0.9.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1" +checksum = "44c5af06bb1b7d3216d91932aed5265164bf384dc89cd6ba05cf59a35f5f76ea" dependencies = [ "rand_chacha 0.9.0", "rand_core 0.9.5", @@ -3968,9 +3968,9 @@ dependencies = [ [[package]] name = "rustls-webpki" -version = "0.103.10" +version = "0.103.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "df33b2b81ac578cabaf06b89b0631153a3f416b0a886e8a7a1707fb51abbd1ef" +checksum = "61c429a8649f110dddef65e2a5ad240f747e85f7758a6bccc7e5777bd33f756e" dependencies = [ "ring", "rustls-pki-types", diff --git a/crates/catalog/glue/src/catalog.rs b/crates/catalog/glue/src/catalog.rs index a7e0171337..5b3ccf3b39 100644 --- a/crates/catalog/glue/src/catalog.rs +++ b/crates/catalog/glue/src/catalog.rs @@ -203,7 +203,6 @@ impl GlueCatalog { // Use provided factory or default to OpenDalStorageFactory::S3 let factory = storage_factory.unwrap_or_else(|| { Arc::new(OpenDalStorageFactory::S3 { - configured_scheme: "s3a".to_string(), customized_credential_load: None, }) }); diff --git a/crates/catalog/hms/tests/hms_catalog_test.rs b/crates/catalog/hms/tests/hms_catalog_test.rs index f19cf7bff4..d0e6486ad8 100644 --- a/crates/catalog/hms/tests/hms_catalog_test.rs +++ b/crates/catalog/hms/tests/hms_catalog_test.rs @@ -23,7 +23,10 @@ use std::collections::HashMap; use std::sync::Arc; -use iceberg::io::{FileIOBuilder, S3_ACCESS_KEY_ID, S3_ENDPOINT, S3_REGION, S3_SECRET_ACCESS_KEY}; +use iceberg::io::{ + FileIOBuilder, S3_ACCESS_KEY_ID, S3_ENDPOINT, S3_PATH_STYLE_ACCESS, S3_REGION, + S3_SECRET_ACCESS_KEY, +}; use iceberg::{Catalog, CatalogBuilder, Namespace, NamespaceIdent}; use iceberg_catalog_hms::{ HMS_CATALOG_PROP_THRIFT_TRANSPORT, HMS_CATALOG_PROP_URI, HMS_CATALOG_PROP_WAREHOUSE, @@ -56,11 +59,11 @@ async fn get_catalog() -> HmsCatalog { (S3_ACCESS_KEY_ID.to_string(), "admin".to_string()), (S3_SECRET_ACCESS_KEY.to_string(), "password".to_string()), (S3_REGION.to_string(), "us-east-1".to_string()), + (S3_PATH_STYLE_ACCESS.to_string(), "true".to_string()), ]); // Wait for bucket to actually exist let file_io = FileIOBuilder::new(Arc::new(OpenDalStorageFactory::S3 { - configured_scheme: "s3a".to_string(), customized_credential_load: None, })) .with_props(props.clone()) @@ -79,7 +82,6 @@ async fn get_catalog() -> HmsCatalog { HmsCatalogBuilder::default() .with_storage_factory(Arc::new(OpenDalStorageFactory::S3 { - configured_scheme: "s3a".to_string(), customized_credential_load: None, })) .load("hms", props) diff --git a/crates/catalog/loader/tests/common/mod.rs b/crates/catalog/loader/tests/common/mod.rs index 6524d56339..dfa9535672 100644 --- a/crates/catalog/loader/tests/common/mod.rs +++ b/crates/catalog/loader/tests/common/mod.rs @@ -24,8 +24,8 @@ use std::fmt; use std::sync::Arc; use iceberg::io::{ - FileIOBuilder, LocalFsStorageFactory, S3_ACCESS_KEY_ID, S3_ENDPOINT, S3_REGION, - S3_SECRET_ACCESS_KEY, + FileIOBuilder, LocalFsStorageFactory, S3_ACCESS_KEY_ID, S3_ENDPOINT, S3_PATH_STYLE_ACCESS, + S3_REGION, S3_SECRET_ACCESS_KEY, }; use iceberg::memory::{MEMORY_CATALOG_WAREHOUSE, MemoryCatalogBuilder}; use iceberg::spec::{NestedField, PrimitiveType, Schema, Type}; @@ -242,10 +242,10 @@ async fn glue_catalog() -> GlueCatalog { (S3_ACCESS_KEY_ID.to_string(), "admin".to_string()), (S3_SECRET_ACCESS_KEY.to_string(), "password".to_string()), (S3_REGION.to_string(), "us-east-1".to_string()), + (S3_PATH_STYLE_ACCESS.to_string(), "true".to_string()), ]); let file_io = FileIOBuilder::new(Arc::new(OpenDalStorageFactory::S3 { - configured_scheme: "s3a".to_string(), customized_credential_load: None, })) .with_props(props.clone()) @@ -293,10 +293,10 @@ async fn hms_catalog() -> HmsCatalog { (S3_ACCESS_KEY_ID.to_string(), "admin".to_string()), (S3_SECRET_ACCESS_KEY.to_string(), "password".to_string()), (S3_REGION.to_string(), "us-east-1".to_string()), + (S3_PATH_STYLE_ACCESS.to_string(), "true".to_string()), ]); let file_io = FileIOBuilder::new(Arc::new(OpenDalStorageFactory::S3 { - configured_scheme: "s3a".to_string(), customized_credential_load: None, })) .with_props(props.clone()) @@ -313,7 +313,6 @@ async fn hms_catalog() -> HmsCatalog { HmsCatalogBuilder::default() .with_storage_factory(Arc::new(OpenDalStorageFactory::S3 { - configured_scheme: "s3a".to_string(), customized_credential_load: None, })) .load("hms", props) diff --git a/crates/catalog/s3tables/Cargo.toml b/crates/catalog/s3tables/Cargo.toml index 2fe096fec9..dc7be3027f 100644 --- a/crates/catalog/s3tables/Cargo.toml +++ b/crates/catalog/s3tables/Cargo.toml @@ -39,6 +39,11 @@ iceberg-storage-opendal = { workspace = true, features = ["opendal-s3"] } [dev-dependencies] +arrow-array = { workspace = true } +arrow-schema = { workspace = true } +futures = { workspace = true } iceberg_test_utils = { path = "../../test_utils", features = ["tests"] } itertools = { workspace = true } +parquet = { workspace = true } tokio = { workspace = true } +uuid = { workspace = true } diff --git a/crates/catalog/s3tables/src/catalog.rs b/crates/catalog/s3tables/src/catalog.rs index b88bd77d29..cc43446943 100644 --- a/crates/catalog/s3tables/src/catalog.rs +++ b/crates/catalog/s3tables/src/catalog.rs @@ -202,7 +202,6 @@ impl S3TablesCatalog { // Use provided factory or default to OpenDalStorageFactory::S3 let factory = storage_factory.unwrap_or_else(|| { Arc::new(OpenDalStorageFactory::S3 { - configured_scheme: "s3a".to_string(), customized_credential_load: None, }) }); @@ -707,6 +706,7 @@ where T: std::fmt::Debug { #[cfg(test)] mod tests { + use futures::TryStreamExt; use iceberg::spec::{NestedField, PrimitiveType, Schema, Type}; use iceberg::transaction::{ApplyTransactionAction, Transaction}; @@ -1175,4 +1175,108 @@ mod tests { assert_eq!(err.message(), "Catalog name cannot be empty"); } } + + /// Verify that an S3 Table catalog can create a table, write data, load the same table, and read from it. + #[tokio::test] + async fn test_s3tables_create_table_write_load_table_read() { + use iceberg::writer::base_writer::data_file_writer::DataFileWriterBuilder; + use iceberg::writer::file_writer::ParquetWriterBuilder; + use iceberg::writer::file_writer::location_generator::{ + DefaultFileNameGenerator, DefaultLocationGenerator, + }; + use iceberg::writer::file_writer::rolling_writer::RollingFileWriterBuilder; + use iceberg::writer::{IcebergWriter, IcebergWriterBuilder}; + + let catalog = match load_s3tables_catalog_from_env().await { + Ok(Some(c)) => c, + Ok(None) => return, + Err(e) => panic!("Error loading catalog: {e}"), + }; + + let ns = NamespaceIdent::new(format!("test_rw_{}", uuid::Uuid::new_v4().simple())); + catalog.create_namespace(&ns, HashMap::new()).await.unwrap(); + + let table_name = String::from("table"); + + let schema = Schema::builder() + .with_fields(vec![ + NestedField::required(1, "id", Type::Primitive(PrimitiveType::Int)).into(), + ]) + .build() + .unwrap(); + let creation = TableCreation::builder() + .name(table_name.clone()) + .schema(schema) + .build(); + + let table = catalog.create_table(&ns, creation).await.unwrap(); + + // Write one row. + let arrow_schema: Arc = Arc::new( + table + .metadata() + .current_schema() + .as_ref() + .try_into() + .unwrap(), + ); + let batch = arrow_array::RecordBatch::try_new(arrow_schema, vec![Arc::new( + arrow_array::Int32Array::from(vec![42]), + )]) + .unwrap(); + + // Locations will be generated based on the table metadata, which will be using `s3://` for Amazon S3 Tables. + let location_generator = DefaultLocationGenerator::new(table.metadata().clone()).unwrap(); + let file_name_generator = DefaultFileNameGenerator::new( + "test".to_string(), + None, + iceberg::spec::DataFileFormat::Parquet, + ); + let parquet_writer_builder = ParquetWriterBuilder::new( + parquet::file::properties::WriterProperties::default(), + table.metadata().current_schema().clone(), + ); + let rw = RollingFileWriterBuilder::new_with_default_file_size( + parquet_writer_builder, + table.file_io().clone(), + location_generator, + file_name_generator, + ); + let mut writer = DataFileWriterBuilder::new(rw).build(None).await.unwrap(); + writer.write(batch.clone()).await.unwrap(); + let data_files = writer.close().await.unwrap(); + + let tx = Transaction::new(&table); + let tx = tx + .fast_append() + .add_data_files(data_files) + .apply(tx) + .unwrap(); + tx.commit(&catalog).await.unwrap(); + + // Reload from catalog and read back. + let table_ident = TableIdent::new(ns.clone(), table_name.clone()); + let reloaded = catalog.load_table(&table_ident).await.unwrap(); + let batches: Vec = reloaded + .scan() + .select_all() + .build() + .expect("scan to be valid (snapshot exists, schema is OK)") + .to_arrow() + .await + .expect("scan tasks should be OK") + .try_collect() + .await + .expect("scan should complete successfully"); + + assert_eq!(batches.len(), 1); + assert_eq!( + batches[0], batch, + "read records should match records written earlier" + ); + + // Clean up. + catalog.purge_table(&table_ident).await.ok(); + catalog.drop_namespace(&ns).await.ok(); + } } diff --git a/crates/iceberg/Cargo.toml b/crates/iceberg/Cargo.toml index 7e91050605..7f5c235c47 100644 --- a/crates/iceberg/Cargo.toml +++ b/crates/iceberg/Cargo.toml @@ -100,6 +100,7 @@ rand = { workspace = true } regex = { workspace = true } tempfile = { workspace = true } minijinja = { workspace = true } +serde_arrow = { version = "0.14", features = ["arrow-58"] } [package.metadata.cargo-machete] # These dependencies are added to ensure minimal dependency version diff --git a/crates/iceberg/src/arrow/caching_delete_file_loader.rs b/crates/iceberg/src/arrow/caching_delete_file_loader.rs index ae97534d83..231971fd54 100644 --- a/crates/iceberg/src/arrow/caching_delete_file_loader.rs +++ b/crates/iceberg/src/arrow/caching_delete_file_loader.rs @@ -25,6 +25,7 @@ use tokio::sync::oneshot::{Receiver, channel}; use super::delete_filter::{DeleteFilter, PosDelLoadAction}; use crate::arrow::delete_file_loader::BasicDeleteFileLoader; +use crate::arrow::scan_metrics::ScanMetrics; use crate::arrow::{arrow_primitive_to_literal, arrow_schema_to_schema}; use crate::delete_vector::DeleteVector; use crate::expr::Predicate::AlwaysTrue; @@ -77,13 +78,22 @@ enum ParsedDeleteFileContext { #[allow(unused_variables)] impl CachingDeleteFileLoader { pub(crate) fn new(file_io: FileIO, concurrency_limit_data_files: usize) -> Self { + let scan_metrics = ScanMetrics::new(); CachingDeleteFileLoader { - basic_delete_file_loader: BasicDeleteFileLoader::new(file_io), + basic_delete_file_loader: BasicDeleteFileLoader::new(file_io, scan_metrics), concurrency_limit_data_files, delete_filter: DeleteFilter::default(), } } + pub(crate) fn with_scan_metrics(mut self, scan_metrics: ScanMetrics) -> Self { + self.basic_delete_file_loader = BasicDeleteFileLoader::new( + self.basic_delete_file_loader.file_io().clone(), + scan_metrics, + ); + self + } + /// Initiates loading of all deletes for all the specified tasks /// /// Returned future completes once all positional deletes and delete vectors @@ -612,7 +622,8 @@ mod tests { let eq_delete_file_path = setup_write_equality_delete_file_1(table_location); - let basic_delete_file_loader = BasicDeleteFileLoader::new(file_io.clone()); + let basic_delete_file_loader = + BasicDeleteFileLoader::new(file_io.clone(), ScanMetrics::new()); let record_batch_stream = basic_delete_file_loader .parquet_to_batch_stream( &eq_delete_file_path, @@ -808,7 +819,8 @@ mod tests { }; let file_io = FileIO::new_with_fs(); - let basic_delete_file_loader = BasicDeleteFileLoader::new(file_io.clone()); + let basic_delete_file_loader = + BasicDeleteFileLoader::new(file_io.clone(), ScanMetrics::new()); let batch_stream = basic_delete_file_loader .parquet_to_batch_stream( @@ -994,7 +1006,8 @@ mod tests { writer.write(&record_batch).unwrap(); writer.close().unwrap(); - let basic_delete_file_loader = BasicDeleteFileLoader::new(file_io.clone()); + let basic_delete_file_loader = + BasicDeleteFileLoader::new(file_io.clone(), ScanMetrics::new()); let record_batch_stream = basic_delete_file_loader .parquet_to_batch_stream(&path, std::fs::metadata(&path).unwrap().len()) .await diff --git a/crates/iceberg/src/arrow/delete_file_loader.rs b/crates/iceberg/src/arrow/delete_file_loader.rs index 0be62ad496..134b029613 100644 --- a/crates/iceberg/src/arrow/delete_file_loader.rs +++ b/crates/iceberg/src/arrow/delete_file_loader.rs @@ -23,6 +23,7 @@ use parquet::arrow::ParquetRecordBatchStreamBuilder; use crate::arrow::ArrowReader; use crate::arrow::reader::ParquetReadOptions; use crate::arrow::record_batch_transformer::RecordBatchTransformerBuilder; +use crate::arrow::scan_metrics::ScanMetrics; use crate::io::FileIO; use crate::scan::{ArrowRecordBatchStream, FileScanTaskDeleteFile}; use crate::spec::{Schema, SchemaRef}; @@ -45,13 +46,22 @@ pub trait DeleteFileLoader { #[derive(Clone, Debug)] pub(crate) struct BasicDeleteFileLoader { file_io: FileIO, + scan_metrics: ScanMetrics, } #[allow(unused_variables)] impl BasicDeleteFileLoader { - pub fn new(file_io: FileIO) -> Self { - BasicDeleteFileLoader { file_io } + pub fn new(file_io: FileIO, scan_metrics: ScanMetrics) -> Self { + BasicDeleteFileLoader { + file_io, + scan_metrics, + } } + + pub(crate) fn file_io(&self) -> &FileIO { + &self.file_io + } + /// Loads a RecordBatchStream for a given datafile. pub(crate) async fn parquet_to_batch_stream( &self, @@ -69,6 +79,7 @@ impl BasicDeleteFileLoader { &self.file_io, file_size_in_bytes, parquet_read_options, + self.scan_metrics.bytes_read_counter(), ) .await?; @@ -137,7 +148,8 @@ mod tests { let table_location = tmp_dir.path(); let file_io = FileIO::new_with_fs(); - let delete_file_loader = BasicDeleteFileLoader::new(file_io.clone()); + let scan_metrics = ScanMetrics::new(); + let delete_file_loader = BasicDeleteFileLoader::new(file_io.clone(), scan_metrics); let file_scan_tasks = setup(table_location); diff --git a/crates/iceberg/src/arrow/incremental.rs b/crates/iceberg/src/arrow/incremental.rs index 8149c6d677..8d6269071f 100644 --- a/crates/iceberg/src/arrow/incremental.rs +++ b/crates/iceberg/src/arrow/incremental.rs @@ -25,6 +25,7 @@ use futures::stream::select; use futures::{Stream, StreamExt, TryStreamExt}; use crate::arrow::reader::{ParquetReadOptions, process_record_batch_stream}; +use crate::arrow::scan_metrics::ScanMetrics; use crate::arrow::{ArrowReader, StreamsInto}; use crate::delete_vector::DeleteVector; use crate::expr::Bind; @@ -49,18 +50,34 @@ pub enum IncrementalBatchType { Delete, } -/// The stream of incremental Arrow `RecordBatch`es with batch type. -pub type CombinedIncrementalBatchRecordStream = +/// Inner stream type for [`CombinedIncrementalScanResult`]. +pub type CombinedIncrementalBatchStream = Pin> + Send + 'static>>; -/// Stream type for obtaining a separate stream of appended and deleted record batches. -pub type UnzippedIncrementalBatchRecordStream = (ArrowRecordBatchStream, ArrowRecordBatchStream); +/// The stream of incremental Arrow `RecordBatch`es with batch type, together with scan metrics. +pub struct CombinedIncrementalScanResult { + /// Combined stream of appended and deleted record batches, each tagged with its type. + pub stream: CombinedIncrementalBatchStream, + /// Metrics collected during the incremental scan (e.g. bytes read from storage). + pub metrics: ScanMetrics, +} + +/// Separate streams for appended and deleted record batches, together with scan metrics. +pub struct UnzippedIncrementalScanResult { + /// Stream of appended record batches. + pub appends: ArrowRecordBatchStream, + /// Stream of deleted record batches. + pub deletes: ArrowRecordBatchStream, + /// Metrics collected during the incremental scan (e.g. bytes read from storage). + pub metrics: ScanMetrics, +} async fn process_incremental_append_task( task: AppendedFileScanTask, batch_size: Option, file_io: FileIO, parquet_read_options: ParquetReadOptions, + scan_metrics: ScanMetrics, ) -> Result { let AppendedFileScanTask { base, @@ -80,6 +97,8 @@ async fn process_incremental_append_task( ArrowReader::build_virtual_columns(&base.project_field_ids), batch_size, None, // name_mapping not yet supported in incremental scan + Some(Arc::clone(scan_metrics.bytes_read_counter())), + Some(&base.schema), ) .await?; @@ -184,6 +203,7 @@ async fn process_equality_delete_task( batch_size: Option, file_io: FileIO, parquet_read_options: ParquetReadOptions, + scan_metrics: ScanMetrics, ) -> Result { let file_path = task.data_file_path().to_string(); @@ -205,6 +225,8 @@ async fn process_equality_delete_task( vec![Arc::clone(row_pos_field())], batch_size, None, // name_mapping not yet supported in incremental scan + Some(Arc::clone(scan_metrics.bytes_read_counter())), + Some(&task.base.schema), ) .await?; @@ -281,28 +303,32 @@ async fn process_equality_delete_task( Ok(Box::pin(stream) as ArrowRecordBatchStream) } -impl StreamsInto - for IncrementalFileScanTaskStreams -{ +impl StreamsInto for IncrementalFileScanTaskStreams { /// Takes separate streams of appended and deleted file scan tasks and reads all the files. - /// Returns a combined stream of Arrow `RecordBatch`es containing the data from the files. - fn stream(self, reader: ArrowReader) -> Result { - let (appends, deletes) = - StreamsInto::::stream(self, reader)?; + /// Returns a [`CombinedIncrementalScanResult`] containing a combined stream of Arrow + /// `RecordBatch`es and scan metrics. + fn stream(self, reader: ArrowReader) -> Result { + let UnzippedIncrementalScanResult { + appends, + deletes, + metrics, + } = StreamsInto::::stream(self, reader)?; let left = appends.map(|res| res.map(|batch| (IncrementalBatchType::Append, batch))); let right = deletes.map(|res| res.map(|batch| (IncrementalBatchType::Delete, batch))); - Ok(Box::pin(select(left, right)) as CombinedIncrementalBatchRecordStream) + Ok(CombinedIncrementalScanResult { + stream: Box::pin(select(left, right)), + metrics, + }) } } -impl StreamsInto - for IncrementalFileScanTaskStreams -{ +impl StreamsInto for IncrementalFileScanTaskStreams { /// Takes separate streams of appended and deleted file scan tasks and reads all the files. - /// Returns two separate streams of Arrow `RecordBatch`es containing appended data and deleted records. - fn stream(self, reader: ArrowReader) -> Result { + /// Returns an [`UnzippedIncrementalScanResult`] containing separate streams of appended and + /// deleted record batches together with scan metrics. + fn stream(self, reader: ArrowReader) -> Result { let (appends_tx, appends_rx) = channel::>(reader.concurrency_limit_data_files); let (deletes_tx, deletes_rx) = @@ -310,16 +336,19 @@ impl StreamsInto let batch_size = reader.batch_size; let parquet_read_options = reader.parquet_read_options; + let scan_metrics = ScanMetrics::new(); let (append_stream, delete_stream) = self; // Process append tasks let file_io_append = reader.file_io.clone(); + let scan_metrics_append = scan_metrics.clone(); spawn(async move { let _ = append_stream .try_for_each_concurrent(reader.concurrency_limit_data_files, |append_task| { let file_io = file_io_append.clone(); let appends_tx = appends_tx.clone(); + let scan_metrics = scan_metrics_append.clone(); async move { // Inner spawn: each file's IO runs on its own tokio task for true // parallelism. Awaiting it keeps the concurrency slot occupied until @@ -335,6 +364,7 @@ impl StreamsInto batch_size, file_io, append_read_options, + scan_metrics, ) .await; @@ -355,11 +385,13 @@ impl StreamsInto // Process delete tasks let file_io_delete = reader.file_io.clone(); + let scan_metrics_delete = scan_metrics.clone(); spawn(async move { let _ = delete_stream .try_for_each_concurrent(reader.concurrency_limit_data_files, |delete_task| { let deletes_tx = deletes_tx.clone(); let file_io = file_io_delete.clone(); + let scan_metrics = scan_metrics_delete.clone(); async move { // Inner spawn: same pattern as full-scan reader — spawn for parallelism, // await to keep the concurrency slot occupied until the task completes. @@ -406,6 +438,7 @@ impl StreamsInto batch_size, file_io.clone(), eq_read_options, + scan_metrics, ) .await; @@ -426,9 +459,10 @@ impl StreamsInto .await; }); - Ok(( - Box::pin(appends_rx) as ArrowRecordBatchStream, - Box::pin(deletes_rx) as ArrowRecordBatchStream, - )) + Ok(UnzippedIncrementalScanResult { + appends: Box::pin(appends_rx) as ArrowRecordBatchStream, + deletes: Box::pin(deletes_rx) as ArrowRecordBatchStream, + metrics: scan_metrics, + }) } } diff --git a/crates/iceberg/src/arrow/int96.rs b/crates/iceberg/src/arrow/int96.rs new file mode 100644 index 0000000000..63a7a30f1a --- /dev/null +++ b/crates/iceberg/src/arrow/int96.rs @@ -0,0 +1,578 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! INT96 timestamp coercion for Parquet files. + +use std::sync::Arc; + +use arrow_schema::{ + DataType, Field, Fields, Schema as ArrowSchema, SchemaRef as ArrowSchemaRef, TimeUnit, +}; +use parquet::arrow::PARQUET_FIELD_ID_META_KEY; + +use crate::arrow::schema::{ArrowSchemaVisitor, DEFAULT_MAP_FIELD_NAME, visit_schema}; +use crate::error::Result; +use crate::spec::{PrimitiveType, Schema, Type}; +use crate::{Error, ErrorKind}; + +/// Coerce Arrow schema types for INT96 columns to match the Iceberg table schema. +/// +/// arrow-rs defaults INT96 to `Timestamp(Nanosecond)`, which overflows i64 for dates outside +/// ~1677-2262. We use arrow-rs's schema hint mechanism to read INT96 at the resolution +/// specified by the Iceberg schema (`timestamp` → microsecond, `timestamp_ns` → nanosecond). +/// +/// Iceberg Java handles this differently: it bypasses parquet-mr with a custom column reader +/// (`GenericParquetReaders.TimestampInt96Reader`). We achieve the same result via schema hints. +/// +/// References: +/// - Iceberg spec primitive types: +/// - arrow-rs schema hint support: +pub(crate) fn coerce_int96_timestamps( + arrow_schema: &ArrowSchemaRef, + iceberg_schema: &Schema, +) -> Option> { + let mut visitor = Int96CoercionVisitor::new(iceberg_schema); + let coerced = visit_schema(arrow_schema, &mut visitor).ok()?; + if visitor.changed { + Some(Arc::new(coerced)) + } else { + None + } +} + +/// Visitor that coerces `Timestamp(Nanosecond)` Arrow fields to the resolution +/// indicated by the Iceberg schema. +struct Int96CoercionVisitor<'a> { + iceberg_schema: &'a Schema, + // TODO(#2310): use FieldRef (Arc) once ArrowSchemaVisitor passes FieldRef. + field_stack: Vec, + changed: bool, +} + +impl<'a> Int96CoercionVisitor<'a> { + fn new(iceberg_schema: &'a Schema) -> Self { + Self { + iceberg_schema, + field_stack: Vec::new(), + changed: false, + } + } + + /// Determine the target TimeUnit for a Timestamp(Nanosecond) field based on the + /// Iceberg schema. Falls back to microsecond when field IDs are unavailable, + /// matching Iceberg Java behavior. + fn target_unit(&self, field: &Field) -> Option { + if !matches!( + field.data_type(), + DataType::Timestamp(TimeUnit::Nanosecond, _) + ) { + return None; + } + + let target = field + .metadata() + .get(PARQUET_FIELD_ID_META_KEY) + .and_then(|id_str| id_str.parse::().ok()) + .and_then(|field_id| self.iceberg_schema.field_by_id(field_id)) + .and_then(|f| match &*f.field_type { + Type::Primitive(PrimitiveType::Timestamp | PrimitiveType::Timestamptz) => { + Some(TimeUnit::Microsecond) + } + Type::Primitive(PrimitiveType::TimestampNs | PrimitiveType::TimestamptzNs) => { + Some(TimeUnit::Nanosecond) + } + _ => None, + }) + // Iceberg Java reads INT96 as microseconds by default + .unwrap_or(TimeUnit::Microsecond); + + if target == TimeUnit::Nanosecond { + None + } else { + Some(target) + } + } +} + +impl ArrowSchemaVisitor for Int96CoercionVisitor<'_> { + type T = Field; + type U = ArrowSchema; + + fn before_field(&mut self, field: &Field) -> Result<()> { + self.field_stack.push(field.as_ref().clone()); + Ok(()) + } + + fn after_field(&mut self, _field: &Field) -> Result<()> { + self.field_stack.pop(); + Ok(()) + } + + fn before_list_element(&mut self, field: &Field) -> Result<()> { + self.field_stack.push(field.as_ref().clone()); + Ok(()) + } + + fn after_list_element(&mut self, _field: &Field) -> Result<()> { + self.field_stack.pop(); + Ok(()) + } + + fn before_map_key(&mut self, field: &Field) -> Result<()> { + self.field_stack.push(field.as_ref().clone()); + Ok(()) + } + + fn after_map_key(&mut self, _field: &Field) -> Result<()> { + self.field_stack.pop(); + Ok(()) + } + + fn before_map_value(&mut self, field: &Field) -> Result<()> { + self.field_stack.push(field.as_ref().clone()); + Ok(()) + } + + fn after_map_value(&mut self, _field: &Field) -> Result<()> { + self.field_stack.pop(); + Ok(()) + } + + fn schema(&mut self, schema: &ArrowSchema, values: Vec) -> Result { + Ok(ArrowSchema::new_with_metadata( + values, + schema.metadata().clone(), + )) + } + + fn r#struct(&mut self, _fields: &Fields, results: Vec) -> Result { + let field_info = self + .field_stack + .last() + .ok_or_else(|| Error::new(ErrorKind::Unexpected, "Field stack underflow in struct"))?; + Ok(Field::new( + field_info.name(), + DataType::Struct(Fields::from(results)), + field_info.is_nullable(), + ) + .with_metadata(field_info.metadata().clone())) + } + + fn list(&mut self, list: &DataType, value: Field) -> Result { + let field_info = self + .field_stack + .last() + .ok_or_else(|| Error::new(ErrorKind::Unexpected, "Field stack underflow in list"))?; + let list_type = match list { + DataType::List(_) => DataType::List(Arc::new(value)), + DataType::LargeList(_) => DataType::LargeList(Arc::new(value)), + DataType::FixedSizeList(_, size) => DataType::FixedSizeList(Arc::new(value), *size), + _ => { + return Err(Error::new( + ErrorKind::Unexpected, + format!("Expected list type, got {list}"), + )); + } + }; + Ok( + Field::new(field_info.name(), list_type, field_info.is_nullable()) + .with_metadata(field_info.metadata().clone()), + ) + } + + fn map(&mut self, map: &DataType, key_value: Field, value: Field) -> Result { + let field_info = self + .field_stack + .last() + .ok_or_else(|| Error::new(ErrorKind::Unexpected, "Field stack underflow in map"))?; + let sorted = match map { + DataType::Map(_, sorted) => *sorted, + _ => { + return Err(Error::new( + ErrorKind::Unexpected, + format!("Expected map type, got {map}"), + )); + } + }; + let struct_field = Field::new( + DEFAULT_MAP_FIELD_NAME, + DataType::Struct(Fields::from(vec![key_value, value])), + false, + ); + Ok(Field::new( + field_info.name(), + DataType::Map(Arc::new(struct_field), sorted), + field_info.is_nullable(), + ) + .with_metadata(field_info.metadata().clone())) + } + + fn primitive(&mut self, p: &DataType) -> Result { + let field_info = self.field_stack.last().ok_or_else(|| { + Error::new(ErrorKind::Unexpected, "Field stack underflow in primitive") + })?; + + if let Some(target_unit) = self.target_unit(field_info) { + let tz = match field_info.data_type() { + DataType::Timestamp(_, tz) => tz.clone(), + _ => None, + }; + self.changed = true; + Ok(Field::new( + field_info.name(), + DataType::Timestamp(target_unit, tz), + field_info.is_nullable(), + ) + .with_metadata(field_info.metadata().clone())) + } else { + Ok( + Field::new(field_info.name(), p.clone(), field_info.is_nullable()) + .with_metadata(field_info.metadata().clone()), + ) + } + } +} + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + use std::sync::Arc; + + use arrow_schema::{DataType, Field, Schema as ArrowSchema, TimeUnit}; + use parquet::arrow::PARQUET_FIELD_ID_META_KEY; + + use super::coerce_int96_timestamps; + use crate::spec::{ListType, MapType, NestedField, PrimitiveType, Schema, StructType, Type}; + + fn iceberg_schema_with_timestamp() -> Schema { + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::optional(1, "ts", Type::Primitive(PrimitiveType::Timestamp)).into(), + NestedField::required(2, "id", Type::Primitive(PrimitiveType::Int)).into(), + ]) + .build() + .unwrap() + } + + fn field_id_meta(id: i32) -> HashMap { + HashMap::from([(PARQUET_FIELD_ID_META_KEY.to_string(), id.to_string())]) + } + + #[test] + fn test_coerce_timestamp_ns_to_us() { + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("ts", DataType::Timestamp(TimeUnit::Nanosecond, None), true) + .with_metadata(field_id_meta(1)), + Field::new("id", DataType::Int32, false).with_metadata(field_id_meta(2)), + ])); + let iceberg = iceberg_schema_with_timestamp(); + + let coerced = coerce_int96_timestamps(&arrow_schema, &iceberg).unwrap(); + assert_eq!( + coerced.field(0).data_type(), + &DataType::Timestamp(TimeUnit::Microsecond, None) + ); + // Non-timestamp field unchanged + assert_eq!(coerced.field(1).data_type(), &DataType::Int32); + } + + #[test] + fn test_coerce_timestamptz_ns_to_us() { + let iceberg = Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::optional(1, "ts", Type::Primitive(PrimitiveType::Timestamptz)).into(), + ]) + .build() + .unwrap(); + + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new( + "ts", + DataType::Timestamp(TimeUnit::Nanosecond, Some("UTC".into())), + true, + ) + .with_metadata(field_id_meta(1)), + ])); + + let coerced = coerce_int96_timestamps(&arrow_schema, &iceberg).unwrap(); + assert_eq!( + coerced.field(0).data_type(), + &DataType::Timestamp(TimeUnit::Microsecond, Some("UTC".into())) + ); + } + + #[test] + fn test_no_coercion_when_iceberg_is_timestamp_ns() { + let iceberg = Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::optional(1, "ts", Type::Primitive(PrimitiveType::TimestampNs)).into(), + ]) + .build() + .unwrap(); + + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("ts", DataType::Timestamp(TimeUnit::Nanosecond, None), true) + .with_metadata(field_id_meta(1)), + ])); + + assert!(coerce_int96_timestamps(&arrow_schema, &iceberg).is_none()); + } + + #[test] + fn test_no_coercion_when_iceberg_is_timestamptz_ns() { + let iceberg = Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::optional(1, "ts", Type::Primitive(PrimitiveType::TimestamptzNs)) + .into(), + ]) + .build() + .unwrap(); + + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new( + "ts", + DataType::Timestamp(TimeUnit::Nanosecond, Some("UTC".into())), + true, + ) + .with_metadata(field_id_meta(1)), + ])); + + assert!(coerce_int96_timestamps(&arrow_schema, &iceberg).is_none()); + } + + #[test] + fn test_no_coercion_when_already_microsecond() { + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("ts", DataType::Timestamp(TimeUnit::Microsecond, None), true) + .with_metadata(field_id_meta(1)), + Field::new("id", DataType::Int32, false).with_metadata(field_id_meta(2)), + ])); + let iceberg = iceberg_schema_with_timestamp(); + + assert!(coerce_int96_timestamps(&arrow_schema, &iceberg).is_none()); + } + + // Without field IDs, the visitor can't look up the Iceberg type and falls back + // to microsecond to match Iceberg Java behavior. + #[test] + fn test_defaults_to_us_without_field_ids() { + let arrow_schema = Arc::new(ArrowSchema::new(vec![Field::new( + "ts", + DataType::Timestamp(TimeUnit::Nanosecond, None), + true, + )])); + let iceberg = iceberg_schema_with_timestamp(); + + let coerced = coerce_int96_timestamps(&arrow_schema, &iceberg).unwrap(); + assert_eq!( + coerced.field(0).data_type(), + &DataType::Timestamp(TimeUnit::Microsecond, None) + ); + } + + // Field ID exists but points to a non-timestamp Iceberg type. The field_by_id + // lookup succeeds but the match arm returns None, so unwrap_or falls back to + // microsecond. + #[test] + fn test_defaults_to_us_when_iceberg_type_is_not_timestamp() { + let iceberg = Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::optional(1, "ts", Type::Primitive(PrimitiveType::String)).into(), + ]) + .build() + .unwrap(); + + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("ts", DataType::Timestamp(TimeUnit::Nanosecond, None), true) + .with_metadata(field_id_meta(1)), + ])); + + let coerced = coerce_int96_timestamps(&arrow_schema, &iceberg).unwrap(); + assert_eq!( + coerced.field(0).data_type(), + &DataType::Timestamp(TimeUnit::Microsecond, None) + ); + } + + #[test] + fn test_coerce_preserves_field_metadata() { + let mut meta = field_id_meta(1); + meta.insert("custom_key".to_string(), "custom_value".to_string()); + + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("ts", DataType::Timestamp(TimeUnit::Nanosecond, None), true) + .with_metadata(meta.clone()), + ])); + let iceberg = iceberg_schema_with_timestamp(); + + let coerced = coerce_int96_timestamps(&arrow_schema, &iceberg).unwrap(); + assert_eq!(coerced.field(0).metadata(), &meta); + } + + #[test] + fn test_coerce_timestamp_in_struct() { + let iceberg = Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::required( + 1, + "data", + Type::Struct(StructType::new(vec![ + NestedField::optional(2, "ts", Type::Primitive(PrimitiveType::Timestamp)) + .into(), + ])), + ) + .into(), + ]) + .build() + .unwrap(); + + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new( + "data", + DataType::Struct( + vec![ + Field::new("ts", DataType::Timestamp(TimeUnit::Nanosecond, None), true) + .with_metadata(field_id_meta(2)), + ] + .into(), + ), + false, + ) + .with_metadata(field_id_meta(1)), + ])); + + let coerced = coerce_int96_timestamps(&arrow_schema, &iceberg).unwrap(); + let inner = match coerced.field(0).data_type() { + DataType::Struct(fields) => fields, + other => panic!("Expected Struct, got {other}"), + }; + assert_eq!( + inner[0].data_type(), + &DataType::Timestamp(TimeUnit::Microsecond, None) + ); + } + + #[test] + fn test_coerce_timestamp_in_list() { + let iceberg = Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::optional( + 1, + "timestamps", + Type::List(ListType { + element_field: NestedField::optional( + 2, + "element", + Type::Primitive(PrimitiveType::Timestamp), + ) + .into(), + }), + ) + .into(), + ]) + .build() + .unwrap(); + + let element_field = Field::new( + "element", + DataType::Timestamp(TimeUnit::Nanosecond, None), + true, + ) + .with_metadata(field_id_meta(2)); + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("timestamps", DataType::List(Arc::new(element_field)), true) + .with_metadata(field_id_meta(1)), + ])); + + let coerced = coerce_int96_timestamps(&arrow_schema, &iceberg).unwrap(); + let element_dt = match coerced.field(0).data_type() { + DataType::List(f) => f.data_type(), + other => panic!("Expected List, got {other}"), + }; + assert_eq!( + element_dt, + &DataType::Timestamp(TimeUnit::Microsecond, None) + ); + } + + #[test] + fn test_coerce_timestamp_in_map_value() { + let iceberg = Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::optional( + 1, + "ts_map", + Type::Map(MapType { + key_field: NestedField::required( + 2, + "key", + Type::Primitive(PrimitiveType::String), + ) + .into(), + value_field: NestedField::optional( + 3, + "value", + Type::Primitive(PrimitiveType::Timestamp), + ) + .into(), + }), + ) + .into(), + ]) + .build() + .unwrap(); + + let key_field = Field::new("key", DataType::Utf8, false).with_metadata(field_id_meta(2)); + let value_field = Field::new( + "value", + DataType::Timestamp(TimeUnit::Nanosecond, None), + true, + ) + .with_metadata(field_id_meta(3)); + let entries_field = Field::new( + "key_value", + DataType::Struct(vec![key_field, value_field].into()), + false, + ); + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new( + "ts_map", + DataType::Map(Arc::new(entries_field), false), + true, + ) + .with_metadata(field_id_meta(1)), + ])); + + let coerced = coerce_int96_timestamps(&arrow_schema, &iceberg).unwrap(); + let value_dt = match coerced.field(0).data_type() { + DataType::Map(entries, _) => match entries.data_type() { + DataType::Struct(fields) => fields[1].data_type().clone(), + other => panic!("Expected Struct inside Map, got {other}"), + }, + other => panic!("Expected Map, got {other}"), + }; + assert_eq!(value_dt, DataType::Timestamp(TimeUnit::Microsecond, None)); + } +} diff --git a/crates/iceberg/src/arrow/mod.rs b/crates/iceberg/src/arrow/mod.rs index 15b386109d..089d01cad0 100644 --- a/crates/iceberg/src/arrow/mod.rs +++ b/crates/iceberg/src/arrow/mod.rs @@ -27,15 +27,18 @@ pub(crate) mod caching_delete_file_loader; pub mod delete_file_loader; pub(crate) mod delete_filter; +mod int96; mod reader; /// RecordBatch projection utilities pub mod record_batch_projector; pub(crate) mod record_batch_transformer; +mod scan_metrics; mod value; mod incremental; pub use incremental::*; pub use reader::*; +pub use scan_metrics::{ScanMetrics, ScanResult}; pub use value::*; // Re-export delete file constants for convenience diff --git a/crates/iceberg/src/arrow/reader.rs b/crates/iceberg/src/arrow/reader.rs deleted file mode 100644 index 3696e88584..0000000000 --- a/crates/iceberg/src/arrow/reader.rs +++ /dev/null @@ -1,5037 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! Parquet file data reader - -use std::collections::{HashMap, HashSet}; -use std::ops::Range; -use std::str::FromStr; -use std::sync::{Arc, Mutex}; - -use arrow_arith::boolean::{and, and_kleene, is_not_null, is_null, not, or, or_kleene}; -use arrow_array::{Array, ArrayRef, BooleanArray, Datum as ArrowDatum, RecordBatch, Scalar}; -use arrow_cast::cast::cast; -use arrow_ord::cmp::{eq, gt, gt_eq, lt, lt_eq, neq}; -use arrow_schema::{ - ArrowError, DataType, FieldRef, Schema as ArrowSchema, SchemaRef as ArrowSchemaRef, -}; -use arrow_string::like::starts_with; -use bytes::Bytes; -use fnv::FnvHashSet; -use futures::channel::mpsc::channel; -use futures::future::BoxFuture; -use futures::{FutureExt, SinkExt, Stream, StreamExt, TryFutureExt, TryStreamExt}; -use parquet::arrow::arrow_reader::{ - ArrowPredicateFn, ArrowReaderMetadata, ArrowReaderOptions, RowFilter, RowSelection, RowSelector, -}; -use parquet::arrow::async_reader::AsyncFileReader; -use parquet::arrow::{PARQUET_FIELD_ID_META_KEY, ParquetRecordBatchStreamBuilder, ProjectionMask}; -use parquet::file::metadata::{ - PageIndexPolicy, ParquetMetaData, ParquetMetaDataReader, RowGroupMetaData, -}; -use parquet::schema::types::{SchemaDescriptor, Type as ParquetType}; -use typed_builder::TypedBuilder; - -use crate::arrow::caching_delete_file_loader::CachingDeleteFileLoader; -use crate::arrow::record_batch_transformer::{ - RecordBatchTransformer, RecordBatchTransformerBuilder, -}; -use crate::arrow::{arrow_schema_to_schema, get_arrow_datum}; -use crate::delete_vector::DeleteVector; -use crate::error::Result; -use crate::expr::visitors::bound_predicate_visitor::{BoundPredicateVisitor, visit}; -use crate::expr::visitors::page_index_evaluator::PageIndexEvaluator; -use crate::expr::visitors::row_group_metrics_evaluator::RowGroupMetricsEvaluator; -use crate::expr::{BoundPredicate, BoundReference}; -use crate::io::{FileIO, FileMetadata, FileRead}; -use crate::metadata_columns::{ - RESERVED_FIELD_ID_FILE, RESERVED_FIELD_ID_POS, is_metadata_field, row_pos_field, -}; -use crate::runtime::spawn; -use crate::scan::{ArrowRecordBatchStream, FileScanTask, FileScanTaskStream}; -use crate::spec::{ - Datum, NameMapping, NestedField, PartitionSpec, PrimitiveType, Schema, SchemaRef, Struct, Type, -}; -use crate::util::available_parallelism; -use crate::{Error, ErrorKind}; - -/// Default gap between byte ranges below which they are coalesced into a -/// single request. Matches object_store's `OBJECT_STORE_COALESCE_DEFAULT`. -const DEFAULT_RANGE_COALESCE_BYTES: u64 = 1024 * 1024; - -/// Default maximum number of coalesced byte ranges fetched concurrently. -/// Matches object_store's `OBJECT_STORE_COALESCE_PARALLEL`. -const DEFAULT_RANGE_FETCH_CONCURRENCY: usize = 10; - -/// Default number of bytes to prefetch when parsing Parquet footer metadata. -/// Matches DataFusion's default `ParquetOptions::metadata_size_hint`. -const DEFAULT_METADATA_SIZE_HINT: usize = 512 * 1024; - -/// Options for tuning Parquet file I/O. -#[derive(Clone, Copy, Debug, TypedBuilder)] -#[builder(field_defaults(setter(prefix = "with_")))] -pub(crate) struct ParquetReadOptions { - /// Number of bytes to prefetch for parsing the Parquet metadata. - /// - /// This hint can help reduce the number of fetch requests. For more details see the - /// [ParquetMetaDataReader documentation](https://docs.rs/parquet/latest/parquet/file/metadata/struct.ParquetMetaDataReader.html#method.with_prefetch_hint). - /// - /// Defaults to 512 KiB, matching DataFusion's default `ParquetOptions::metadata_size_hint`. - #[builder(default = Some(DEFAULT_METADATA_SIZE_HINT))] - pub(crate) metadata_size_hint: Option, - /// Gap threshold for merging nearby byte ranges into a single request. - /// Ranges with gaps smaller than this value will be coalesced. - /// - /// Defaults to 1 MiB, matching object_store's `OBJECT_STORE_COALESCE_DEFAULT`. - #[builder(default = DEFAULT_RANGE_COALESCE_BYTES)] - pub(crate) range_coalesce_bytes: u64, - /// Maximum number of merged byte ranges to fetch concurrently. - /// - /// Defaults to 10, matching object_store's `OBJECT_STORE_COALESCE_PARALLEL`. - #[builder(default = DEFAULT_RANGE_FETCH_CONCURRENCY)] - pub(crate) range_fetch_concurrency: usize, - /// Whether to preload the column index when reading Parquet metadata. - #[builder(default = true)] - pub(crate) preload_column_index: bool, - /// Whether to preload the offset index when reading Parquet metadata. - #[builder(default = true)] - pub(crate) preload_offset_index: bool, - /// Whether to preload the page index when reading Parquet metadata. - #[builder(default = false)] - pub(crate) preload_page_index: bool, -} - -impl ParquetReadOptions { - pub(crate) fn metadata_size_hint(&self) -> Option { - self.metadata_size_hint - } - - pub(crate) fn range_coalesce_bytes(&self) -> u64 { - self.range_coalesce_bytes - } - - pub(crate) fn range_fetch_concurrency(&self) -> usize { - self.range_fetch_concurrency - } - - pub(crate) fn preload_column_index(&self) -> bool { - self.preload_column_index - } - - pub(crate) fn preload_offset_index(&self) -> bool { - self.preload_offset_index - } - - pub(crate) fn preload_page_index(&self) -> bool { - self.preload_page_index - } -} - -/// Builder to create ArrowReader -pub struct ArrowReaderBuilder { - batch_size: Option, - file_io: FileIO, - concurrency_limit_data_files: usize, - row_group_filtering_enabled: bool, - row_selection_enabled: bool, - parquet_read_options: ParquetReadOptions, -} - -impl ArrowReaderBuilder { - /// Create a new ArrowReaderBuilder - pub fn new(file_io: FileIO) -> Self { - let num_cpus = available_parallelism().get(); - - ArrowReaderBuilder { - batch_size: None, - file_io, - concurrency_limit_data_files: num_cpus, - row_group_filtering_enabled: true, - row_selection_enabled: false, - parquet_read_options: ParquetReadOptions::builder().build(), - } - } - - /// Sets the max number of in flight data files that are being fetched - pub fn with_data_file_concurrency_limit(mut self, val: usize) -> Self { - self.concurrency_limit_data_files = val; - self - } - - /// Sets the desired size of batches in the response - /// to something other than the default - pub fn with_batch_size(mut self, batch_size: usize) -> Self { - self.batch_size = Some(batch_size); - self - } - - /// Determines whether to enable row group filtering. - pub fn with_row_group_filtering_enabled(mut self, row_group_filtering_enabled: bool) -> Self { - self.row_group_filtering_enabled = row_group_filtering_enabled; - self - } - - /// Determines whether to enable row selection. - pub fn with_row_selection_enabled(mut self, row_selection_enabled: bool) -> Self { - self.row_selection_enabled = row_selection_enabled; - self - } - - /// Provide a hint as to the number of bytes to prefetch for parsing the Parquet metadata - /// - /// This hint can help reduce the number of fetch requests. For more details see the - /// [ParquetMetaDataReader documentation](https://docs.rs/parquet/latest/parquet/file/metadata/struct.ParquetMetaDataReader.html#method.with_prefetch_hint). - pub fn with_metadata_size_hint(mut self, metadata_size_hint: usize) -> Self { - self.parquet_read_options.metadata_size_hint = Some(metadata_size_hint); - self - } - - /// Sets the gap threshold for merging nearby byte ranges into a single request. - /// Ranges with gaps smaller than this value will be coalesced. - /// - /// Defaults to 1 MiB, matching object_store's OBJECT_STORE_COALESCE_DEFAULT. - pub fn with_range_coalesce_bytes(mut self, range_coalesce_bytes: u64) -> Self { - self.parquet_read_options.range_coalesce_bytes = range_coalesce_bytes; - self - } - - /// Sets the maximum number of merged byte ranges to fetch concurrently. - /// - /// Defaults to 10, matching object_store's OBJECT_STORE_COALESCE_PARALLEL. - pub fn with_range_fetch_concurrency(mut self, range_fetch_concurrency: usize) -> Self { - self.parquet_read_options.range_fetch_concurrency = range_fetch_concurrency; - self - } - - /// Build the ArrowReader. - pub fn build(self) -> ArrowReader { - ArrowReader { - batch_size: self.batch_size, - file_io: self.file_io.clone(), - delete_file_loader: CachingDeleteFileLoader::new( - self.file_io.clone(), - self.concurrency_limit_data_files, - ), - concurrency_limit_data_files: self.concurrency_limit_data_files, - row_group_filtering_enabled: self.row_group_filtering_enabled, - row_selection_enabled: self.row_selection_enabled, - parquet_read_options: self.parquet_read_options, - } - } -} - -/// Reads data from Parquet files -#[derive(Clone)] -pub struct ArrowReader { - pub(crate) batch_size: Option, - pub(crate) file_io: FileIO, - delete_file_loader: CachingDeleteFileLoader, - - /// the maximum number of data files that can be fetched at the same time - pub(crate) concurrency_limit_data_files: usize, - - pub(crate) row_group_filtering_enabled: bool, - pub(crate) row_selection_enabled: bool, - pub(crate) parquet_read_options: ParquetReadOptions, -} - -/// Trait indicating that the implementing type streams into a stream of type `S` using -/// a reader of type `R`. -pub trait StreamsInto { - /// Stream from the reader and produce a stream of type `S`. - fn stream(self, reader: R) -> Result; -} - -/// Helper function to process a stream of record batches and send through a channel. -/// Handles the Result pattern, so callers don't need to match on the stream result. -/// This pattern is used in both reader.rs and incremental.rs. -pub(crate) async fn process_record_batch_stream( - record_batch_stream: Result, - mut tx: T, - error_context: &str, -) where - E: std::error::Error + Send + Sync + 'static, - S: Stream> + Send + Unpin + 'static, - T: SinkExt> + Unpin + Send + 'static, -{ - match record_batch_stream { - Ok(mut stream) => { - while let Some(batch_result) = stream.next().await { - let batch = batch_result - .map_err(|e| Error::new(ErrorKind::Unexpected, error_context).with_source(e)); - let _ = tx.send(batch).await; - } - } - Err(e) => { - let _ = tx.send(Err(e)).await; - } - } -} - -impl ArrowReader { - /// Take a stream of FileScanTasks and reads all the files. - /// Returns a stream of Arrow RecordBatches containing the data from the files. - /// - /// This implementation provides both file-level and batch-level parallelism: - /// - Multiple files are processed in parallel (IO-heavy operations) - /// - Multiple batches are processed in parallel across all files (CPU-heavy operations) - pub fn read(self, tasks: FileScanTaskStream) -> Result { - let file_io = self.file_io; - let batch_size = self.batch_size; - let concurrency_limit_data_files = self.concurrency_limit_data_files; - let row_group_filtering_enabled = self.row_group_filtering_enabled; - let row_selection_enabled = self.row_selection_enabled; - let parquet_read_options = self.parquet_read_options; - - // Fast-path for single concurrency to avoid overhead of try_flatten_unordered - let stream: ArrowRecordBatchStream = if concurrency_limit_data_files == 1 { - Box::pin( - tasks - .and_then(move |task| { - let file_io = file_io.clone(); - - Self::process_file_scan_task( - task, - batch_size, - file_io, - self.delete_file_loader.clone(), - row_group_filtering_enabled, - row_selection_enabled, - parquet_read_options, - ) - }) - .map_err(|err| { - Error::new(ErrorKind::Unexpected, "file scan task generate failed") - .with_source(err) - }) - .try_flatten(), - ) - } else { - // Multi-concurrency path: spawn each file's IO-heavy processing as an independent - // tokio task for true parallelism, streaming results through a channel. - let (tx, rx) = channel::>(concurrency_limit_data_files); - let delete_file_loader = self.delete_file_loader; - - // Outer spawn: runs the task coordination loop without blocking the caller. - spawn(async move { - let _ = tasks - .try_for_each_concurrent(concurrency_limit_data_files, |task| { - let file_io = file_io.clone(); - let delete_file_loader = delete_file_loader.clone(); - let tx = tx.clone(); - - async move { - // Inner spawn: each file's IO operations run on their own tokio task. - spawn(async move { - let record_batch_stream = Self::process_file_scan_task( - task, - batch_size, - file_io, - delete_file_loader, - row_group_filtering_enabled, - row_selection_enabled, - parquet_read_options, - ) - .await; - - process_record_batch_stream( - record_batch_stream, - tx, - "failed to read record batch", - ) - .await; - }) - .await; - - Ok(()) - } - }) - .await; - }); - - Box::pin(rx) as ArrowRecordBatchStream - }; - - Ok(stream) - } - - async fn process_file_scan_task( - task: FileScanTask, - batch_size: Option, - file_io: FileIO, - delete_file_loader: CachingDeleteFileLoader, - row_group_filtering_enabled: bool, - row_selection_enabled: bool, - parquet_read_options: ParquetReadOptions, - ) -> Result { - let should_load_page_index = - (row_selection_enabled && task.predicate.is_some()) || !task.deletes.is_empty(); - let mut parquet_read_options = parquet_read_options; - parquet_read_options.preload_page_index = should_load_page_index; - - // Open the Parquet file and load delete files concurrently. - let delete_filter_rx = - delete_file_loader.load_deletes(&task.deletes, Arc::clone(&task.schema)); - - let (parquet_result, delete_filter) = futures::join!( - Self::open_parquet_stream_builder( - &task.data_file_path, - task.file_size_in_bytes, - file_io, - parquet_read_options, - Self::build_virtual_columns(task.project_field_ids()), - batch_size, - task.name_mapping.as_deref(), - ), - async { delete_filter_rx.await.unwrap() }, - ); - let (builder, has_missing_field_ids) = parquet_result?; - let delete_filter = delete_filter?; - let delete_predicate = delete_filter.build_equality_delete_predicate(&task).await?; - - // In addition to the optional predicate supplied in the `FileScanTask`, - // we also have an optional predicate resulting from equality delete files. - // If both are present, we logical-AND them together to form a single filter - // predicate that we can pass to the `RecordBatchStreamBuilder`. - let final_predicate = match (&task.predicate, delete_predicate) { - (None, None) => None, - (Some(predicate), None) => Some(predicate.clone()), - (None, Some(ref predicate)) => Some(predicate.clone()), - (Some(filter_predicate), Some(delete_predicate)) => { - Some(filter_predicate.clone().and(delete_predicate)) - } - }; - - let positional_delete_indexes = delete_filter.get_delete_vector(&task); - - let builder = Self::apply_parquet_filters( - builder, - task.start, - task.length, - &task.schema, - final_predicate.as_ref(), - positional_delete_indexes.as_deref(), - row_group_filtering_enabled, - row_selection_enabled, - false, // use_predicate_projection: projection is handled by build_projected_record_batch_stream - has_missing_field_ids, - )?; - - Self::build_projected_record_batch_stream( - builder, - task.project_field_ids(), - task.schema_ref(), - has_missing_field_ids, - &task.data_file_path, - task.partition_spec.clone(), - task.partition.clone(), - ) - } - - /// Opens a Parquet file and loads its metadata, returning both the reader and metadata. - /// The reader can be reused to build a `ParquetRecordBatchStreamBuilder` without - /// reopening the file. - pub(crate) async fn open_parquet_file( - data_file_path: &str, - file_io: &FileIO, - file_size_in_bytes: u64, - parquet_read_options: ParquetReadOptions, - ) -> Result<(ArrowFileReader, ArrowReaderMetadata)> { - let parquet_file = file_io.new_input(data_file_path)?; - let parquet_reader = parquet_file.reader().await?; - let mut reader = ArrowFileReader::new( - FileMetadata { - size: file_size_in_bytes, - }, - parquet_reader, - ) - .with_parquet_read_options(parquet_read_options); - - let arrow_metadata = ArrowReaderMetadata::load_async(&mut reader, Default::default()) - .await - .map_err(|e| { - Error::new(ErrorKind::Unexpected, "Failed to load Parquet metadata").with_source(e) - })?; - - Ok((reader, arrow_metadata)) - } - - /// Opens a Parquet file, resolves its schema (name-mapping / field-ID fallback), and - /// applies the batch size. Returns `(builder, has_missing_field_ids)`. - /// - /// This is the async phase shared by every reading path. Callers that have background - /// work to overlap (e.g. delete-file loading) can run this concurrently with that work - /// using [`futures::join!`], then pass the result to [`Self::apply_parquet_filters`]. - /// - /// Implements the three-branch schema resolution strategy matching Java's `ReadConf` constructor: - /// - Branch 1: file has embedded field IDs → trust them, use as-is - /// - Branch 2: name_mapping present → apply name mapping to assign correct Iceberg field IDs - /// - Branch 3: no name mapping → assign fallback position-based IDs - #[allow(clippy::too_many_arguments)] - pub(crate) async fn open_parquet_stream_builder( - data_file_path: &str, - file_size_in_bytes: u64, - file_io: FileIO, - parquet_read_options: ParquetReadOptions, - virtual_columns: Vec>, - batch_size: Option, - name_mapping: Option<&NameMapping>, - ) -> Result<(ParquetRecordBatchStreamBuilder, bool)> { - let (file_reader, arrow_metadata) = Self::open_parquet_file( - data_file_path, - &file_io, - file_size_in_bytes, - parquet_read_options, - ) - .await?; - - // Check if Parquet file has embedded field IDs. - // Corresponds to Java's ParquetSchemaUtil.hasIds() - let has_missing_field_ids = arrow_metadata - .schema() - .fields() - .iter() - .next() - .is_some_and(|f| f.metadata().get(PARQUET_FIELD_ID_META_KEY).is_none()); - - // Three-branch schema resolution strategy matching Java's ReadConf constructor. - // - // When Parquet files lack field IDs (e.g., Hive/Spark migrations via add_files), - // we must assign field IDs BEFORE reading data to enable correct column projection. - let arrow_metadata = if has_missing_field_ids { - // Parquet file lacks field IDs - must assign them before reading. - let arrow_schema = if let Some(nm) = name_mapping { - // Branch 2: Apply name mapping to assign correct Iceberg field IDs. - // Corresponds to Java's ParquetSchemaUtil.applyNameMapping() - apply_name_mapping_to_arrow_schema(Arc::clone(arrow_metadata.schema()), nm)? - } else { - // Branch 3: No name mapping - use position-based fallback IDs. - // Corresponds to Java's ParquetSchemaUtil.addFallbackIds() - add_fallback_field_ids_to_arrow_schema(arrow_metadata.schema()) - }; - let mut options = ArrowReaderOptions::new().with_schema(arrow_schema); - if !virtual_columns.is_empty() { - options = options.with_virtual_columns(virtual_columns)?; - } - ArrowReaderMetadata::try_new(Arc::clone(arrow_metadata.metadata()), options).map_err( - |e| { - Error::new( - ErrorKind::Unexpected, - "Failed to create ArrowReaderMetadata with field ID schema", - ) - .with_source(e) - }, - )? - } else { - // Branch 1: File has embedded field IDs - trust them. - if !virtual_columns.is_empty() { - let options = ArrowReaderOptions::new().with_virtual_columns(virtual_columns)?; - ArrowReaderMetadata::try_new(Arc::clone(arrow_metadata.metadata()), options) - .map_err(|e| { - Error::new( - ErrorKind::Unexpected, - "Failed to create ArrowReaderMetadata with virtual columns", - ) - .with_source(e) - })? - } else { - arrow_metadata - } - }; - - let mut builder = - ParquetRecordBatchStreamBuilder::new_with_metadata(file_reader, arrow_metadata); - - if let Some(batch_size) = batch_size { - builder = builder.with_batch_size(batch_size); - } - - Ok((builder, has_missing_field_ids)) - } - - /// Applies all row-level and row-group-level filters to a builder returned by - /// [`Self::open_parquet_stream_builder`]. - /// - /// Handles byte-range row group pruning, predicate row filtering (with optional - /// projection), and positional-delete row selection. - #[allow(clippy::too_many_arguments)] - pub(crate) fn apply_parquet_filters( - mut builder: ParquetRecordBatchStreamBuilder, - start: u64, - length: u64, - schema: &Schema, - bound_predicate: Option<&BoundPredicate>, - positional_deletes: Option<&Mutex>, - row_group_filtering_enabled: bool, - row_selection_enabled: bool, - use_predicate_projection: bool, - has_missing_field_ids: bool, - ) -> Result> { - // There are three possible sources for potential lists of selected RowGroup indices, - // and two for `RowSelection`s. - // Selected RowGroup index lists can come from three sources: - // * When task.start and task.length specify a byte range (file splitting); - // * When there are equality delete files that are applicable; - // * When there is a scan predicate and row_group_filtering_enabled = true. - // `RowSelection`s can be created in either or both of the following cases: - // * When there are positional delete files that are applicable; - // * When there is a scan predicate and row_selection_enabled = true - // Note that row group filtering from predicates only happens when - // there is a scan predicate AND row_group_filtering_enabled = true, - // but we perform row selection filtering if there are applicable - // equality delete files OR (there is a scan predicate AND row_selection_enabled), - // since the only implemented method of applying positional deletes is - // by using a `RowSelection`. - let mut selected_row_group_indices = None; - let mut row_selection = None; - - if start != 0 || length != 0 { - selected_row_group_indices = Some(Self::filter_row_groups_by_byte_range( - builder.metadata(), - start, - length, - )?); - } - - if let Some(predicate) = bound_predicate { - let (iceberg_field_ids, field_id_map) = - Self::build_field_id_set_and_map(builder.parquet_schema(), predicate)?; - - if use_predicate_projection { - let predicate_field_ids: Vec = iceberg_field_ids.iter().copied().collect(); - builder = Self::apply_projection( - builder, - &predicate_field_ids, - schema, - has_missing_field_ids, - )?; - } - - let row_filter = Self::get_row_filter( - predicate, - builder.parquet_schema(), - &iceberg_field_ids, - &field_id_map, - )?; - builder = builder.with_row_filter(row_filter); - - if row_group_filtering_enabled { - let predicate_filtered = Self::get_selected_row_group_indices( - predicate, - builder.metadata(), - &field_id_map, - schema, - )?; - selected_row_group_indices = Some(match selected_row_group_indices.take() { - Some(existing) => existing - .into_iter() - .filter(|idx| predicate_filtered.contains(idx)) - .collect(), - None => predicate_filtered, - }); - } - - if row_selection_enabled { - row_selection = Some(Self::get_row_selection_for_filter_predicate( - predicate, - builder.metadata(), - &selected_row_group_indices, - &field_id_map, - schema, - )?); - } - } - - if let Some(positional_delete_indexes) = positional_deletes { - let delete_row_selection = { - let guard = positional_delete_indexes.lock().unwrap(); - Self::build_deletes_row_selection( - builder.metadata().row_groups(), - &selected_row_group_indices, - &guard, - ) - }?; - row_selection = Some(match row_selection.take() { - None => delete_row_selection, - Some(prev) => prev.intersection(&delete_row_selection), - }); - } - - if let Some(sel) = row_selection { - builder = builder.with_row_selection(sel); - } - if let Some(groups) = selected_row_group_indices { - builder = builder.with_row_groups(groups); - } - - Ok(builder) - } - - /// computes a `RowSelection` from positional delete indices. - /// - /// Using the Parquet page index, we build a `RowSelection` that rejects rows that are indicated - /// as having been deleted by a positional delete, taking into account any row groups that have - /// been skipped entirely by the filter predicate - fn build_deletes_row_selection( - row_group_metadata_list: &[RowGroupMetaData], - selected_row_groups: &Option>, - positional_deletes: &DeleteVector, - ) -> Result { - let mut results: Vec = Vec::new(); - let mut selected_row_groups_idx = 0; - let mut current_row_group_base_idx: u64 = 0; - let mut delete_vector_iter = positional_deletes.iter(); - let mut next_deleted_row_idx_opt = delete_vector_iter.next(); - - for (idx, row_group_metadata) in row_group_metadata_list.iter().enumerate() { - let row_group_num_rows = row_group_metadata.num_rows() as u64; - let next_row_group_base_idx = current_row_group_base_idx + row_group_num_rows; - - // if row group selection is enabled, - if let Some(selected_row_groups) = selected_row_groups { - // if we've consumed all the selected row groups, we're done - if selected_row_groups_idx == selected_row_groups.len() { - break; - } - - if idx == selected_row_groups[selected_row_groups_idx] { - // we're in a selected row group. Increment selected_row_groups_idx - // so that next time around the for loop we're looking for the next - // selected row group - selected_row_groups_idx += 1; - } else { - // Advance iterator past all deletes in the skipped row group. - // advance_to() positions the iterator to the first delete >= next_row_group_base_idx. - // However, if our cached next_deleted_row_idx_opt is in the skipped range, - // we need to call next() to update the cache with the newly positioned value. - delete_vector_iter.advance_to(next_row_group_base_idx); - // Only update the cache if the cached value is stale (in the skipped range) - if let Some(cached_idx) = next_deleted_row_idx_opt - && cached_idx < next_row_group_base_idx - { - next_deleted_row_idx_opt = delete_vector_iter.next(); - } - - // still increment the current page base index but then skip to the next row group - // in the file - current_row_group_base_idx += row_group_num_rows; - continue; - } - } - - let mut next_deleted_row_idx = match next_deleted_row_idx_opt { - Some(next_deleted_row_idx) => { - // if the index of the next deleted row is beyond this row group, add a selection for - // the remainder of this row group and skip to the next row group - if next_deleted_row_idx >= next_row_group_base_idx { - results.push(RowSelector::select(row_group_num_rows as usize)); - current_row_group_base_idx += row_group_num_rows; - continue; - } - - next_deleted_row_idx - } - - // If there are no more pos deletes, add a selector for the entirety of this row group. - _ => { - results.push(RowSelector::select(row_group_num_rows as usize)); - current_row_group_base_idx += row_group_num_rows; - continue; - } - }; - - let mut current_idx = current_row_group_base_idx; - 'chunks: while next_deleted_row_idx < next_row_group_base_idx { - // `select` all rows that precede the next delete index - if current_idx < next_deleted_row_idx { - let run_length = next_deleted_row_idx - current_idx; - results.push(RowSelector::select(run_length as usize)); - current_idx += run_length; - } - - // `skip` all consecutive deleted rows in the current row group - let mut run_length = 0; - while next_deleted_row_idx == current_idx - && next_deleted_row_idx < next_row_group_base_idx - { - run_length += 1; - current_idx += 1; - - next_deleted_row_idx_opt = delete_vector_iter.next(); - next_deleted_row_idx = match next_deleted_row_idx_opt { - Some(next_deleted_row_idx) => next_deleted_row_idx, - _ => { - // We've processed the final positional delete. - // Conclude the skip and then break so that we select the remaining - // rows in the row group and move on to the next row group - results.push(RowSelector::skip(run_length)); - break 'chunks; - } - }; - } - if run_length > 0 { - results.push(RowSelector::skip(run_length)); - } - } - - if current_idx < next_row_group_base_idx { - results.push(RowSelector::select( - (next_row_group_base_idx - current_idx) as usize, - )); - } - - current_row_group_base_idx += row_group_num_rows; - } - - Ok(results.into()) - } - - fn build_field_id_set_and_map( - parquet_schema: &SchemaDescriptor, - predicate: &BoundPredicate, - ) -> Result<(HashSet, HashMap)> { - // Collects all Iceberg field IDs referenced in the filter predicate - let mut collector = CollectFieldIdVisitor { - field_ids: HashSet::default(), - }; - visit(&mut collector, predicate)?; - - let iceberg_field_ids = collector.field_ids(); - - // Without embedded field IDs, we fall back to position-based mapping for compatibility - let field_id_map = match build_field_id_map(parquet_schema)? { - Some(map) => map, - None => build_fallback_field_id_map(parquet_schema), - }; - - Ok((iceberg_field_ids, field_id_map)) - } - - /// Recursively extract leaf field IDs because Parquet projection works at the leaf column level. - /// Nested types (struct/list/map) are flattened in Parquet's columnar format. - fn include_leaf_field_id(field: &NestedField, field_ids: &mut Vec) { - match field.field_type.as_ref() { - Type::Primitive(_) => { - field_ids.push(field.id); - } - Type::Struct(struct_type) => { - for nested_field in struct_type.fields() { - Self::include_leaf_field_id(nested_field, field_ids); - } - } - Type::List(list_type) => { - Self::include_leaf_field_id(&list_type.element_field, field_ids); - } - Type::Map(map_type) => { - Self::include_leaf_field_id(&map_type.key_field, field_ids); - Self::include_leaf_field_id(&map_type.value_field, field_ids); - } - } - } - - fn get_arrow_projection_mask( - field_ids: &[i32], - iceberg_schema_of_task: &Schema, - parquet_schema: &SchemaDescriptor, - arrow_schema: &ArrowSchemaRef, - use_fallback: bool, // Whether file lacks embedded field IDs (e.g., migrated from Hive/Spark) - ) -> Result { - fn type_promotion_is_valid( - file_type: Option<&PrimitiveType>, - projected_type: Option<&PrimitiveType>, - ) -> bool { - match (file_type, projected_type) { - (Some(lhs), Some(rhs)) if lhs == rhs => true, - (Some(PrimitiveType::Int), Some(PrimitiveType::Long)) => true, - (Some(PrimitiveType::Float), Some(PrimitiveType::Double)) => true, - ( - Some(PrimitiveType::Decimal { - precision: file_precision, - scale: file_scale, - }), - Some(PrimitiveType::Decimal { - precision: requested_precision, - scale: requested_scale, - }), - ) if requested_precision >= file_precision && file_scale == requested_scale => true, - // Uuid will be store as Fixed(16) in parquet file, so the read back type will be Fixed(16). - (Some(PrimitiveType::Fixed(16)), Some(PrimitiveType::Uuid)) => true, - // Some Parquet writers (e.g. Snowflake) store FIXED_LEN_BYTE_ARRAY as - // Arrow Binary rather than FixedSizeBinary. Allow Binary -> Fixed(N) - // since the underlying bytes are the same. - (Some(PrimitiveType::Binary), Some(PrimitiveType::Fixed(_))) => true, - _ => false, - } - } - - if field_ids.is_empty() { - return Ok(ProjectionMask::all()); - } - - if use_fallback { - // Position-based projection necessary because file lacks embedded field IDs - Self::get_arrow_projection_mask_fallback(field_ids, parquet_schema) - } else { - // Field-ID-based projection using embedded field IDs from Parquet metadata - - // Parquet's columnar format requires leaf-level (not top-level struct/list/map) projection - let mut leaf_field_ids = vec![]; - for field_id in field_ids { - let field = iceberg_schema_of_task.field_by_id(*field_id); - if let Some(field) = field { - Self::include_leaf_field_id(field, &mut leaf_field_ids); - } - } - - Self::get_arrow_projection_mask_with_field_ids( - &leaf_field_ids, - iceberg_schema_of_task, - parquet_schema, - arrow_schema, - type_promotion_is_valid, - ) - } - } - - /// Standard projection using embedded field IDs from Parquet metadata. - /// For iceberg-java compatibility with ParquetSchemaUtil.pruneColumns(). - fn get_arrow_projection_mask_with_field_ids( - leaf_field_ids: &[i32], - iceberg_schema_of_task: &Schema, - parquet_schema: &SchemaDescriptor, - arrow_schema: &ArrowSchemaRef, - type_promotion_is_valid: fn(Option<&PrimitiveType>, Option<&PrimitiveType>) -> bool, - ) -> Result { - let mut column_map = HashMap::new(); - let fields = arrow_schema.fields(); - - // Pre-project only the fields that have been selected, possibly avoiding converting - // some Arrow types that are not yet supported. - let mut projected_fields: HashMap = HashMap::new(); - let projected_arrow_schema = ArrowSchema::new_with_metadata( - fields.filter_leaves(|_, f| { - f.metadata() - .get(PARQUET_FIELD_ID_META_KEY) - .and_then(|field_id| i32::from_str(field_id).ok()) - .is_some_and(|field_id| { - projected_fields.insert((*f).clone(), field_id); - leaf_field_ids.contains(&field_id) - }) - }), - arrow_schema.metadata().clone(), - ); - let iceberg_schema = arrow_schema_to_schema(&projected_arrow_schema)?; - - fields.filter_leaves(|idx, field| { - let Some(field_id) = projected_fields.get(field).cloned() else { - return false; - }; - - let iceberg_field = iceberg_schema_of_task.field_by_id(field_id); - let parquet_iceberg_field = iceberg_schema.field_by_id(field_id); - - if iceberg_field.is_none() || parquet_iceberg_field.is_none() { - return false; - } - - if !type_promotion_is_valid( - parquet_iceberg_field - .unwrap() - .field_type - .as_primitive_type(), - iceberg_field.unwrap().field_type.as_primitive_type(), - ) { - return false; - } - - column_map.insert(field_id, idx); - true - }); - - // Schema evolution: New columns may not exist in old Parquet files. - // We only project existing columns; RecordBatchTransformer adds default/NULL values. - let mut indices = vec![]; - for field_id in leaf_field_ids { - if let Some(col_idx) = column_map.get(field_id) { - indices.push(*col_idx); - } - } - - if indices.is_empty() { - // Edge case: All requested columns are new (don't exist in file). - // Project all columns so RecordBatchTransformer has a batch to transform. - Ok(ProjectionMask::all()) - } else { - Ok(ProjectionMask::leaves(parquet_schema, indices)) - } - } - - /// Fallback projection for Parquet files without field IDs. - /// Uses position-based matching: field ID N → column position N-1. - /// Projects entire top-level columns (including nested content) for iceberg-java compatibility. - fn get_arrow_projection_mask_fallback( - field_ids: &[i32], - parquet_schema: &SchemaDescriptor, - ) -> Result { - // Position-based: field_id N → column N-1 (field IDs are 1-indexed) - let parquet_root_fields = parquet_schema.root_schema().get_fields(); - let mut root_indices = vec![]; - - for field_id in field_ids.iter() { - let parquet_pos = (*field_id - 1) as usize; - - if parquet_pos < parquet_root_fields.len() { - root_indices.push(parquet_pos); - } - // RecordBatchTransformer adds missing columns with NULL values - } - - if root_indices.is_empty() { - Ok(ProjectionMask::all()) - } else { - Ok(ProjectionMask::roots(parquet_schema, root_indices)) - } - } - - fn get_row_filter( - predicates: &BoundPredicate, - parquet_schema: &SchemaDescriptor, - iceberg_field_ids: &HashSet, - field_id_map: &HashMap, - ) -> Result { - // Collect Parquet column indices from field ids. - // If the field id is not found in Parquet schema, it will be ignored due to schema evolution. - let mut column_indices = iceberg_field_ids - .iter() - .filter_map(|field_id| field_id_map.get(field_id).cloned()) - .collect::>(); - column_indices.sort(); - - // The converter that converts `BoundPredicates` to `ArrowPredicates` - let mut converter = PredicateConverter { - parquet_schema, - column_map: field_id_map, - column_indices: &column_indices, - }; - - // After collecting required leaf column indices used in the predicate, - // creates the projection mask for the Arrow predicates. - let projection_mask = ProjectionMask::leaves(parquet_schema, column_indices.clone()); - let predicate_func = visit(&mut converter, predicates)?; - let arrow_predicate = ArrowPredicateFn::new(projection_mask, predicate_func); - Ok(RowFilter::new(vec![Box::new(arrow_predicate)])) - } - - fn get_selected_row_group_indices( - predicate: &BoundPredicate, - parquet_metadata: &Arc, - field_id_map: &HashMap, - snapshot_schema: &Schema, - ) -> Result> { - let row_groups_metadata = parquet_metadata.row_groups(); - let mut results = Vec::with_capacity(row_groups_metadata.len()); - - for (idx, row_group_metadata) in row_groups_metadata.iter().enumerate() { - if RowGroupMetricsEvaluator::eval( - predicate, - row_group_metadata, - field_id_map, - snapshot_schema, - )? { - results.push(idx); - } - } - - Ok(results) - } - - /// Applies a projection mask derived from `field_ids` to a builder. - /// - /// Wraps `get_arrow_projection_mask` + `with_projection` into a single call. - fn apply_projection( - builder: ParquetRecordBatchStreamBuilder, - field_ids: &[i32], - schema: &Schema, - has_missing_field_ids: bool, - ) -> Result> { - // Metadata fields (e.g. _file, _pos) are virtual — they don't exist as Parquet columns. - // Filter them out so get_arrow_projection_mask only sees real schema field IDs. - let project_field_ids_without_metadata: Vec = field_ids - .iter() - .filter(|&&id| !is_metadata_field(id)) - .copied() - .collect(); - let mask = Self::get_arrow_projection_mask( - &project_field_ids_without_metadata, - schema, - builder.parquet_schema(), - builder.schema(), - has_missing_field_ids, - )?; - Ok(builder.with_projection(mask)) - } - - /// Builds a [`RecordBatchTransformer`] for a data file scan task. - /// - /// Handles the three optional transformations that are common to both the full - /// Returns the list of virtual columns to request from the Parquet reader for the - /// given projection. Currently, only `_pos` is a virtual column (produced by the - /// Parquet reader itself rather than read from file data). - pub(crate) fn build_virtual_columns( - project_field_ids: &[i32], - ) -> Vec> { - let mut virtual_columns = Vec::new(); - if project_field_ids.contains(&RESERVED_FIELD_ID_POS) { - virtual_columns.push(Arc::clone(row_pos_field())); - } - virtual_columns - } - - /// scan (`process_file_scan_task`) and the incremental append scan - /// (`process_incremental_append_task`): - /// - `_file` constant column (only when `RESERVED_FIELD_ID_FILE` is projected) - /// - `_pos` virtual column (only when `RESERVED_FIELD_ID_POS` is projected) - /// - identity-transform partition columns (only when partition metadata is present) - fn build_record_batch_transformer( - schema: SchemaRef, - project_field_ids: &[i32], - data_file_path: &str, - partition_spec: Option>, - partition: Option, - ) -> Result { - let mut builder = RecordBatchTransformerBuilder::new(schema, project_field_ids); - - if project_field_ids.contains(&RESERVED_FIELD_ID_FILE) { - builder = builder.with_constant(RESERVED_FIELD_ID_FILE, Datum::string(data_file_path)); - } - - if project_field_ids.contains(&RESERVED_FIELD_ID_POS) { - builder = builder.with_virtual_field(Arc::clone(row_pos_field()))?; - } - - if let (Some(spec), Some(data)) = (partition_spec, partition) { - builder = builder.with_partition(spec, data)?; - } - - Ok(builder.build()) - } - - fn get_row_selection_for_filter_predicate( - predicate: &BoundPredicate, - parquet_metadata: &Arc, - selected_row_groups: &Option>, - field_id_map: &HashMap, - snapshot_schema: &Schema, - ) -> Result { - let Some(column_index) = parquet_metadata.column_index() else { - return Err(Error::new( - ErrorKind::Unexpected, - "Parquet file metadata does not contain a column index", - )); - }; - - let Some(offset_index) = parquet_metadata.offset_index() else { - return Err(Error::new( - ErrorKind::Unexpected, - "Parquet file metadata does not contain an offset index", - )); - }; - - // If all row groups were filtered out, return an empty RowSelection (select no rows) - if let Some(selected_row_groups) = selected_row_groups - && selected_row_groups.is_empty() - { - return Ok(RowSelection::from(Vec::new())); - } - - let mut selected_row_groups_idx = 0; - - let page_index = column_index - .iter() - .enumerate() - .zip(offset_index) - .zip(parquet_metadata.row_groups()); - - let mut results = Vec::new(); - for (((idx, column_index), offset_index), row_group_metadata) in page_index { - if let Some(selected_row_groups) = selected_row_groups { - // skip row groups that aren't present in selected_row_groups - if idx == selected_row_groups[selected_row_groups_idx] { - selected_row_groups_idx += 1; - } else { - continue; - } - } - - let selections_for_page = PageIndexEvaluator::eval( - predicate, - column_index, - offset_index, - row_group_metadata, - field_id_map, - snapshot_schema, - )?; - - results.push(selections_for_page); - - if let Some(selected_row_groups) = selected_row_groups - && selected_row_groups_idx == selected_row_groups.len() - { - break; - } - } - - Ok(results.into_iter().flatten().collect::>().into()) - } - - /// Filters row groups by byte range to support Iceberg's file splitting. - /// - /// Applies an optional row group list and optional `RowSelection` to a builder. - /// - /// Centralises the final "commit" step shared by all Parquet reading paths. - /// Applies projection to `builder`, constructs a `RecordBatchTransformer`, builds the - /// Parquet stream, and wraps it so every batch is passed through the transformer. - /// - /// This is the shared finalization step used by every data-file reading path. - pub(crate) fn build_projected_record_batch_stream( - builder: ParquetRecordBatchStreamBuilder, - project_field_ids: &[i32], - schema: SchemaRef, - has_missing_field_ids: bool, - data_file_path: &str, - partition_spec: Option>, - partition: Option, - ) -> Result { - let builder = - Self::apply_projection(builder, project_field_ids, &schema, has_missing_field_ids)?; - - let mut record_batch_transformer = Self::build_record_batch_transformer( - schema, - project_field_ids, - data_file_path, - partition_spec, - partition, - )?; - - let record_batch_stream = builder.build()?.map(move |batch| match batch { - Ok(batch) => record_batch_transformer.process_record_batch(batch), - Err(err) => Err(err.into()), - }); - - Ok(Box::pin(record_batch_stream) as ArrowRecordBatchStream) - } - - fn filter_row_groups_by_byte_range( - parquet_metadata: &Arc, - start: u64, - length: u64, - ) -> Result> { - let row_groups = parquet_metadata.row_groups(); - let mut selected = Vec::new(); - let end = start + length; - - // Row groups are stored sequentially after the 4-byte magic header. - let mut current_byte_offset = 4u64; - - for (idx, row_group) in row_groups.iter().enumerate() { - let row_group_size = row_group.compressed_size() as u64; - let row_group_end = current_byte_offset + row_group_size; - - if current_byte_offset < end && start < row_group_end { - selected.push(idx); - } - - current_byte_offset = row_group_end; - } - - Ok(selected) - } -} - -/// Build the map of parquet field id to Parquet column index in the schema. -/// Returns None if the Parquet file doesn't have field IDs embedded (e.g., migrated tables). -fn build_field_id_map(parquet_schema: &SchemaDescriptor) -> Result>> { - let mut column_map = HashMap::new(); - - for (idx, field) in parquet_schema.columns().iter().enumerate() { - let field_type = field.self_type(); - match field_type { - ParquetType::PrimitiveType { basic_info, .. } => { - if !basic_info.has_id() { - return Ok(None); - } - column_map.insert(basic_info.id(), idx); - } - ParquetType::GroupType { .. } => { - return Err(Error::new( - ErrorKind::DataInvalid, - format!( - "Leave column in schema should be primitive type but got {field_type:?}" - ), - )); - } - }; - } - - Ok(Some(column_map)) -} - -/// Build a fallback field ID map for Parquet files without embedded field IDs. -/// Position-based (1, 2, 3, ...) for compatibility with iceberg-java migrations. -fn build_fallback_field_id_map(parquet_schema: &SchemaDescriptor) -> HashMap { - let mut column_map = HashMap::new(); - - // 1-indexed to match iceberg-java's convention - for (idx, _field) in parquet_schema.columns().iter().enumerate() { - let field_id = (idx + 1) as i32; - column_map.insert(field_id, idx); - } - - column_map -} - -/// Apply name mapping to Arrow schema for Parquet files lacking field IDs. -/// -/// Assigns Iceberg field IDs based on column names using the name mapping, -/// enabling correct projection on migrated files (e.g., from Hive/Spark via add_files). -/// -/// Per Iceberg spec Column Projection rule #2: -/// "Use schema.name-mapping.default metadata to map field id to columns without field id" -/// https://iceberg.apache.org/spec/#column-projection -/// -/// Corresponds to Java's ParquetSchemaUtil.applyNameMapping() and ApplyNameMapping visitor. -/// The key difference is Java operates on Parquet MessageType, while we operate on Arrow Schema. -/// -/// # Arguments -/// * `arrow_schema` - Arrow schema from Parquet file (without field IDs) -/// * `name_mapping` - Name mapping from table metadata (TableProperties.DEFAULT_NAME_MAPPING) -/// -/// # Returns -/// Arrow schema with field IDs assigned based on name mapping -fn apply_name_mapping_to_arrow_schema( - arrow_schema: ArrowSchemaRef, - name_mapping: &NameMapping, -) -> Result> { - debug_assert!( - arrow_schema - .fields() - .iter() - .next() - .is_none_or(|f| f.metadata().get(PARQUET_FIELD_ID_META_KEY).is_none()), - "Schema already has field IDs - name mapping should not be applied" - ); - - use arrow_schema::Field; - - let fields_with_mapped_ids: Vec<_> = arrow_schema - .fields() - .iter() - .map(|field| { - // Look up this column name in name mapping to get the Iceberg field ID. - // Corresponds to Java's ApplyNameMapping visitor which calls - // nameMapping.find(currentPath()) and returns field.withId() if found. - // - // If the field isn't in the mapping, leave it WITHOUT assigning an ID - // (matching Java's behavior of returning the field unchanged). - // Later, during projection, fields without IDs are filtered out. - let mapped_field_opt = name_mapping - .fields() - .iter() - .find(|f| f.names().contains(&field.name().to_string())); - - let mut metadata = field.metadata().clone(); - - if let Some(mapped_field) = mapped_field_opt - && let Some(field_id) = mapped_field.field_id() - { - // Field found in mapping with a field_id → assign it - metadata.insert(PARQUET_FIELD_ID_META_KEY.to_string(), field_id.to_string()); - } - // If field_id is None, leave the field without an ID (will be filtered by projection) - - Field::new(field.name(), field.data_type().clone(), field.is_nullable()) - .with_metadata(metadata) - }) - .collect(); - - Ok(Arc::new(ArrowSchema::new_with_metadata( - fields_with_mapped_ids, - arrow_schema.metadata().clone(), - ))) -} - -/// Add position-based fallback field IDs to Arrow schema for Parquet files lacking them. -/// Enables projection on migrated files (e.g., from Hive/Spark). -/// -/// Why at schema level (not per-batch): Efficiency - avoids repeated schema modification. -/// Why only top-level: Nested projection uses leaf column indices, not parent struct IDs. -/// Why 1-indexed: Compatibility with iceberg-java's ParquetSchemaUtil.addFallbackIds(). -fn add_fallback_field_ids_to_arrow_schema(arrow_schema: &ArrowSchemaRef) -> Arc { - debug_assert!( - arrow_schema - .fields() - .iter() - .next() - .is_none_or(|f| f.metadata().get(PARQUET_FIELD_ID_META_KEY).is_none()), - "Schema already has field IDs" - ); - - use arrow_schema::Field; - - let fields_with_fallback_ids: Vec<_> = arrow_schema - .fields() - .iter() - .enumerate() - .map(|(pos, field)| { - let mut metadata = field.metadata().clone(); - let field_id = (pos + 1) as i32; // 1-indexed for Java compatibility - metadata.insert(PARQUET_FIELD_ID_META_KEY.to_string(), field_id.to_string()); - - Field::new(field.name(), field.data_type().clone(), field.is_nullable()) - .with_metadata(metadata) - }) - .collect(); - - Arc::new(ArrowSchema::new_with_metadata( - fields_with_fallback_ids, - arrow_schema.metadata().clone(), - )) -} - -/// A visitor to collect field ids from bound predicates. -struct CollectFieldIdVisitor { - field_ids: HashSet, -} - -impl CollectFieldIdVisitor { - fn field_ids(self) -> HashSet { - self.field_ids - } -} - -impl BoundPredicateVisitor for CollectFieldIdVisitor { - type T = (); - - fn always_true(&mut self) -> Result<()> { - Ok(()) - } - - fn always_false(&mut self) -> Result<()> { - Ok(()) - } - - fn and(&mut self, _lhs: (), _rhs: ()) -> Result<()> { - Ok(()) - } - - fn or(&mut self, _lhs: (), _rhs: ()) -> Result<()> { - Ok(()) - } - - fn not(&mut self, _inner: ()) -> Result<()> { - Ok(()) - } - - fn is_null(&mut self, reference: &BoundReference, _predicate: &BoundPredicate) -> Result<()> { - self.field_ids.insert(reference.field().id); - Ok(()) - } - - fn not_null(&mut self, reference: &BoundReference, _predicate: &BoundPredicate) -> Result<()> { - self.field_ids.insert(reference.field().id); - Ok(()) - } - - fn is_nan(&mut self, reference: &BoundReference, _predicate: &BoundPredicate) -> Result<()> { - self.field_ids.insert(reference.field().id); - Ok(()) - } - - fn not_nan(&mut self, reference: &BoundReference, _predicate: &BoundPredicate) -> Result<()> { - self.field_ids.insert(reference.field().id); - Ok(()) - } - - fn less_than( - &mut self, - reference: &BoundReference, - _literal: &Datum, - _predicate: &BoundPredicate, - ) -> Result<()> { - self.field_ids.insert(reference.field().id); - Ok(()) - } - - fn less_than_or_eq( - &mut self, - reference: &BoundReference, - _literal: &Datum, - _predicate: &BoundPredicate, - ) -> Result<()> { - self.field_ids.insert(reference.field().id); - Ok(()) - } - - fn greater_than( - &mut self, - reference: &BoundReference, - _literal: &Datum, - _predicate: &BoundPredicate, - ) -> Result<()> { - self.field_ids.insert(reference.field().id); - Ok(()) - } - - fn greater_than_or_eq( - &mut self, - reference: &BoundReference, - _literal: &Datum, - _predicate: &BoundPredicate, - ) -> Result<()> { - self.field_ids.insert(reference.field().id); - Ok(()) - } - - fn eq( - &mut self, - reference: &BoundReference, - _literal: &Datum, - _predicate: &BoundPredicate, - ) -> Result<()> { - self.field_ids.insert(reference.field().id); - Ok(()) - } - - fn not_eq( - &mut self, - reference: &BoundReference, - _literal: &Datum, - _predicate: &BoundPredicate, - ) -> Result<()> { - self.field_ids.insert(reference.field().id); - Ok(()) - } - - fn starts_with( - &mut self, - reference: &BoundReference, - _literal: &Datum, - _predicate: &BoundPredicate, - ) -> Result<()> { - self.field_ids.insert(reference.field().id); - Ok(()) - } - - fn not_starts_with( - &mut self, - reference: &BoundReference, - _literal: &Datum, - _predicate: &BoundPredicate, - ) -> Result<()> { - self.field_ids.insert(reference.field().id); - Ok(()) - } - - fn r#in( - &mut self, - reference: &BoundReference, - _literals: &FnvHashSet, - _predicate: &BoundPredicate, - ) -> Result<()> { - self.field_ids.insert(reference.field().id); - Ok(()) - } - - fn not_in( - &mut self, - reference: &BoundReference, - _literals: &FnvHashSet, - _predicate: &BoundPredicate, - ) -> Result<()> { - self.field_ids.insert(reference.field().id); - Ok(()) - } -} - -/// A visitor to convert Iceberg bound predicates to Arrow predicates. -struct PredicateConverter<'a> { - /// The Parquet schema descriptor. - pub parquet_schema: &'a SchemaDescriptor, - /// The map between field id and leaf column index in Parquet schema. - pub column_map: &'a HashMap, - /// The required column indices in Parquet schema for the predicates. - pub column_indices: &'a Vec, -} - -impl PredicateConverter<'_> { - /// When visiting a bound reference, we return index of the leaf column in the - /// required column indices which is used to project the column in the record batch. - /// Return None if the field id is not found in the column map, which is possible - /// due to schema evolution. - fn bound_reference(&mut self, reference: &BoundReference) -> Result> { - // The leaf column's index in Parquet schema. - if let Some(column_idx) = self.column_map.get(&reference.field().id) { - if self.parquet_schema.get_column_root(*column_idx).is_group() { - return Err(Error::new( - ErrorKind::DataInvalid, - format!( - "Leave column `{}` in predicates isn't a root column in Parquet schema.", - reference.field().name - ), - )); - } - - // The leaf column's index in the required column indices. - let index = self - .column_indices - .iter() - .position(|&idx| idx == *column_idx) - .ok_or(Error::new( - ErrorKind::DataInvalid, - format!( - "Leave column `{}` in predicates cannot be found in the required column indices.", - reference.field().name - ), - ))?; - - Ok(Some(index)) - } else { - Ok(None) - } - } - - /// Build an Arrow predicate that always returns true. - fn build_always_true(&self) -> Result> { - Ok(Box::new(|batch| { - Ok(BooleanArray::from(vec![true; batch.num_rows()])) - })) - } - - /// Build an Arrow predicate that always returns false. - fn build_always_false(&self) -> Result> { - Ok(Box::new(|batch| { - Ok(BooleanArray::from(vec![false; batch.num_rows()])) - })) - } -} - -/// Gets the leaf column from the record batch for the required column index. Only -/// supports top-level columns for now. -fn project_column( - batch: &RecordBatch, - column_idx: usize, -) -> std::result::Result { - let column = batch.column(column_idx); - - match column.data_type() { - DataType::Struct(_) => Err(ArrowError::SchemaError( - "Does not support struct column yet.".to_string(), - )), - _ => Ok(column.clone()), - } -} - -type PredicateResult = - dyn FnMut(RecordBatch) -> std::result::Result + Send + 'static; - -impl BoundPredicateVisitor for PredicateConverter<'_> { - type T = Box; - - fn always_true(&mut self) -> Result> { - self.build_always_true() - } - - fn always_false(&mut self) -> Result> { - self.build_always_false() - } - - fn and( - &mut self, - mut lhs: Box, - mut rhs: Box, - ) -> Result> { - Ok(Box::new(move |batch| { - let left = lhs(batch.clone())?; - let right = rhs(batch)?; - and_kleene(&left, &right) - })) - } - - fn or( - &mut self, - mut lhs: Box, - mut rhs: Box, - ) -> Result> { - Ok(Box::new(move |batch| { - let left = lhs(batch.clone())?; - let right = rhs(batch)?; - or_kleene(&left, &right) - })) - } - - fn not(&mut self, mut inner: Box) -> Result> { - Ok(Box::new(move |batch| { - let pred_ret = inner(batch)?; - not(&pred_ret) - })) - } - - fn is_null( - &mut self, - reference: &BoundReference, - _predicate: &BoundPredicate, - ) -> Result> { - if let Some(idx) = self.bound_reference(reference)? { - Ok(Box::new(move |batch| { - let column = project_column(&batch, idx)?; - is_null(&column) - })) - } else { - // A missing column, treating it as null. - self.build_always_true() - } - } - - fn not_null( - &mut self, - reference: &BoundReference, - _predicate: &BoundPredicate, - ) -> Result> { - if let Some(idx) = self.bound_reference(reference)? { - Ok(Box::new(move |batch| { - let column = project_column(&batch, idx)?; - is_not_null(&column) - })) - } else { - // A missing column, treating it as null. - self.build_always_false() - } - } - - fn is_nan( - &mut self, - reference: &BoundReference, - _predicate: &BoundPredicate, - ) -> Result> { - if self.bound_reference(reference)?.is_some() { - self.build_always_true() - } else { - // A missing column, treating it as null. - self.build_always_false() - } - } - - fn not_nan( - &mut self, - reference: &BoundReference, - _predicate: &BoundPredicate, - ) -> Result> { - if self.bound_reference(reference)?.is_some() { - self.build_always_false() - } else { - // A missing column, treating it as null. - self.build_always_true() - } - } - - fn less_than( - &mut self, - reference: &BoundReference, - literal: &Datum, - _predicate: &BoundPredicate, - ) -> Result> { - if let Some(idx) = self.bound_reference(reference)? { - let literal = get_arrow_datum(literal)?; - - Ok(Box::new(move |batch| { - let left = project_column(&batch, idx)?; - let literal = try_cast_literal(&literal, left.data_type())?; - lt(&left, literal.as_ref()) - })) - } else { - // A missing column, treating it as null. - self.build_always_true() - } - } - - fn less_than_or_eq( - &mut self, - reference: &BoundReference, - literal: &Datum, - _predicate: &BoundPredicate, - ) -> Result> { - if let Some(idx) = self.bound_reference(reference)? { - let literal = get_arrow_datum(literal)?; - - Ok(Box::new(move |batch| { - let left = project_column(&batch, idx)?; - let literal = try_cast_literal(&literal, left.data_type())?; - lt_eq(&left, literal.as_ref()) - })) - } else { - // A missing column, treating it as null. - self.build_always_true() - } - } - - fn greater_than( - &mut self, - reference: &BoundReference, - literal: &Datum, - _predicate: &BoundPredicate, - ) -> Result> { - if let Some(idx) = self.bound_reference(reference)? { - let literal = get_arrow_datum(literal)?; - - Ok(Box::new(move |batch| { - let left = project_column(&batch, idx)?; - let literal = try_cast_literal(&literal, left.data_type())?; - gt(&left, literal.as_ref()) - })) - } else { - // A missing column, treating it as null. - self.build_always_false() - } - } - - fn greater_than_or_eq( - &mut self, - reference: &BoundReference, - literal: &Datum, - _predicate: &BoundPredicate, - ) -> Result> { - if let Some(idx) = self.bound_reference(reference)? { - let literal = get_arrow_datum(literal)?; - - Ok(Box::new(move |batch| { - let left = project_column(&batch, idx)?; - let literal = try_cast_literal(&literal, left.data_type())?; - gt_eq(&left, literal.as_ref()) - })) - } else { - // A missing column, treating it as null. - self.build_always_false() - } - } - - fn eq( - &mut self, - reference: &BoundReference, - literal: &Datum, - _predicate: &BoundPredicate, - ) -> Result> { - if let Some(idx) = self.bound_reference(reference)? { - let literal = get_arrow_datum(literal)?; - - Ok(Box::new(move |batch| { - let left = project_column(&batch, idx)?; - let literal = try_cast_literal(&literal, left.data_type())?; - eq(&left, literal.as_ref()) - })) - } else { - // A missing column, treating it as null. - self.build_always_false() - } - } - - fn not_eq( - &mut self, - reference: &BoundReference, - literal: &Datum, - _predicate: &BoundPredicate, - ) -> Result> { - if let Some(idx) = self.bound_reference(reference)? { - let literal = get_arrow_datum(literal)?; - - Ok(Box::new(move |batch| { - let left = project_column(&batch, idx)?; - let literal = try_cast_literal(&literal, left.data_type())?; - neq(&left, literal.as_ref()) - })) - } else { - // A missing column, treating it as null. - self.build_always_false() - } - } - - fn starts_with( - &mut self, - reference: &BoundReference, - literal: &Datum, - _predicate: &BoundPredicate, - ) -> Result> { - if let Some(idx) = self.bound_reference(reference)? { - let literal = get_arrow_datum(literal)?; - - Ok(Box::new(move |batch| { - let left = project_column(&batch, idx)?; - let literal = try_cast_literal(&literal, left.data_type())?; - starts_with(&left, literal.as_ref()) - })) - } else { - // A missing column, treating it as null. - self.build_always_false() - } - } - - fn not_starts_with( - &mut self, - reference: &BoundReference, - literal: &Datum, - _predicate: &BoundPredicate, - ) -> Result> { - if let Some(idx) = self.bound_reference(reference)? { - let literal = get_arrow_datum(literal)?; - - Ok(Box::new(move |batch| { - let left = project_column(&batch, idx)?; - let literal = try_cast_literal(&literal, left.data_type())?; - // update here if arrow ever adds a native not_starts_with - not(&starts_with(&left, literal.as_ref())?) - })) - } else { - // A missing column, treating it as null. - self.build_always_true() - } - } - - fn r#in( - &mut self, - reference: &BoundReference, - literals: &FnvHashSet, - _predicate: &BoundPredicate, - ) -> Result> { - if let Some(idx) = self.bound_reference(reference)? { - let literals: Vec<_> = literals - .iter() - .map(|lit| get_arrow_datum(lit).unwrap()) - .collect(); - - Ok(Box::new(move |batch| { - // update this if arrow ever adds a native is_in kernel - let left = project_column(&batch, idx)?; - - let mut acc = BooleanArray::from(vec![false; batch.num_rows()]); - for literal in &literals { - let literal = try_cast_literal(literal, left.data_type())?; - acc = or(&acc, &eq(&left, literal.as_ref())?)? - } - - Ok(acc) - })) - } else { - // A missing column, treating it as null. - self.build_always_false() - } - } - - fn not_in( - &mut self, - reference: &BoundReference, - literals: &FnvHashSet, - _predicate: &BoundPredicate, - ) -> Result> { - if let Some(idx) = self.bound_reference(reference)? { - let literals: Vec<_> = literals - .iter() - .map(|lit| get_arrow_datum(lit).unwrap()) - .collect(); - - Ok(Box::new(move |batch| { - // update this if arrow ever adds a native not_in kernel - let left = project_column(&batch, idx)?; - let mut acc = BooleanArray::from(vec![true; batch.num_rows()]); - for literal in &literals { - let literal = try_cast_literal(literal, left.data_type())?; - acc = and(&acc, &neq(&left, literal.as_ref())?)? - } - - Ok(acc) - })) - } else { - // A missing column, treating it as null. - self.build_always_true() - } - } -} - -/// ArrowFileReader is a wrapper around a FileRead that impls parquets AsyncFileReader. -pub struct ArrowFileReader { - meta: FileMetadata, - parquet_read_options: ParquetReadOptions, - r: Box, -} - -impl ArrowFileReader { - /// Create a new ArrowFileReader - pub fn new(meta: FileMetadata, r: Box) -> Self { - Self { - meta, - parquet_read_options: ParquetReadOptions::builder().build(), - r, - } - } - - /// Configure all Parquet read options. - pub(crate) fn with_parquet_read_options(mut self, options: ParquetReadOptions) -> Self { - self.parquet_read_options = options; - self - } -} - -impl AsyncFileReader for ArrowFileReader { - fn get_bytes(&mut self, range: Range) -> BoxFuture<'_, parquet::errors::Result> { - Box::pin( - self.r - .read(range.start..range.end) - .map_err(|err| parquet::errors::ParquetError::External(Box::new(err))), - ) - } - - /// Override the default `get_byte_ranges` which calls `get_bytes` sequentially. - /// The parquet reader calls this to fetch column chunks for a row group, so - /// without this override each column chunk is a serial round-trip to object storage. - /// Adapted from object_store's `coalesce_ranges` in `util.rs`. - fn get_byte_ranges( - &mut self, - ranges: Vec>, - ) -> BoxFuture<'_, parquet::errors::Result>> { - let coalesce_bytes = self.parquet_read_options.range_coalesce_bytes(); - let concurrency = self.parquet_read_options.range_fetch_concurrency().max(1); - - async move { - // Merge nearby ranges to reduce the number of object store requests. - let fetch_ranges = merge_ranges(&ranges, coalesce_bytes); - let r = &self.r; - - // Fetch merged ranges concurrently. - let fetched: Vec = futures::stream::iter(fetch_ranges.iter().cloned()) - .map(|range| async move { - r.read(range) - .await - .map_err(|e| parquet::errors::ParquetError::External(Box::new(e))) - }) - .buffered(concurrency) - .try_collect() - .await?; - - // Slice the fetched data back into the originally requested ranges. - Ok(ranges - .iter() - .map(|range| { - let idx = fetch_ranges.partition_point(|v| v.start <= range.start) - 1; - let fetch_range = &fetch_ranges[idx]; - let fetch_bytes = &fetched[idx]; - let start = (range.start - fetch_range.start) as usize; - let end = (range.end - fetch_range.start) as usize; - fetch_bytes.slice(start..end.min(fetch_bytes.len())) - }) - .collect()) - } - .boxed() - } - - // TODO: currently we don't respect `ArrowReaderOptions` cause it don't expose any method to access the option field - // we will fix it after `v55.1.0` is released in https://github.com/apache/arrow-rs/issues/7393 - fn get_metadata( - &mut self, - _options: Option<&'_ ArrowReaderOptions>, - ) -> BoxFuture<'_, parquet::errors::Result>> { - async move { - fn page_index_policy(enabled: bool) -> PageIndexPolicy { - if enabled { - PageIndexPolicy::Optional - } else { - PageIndexPolicy::Skip - } - } - - let reader = ParquetMetaDataReader::new() - .with_prefetch_hint(self.parquet_read_options.metadata_size_hint()) - // Set the page policy first because it updates both column and offset policies. - .with_page_index_policy(page_index_policy( - self.parquet_read_options.preload_page_index(), - )) - .with_column_index_policy(page_index_policy( - self.parquet_read_options.preload_column_index(), - )) - .with_offset_index_policy(page_index_policy( - self.parquet_read_options.preload_offset_index(), - )); - let size = self.meta.size; - let meta = reader.load_and_finish(self, size).await?; - - Ok(Arc::new(meta)) - } - .boxed() - } -} - -/// Merge overlapping or nearby byte ranges, combining ranges with gaps <= `coalesce` bytes. -/// Adapted from object_store's `merge_ranges` in `util.rs`. -fn merge_ranges(ranges: &[Range], coalesce: u64) -> Vec> { - if ranges.is_empty() { - return vec![]; - } - - let mut ranges = ranges.to_vec(); - ranges.sort_unstable_by_key(|r| r.start); - - let mut merged = Vec::with_capacity(ranges.len()); - let mut start_idx = 0; - let mut end_idx = 1; - - while start_idx != ranges.len() { - let mut range_end = ranges[start_idx].end; - - while end_idx != ranges.len() - && ranges[end_idx] - .start - .checked_sub(range_end) - .map(|delta| delta <= coalesce) - .unwrap_or(true) - { - range_end = range_end.max(ranges[end_idx].end); - end_idx += 1; - } - - merged.push(ranges[start_idx].start..range_end); - start_idx = end_idx; - end_idx += 1; - } - - merged -} - -/// The Arrow type of an array that the Parquet reader reads may not match the exact Arrow type -/// that Iceberg uses for literals - but they are effectively the same logical type, -/// i.e. LargeUtf8 and Utf8 or Utf8View and Utf8 or Utf8View and LargeUtf8. -/// -/// The Arrow compute kernels that we use must match the type exactly, so first cast the literal -/// into the type of the batch we read from Parquet before sending it to the compute kernel. -fn try_cast_literal( - literal: &Arc, - column_type: &DataType, -) -> std::result::Result, ArrowError> { - let literal_array = literal.get().0; - - // No cast required - if literal_array.data_type() == column_type { - return Ok(Arc::clone(literal)); - } - - let literal_array = cast(literal_array, column_type)?; - Ok(Arc::new(Scalar::new(literal_array))) -} - -#[cfg(test)] -mod tests { - use std::collections::{HashMap, HashSet}; - use std::fs::File; - use std::ops::Range; - use std::sync::Arc; - - use arrow_array::cast::AsArray; - use arrow_array::{ - Array, ArrayRef, BinaryArray, FixedSizeBinaryArray, Int32Array, LargeStringArray, - RecordBatch, StringArray, - }; - use arrow_schema::{DataType, Field, Schema as ArrowSchema, TimeUnit}; - use futures::TryStreamExt; - use parquet::arrow::arrow_reader::{RowSelection, RowSelector}; - use parquet::arrow::{ArrowWriter, ProjectionMask}; - use parquet::basic::Compression; - use parquet::file::metadata::{ColumnChunkMetaData, RowGroupMetaData}; - use parquet::file::properties::WriterProperties; - use parquet::schema::parser::parse_message_type; - use parquet::schema::types::{SchemaDescPtr, SchemaDescriptor}; - use roaring::RoaringTreemap; - use tempfile::TempDir; - - use crate::ErrorKind; - use crate::arrow::reader::{CollectFieldIdVisitor, PARQUET_FIELD_ID_META_KEY}; - use crate::arrow::{ArrowReader, ArrowReaderBuilder}; - use crate::delete_vector::DeleteVector; - use crate::expr::visitors::bound_predicate_visitor::visit; - use crate::expr::{Bind, Predicate, Reference}; - use crate::io::FileIO; - use crate::scan::{FileScanTask, FileScanTaskDeleteFile, FileScanTaskStream}; - use crate::spec::{ - DataContentType, DataFileFormat, Datum, NestedField, PrimitiveType, Schema, SchemaRef, Type, - }; - - fn table_schema_simple() -> SchemaRef { - Arc::new( - Schema::builder() - .with_schema_id(1) - .with_identifier_field_ids(vec![2]) - .with_fields(vec![ - NestedField::optional(1, "foo", Type::Primitive(PrimitiveType::String)).into(), - NestedField::required(2, "bar", Type::Primitive(PrimitiveType::Int)).into(), - NestedField::optional(3, "baz", Type::Primitive(PrimitiveType::Boolean)).into(), - NestedField::optional(4, "qux", Type::Primitive(PrimitiveType::Float)).into(), - ]) - .build() - .unwrap(), - ) - } - - #[test] - fn test_collect_field_id() { - let schema = table_schema_simple(); - let expr = Reference::new("qux").is_null(); - let bound_expr = expr.bind(schema, true).unwrap(); - - let mut visitor = CollectFieldIdVisitor { - field_ids: HashSet::default(), - }; - visit(&mut visitor, &bound_expr).unwrap(); - - let mut expected = HashSet::default(); - expected.insert(4_i32); - - assert_eq!(visitor.field_ids, expected); - } - - #[test] - fn test_collect_field_id_with_and() { - let schema = table_schema_simple(); - let expr = Reference::new("qux") - .is_null() - .and(Reference::new("baz").is_null()); - let bound_expr = expr.bind(schema, true).unwrap(); - - let mut visitor = CollectFieldIdVisitor { - field_ids: HashSet::default(), - }; - visit(&mut visitor, &bound_expr).unwrap(); - - let mut expected = HashSet::default(); - expected.insert(4_i32); - expected.insert(3); - - assert_eq!(visitor.field_ids, expected); - } - - #[test] - fn test_collect_field_id_with_or() { - let schema = table_schema_simple(); - let expr = Reference::new("qux") - .is_null() - .or(Reference::new("baz").is_null()); - let bound_expr = expr.bind(schema, true).unwrap(); - - let mut visitor = CollectFieldIdVisitor { - field_ids: HashSet::default(), - }; - visit(&mut visitor, &bound_expr).unwrap(); - - let mut expected = HashSet::default(); - expected.insert(4_i32); - expected.insert(3); - - assert_eq!(visitor.field_ids, expected); - } - - #[test] - fn test_arrow_projection_mask() { - let schema = Arc::new( - Schema::builder() - .with_schema_id(1) - .with_identifier_field_ids(vec![1]) - .with_fields(vec![ - NestedField::required(1, "c1", Type::Primitive(PrimitiveType::String)).into(), - NestedField::optional(2, "c2", Type::Primitive(PrimitiveType::Int)).into(), - NestedField::optional( - 3, - "c3", - Type::Primitive(PrimitiveType::Decimal { - precision: 38, - scale: 3, - }), - ) - .into(), - ]) - .build() - .unwrap(), - ); - let arrow_schema = Arc::new(ArrowSchema::new(vec![ - Field::new("c1", DataType::Utf8, false).with_metadata(HashMap::from([( - PARQUET_FIELD_ID_META_KEY.to_string(), - "1".to_string(), - )])), - // Type not supported - Field::new("c2", DataType::Duration(TimeUnit::Microsecond), true).with_metadata( - HashMap::from([(PARQUET_FIELD_ID_META_KEY.to_string(), "2".to_string())]), - ), - // Precision is beyond the supported range - Field::new("c3", DataType::Decimal128(39, 3), true).with_metadata(HashMap::from([( - PARQUET_FIELD_ID_META_KEY.to_string(), - "3".to_string(), - )])), - ])); - - let message_type = " -message schema { - required binary c1 (STRING) = 1; - optional int32 c2 (INTEGER(8,true)) = 2; - optional fixed_len_byte_array(17) c3 (DECIMAL(39,3)) = 3; -} - "; - let parquet_type = parse_message_type(message_type).expect("should parse schema"); - let parquet_schema = SchemaDescriptor::new(Arc::new(parquet_type)); - - // Try projecting the fields c2 and c3 with the unsupported data types - let err = ArrowReader::get_arrow_projection_mask( - &[1, 2, 3], - &schema, - &parquet_schema, - &arrow_schema, - false, - ) - .unwrap_err(); - - assert_eq!(err.kind(), ErrorKind::DataInvalid); - assert_eq!( - err.to_string(), - "DataInvalid => Unsupported Arrow data type: Duration(µs)".to_string() - ); - - // Omitting field c2, we still get an error due to c3 being selected - let err = ArrowReader::get_arrow_projection_mask( - &[1, 3], - &schema, - &parquet_schema, - &arrow_schema, - false, - ) - .unwrap_err(); - - assert_eq!(err.kind(), ErrorKind::DataInvalid); - assert_eq!( - err.to_string(), - "DataInvalid => Failed to create decimal type, source: DataInvalid => Decimals with precision larger than 38 are not supported: 39".to_string() - ); - - // Finally avoid selecting fields with unsupported data types - let mask = ArrowReader::get_arrow_projection_mask( - &[1], - &schema, - &parquet_schema, - &arrow_schema, - false, - ) - .expect("Some ProjectionMask"); - assert_eq!(mask, ProjectionMask::leaves(&parquet_schema, vec![0])); - } - - #[tokio::test] - async fn test_kleene_logic_or_behaviour() { - // a IS NULL OR a = 'foo' - let predicate = Reference::new("a") - .is_null() - .or(Reference::new("a").equal_to(Datum::string("foo"))); - - // Table data: [NULL, "foo", "bar"] - let data_for_col_a = vec![None, Some("foo".to_string()), Some("bar".to_string())]; - - // Expected: [NULL, "foo"]. - let expected = vec![None, Some("foo".to_string())]; - - let (file_io, schema, table_location, _temp_dir) = - setup_kleene_logic(data_for_col_a, DataType::Utf8); - let reader = ArrowReaderBuilder::new(file_io).build(); - - let result_data = test_perform_read(predicate, schema, table_location, reader).await; - - assert_eq!(result_data, expected); - } - - #[tokio::test] - async fn test_kleene_logic_and_behaviour() { - // a IS NOT NULL AND a != 'foo' - let predicate = Reference::new("a") - .is_not_null() - .and(Reference::new("a").not_equal_to(Datum::string("foo"))); - - // Table data: [NULL, "foo", "bar"] - let data_for_col_a = vec![None, Some("foo".to_string()), Some("bar".to_string())]; - - // Expected: ["bar"]. - let expected = vec![Some("bar".to_string())]; - - let (file_io, schema, table_location, _temp_dir) = - setup_kleene_logic(data_for_col_a, DataType::Utf8); - let reader = ArrowReaderBuilder::new(file_io).build(); - - let result_data = test_perform_read(predicate, schema, table_location, reader).await; - - assert_eq!(result_data, expected); - } - - #[tokio::test] - async fn test_predicate_cast_literal() { - let predicates = vec![ - // a == 'foo' - (Reference::new("a").equal_to(Datum::string("foo")), vec![ - Some("foo".to_string()), - ]), - // a != 'foo' - ( - Reference::new("a").not_equal_to(Datum::string("foo")), - vec![Some("bar".to_string())], - ), - // STARTS_WITH(a, 'foo') - (Reference::new("a").starts_with(Datum::string("f")), vec![ - Some("foo".to_string()), - ]), - // NOT STARTS_WITH(a, 'foo') - ( - Reference::new("a").not_starts_with(Datum::string("f")), - vec![Some("bar".to_string())], - ), - // a < 'foo' - (Reference::new("a").less_than(Datum::string("foo")), vec![ - Some("bar".to_string()), - ]), - // a <= 'foo' - ( - Reference::new("a").less_than_or_equal_to(Datum::string("foo")), - vec![Some("foo".to_string()), Some("bar".to_string())], - ), - // a > 'foo' - ( - Reference::new("a").greater_than(Datum::string("bar")), - vec![Some("foo".to_string())], - ), - // a >= 'foo' - ( - Reference::new("a").greater_than_or_equal_to(Datum::string("foo")), - vec![Some("foo".to_string())], - ), - // a IN ('foo', 'bar') - ( - Reference::new("a").is_in([Datum::string("foo"), Datum::string("baz")]), - vec![Some("foo".to_string())], - ), - // a NOT IN ('foo', 'bar') - ( - Reference::new("a").is_not_in([Datum::string("foo"), Datum::string("baz")]), - vec![Some("bar".to_string())], - ), - ]; - - // Table data: ["foo", "bar"] - let data_for_col_a = vec![Some("foo".to_string()), Some("bar".to_string())]; - - let (file_io, schema, table_location, _temp_dir) = - setup_kleene_logic(data_for_col_a, DataType::LargeUtf8); - let reader = ArrowReaderBuilder::new(file_io).build(); - - for (predicate, expected) in predicates { - println!("testing predicate {predicate}"); - let result_data = test_perform_read( - predicate.clone(), - schema.clone(), - table_location.clone(), - reader.clone(), - ) - .await; - - assert_eq!(result_data, expected, "predicate={predicate}"); - } - } - - async fn test_perform_read( - predicate: Predicate, - schema: SchemaRef, - table_location: String, - reader: ArrowReader, - ) -> Vec> { - let tasks = Box::pin(futures::stream::iter( - vec![Ok(FileScanTask { - file_size_in_bytes: std::fs::metadata(format!("{table_location}/1.parquet")) - .unwrap() - .len(), - start: 0, - length: 0, - record_count: None, - data_file_path: format!("{table_location}/1.parquet"), - data_file_format: DataFileFormat::Parquet, - schema: schema.clone(), - project_field_ids: vec![1], - predicate: Some(predicate.bind(schema, true).unwrap()), - deletes: vec![], - partition: None, - partition_spec: None, - name_mapping: None, - case_sensitive: false, - })] - .into_iter(), - )) as FileScanTaskStream; - - let result = reader - .read(tasks) - .unwrap() - .try_collect::>() - .await - .unwrap(); - - result[0].columns()[0] - .as_string_opt::() - .unwrap() - .iter() - .map(|v| v.map(ToOwned::to_owned)) - .collect::>() - } - - fn setup_kleene_logic( - data_for_col_a: Vec>, - col_a_type: DataType, - ) -> (FileIO, SchemaRef, String, TempDir) { - let schema = Arc::new( - Schema::builder() - .with_schema_id(1) - .with_fields(vec![ - NestedField::optional(1, "a", Type::Primitive(PrimitiveType::String)).into(), - ]) - .build() - .unwrap(), - ); - - let arrow_schema = Arc::new(ArrowSchema::new(vec![ - Field::new("a", col_a_type.clone(), true).with_metadata(HashMap::from([( - PARQUET_FIELD_ID_META_KEY.to_string(), - "1".to_string(), - )])), - ])); - - let tmp_dir = TempDir::new().unwrap(); - let table_location = tmp_dir.path().to_str().unwrap().to_string(); - - let file_io = FileIO::new_with_fs(); - - let col = match col_a_type { - DataType::Utf8 => Arc::new(StringArray::from(data_for_col_a)) as ArrayRef, - DataType::LargeUtf8 => Arc::new(LargeStringArray::from(data_for_col_a)) as ArrayRef, - _ => panic!("unexpected col_a_type"), - }; - - let to_write = RecordBatch::try_new(arrow_schema.clone(), vec![col]).unwrap(); - - // Write the Parquet files - let props = WriterProperties::builder() - .set_compression(Compression::SNAPPY) - .build(); - - let file = File::create(format!("{table_location}/1.parquet")).unwrap(); - let mut writer = - ArrowWriter::try_new(file, to_write.schema(), Some(props.clone())).unwrap(); - - writer.write(&to_write).expect("Writing batch"); - - // writer must be closed to write footer - writer.close().unwrap(); - - (file_io, schema, table_location, tmp_dir) - } - - #[test] - fn test_build_deletes_row_selection() { - let schema_descr = get_test_schema_descr(); - - let mut columns = vec![]; - for ptr in schema_descr.columns() { - let column = ColumnChunkMetaData::builder(ptr.clone()).build().unwrap(); - columns.push(column); - } - - let row_groups_metadata = vec![ - build_test_row_group_meta(schema_descr.clone(), columns.clone(), 1000, 0), - build_test_row_group_meta(schema_descr.clone(), columns.clone(), 500, 1), - build_test_row_group_meta(schema_descr.clone(), columns.clone(), 500, 2), - build_test_row_group_meta(schema_descr.clone(), columns.clone(), 1000, 3), - build_test_row_group_meta(schema_descr.clone(), columns.clone(), 500, 4), - ]; - - let selected_row_groups = Some(vec![1, 3]); - - /* cases to cover: - * {skip|select} {first|intermediate|last} {one row|multiple rows} in - {first|intermediate|last} {skipped|selected} row group - * row group selection disabled - */ - - let positional_deletes = RoaringTreemap::from_iter(&[ - 1, // in skipped rg 0, should be ignored - 3, // run of three consecutive items in skipped rg0 - 4, 5, 998, // two consecutive items at end of skipped rg0 - 999, 1000, // solitary row at start of selected rg1 (1, 9) - 1010, // run of 3 rows in selected rg1 - 1011, 1012, // (3, 485) - 1498, // run of two items at end of selected rg1 - 1499, 1500, // run of two items at start of skipped rg2 - 1501, 1600, // should ignore, in skipped rg2 - 1999, // single row at end of skipped rg2 - 2000, // run of two items at start of selected rg3 - 2001, // (4, 98) - 2100, // single row in selected row group 3 (1, 99) - 2200, // run of 3 consecutive rows in selected row group 3 - 2201, 2202, // (3, 796) - 2999, // single item at end of selected rg3 (1) - 3000, // single item at start of skipped rg4 - ]); - - let positional_deletes = DeleteVector::new(positional_deletes); - - // using selected row groups 1 and 3 - let result = ArrowReader::build_deletes_row_selection( - &row_groups_metadata, - &selected_row_groups, - &positional_deletes, - ) - .unwrap(); - - let expected = RowSelection::from(vec![ - RowSelector::skip(1), - RowSelector::select(9), - RowSelector::skip(3), - RowSelector::select(485), - RowSelector::skip(4), - RowSelector::select(98), - RowSelector::skip(1), - RowSelector::select(99), - RowSelector::skip(3), - RowSelector::select(796), - RowSelector::skip(1), - ]); - - assert_eq!(result, expected); - - // selecting all row groups - let result = ArrowReader::build_deletes_row_selection( - &row_groups_metadata, - &None, - &positional_deletes, - ) - .unwrap(); - - let expected = RowSelection::from(vec![ - RowSelector::select(1), - RowSelector::skip(1), - RowSelector::select(1), - RowSelector::skip(3), - RowSelector::select(992), - RowSelector::skip(3), - RowSelector::select(9), - RowSelector::skip(3), - RowSelector::select(485), - RowSelector::skip(4), - RowSelector::select(98), - RowSelector::skip(1), - RowSelector::select(398), - RowSelector::skip(3), - RowSelector::select(98), - RowSelector::skip(1), - RowSelector::select(99), - RowSelector::skip(3), - RowSelector::select(796), - RowSelector::skip(2), - RowSelector::select(499), - ]); - - assert_eq!(result, expected); - } - - fn build_test_row_group_meta( - schema_descr: SchemaDescPtr, - columns: Vec, - num_rows: i64, - ordinal: i16, - ) -> RowGroupMetaData { - RowGroupMetaData::builder(schema_descr.clone()) - .set_num_rows(num_rows) - .set_total_byte_size(2000) - .set_column_metadata(columns) - .set_ordinal(ordinal) - .build() - .unwrap() - } - - fn get_test_schema_descr() -> SchemaDescPtr { - use parquet::schema::types::Type as SchemaType; - - let schema = SchemaType::group_type_builder("schema") - .with_fields(vec![ - Arc::new( - SchemaType::primitive_type_builder("a", parquet::basic::Type::INT32) - .build() - .unwrap(), - ), - Arc::new( - SchemaType::primitive_type_builder("b", parquet::basic::Type::INT32) - .build() - .unwrap(), - ), - ]) - .build() - .unwrap(); - - Arc::new(SchemaDescriptor::new(Arc::new(schema))) - } - - /// Verifies that file splits respect byte ranges and only read specific row groups. - #[tokio::test] - async fn test_file_splits_respect_byte_ranges() { - use arrow_array::Int32Array; - use parquet::file::reader::{FileReader, SerializedFileReader}; - - let schema = Arc::new( - Schema::builder() - .with_schema_id(1) - .with_fields(vec![ - NestedField::required(1, "id", Type::Primitive(PrimitiveType::Int)).into(), - ]) - .build() - .unwrap(), - ); - - let arrow_schema = Arc::new(ArrowSchema::new(vec![ - Field::new("id", DataType::Int32, false).with_metadata(HashMap::from([( - PARQUET_FIELD_ID_META_KEY.to_string(), - "1".to_string(), - )])), - ])); - - let tmp_dir = TempDir::new().unwrap(); - let table_location = tmp_dir.path().to_str().unwrap().to_string(); - let file_path = format!("{table_location}/multi_row_group.parquet"); - - // Force each batch into its own row group for testing byte range filtering. - let batch1 = RecordBatch::try_new(arrow_schema.clone(), vec![Arc::new(Int32Array::from( - (0..100).collect::>(), - ))]) - .unwrap(); - let batch2 = RecordBatch::try_new(arrow_schema.clone(), vec![Arc::new(Int32Array::from( - (100..200).collect::>(), - ))]) - .unwrap(); - let batch3 = RecordBatch::try_new(arrow_schema.clone(), vec![Arc::new(Int32Array::from( - (200..300).collect::>(), - ))]) - .unwrap(); - - let props = WriterProperties::builder() - .set_compression(Compression::SNAPPY) - .set_max_row_group_row_count(Some(100)) - .build(); - - let file = File::create(&file_path).unwrap(); - let mut writer = ArrowWriter::try_new(file, arrow_schema.clone(), Some(props)).unwrap(); - writer.write(&batch1).expect("Writing batch 1"); - writer.write(&batch2).expect("Writing batch 2"); - writer.write(&batch3).expect("Writing batch 3"); - writer.close().unwrap(); - - // Read the file metadata to get row group byte positions - let file = File::open(&file_path).unwrap(); - let reader = SerializedFileReader::new(file).unwrap(); - let metadata = reader.metadata(); - - println!("File has {} row groups", metadata.num_row_groups()); - assert_eq!(metadata.num_row_groups(), 3, "Expected 3 row groups"); - - // Get byte positions for each row group - let row_group_0 = metadata.row_group(0); - let row_group_1 = metadata.row_group(1); - let row_group_2 = metadata.row_group(2); - - let rg0_start = 4u64; // Parquet files start with 4-byte magic "PAR1" - let rg1_start = rg0_start + row_group_0.compressed_size() as u64; - let rg2_start = rg1_start + row_group_1.compressed_size() as u64; - let file_end = rg2_start + row_group_2.compressed_size() as u64; - - println!( - "Row group 0: {} rows, starts at byte {}, {} bytes compressed", - row_group_0.num_rows(), - rg0_start, - row_group_0.compressed_size() - ); - println!( - "Row group 1: {} rows, starts at byte {}, {} bytes compressed", - row_group_1.num_rows(), - rg1_start, - row_group_1.compressed_size() - ); - println!( - "Row group 2: {} rows, starts at byte {}, {} bytes compressed", - row_group_2.num_rows(), - rg2_start, - row_group_2.compressed_size() - ); - - let file_io = FileIO::new_with_fs(); - let reader = ArrowReaderBuilder::new(file_io).build(); - - // Task 1: read only the first row group - let task1 = FileScanTask { - file_size_in_bytes: std::fs::metadata(&file_path).unwrap().len(), - start: rg0_start, - length: row_group_0.compressed_size() as u64, - record_count: Some(100), - data_file_path: file_path.clone(), - data_file_format: DataFileFormat::Parquet, - schema: schema.clone(), - project_field_ids: vec![1], - predicate: None, - deletes: vec![], - partition: None, - partition_spec: None, - name_mapping: None, - case_sensitive: false, - }; - - // Task 2: read the second and third row groups - let task2 = FileScanTask { - file_size_in_bytes: std::fs::metadata(&file_path).unwrap().len(), - start: rg1_start, - length: file_end - rg1_start, - record_count: Some(200), - data_file_path: file_path.clone(), - data_file_format: DataFileFormat::Parquet, - schema: schema.clone(), - project_field_ids: vec![1], - predicate: None, - deletes: vec![], - partition: None, - partition_spec: None, - name_mapping: None, - case_sensitive: false, - }; - - let tasks1 = Box::pin(futures::stream::iter(vec![Ok(task1)])) as FileScanTaskStream; - let result1 = reader - .clone() - .read(tasks1) - .unwrap() - .try_collect::>() - .await - .unwrap(); - - let total_rows_task1: usize = result1.iter().map(|b| b.num_rows()).sum(); - println!( - "Task 1 (bytes {}-{}) returned {} rows", - rg0_start, - rg0_start + row_group_0.compressed_size() as u64, - total_rows_task1 - ); - - let tasks2 = Box::pin(futures::stream::iter(vec![Ok(task2)])) as FileScanTaskStream; - let result2 = reader - .read(tasks2) - .unwrap() - .try_collect::>() - .await - .unwrap(); - - let total_rows_task2: usize = result2.iter().map(|b| b.num_rows()).sum(); - println!("Task 2 (bytes {rg1_start}-{file_end}) returned {total_rows_task2} rows"); - - assert_eq!( - total_rows_task1, 100, - "Task 1 should read only the first row group (100 rows), but got {total_rows_task1} rows" - ); - - assert_eq!( - total_rows_task2, 200, - "Task 2 should read only the second+third row groups (200 rows), but got {total_rows_task2} rows" - ); - - // Verify the actual data values are correct (not just the row count) - if total_rows_task1 > 0 { - let first_batch = &result1[0]; - let id_col = first_batch - .column(0) - .as_primitive::(); - let first_val = id_col.value(0); - let last_val = id_col.value(id_col.len() - 1); - println!("Task 1 data range: {first_val} to {last_val}"); - - assert_eq!(first_val, 0, "Task 1 should start with id=0"); - assert_eq!(last_val, 99, "Task 1 should end with id=99"); - } - - if total_rows_task2 > 0 { - let first_batch = &result2[0]; - let id_col = first_batch - .column(0) - .as_primitive::(); - let first_val = id_col.value(0); - println!("Task 2 first value: {first_val}"); - - assert_eq!(first_val, 100, "Task 2 should start with id=100, not id=0"); - } - } - - /// Test schema evolution: reading old Parquet file (with only column 'a') - /// using a newer table schema (with columns 'a' and 'b'). - /// This tests that: - /// 1. get_arrow_projection_mask allows missing columns - /// 2. RecordBatchTransformer adds missing column 'b' with NULL values - #[tokio::test] - async fn test_schema_evolution_add_column() { - use arrow_array::{Array, Int32Array}; - - // New table schema: columns 'a' and 'b' (b was added later, file only has 'a') - let new_schema = Arc::new( - Schema::builder() - .with_schema_id(2) - .with_fields(vec![ - NestedField::required(1, "a", Type::Primitive(PrimitiveType::Int)).into(), - NestedField::optional(2, "b", Type::Primitive(PrimitiveType::Int)).into(), - ]) - .build() - .unwrap(), - ); - - // Create Arrow schema for old Parquet file (only has column 'a') - let arrow_schema_old = Arc::new(ArrowSchema::new(vec![ - Field::new("a", DataType::Int32, false).with_metadata(HashMap::from([( - PARQUET_FIELD_ID_META_KEY.to_string(), - "1".to_string(), - )])), - ])); - - // Write old Parquet file with only column 'a' - let tmp_dir = TempDir::new().unwrap(); - let table_location = tmp_dir.path().to_str().unwrap().to_string(); - let file_io = FileIO::new_with_fs(); - - let data_a = Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef; - let to_write = RecordBatch::try_new(arrow_schema_old.clone(), vec![data_a]).unwrap(); - - let props = WriterProperties::builder() - .set_compression(Compression::SNAPPY) - .build(); - let file = File::create(format!("{table_location}/old_file.parquet")).unwrap(); - let mut writer = ArrowWriter::try_new(file, to_write.schema(), Some(props)).unwrap(); - writer.write(&to_write).expect("Writing batch"); - writer.close().unwrap(); - - // Read the old Parquet file using the NEW schema (with column 'b') - let reader = ArrowReaderBuilder::new(file_io).build(); - let tasks = Box::pin(futures::stream::iter( - vec![Ok(FileScanTask { - file_size_in_bytes: std::fs::metadata(format!("{table_location}/old_file.parquet")) - .unwrap() - .len(), - start: 0, - length: 0, - record_count: None, - data_file_path: format!("{table_location}/old_file.parquet"), - data_file_format: DataFileFormat::Parquet, - schema: new_schema.clone(), - project_field_ids: vec![1, 2], // Request both columns 'a' and 'b' - predicate: None, - deletes: vec![], - partition: None, - partition_spec: None, - name_mapping: None, - case_sensitive: false, - })] - .into_iter(), - )) as FileScanTaskStream; - - let result = reader - .read(tasks) - .unwrap() - .try_collect::>() - .await - .unwrap(); - - // Verify we got the correct data - assert_eq!(result.len(), 1); - let batch = &result[0]; - - // Should have 2 columns now - assert_eq!(batch.num_columns(), 2); - assert_eq!(batch.num_rows(), 3); - - // Column 'a' should have the original data - let col_a = batch - .column(0) - .as_primitive::(); - assert_eq!(col_a.values(), &[1, 2, 3]); - - // Column 'b' should be all NULLs (it didn't exist in the old file) - let col_b = batch - .column(1) - .as_primitive::(); - assert_eq!(col_b.null_count(), 3); - assert!(col_b.is_null(0)); - assert!(col_b.is_null(1)); - assert!(col_b.is_null(2)); - } - - /// Test for bug where position deletes in later row groups are not applied correctly. - /// - /// When a file has multiple row groups and a position delete targets a row in a later - /// row group, the `build_deletes_row_selection` function had a bug where it would - /// fail to increment `current_row_group_base_idx` when skipping row groups. - /// - /// This test creates: - /// - A data file with 200 rows split into 2 row groups (0-99, 100-199) - /// - A position delete file that deletes row 199 (last row in second row group) - /// - /// Expected behavior: Should return 199 rows (with id=200 deleted) - /// Bug behavior: Returns 200 rows (delete is not applied) - /// - /// This bug was discovered while running Apache Spark + Apache Iceberg integration tests - /// through DataFusion Comet. The following Iceberg Java tests failed due to this bug: - /// - `org.apache.iceberg.spark.extensions.TestMergeOnReadDelete::testDeleteWithMultipleRowGroupsParquet` - /// - `org.apache.iceberg.spark.extensions.TestMergeOnReadUpdate::testUpdateWithMultipleRowGroupsParquet` - #[tokio::test] - async fn test_position_delete_across_multiple_row_groups() { - use arrow_array::{Int32Array, Int64Array}; - use parquet::file::reader::{FileReader, SerializedFileReader}; - - // Field IDs for positional delete schema - const FIELD_ID_POSITIONAL_DELETE_FILE_PATH: u64 = 2147483546; - const FIELD_ID_POSITIONAL_DELETE_POS: u64 = 2147483545; - - let tmp_dir = TempDir::new().unwrap(); - let table_location = tmp_dir.path().to_str().unwrap().to_string(); - - // Create table schema with a single 'id' column - let table_schema = Arc::new( - Schema::builder() - .with_schema_id(1) - .with_fields(vec![ - NestedField::required(1, "id", Type::Primitive(PrimitiveType::Int)).into(), - ]) - .build() - .unwrap(), - ); - - let arrow_schema = Arc::new(ArrowSchema::new(vec![ - Field::new("id", DataType::Int32, false).with_metadata(HashMap::from([( - PARQUET_FIELD_ID_META_KEY.to_string(), - "1".to_string(), - )])), - ])); - - // Step 1: Create data file with 200 rows in 2 row groups - // Row group 0: rows 0-99 (ids 1-100) - // Row group 1: rows 100-199 (ids 101-200) - let data_file_path = format!("{table_location}/data.parquet"); - - let batch1 = RecordBatch::try_new(arrow_schema.clone(), vec![Arc::new( - Int32Array::from_iter_values(1..=100), - )]) - .unwrap(); - - let batch2 = RecordBatch::try_new(arrow_schema.clone(), vec![Arc::new( - Int32Array::from_iter_values(101..=200), - )]) - .unwrap(); - - // Force each batch into its own row group - let props = WriterProperties::builder() - .set_compression(Compression::SNAPPY) - .set_max_row_group_row_count(Some(100)) - .build(); - - let file = File::create(&data_file_path).unwrap(); - let mut writer = ArrowWriter::try_new(file, arrow_schema.clone(), Some(props)).unwrap(); - writer.write(&batch1).expect("Writing batch 1"); - writer.write(&batch2).expect("Writing batch 2"); - writer.close().unwrap(); - - // Verify we created 2 row groups - let verify_file = File::open(&data_file_path).unwrap(); - let verify_reader = SerializedFileReader::new(verify_file).unwrap(); - assert_eq!( - verify_reader.metadata().num_row_groups(), - 2, - "Should have 2 row groups" - ); - - // Step 2: Create position delete file that deletes row 199 (id=200, last row in row group 1) - let delete_file_path = format!("{table_location}/deletes.parquet"); - - let delete_schema = Arc::new(ArrowSchema::new(vec![ - Field::new("file_path", DataType::Utf8, false).with_metadata(HashMap::from([( - PARQUET_FIELD_ID_META_KEY.to_string(), - FIELD_ID_POSITIONAL_DELETE_FILE_PATH.to_string(), - )])), - Field::new("pos", DataType::Int64, false).with_metadata(HashMap::from([( - PARQUET_FIELD_ID_META_KEY.to_string(), - FIELD_ID_POSITIONAL_DELETE_POS.to_string(), - )])), - ])); - - // Delete row at position 199 (0-indexed, so it's the last row: id=200) - let delete_batch = RecordBatch::try_new(delete_schema.clone(), vec![ - Arc::new(StringArray::from_iter_values(vec![data_file_path.clone()])), - Arc::new(Int64Array::from_iter_values(vec![199i64])), - ]) - .unwrap(); - - let delete_props = WriterProperties::builder() - .set_compression(Compression::SNAPPY) - .build(); - - let delete_file = File::create(&delete_file_path).unwrap(); - let mut delete_writer = - ArrowWriter::try_new(delete_file, delete_schema, Some(delete_props)).unwrap(); - delete_writer.write(&delete_batch).unwrap(); - delete_writer.close().unwrap(); - - // Step 3: Read the data file with the delete applied - let file_io = FileIO::new_with_fs(); - let reader = ArrowReaderBuilder::new(file_io).build(); - - let task = FileScanTask { - file_size_in_bytes: std::fs::metadata(&data_file_path).unwrap().len(), - start: 0, - length: 0, - record_count: Some(200), - data_file_path: data_file_path.clone(), - data_file_format: DataFileFormat::Parquet, - schema: table_schema.clone(), - project_field_ids: vec![1], - predicate: None, - deletes: vec![FileScanTaskDeleteFile { - file_size_in_bytes: std::fs::metadata(&delete_file_path).unwrap().len(), - file_path: delete_file_path, - file_type: DataContentType::PositionDeletes, - partition_spec_id: 0, - equality_ids: None, - }], - partition: None, - partition_spec: None, - name_mapping: None, - case_sensitive: false, - }; - - let tasks = Box::pin(futures::stream::iter(vec![Ok(task)])) as FileScanTaskStream; - let result = reader - .read(tasks) - .unwrap() - .try_collect::>() - .await - .unwrap(); - - // Step 4: Verify we got 199 rows (not 200) - let total_rows: usize = result.iter().map(|b| b.num_rows()).sum(); - - println!("Total rows read: {total_rows}"); - println!("Expected: 199 rows (deleted row 199 which had id=200)"); - - // This assertion will FAIL before the fix and PASS after the fix - assert_eq!( - total_rows, 199, - "Expected 199 rows after deleting row 199, but got {total_rows} rows. \ - The bug causes position deletes in later row groups to be ignored." - ); - - // Verify the deleted row (id=200) is not present - let all_ids: Vec = result - .iter() - .flat_map(|batch| { - batch - .column(0) - .as_primitive::() - .values() - .iter() - .copied() - }) - .collect(); - - assert!( - !all_ids.contains(&200), - "Row with id=200 should be deleted but was found in results" - ); - - // Verify we have all other ids (1-199) - let expected_ids: Vec = (1..=199).collect(); - assert_eq!( - all_ids, expected_ids, - "Should have ids 1-199 but got different values" - ); - } - - /// Test for bug where position deletes are lost when skipping unselected row groups. - /// - /// This is a variant of `test_position_delete_across_multiple_row_groups` that exercises - /// the row group selection code path (`selected_row_groups: Some([...])`). - /// - /// When a file has multiple row groups and only some are selected for reading, - /// the `build_deletes_row_selection` function must correctly skip over deletes in - /// unselected row groups WITHOUT consuming deletes that belong to selected row groups. - /// - /// This test creates: - /// - A data file with 200 rows split into 2 row groups (0-99, 100-199) - /// - A position delete file that deletes row 199 (last row in second row group) - /// - Row group selection that reads ONLY row group 1 (rows 100-199) - /// - /// Expected behavior: Should return 99 rows (with row 199 deleted) - /// Bug behavior: Returns 100 rows (delete is lost when skipping row group 0) - /// - /// The bug occurs when processing row group 0 (unselected): - /// ```rust - /// delete_vector_iter.advance_to(next_row_group_base_idx); // Position at first delete >= 100 - /// next_deleted_row_idx_opt = delete_vector_iter.next(); // BUG: Consumes delete at 199! - /// ``` - /// - /// The fix is to NOT call `next()` after `advance_to()` when skipping unselected row groups, - /// because `advance_to()` already positions the iterator correctly without consuming elements. - #[tokio::test] - async fn test_position_delete_with_row_group_selection() { - use arrow_array::{Int32Array, Int64Array}; - use parquet::file::reader::{FileReader, SerializedFileReader}; - - // Field IDs for positional delete schema - const FIELD_ID_POSITIONAL_DELETE_FILE_PATH: u64 = 2147483546; - const FIELD_ID_POSITIONAL_DELETE_POS: u64 = 2147483545; - - let tmp_dir = TempDir::new().unwrap(); - let table_location = tmp_dir.path().to_str().unwrap().to_string(); - - // Create table schema with a single 'id' column - let table_schema = Arc::new( - Schema::builder() - .with_schema_id(1) - .with_fields(vec![ - NestedField::required(1, "id", Type::Primitive(PrimitiveType::Int)).into(), - ]) - .build() - .unwrap(), - ); - - let arrow_schema = Arc::new(ArrowSchema::new(vec![ - Field::new("id", DataType::Int32, false).with_metadata(HashMap::from([( - PARQUET_FIELD_ID_META_KEY.to_string(), - "1".to_string(), - )])), - ])); - - // Step 1: Create data file with 200 rows in 2 row groups - // Row group 0: rows 0-99 (ids 1-100) - // Row group 1: rows 100-199 (ids 101-200) - let data_file_path = format!("{table_location}/data.parquet"); - - let batch1 = RecordBatch::try_new(arrow_schema.clone(), vec![Arc::new( - Int32Array::from_iter_values(1..=100), - )]) - .unwrap(); - - let batch2 = RecordBatch::try_new(arrow_schema.clone(), vec![Arc::new( - Int32Array::from_iter_values(101..=200), - )]) - .unwrap(); - - // Force each batch into its own row group - let props = WriterProperties::builder() - .set_compression(Compression::SNAPPY) - .set_max_row_group_row_count(Some(100)) - .build(); - - let file = File::create(&data_file_path).unwrap(); - let mut writer = ArrowWriter::try_new(file, arrow_schema.clone(), Some(props)).unwrap(); - writer.write(&batch1).expect("Writing batch 1"); - writer.write(&batch2).expect("Writing batch 2"); - writer.close().unwrap(); - - // Verify we created 2 row groups - let verify_file = File::open(&data_file_path).unwrap(); - let verify_reader = SerializedFileReader::new(verify_file).unwrap(); - assert_eq!( - verify_reader.metadata().num_row_groups(), - 2, - "Should have 2 row groups" - ); - - // Step 2: Create position delete file that deletes row 199 (id=200, last row in row group 1) - let delete_file_path = format!("{table_location}/deletes.parquet"); - - let delete_schema = Arc::new(ArrowSchema::new(vec![ - Field::new("file_path", DataType::Utf8, false).with_metadata(HashMap::from([( - PARQUET_FIELD_ID_META_KEY.to_string(), - FIELD_ID_POSITIONAL_DELETE_FILE_PATH.to_string(), - )])), - Field::new("pos", DataType::Int64, false).with_metadata(HashMap::from([( - PARQUET_FIELD_ID_META_KEY.to_string(), - FIELD_ID_POSITIONAL_DELETE_POS.to_string(), - )])), - ])); - - // Delete row at position 199 (0-indexed, so it's the last row: id=200) - let delete_batch = RecordBatch::try_new(delete_schema.clone(), vec![ - Arc::new(StringArray::from_iter_values(vec![data_file_path.clone()])), - Arc::new(Int64Array::from_iter_values(vec![199i64])), - ]) - .unwrap(); - - let delete_props = WriterProperties::builder() - .set_compression(Compression::SNAPPY) - .build(); - - let delete_file = File::create(&delete_file_path).unwrap(); - let mut delete_writer = - ArrowWriter::try_new(delete_file, delete_schema, Some(delete_props)).unwrap(); - delete_writer.write(&delete_batch).unwrap(); - delete_writer.close().unwrap(); - - // Step 3: Get byte ranges to read ONLY row group 1 (rows 100-199) - // This exercises the row group selection code path where row group 0 is skipped - let metadata_file = File::open(&data_file_path).unwrap(); - let metadata_reader = SerializedFileReader::new(metadata_file).unwrap(); - let metadata = metadata_reader.metadata(); - - let row_group_0 = metadata.row_group(0); - let row_group_1 = metadata.row_group(1); - - let rg0_start = 4u64; // Parquet files start with 4-byte magic "PAR1" - let rg1_start = rg0_start + row_group_0.compressed_size() as u64; - let rg1_length = row_group_1.compressed_size() as u64; - - println!( - "Row group 0: starts at byte {}, {} bytes compressed", - rg0_start, - row_group_0.compressed_size() - ); - println!( - "Row group 1: starts at byte {}, {} bytes compressed", - rg1_start, - row_group_1.compressed_size() - ); - - let file_io = FileIO::new_with_fs(); - let reader = ArrowReaderBuilder::new(file_io).build(); - - // Create FileScanTask that reads ONLY row group 1 via byte range filtering - let task = FileScanTask { - file_size_in_bytes: std::fs::metadata(&data_file_path).unwrap().len(), - start: rg1_start, - length: rg1_length, - record_count: Some(100), // Row group 1 has 100 rows - data_file_path: data_file_path.clone(), - data_file_format: DataFileFormat::Parquet, - schema: table_schema.clone(), - project_field_ids: vec![1], - predicate: None, - deletes: vec![FileScanTaskDeleteFile { - file_size_in_bytes: std::fs::metadata(&delete_file_path).unwrap().len(), - file_path: delete_file_path, - file_type: DataContentType::PositionDeletes, - partition_spec_id: 0, - equality_ids: None, - }], - partition: None, - partition_spec: None, - name_mapping: None, - case_sensitive: false, - }; - - let tasks = Box::pin(futures::stream::iter(vec![Ok(task)])) as FileScanTaskStream; - let result = reader - .read(tasks) - .unwrap() - .try_collect::>() - .await - .unwrap(); - - // Step 4: Verify we got 99 rows (not 100) - // Row group 1 has 100 rows (ids 101-200), minus 1 delete (id=200) = 99 rows - let total_rows: usize = result.iter().map(|b| b.num_rows()).sum(); - - println!("Total rows read from row group 1: {total_rows}"); - println!("Expected: 99 rows (row group 1 has 100 rows, 1 delete at position 199)"); - - // This assertion will FAIL before the fix and PASS after the fix - assert_eq!( - total_rows, 99, - "Expected 99 rows from row group 1 after deleting position 199, but got {total_rows} rows. \ - The bug causes position deletes to be lost when advance_to() is followed by next() \ - when skipping unselected row groups." - ); - - // Verify the deleted row (id=200) is not present - let all_ids: Vec = result - .iter() - .flat_map(|batch| { - batch - .column(0) - .as_primitive::() - .values() - .iter() - .copied() - }) - .collect(); - - assert!( - !all_ids.contains(&200), - "Row with id=200 should be deleted but was found in results" - ); - - // Verify we have ids 101-199 (not 101-200) - let expected_ids: Vec = (101..=199).collect(); - assert_eq!( - all_ids, expected_ids, - "Should have ids 101-199 but got different values" - ); - } - /// Test for bug where stale cached delete causes infinite loop when skipping row groups. - /// - /// This test exposes the inverse scenario of `test_position_delete_with_row_group_selection`: - /// - Position delete targets a row in the SKIPPED row group (not the selected one) - /// - After calling advance_to(), the cached delete index is stale - /// - Without updating the cache, the code enters an infinite loop - /// - /// This test creates: - /// - A data file with 200 rows split into 2 row groups (0-99, 100-199) - /// - A position delete file that deletes row 0 (first row in SKIPPED row group 0) - /// - Row group selection that reads ONLY row group 1 (rows 100-199) - /// - /// The bug occurs when skipping row group 0: - /// ```rust - /// let mut next_deleted_row_idx_opt = delete_vector_iter.next(); // Some(0) - /// // ... skip to row group 1 ... - /// delete_vector_iter.advance_to(100); // Iterator advances past delete at 0 - /// // BUG: next_deleted_row_idx_opt is still Some(0) - STALE! - /// // When processing row group 1: - /// // current_idx = 100, next_deleted_row_idx = 0, next_row_group_base_idx = 200 - /// // Loop condition: 0 < 200 (true) - /// // But: current_idx (100) > next_deleted_row_idx (0) - /// // And: current_idx (100) != next_deleted_row_idx (0) - /// // Neither branch executes -> INFINITE LOOP! - /// ``` - /// - /// Expected behavior: Should return 100 rows (delete at 0 doesn't affect row group 1) - /// Bug behavior: Infinite loop in build_deletes_row_selection - #[tokio::test] - async fn test_position_delete_in_skipped_row_group() { - use arrow_array::{Int32Array, Int64Array}; - use parquet::file::reader::{FileReader, SerializedFileReader}; - - // Field IDs for positional delete schema - const FIELD_ID_POSITIONAL_DELETE_FILE_PATH: u64 = 2147483546; - const FIELD_ID_POSITIONAL_DELETE_POS: u64 = 2147483545; - - let tmp_dir = TempDir::new().unwrap(); - let table_location = tmp_dir.path().to_str().unwrap().to_string(); - - // Create table schema with a single 'id' column - let table_schema = Arc::new( - Schema::builder() - .with_schema_id(1) - .with_fields(vec![ - NestedField::required(1, "id", Type::Primitive(PrimitiveType::Int)).into(), - ]) - .build() - .unwrap(), - ); - - let arrow_schema = Arc::new(ArrowSchema::new(vec![ - Field::new("id", DataType::Int32, false).with_metadata(HashMap::from([( - PARQUET_FIELD_ID_META_KEY.to_string(), - "1".to_string(), - )])), - ])); - - // Step 1: Create data file with 200 rows in 2 row groups - // Row group 0: rows 0-99 (ids 1-100) - // Row group 1: rows 100-199 (ids 101-200) - let data_file_path = format!("{table_location}/data.parquet"); - - let batch1 = RecordBatch::try_new(arrow_schema.clone(), vec![Arc::new( - Int32Array::from_iter_values(1..=100), - )]) - .unwrap(); - - let batch2 = RecordBatch::try_new(arrow_schema.clone(), vec![Arc::new( - Int32Array::from_iter_values(101..=200), - )]) - .unwrap(); - - // Force each batch into its own row group - let props = WriterProperties::builder() - .set_compression(Compression::SNAPPY) - .set_max_row_group_row_count(Some(100)) - .build(); - - let file = File::create(&data_file_path).unwrap(); - let mut writer = ArrowWriter::try_new(file, arrow_schema.clone(), Some(props)).unwrap(); - writer.write(&batch1).expect("Writing batch 1"); - writer.write(&batch2).expect("Writing batch 2"); - writer.close().unwrap(); - - // Verify we created 2 row groups - let verify_file = File::open(&data_file_path).unwrap(); - let verify_reader = SerializedFileReader::new(verify_file).unwrap(); - assert_eq!( - verify_reader.metadata().num_row_groups(), - 2, - "Should have 2 row groups" - ); - - // Step 2: Create position delete file that deletes row 0 (id=1, first row in row group 0) - let delete_file_path = format!("{table_location}/deletes.parquet"); - - let delete_schema = Arc::new(ArrowSchema::new(vec![ - Field::new("file_path", DataType::Utf8, false).with_metadata(HashMap::from([( - PARQUET_FIELD_ID_META_KEY.to_string(), - FIELD_ID_POSITIONAL_DELETE_FILE_PATH.to_string(), - )])), - Field::new("pos", DataType::Int64, false).with_metadata(HashMap::from([( - PARQUET_FIELD_ID_META_KEY.to_string(), - FIELD_ID_POSITIONAL_DELETE_POS.to_string(), - )])), - ])); - - // Delete row at position 0 (0-indexed, so it's the first row: id=1) - let delete_batch = RecordBatch::try_new(delete_schema.clone(), vec![ - Arc::new(StringArray::from_iter_values(vec![data_file_path.clone()])), - Arc::new(Int64Array::from_iter_values(vec![0i64])), - ]) - .unwrap(); - - let delete_props = WriterProperties::builder() - .set_compression(Compression::SNAPPY) - .build(); - - let delete_file = File::create(&delete_file_path).unwrap(); - let mut delete_writer = - ArrowWriter::try_new(delete_file, delete_schema, Some(delete_props)).unwrap(); - delete_writer.write(&delete_batch).unwrap(); - delete_writer.close().unwrap(); - - // Step 3: Get byte ranges to read ONLY row group 1 (rows 100-199) - // This exercises the row group selection code path where row group 0 is skipped - let metadata_file = File::open(&data_file_path).unwrap(); - let metadata_reader = SerializedFileReader::new(metadata_file).unwrap(); - let metadata = metadata_reader.metadata(); - - let row_group_0 = metadata.row_group(0); - let row_group_1 = metadata.row_group(1); - - let rg0_start = 4u64; // Parquet files start with 4-byte magic "PAR1" - let rg1_start = rg0_start + row_group_0.compressed_size() as u64; - let rg1_length = row_group_1.compressed_size() as u64; - - let file_io = FileIO::new_with_fs(); - let reader = ArrowReaderBuilder::new(file_io).build(); - - // Create FileScanTask that reads ONLY row group 1 via byte range filtering - let task = FileScanTask { - file_size_in_bytes: std::fs::metadata(&data_file_path).unwrap().len(), - start: rg1_start, - length: rg1_length, - record_count: Some(100), // Row group 1 has 100 rows - data_file_path: data_file_path.clone(), - data_file_format: DataFileFormat::Parquet, - schema: table_schema.clone(), - project_field_ids: vec![1], - predicate: None, - deletes: vec![FileScanTaskDeleteFile { - file_size_in_bytes: std::fs::metadata(&delete_file_path).unwrap().len(), - file_path: delete_file_path, - file_type: DataContentType::PositionDeletes, - partition_spec_id: 0, - equality_ids: None, - }], - partition: None, - partition_spec: None, - name_mapping: None, - case_sensitive: false, - }; - - let tasks = Box::pin(futures::stream::iter(vec![Ok(task)])) as FileScanTaskStream; - let result = reader - .read(tasks) - .unwrap() - .try_collect::>() - .await - .unwrap(); - - // Step 4: Verify we got 100 rows (all of row group 1) - // The delete at position 0 is in row group 0, which is skipped, so it doesn't affect us - let total_rows: usize = result.iter().map(|b| b.num_rows()).sum(); - - assert_eq!( - total_rows, 100, - "Expected 100 rows from row group 1 (delete at position 0 is in skipped row group 0). \ - If this hangs or fails, it indicates the cached delete index was not updated after advance_to()." - ); - - // Verify we have all ids from row group 1 (101-200) - let all_ids: Vec = result - .iter() - .flat_map(|batch| { - batch - .column(0) - .as_primitive::() - .values() - .iter() - .copied() - }) - .collect(); - - let expected_ids: Vec = (101..=200).collect(); - assert_eq!( - all_ids, expected_ids, - "Should have ids 101-200 (all of row group 1)" - ); - } - - /// Test reading Parquet files without field ID metadata (e.g., migrated tables). - /// This exercises the position-based fallback path. - /// - /// Corresponds to Java's ParquetSchemaUtil.addFallbackIds() + pruneColumnsFallback() - /// in /parquet/src/main/java/org/apache/iceberg/parquet/ParquetSchemaUtil.java - #[tokio::test] - async fn test_read_parquet_file_without_field_ids() { - let schema = Arc::new( - Schema::builder() - .with_schema_id(1) - .with_fields(vec![ - NestedField::required(1, "name", Type::Primitive(PrimitiveType::String)).into(), - NestedField::required(2, "age", Type::Primitive(PrimitiveType::Int)).into(), - ]) - .build() - .unwrap(), - ); - - // Parquet file from a migrated table - no field ID metadata - let arrow_schema = Arc::new(ArrowSchema::new(vec![ - Field::new("name", DataType::Utf8, false), - Field::new("age", DataType::Int32, false), - ])); - - let tmp_dir = TempDir::new().unwrap(); - let table_location = tmp_dir.path().to_str().unwrap().to_string(); - let file_io = FileIO::new_with_fs(); - - let name_data = vec!["Alice", "Bob", "Charlie"]; - let age_data = vec![30, 25, 35]; - - use arrow_array::Int32Array; - let name_col = Arc::new(StringArray::from(name_data.clone())) as ArrayRef; - let age_col = Arc::new(Int32Array::from(age_data.clone())) as ArrayRef; - - let to_write = RecordBatch::try_new(arrow_schema.clone(), vec![name_col, age_col]).unwrap(); - - let props = WriterProperties::builder() - .set_compression(Compression::SNAPPY) - .build(); - - let file = File::create(format!("{table_location}/1.parquet")).unwrap(); - let mut writer = ArrowWriter::try_new(file, to_write.schema(), Some(props)).unwrap(); - - writer.write(&to_write).expect("Writing batch"); - writer.close().unwrap(); - - let reader = ArrowReaderBuilder::new(file_io).build(); - - let tasks = Box::pin(futures::stream::iter( - vec![Ok(FileScanTask { - file_size_in_bytes: std::fs::metadata(format!("{table_location}/1.parquet")) - .unwrap() - .len(), - start: 0, - length: 0, - record_count: None, - data_file_path: format!("{table_location}/1.parquet"), - data_file_format: DataFileFormat::Parquet, - schema: schema.clone(), - project_field_ids: vec![1, 2], - predicate: None, - deletes: vec![], - partition: None, - partition_spec: None, - name_mapping: None, - case_sensitive: false, - })] - .into_iter(), - )) as FileScanTaskStream; - - let result = reader - .read(tasks) - .unwrap() - .try_collect::>() - .await - .unwrap(); - - assert_eq!(result.len(), 1); - let batch = &result[0]; - assert_eq!(batch.num_rows(), 3); - assert_eq!(batch.num_columns(), 2); - - // Verify position-based mapping: field_id 1 → position 0, field_id 2 → position 1 - let name_array = batch.column(0).as_string::(); - assert_eq!(name_array.value(0), "Alice"); - assert_eq!(name_array.value(1), "Bob"); - assert_eq!(name_array.value(2), "Charlie"); - - let age_array = batch - .column(1) - .as_primitive::(); - assert_eq!(age_array.value(0), 30); - assert_eq!(age_array.value(1), 25); - assert_eq!(age_array.value(2), 35); - } - - /// Test reading Parquet files without field IDs with partial projection. - /// Only a subset of columns are requested, verifying position-based fallback - /// handles column selection correctly. - #[tokio::test] - async fn test_read_parquet_without_field_ids_partial_projection() { - use arrow_array::Int32Array; - - let schema = Arc::new( - Schema::builder() - .with_schema_id(1) - .with_fields(vec![ - NestedField::required(1, "col1", Type::Primitive(PrimitiveType::String)).into(), - NestedField::required(2, "col2", Type::Primitive(PrimitiveType::Int)).into(), - NestedField::required(3, "col3", Type::Primitive(PrimitiveType::String)).into(), - NestedField::required(4, "col4", Type::Primitive(PrimitiveType::Int)).into(), - ]) - .build() - .unwrap(), - ); - - let arrow_schema = Arc::new(ArrowSchema::new(vec![ - Field::new("col1", DataType::Utf8, false), - Field::new("col2", DataType::Int32, false), - Field::new("col3", DataType::Utf8, false), - Field::new("col4", DataType::Int32, false), - ])); - - let tmp_dir = TempDir::new().unwrap(); - let table_location = tmp_dir.path().to_str().unwrap().to_string(); - let file_io = FileIO::new_with_fs(); - - let col1_data = Arc::new(StringArray::from(vec!["a", "b"])) as ArrayRef; - let col2_data = Arc::new(Int32Array::from(vec![10, 20])) as ArrayRef; - let col3_data = Arc::new(StringArray::from(vec!["c", "d"])) as ArrayRef; - let col4_data = Arc::new(Int32Array::from(vec![30, 40])) as ArrayRef; - - let to_write = RecordBatch::try_new(arrow_schema.clone(), vec![ - col1_data, col2_data, col3_data, col4_data, - ]) - .unwrap(); - - let props = WriterProperties::builder() - .set_compression(Compression::SNAPPY) - .build(); - - let file = File::create(format!("{table_location}/1.parquet")).unwrap(); - let mut writer = ArrowWriter::try_new(file, to_write.schema(), Some(props)).unwrap(); - - writer.write(&to_write).expect("Writing batch"); - writer.close().unwrap(); - - let reader = ArrowReaderBuilder::new(file_io).build(); - - let tasks = Box::pin(futures::stream::iter( - vec![Ok(FileScanTask { - file_size_in_bytes: std::fs::metadata(format!("{table_location}/1.parquet")) - .unwrap() - .len(), - start: 0, - length: 0, - record_count: None, - data_file_path: format!("{table_location}/1.parquet"), - data_file_format: DataFileFormat::Parquet, - schema: schema.clone(), - project_field_ids: vec![1, 3], - predicate: None, - deletes: vec![], - partition: None, - partition_spec: None, - name_mapping: None, - case_sensitive: false, - })] - .into_iter(), - )) as FileScanTaskStream; - - let result = reader - .read(tasks) - .unwrap() - .try_collect::>() - .await - .unwrap(); - - assert_eq!(result.len(), 1); - let batch = &result[0]; - assert_eq!(batch.num_rows(), 2); - assert_eq!(batch.num_columns(), 2); - - let col1_array = batch.column(0).as_string::(); - assert_eq!(col1_array.value(0), "a"); - assert_eq!(col1_array.value(1), "b"); - - let col3_array = batch.column(1).as_string::(); - assert_eq!(col3_array.value(0), "c"); - assert_eq!(col3_array.value(1), "d"); - } - - /// Test reading Parquet files without field IDs with schema evolution. - /// The Iceberg schema has more fields than the Parquet file, testing that - /// missing columns are filled with NULLs. - #[tokio::test] - async fn test_read_parquet_without_field_ids_schema_evolution() { - use arrow_array::{Array, Int32Array}; - - // Schema with field 3 added after the file was written - let schema = Arc::new( - Schema::builder() - .with_schema_id(1) - .with_fields(vec![ - NestedField::required(1, "name", Type::Primitive(PrimitiveType::String)).into(), - NestedField::required(2, "age", Type::Primitive(PrimitiveType::Int)).into(), - NestedField::optional(3, "city", Type::Primitive(PrimitiveType::String)).into(), - ]) - .build() - .unwrap(), - ); - - let arrow_schema = Arc::new(ArrowSchema::new(vec![ - Field::new("name", DataType::Utf8, false), - Field::new("age", DataType::Int32, false), - ])); - - let tmp_dir = TempDir::new().unwrap(); - let table_location = tmp_dir.path().to_str().unwrap().to_string(); - let file_io = FileIO::new_with_fs(); - - let name_data = Arc::new(StringArray::from(vec!["Alice", "Bob"])) as ArrayRef; - let age_data = Arc::new(Int32Array::from(vec![30, 25])) as ArrayRef; - - let to_write = - RecordBatch::try_new(arrow_schema.clone(), vec![name_data, age_data]).unwrap(); - - let props = WriterProperties::builder() - .set_compression(Compression::SNAPPY) - .build(); - - let file = File::create(format!("{table_location}/1.parquet")).unwrap(); - let mut writer = ArrowWriter::try_new(file, to_write.schema(), Some(props)).unwrap(); - - writer.write(&to_write).expect("Writing batch"); - writer.close().unwrap(); - - let reader = ArrowReaderBuilder::new(file_io).build(); - - let tasks = Box::pin(futures::stream::iter( - vec![Ok(FileScanTask { - file_size_in_bytes: std::fs::metadata(format!("{table_location}/1.parquet")) - .unwrap() - .len(), - start: 0, - length: 0, - record_count: None, - data_file_path: format!("{table_location}/1.parquet"), - data_file_format: DataFileFormat::Parquet, - schema: schema.clone(), - project_field_ids: vec![1, 2, 3], - predicate: None, - deletes: vec![], - partition: None, - partition_spec: None, - name_mapping: None, - case_sensitive: false, - })] - .into_iter(), - )) as FileScanTaskStream; - - let result = reader - .read(tasks) - .unwrap() - .try_collect::>() - .await - .unwrap(); - - assert_eq!(result.len(), 1); - let batch = &result[0]; - assert_eq!(batch.num_rows(), 2); - assert_eq!(batch.num_columns(), 3); - - let name_array = batch.column(0).as_string::(); - assert_eq!(name_array.value(0), "Alice"); - assert_eq!(name_array.value(1), "Bob"); - - let age_array = batch - .column(1) - .as_primitive::(); - assert_eq!(age_array.value(0), 30); - assert_eq!(age_array.value(1), 25); - - // Verify missing column filled with NULLs - let city_array = batch.column(2).as_string::(); - assert_eq!(city_array.null_count(), 2); - assert!(city_array.is_null(0)); - assert!(city_array.is_null(1)); - } - - /// Test reading Parquet files without field IDs that have multiple row groups. - /// This ensures the position-based fallback works correctly across row group boundaries. - #[tokio::test] - async fn test_read_parquet_without_field_ids_multiple_row_groups() { - use arrow_array::Int32Array; - - let schema = Arc::new( - Schema::builder() - .with_schema_id(1) - .with_fields(vec![ - NestedField::required(1, "name", Type::Primitive(PrimitiveType::String)).into(), - NestedField::required(2, "value", Type::Primitive(PrimitiveType::Int)).into(), - ]) - .build() - .unwrap(), - ); - - let arrow_schema = Arc::new(ArrowSchema::new(vec![ - Field::new("name", DataType::Utf8, false), - Field::new("value", DataType::Int32, false), - ])); - - let tmp_dir = TempDir::new().unwrap(); - let table_location = tmp_dir.path().to_str().unwrap().to_string(); - let file_io = FileIO::new_with_fs(); - - // Small row group size to create multiple row groups - let props = WriterProperties::builder() - .set_compression(Compression::SNAPPY) - .set_write_batch_size(2) - .set_max_row_group_row_count(Some(2)) - .build(); - - let file = File::create(format!("{table_location}/1.parquet")).unwrap(); - let mut writer = ArrowWriter::try_new(file, arrow_schema.clone(), Some(props)).unwrap(); - - // Write 6 rows in 3 batches (will create 3 row groups) - for batch_num in 0..3 { - let name_data = Arc::new(StringArray::from(vec![ - format!("name_{}", batch_num * 2), - format!("name_{}", batch_num * 2 + 1), - ])) as ArrayRef; - let value_data = - Arc::new(Int32Array::from(vec![batch_num * 2, batch_num * 2 + 1])) as ArrayRef; - - let batch = - RecordBatch::try_new(arrow_schema.clone(), vec![name_data, value_data]).unwrap(); - writer.write(&batch).expect("Writing batch"); - } - writer.close().unwrap(); - - let reader = ArrowReaderBuilder::new(file_io).build(); - - let tasks = Box::pin(futures::stream::iter( - vec![Ok(FileScanTask { - file_size_in_bytes: std::fs::metadata(format!("{table_location}/1.parquet")) - .unwrap() - .len(), - start: 0, - length: 0, - record_count: None, - data_file_path: format!("{table_location}/1.parquet"), - data_file_format: DataFileFormat::Parquet, - schema: schema.clone(), - project_field_ids: vec![1, 2], - predicate: None, - deletes: vec![], - partition: None, - partition_spec: None, - name_mapping: None, - case_sensitive: false, - })] - .into_iter(), - )) as FileScanTaskStream; - - let result = reader - .read(tasks) - .unwrap() - .try_collect::>() - .await - .unwrap(); - - assert!(!result.is_empty()); - - let mut all_names = Vec::new(); - let mut all_values = Vec::new(); - - for batch in &result { - let name_array = batch.column(0).as_string::(); - let value_array = batch - .column(1) - .as_primitive::(); - - for i in 0..batch.num_rows() { - all_names.push(name_array.value(i).to_string()); - all_values.push(value_array.value(i)); - } - } - - assert_eq!(all_names.len(), 6); - assert_eq!(all_values.len(), 6); - - for i in 0..6 { - assert_eq!(all_names[i], format!("name_{i}")); - assert_eq!(all_values[i], i as i32); - } - } - - /// Test reading Parquet files without field IDs with nested types (struct). - /// Java's pruneColumnsFallback() projects entire top-level columns including nested content. - /// This test verifies that a top-level struct field is projected correctly with all its nested fields. - #[tokio::test] - async fn test_read_parquet_without_field_ids_with_struct() { - use arrow_array::{Int32Array, StructArray}; - use arrow_schema::Fields; - - let schema = Arc::new( - Schema::builder() - .with_schema_id(1) - .with_fields(vec![ - NestedField::required(1, "id", Type::Primitive(PrimitiveType::Int)).into(), - NestedField::required( - 2, - "person", - Type::Struct(crate::spec::StructType::new(vec![ - NestedField::required( - 3, - "name", - Type::Primitive(PrimitiveType::String), - ) - .into(), - NestedField::required(4, "age", Type::Primitive(PrimitiveType::Int)) - .into(), - ])), - ) - .into(), - ]) - .build() - .unwrap(), - ); - - let arrow_schema = Arc::new(ArrowSchema::new(vec![ - Field::new("id", DataType::Int32, false), - Field::new( - "person", - DataType::Struct(Fields::from(vec![ - Field::new("name", DataType::Utf8, false), - Field::new("age", DataType::Int32, false), - ])), - false, - ), - ])); - - let tmp_dir = TempDir::new().unwrap(); - let table_location = tmp_dir.path().to_str().unwrap().to_string(); - let file_io = FileIO::new_with_fs(); - - let id_data = Arc::new(Int32Array::from(vec![1, 2])) as ArrayRef; - let name_data = Arc::new(StringArray::from(vec!["Alice", "Bob"])) as ArrayRef; - let age_data = Arc::new(Int32Array::from(vec![30, 25])) as ArrayRef; - let person_data = Arc::new(StructArray::from(vec![ - ( - Arc::new(Field::new("name", DataType::Utf8, false)), - name_data, - ), - ( - Arc::new(Field::new("age", DataType::Int32, false)), - age_data, - ), - ])) as ArrayRef; - - let to_write = - RecordBatch::try_new(arrow_schema.clone(), vec![id_data, person_data]).unwrap(); - - let props = WriterProperties::builder() - .set_compression(Compression::SNAPPY) - .build(); - - let file = File::create(format!("{table_location}/1.parquet")).unwrap(); - let mut writer = ArrowWriter::try_new(file, to_write.schema(), Some(props)).unwrap(); - - writer.write(&to_write).expect("Writing batch"); - writer.close().unwrap(); - - let reader = ArrowReaderBuilder::new(file_io).build(); - - let tasks = Box::pin(futures::stream::iter( - vec![Ok(FileScanTask { - file_size_in_bytes: std::fs::metadata(format!("{table_location}/1.parquet")) - .unwrap() - .len(), - start: 0, - length: 0, - record_count: None, - data_file_path: format!("{table_location}/1.parquet"), - data_file_format: DataFileFormat::Parquet, - schema: schema.clone(), - project_field_ids: vec![1, 2], - predicate: None, - deletes: vec![], - partition: None, - partition_spec: None, - name_mapping: None, - case_sensitive: false, - })] - .into_iter(), - )) as FileScanTaskStream; - - let result = reader - .read(tasks) - .unwrap() - .try_collect::>() - .await - .unwrap(); - - assert_eq!(result.len(), 1); - let batch = &result[0]; - assert_eq!(batch.num_rows(), 2); - assert_eq!(batch.num_columns(), 2); - - let id_array = batch - .column(0) - .as_primitive::(); - assert_eq!(id_array.value(0), 1); - assert_eq!(id_array.value(1), 2); - - let person_array = batch.column(1).as_struct(); - assert_eq!(person_array.num_columns(), 2); - - let name_array = person_array.column(0).as_string::(); - assert_eq!(name_array.value(0), "Alice"); - assert_eq!(name_array.value(1), "Bob"); - - let age_array = person_array - .column(1) - .as_primitive::(); - assert_eq!(age_array.value(0), 30); - assert_eq!(age_array.value(1), 25); - } - - /// Test reading Parquet files without field IDs with schema evolution - column added in the middle. - /// When a new column is inserted between existing columns in the schema order, - /// the fallback projection must correctly map field IDs to output positions. - #[tokio::test] - async fn test_read_parquet_without_field_ids_schema_evolution_add_column_in_middle() { - use arrow_array::{Array, Int32Array}; - - let arrow_schema_old = Arc::new(ArrowSchema::new(vec![ - Field::new("col0", DataType::Int32, true), - Field::new("col1", DataType::Int32, true), - ])); - - // New column added between existing columns: col0 (id=1), newCol (id=5), col1 (id=2) - let schema = Arc::new( - Schema::builder() - .with_schema_id(1) - .with_fields(vec![ - NestedField::optional(1, "col0", Type::Primitive(PrimitiveType::Int)).into(), - NestedField::optional(5, "newCol", Type::Primitive(PrimitiveType::Int)).into(), - NestedField::optional(2, "col1", Type::Primitive(PrimitiveType::Int)).into(), - ]) - .build() - .unwrap(), - ); - - let tmp_dir = TempDir::new().unwrap(); - let table_location = tmp_dir.path().to_str().unwrap().to_string(); - let file_io = FileIO::new_with_fs(); - - let col0_data = Arc::new(Int32Array::from(vec![1, 2])) as ArrayRef; - let col1_data = Arc::new(Int32Array::from(vec![10, 20])) as ArrayRef; - - let to_write = - RecordBatch::try_new(arrow_schema_old.clone(), vec![col0_data, col1_data]).unwrap(); - - let props = WriterProperties::builder() - .set_compression(Compression::SNAPPY) - .build(); - - let file = File::create(format!("{table_location}/1.parquet")).unwrap(); - let mut writer = ArrowWriter::try_new(file, to_write.schema(), Some(props)).unwrap(); - writer.write(&to_write).expect("Writing batch"); - writer.close().unwrap(); - - let reader = ArrowReaderBuilder::new(file_io).build(); - - let tasks = Box::pin(futures::stream::iter( - vec![Ok(FileScanTask { - file_size_in_bytes: std::fs::metadata(format!("{table_location}/1.parquet")) - .unwrap() - .len(), - start: 0, - length: 0, - record_count: None, - data_file_path: format!("{table_location}/1.parquet"), - data_file_format: DataFileFormat::Parquet, - schema: schema.clone(), - project_field_ids: vec![1, 5, 2], - predicate: None, - deletes: vec![], - partition: None, - partition_spec: None, - name_mapping: None, - case_sensitive: false, - })] - .into_iter(), - )) as FileScanTaskStream; - - let result = reader - .read(tasks) - .unwrap() - .try_collect::>() - .await - .unwrap(); - - assert_eq!(result.len(), 1); - let batch = &result[0]; - assert_eq!(batch.num_rows(), 2); - assert_eq!(batch.num_columns(), 3); - - let result_col0 = batch - .column(0) - .as_primitive::(); - assert_eq!(result_col0.value(0), 1); - assert_eq!(result_col0.value(1), 2); - - // New column should be NULL (doesn't exist in old file) - let result_newcol = batch - .column(1) - .as_primitive::(); - assert_eq!(result_newcol.null_count(), 2); - assert!(result_newcol.is_null(0)); - assert!(result_newcol.is_null(1)); - - let result_col1 = batch - .column(2) - .as_primitive::(); - assert_eq!(result_col1.value(0), 10); - assert_eq!(result_col1.value(1), 20); - } - - /// Test reading Parquet files without field IDs with a filter that eliminates all row groups. - /// During development of field ID mapping, we saw a panic when row_selection_enabled=true and - /// all row groups are filtered out. - #[tokio::test] - async fn test_read_parquet_without_field_ids_filter_eliminates_all_rows() { - use arrow_array::{Float64Array, Int32Array}; - - // Schema with fields that will use fallback IDs 1, 2, 3 - let schema = Arc::new( - Schema::builder() - .with_schema_id(1) - .with_fields(vec![ - NestedField::required(1, "id", Type::Primitive(PrimitiveType::Int)).into(), - NestedField::required(2, "name", Type::Primitive(PrimitiveType::String)).into(), - NestedField::required(3, "value", Type::Primitive(PrimitiveType::Double)) - .into(), - ]) - .build() - .unwrap(), - ); - - let arrow_schema = Arc::new(ArrowSchema::new(vec![ - Field::new("id", DataType::Int32, false), - Field::new("name", DataType::Utf8, false), - Field::new("value", DataType::Float64, false), - ])); - - let tmp_dir = TempDir::new().unwrap(); - let table_location = tmp_dir.path().to_str().unwrap().to_string(); - let file_io = FileIO::new_with_fs(); - - // Write data where all ids are >= 10 - let id_data = Arc::new(Int32Array::from(vec![10, 11, 12])) as ArrayRef; - let name_data = Arc::new(StringArray::from(vec!["a", "b", "c"])) as ArrayRef; - let value_data = Arc::new(Float64Array::from(vec![100.0, 200.0, 300.0])) as ArrayRef; - - let to_write = - RecordBatch::try_new(arrow_schema.clone(), vec![id_data, name_data, value_data]) - .unwrap(); - - let props = WriterProperties::builder() - .set_compression(Compression::SNAPPY) - .build(); - - let file = File::create(format!("{table_location}/1.parquet")).unwrap(); - let mut writer = ArrowWriter::try_new(file, to_write.schema(), Some(props)).unwrap(); - writer.write(&to_write).expect("Writing batch"); - writer.close().unwrap(); - - // Filter that eliminates all row groups: id < 5 - let predicate = Reference::new("id").less_than(Datum::int(5)); - - // Enable both row_group_filtering and row_selection - triggered the panic - let reader = ArrowReaderBuilder::new(file_io) - .with_row_group_filtering_enabled(true) - .with_row_selection_enabled(true) - .build(); - - let tasks = Box::pin(futures::stream::iter( - vec![Ok(FileScanTask { - file_size_in_bytes: std::fs::metadata(format!("{table_location}/1.parquet")) - .unwrap() - .len(), - start: 0, - length: 0, - record_count: None, - data_file_path: format!("{table_location}/1.parquet"), - data_file_format: DataFileFormat::Parquet, - schema: schema.clone(), - project_field_ids: vec![1, 2, 3], - predicate: Some(predicate.bind(schema, true).unwrap()), - deletes: vec![], - partition: None, - partition_spec: None, - name_mapping: None, - case_sensitive: false, - })] - .into_iter(), - )) as FileScanTaskStream; - - // Should no longer panic - let result = reader - .read(tasks) - .unwrap() - .try_collect::>() - .await - .unwrap(); - - // Should return empty results - assert!(result.is_empty() || result.iter().all(|batch| batch.num_rows() == 0)); - } - - /// Test that concurrency=1 reads all files correctly and in deterministic order. - /// This verifies the fast-path optimization for single concurrency. - #[tokio::test] - async fn test_read_with_concurrency_one() { - use arrow_array::Int32Array; - - let schema = Arc::new( - Schema::builder() - .with_schema_id(1) - .with_fields(vec![ - NestedField::required(1, "id", Type::Primitive(PrimitiveType::Int)).into(), - NestedField::required(2, "file_num", Type::Primitive(PrimitiveType::Int)) - .into(), - ]) - .build() - .unwrap(), - ); - - let arrow_schema = Arc::new(ArrowSchema::new(vec![ - Field::new("id", DataType::Int32, false).with_metadata(HashMap::from([( - PARQUET_FIELD_ID_META_KEY.to_string(), - "1".to_string(), - )])), - Field::new("file_num", DataType::Int32, false).with_metadata(HashMap::from([( - PARQUET_FIELD_ID_META_KEY.to_string(), - "2".to_string(), - )])), - ])); - - let tmp_dir = TempDir::new().unwrap(); - let table_location = tmp_dir.path().to_str().unwrap().to_string(); - let file_io = FileIO::new_with_fs(); - - // Create 3 parquet files with different data - let props = WriterProperties::builder() - .set_compression(Compression::SNAPPY) - .build(); - - for file_num in 0..3 { - let id_data = Arc::new(Int32Array::from_iter_values( - file_num * 10..(file_num + 1) * 10, - )) as ArrayRef; - let file_num_data = Arc::new(Int32Array::from(vec![file_num; 10])) as ArrayRef; - - let to_write = - RecordBatch::try_new(arrow_schema.clone(), vec![id_data, file_num_data]).unwrap(); - - let file = File::create(format!("{table_location}/file_{file_num}.parquet")).unwrap(); - let mut writer = - ArrowWriter::try_new(file, to_write.schema(), Some(props.clone())).unwrap(); - writer.write(&to_write).expect("Writing batch"); - writer.close().unwrap(); - } - - // Read with concurrency=1 (fast-path) - let reader = ArrowReaderBuilder::new(file_io) - .with_data_file_concurrency_limit(1) - .build(); - - // Create tasks in a specific order: file_0, file_1, file_2 - let tasks = vec![ - Ok(FileScanTask { - file_size_in_bytes: std::fs::metadata(format!("{table_location}/file_0.parquet")) - .unwrap() - .len(), - start: 0, - length: 0, - record_count: None, - data_file_path: format!("{table_location}/file_0.parquet"), - data_file_format: DataFileFormat::Parquet, - schema: schema.clone(), - project_field_ids: vec![1, 2], - predicate: None, - deletes: vec![], - partition: None, - partition_spec: None, - name_mapping: None, - case_sensitive: false, - }), - Ok(FileScanTask { - file_size_in_bytes: std::fs::metadata(format!("{table_location}/file_1.parquet")) - .unwrap() - .len(), - start: 0, - length: 0, - record_count: None, - data_file_path: format!("{table_location}/file_1.parquet"), - data_file_format: DataFileFormat::Parquet, - schema: schema.clone(), - project_field_ids: vec![1, 2], - predicate: None, - deletes: vec![], - partition: None, - partition_spec: None, - name_mapping: None, - case_sensitive: false, - }), - Ok(FileScanTask { - file_size_in_bytes: std::fs::metadata(format!("{table_location}/file_2.parquet")) - .unwrap() - .len(), - start: 0, - length: 0, - record_count: None, - data_file_path: format!("{table_location}/file_2.parquet"), - data_file_format: DataFileFormat::Parquet, - schema: schema.clone(), - project_field_ids: vec![1, 2], - predicate: None, - deletes: vec![], - partition: None, - partition_spec: None, - name_mapping: None, - case_sensitive: false, - }), - ]; - - let tasks_stream = Box::pin(futures::stream::iter(tasks)) as FileScanTaskStream; - - let result = reader - .read(tasks_stream) - .unwrap() - .try_collect::>() - .await - .unwrap(); - - // Verify we got all 30 rows (10 from each file) - let total_rows: usize = result.iter().map(|b| b.num_rows()).sum(); - assert_eq!(total_rows, 30, "Should have 30 total rows"); - - // Collect all ids and file_nums to verify data - let mut all_ids = Vec::new(); - let mut all_file_nums = Vec::new(); - - for batch in &result { - let id_col = batch - .column(0) - .as_primitive::(); - let file_num_col = batch - .column(1) - .as_primitive::(); - - for i in 0..batch.num_rows() { - all_ids.push(id_col.value(i)); - all_file_nums.push(file_num_col.value(i)); - } - } - - assert_eq!(all_ids.len(), 30); - assert_eq!(all_file_nums.len(), 30); - - // With concurrency=1 and sequential processing, files should be processed in order - // file_0: ids 0-9, file_num=0 - // file_1: ids 10-19, file_num=1 - // file_2: ids 20-29, file_num=2 - for i in 0..10 { - assert_eq!(all_file_nums[i], 0, "First 10 rows should be from file_0"); - assert_eq!(all_ids[i], i as i32, "IDs should be 0-9"); - } - for i in 10..20 { - assert_eq!(all_file_nums[i], 1, "Next 10 rows should be from file_1"); - assert_eq!(all_ids[i], i as i32, "IDs should be 10-19"); - } - for i in 20..30 { - assert_eq!(all_file_nums[i], 2, "Last 10 rows should be from file_2"); - assert_eq!(all_ids[i], i as i32, "IDs should be 20-29"); - } - } - - /// Test bucket partitioning reads source column from data file (not partition metadata). - /// - /// This is an integration test verifying the complete ArrowReader pipeline with bucket partitioning. - /// It corresponds to TestRuntimeFiltering tests in Iceberg Java (e.g., testRenamedSourceColumnTable). - /// - /// # Iceberg Spec Requirements - /// - /// Per the Iceberg spec "Column Projection" section: - /// > "Return the value from partition metadata if an **Identity Transform** exists for the field" - /// - /// This means: - /// - Identity transforms (e.g., `identity(dept)`) use constants from partition metadata - /// - Non-identity transforms (e.g., `bucket(4, id)`) must read source columns from data files - /// - Partition metadata for bucket transforms stores bucket numbers (0-3), NOT source values - /// - /// Java's PartitionUtil.constantsMap() implements this via: - /// ```java - /// if (field.transform().isIdentity()) { - /// idToConstant.put(field.sourceId(), converted); - /// } - /// ``` - /// - /// # What This Test Verifies - /// - /// This test ensures the full ArrowReader → RecordBatchTransformer pipeline correctly handles - /// bucket partitioning when FileScanTask provides partition_spec and partition_data: - /// - /// - Parquet file has field_id=1 named "id" with actual data [1, 5, 9, 13] - /// - FileScanTask specifies partition_spec with bucket(4, id) and partition_data with bucket=1 - /// - RecordBatchTransformer.constants_map() excludes bucket-partitioned field from constants - /// - ArrowReader correctly reads [1, 5, 9, 13] from the data file - /// - Values are NOT replaced with constant 1 from partition metadata - /// - /// # Why This Matters - /// - /// Without correct handling: - /// - Runtime filtering would break (e.g., `WHERE id = 5` would fail) - /// - Query results would be incorrect (all rows would have id=1) - /// - Bucket partitioning would be unusable for query optimization - /// - /// # References - /// - Iceberg spec: format/spec.md "Column Projection" + "Partition Transforms" - /// - Java test: spark/src/test/java/.../TestRuntimeFiltering.java - /// - Java impl: core/src/main/java/org/apache/iceberg/util/PartitionUtil.java - #[tokio::test] - async fn test_bucket_partitioning_reads_source_column_from_file() { - use arrow_array::Int32Array; - - use crate::spec::{Literal, PartitionSpec, Struct, Transform}; - - // Iceberg schema with id and name columns - let schema = Arc::new( - Schema::builder() - .with_schema_id(0) - .with_fields(vec![ - NestedField::required(1, "id", Type::Primitive(PrimitiveType::Int)).into(), - NestedField::optional(2, "name", Type::Primitive(PrimitiveType::String)).into(), - ]) - .build() - .unwrap(), - ); - - // Partition spec: bucket(4, id) - let partition_spec = Arc::new( - PartitionSpec::builder(schema.clone()) - .with_spec_id(0) - .add_partition_field("id", "id_bucket", Transform::Bucket(4)) - .unwrap() - .build() - .unwrap(), - ); - - // Partition data: bucket value is 1 - let partition_data = Struct::from_iter(vec![Some(Literal::int(1))]); - - // Create Arrow schema with field IDs for Parquet file - let arrow_schema = Arc::new(ArrowSchema::new(vec![ - Field::new("id", DataType::Int32, false).with_metadata(HashMap::from([( - PARQUET_FIELD_ID_META_KEY.to_string(), - "1".to_string(), - )])), - Field::new("name", DataType::Utf8, true).with_metadata(HashMap::from([( - PARQUET_FIELD_ID_META_KEY.to_string(), - "2".to_string(), - )])), - ])); - - // Write Parquet file with data - let tmp_dir = TempDir::new().unwrap(); - let table_location = tmp_dir.path().to_str().unwrap().to_string(); - let file_io = FileIO::new_with_fs(); - - let id_data = Arc::new(Int32Array::from(vec![1, 5, 9, 13])) as ArrayRef; - let name_data = - Arc::new(StringArray::from(vec!["Alice", "Bob", "Charlie", "Dave"])) as ArrayRef; - - let to_write = - RecordBatch::try_new(arrow_schema.clone(), vec![id_data, name_data]).unwrap(); - - let props = WriterProperties::builder() - .set_compression(Compression::SNAPPY) - .build(); - let file = File::create(format!("{}/data.parquet", &table_location)).unwrap(); - let mut writer = ArrowWriter::try_new(file, to_write.schema(), Some(props)).unwrap(); - writer.write(&to_write).expect("Writing batch"); - writer.close().unwrap(); - - // Read the Parquet file with partition spec and data - let reader = ArrowReaderBuilder::new(file_io).build(); - let tasks = Box::pin(futures::stream::iter( - vec![Ok(FileScanTask { - file_size_in_bytes: std::fs::metadata(format!("{table_location}/data.parquet")) - .unwrap() - .len(), - start: 0, - length: 0, - record_count: None, - data_file_path: format!("{table_location}/data.parquet"), - data_file_format: DataFileFormat::Parquet, - schema: schema.clone(), - project_field_ids: vec![1, 2], - predicate: None, - deletes: vec![], - partition: Some(partition_data), - partition_spec: Some(partition_spec), - name_mapping: None, - case_sensitive: false, - })] - .into_iter(), - )) as FileScanTaskStream; - - let result = reader - .read(tasks) - .unwrap() - .try_collect::>() - .await - .unwrap(); - - // Verify we got the correct data - assert_eq!(result.len(), 1); - let batch = &result[0]; - - assert_eq!(batch.num_columns(), 2); - assert_eq!(batch.num_rows(), 4); - - // The id column MUST contain actual values from the Parquet file [1, 5, 9, 13], - // NOT the constant partition value 1 - let id_col = batch - .column(0) - .as_primitive::(); - assert_eq!(id_col.value(0), 1); - assert_eq!(id_col.value(1), 5); - assert_eq!(id_col.value(2), 9); - assert_eq!(id_col.value(3), 13); - - let name_col = batch.column(1).as_string::(); - assert_eq!(name_col.value(0), "Alice"); - assert_eq!(name_col.value(1), "Bob"); - assert_eq!(name_col.value(2), "Charlie"); - assert_eq!(name_col.value(3), "Dave"); - } - - #[test] - fn test_merge_ranges_empty() { - assert_eq!(super::merge_ranges(&[], 1024), Vec::>::new()); - } - - #[test] - fn test_merge_ranges_no_coalesce() { - // Ranges far apart should not be merged - let ranges = vec![0..100, 1_000_000..1_000_100]; - let merged = super::merge_ranges(&ranges, 1024); - assert_eq!(merged, vec![0..100, 1_000_000..1_000_100]); - } - - #[test] - fn test_merge_ranges_coalesce() { - // Ranges within the gap threshold should be merged - let ranges = vec![0..100, 200..300, 500..600]; - let merged = super::merge_ranges(&ranges, 1024); - assert_eq!(merged, vec![0..600]); - } - - #[test] - fn test_merge_ranges_overlapping() { - let ranges = vec![0..200, 100..300]; - let merged = super::merge_ranges(&ranges, 0); - assert_eq!(merged, vec![0..300]); - } - - #[test] - fn test_merge_ranges_unsorted() { - let ranges = vec![500..600, 0..100, 200..300]; - let merged = super::merge_ranges(&ranges, 1024); - assert_eq!(merged, vec![0..600]); - } - - /// Mock FileRead backed by a flat byte buffer. - struct MockFileRead { - data: bytes::Bytes, - } - - impl MockFileRead { - fn new(size: usize) -> Self { - // Fill with sequential byte values so slices are verifiable. - let data: Vec = (0..size).map(|i| (i % 256) as u8).collect(); - Self { - data: bytes::Bytes::from(data), - } - } - } - - #[async_trait::async_trait] - impl crate::io::FileRead for MockFileRead { - async fn read(&self, range: Range) -> crate::Result { - Ok(self.data.slice(range.start as usize..range.end as usize)) - } - } - - #[tokio::test] - async fn test_get_byte_ranges_no_coalesce() { - use parquet::arrow::async_reader::AsyncFileReader; - - let mock = MockFileRead::new(2048); - let expected_0 = mock.data.slice(0..100); - let expected_1 = mock.data.slice(1500..1600); - - let mut reader = - super::ArrowFileReader::new(crate::io::FileMetadata { size: 2048 }, Box::new(mock)) - .with_parquet_read_options( - super::ParquetReadOptions::builder() - .with_range_coalesce_bytes(0) - .build(), - ); - - let result = reader - .get_byte_ranges(vec![0..100, 1500..1600]) - .await - .unwrap(); - - assert_eq!(result.len(), 2); - assert_eq!(result[0], expected_0); - assert_eq!(result[1], expected_1); - } - - #[tokio::test] - async fn test_get_byte_ranges_with_coalesce() { - use parquet::arrow::async_reader::AsyncFileReader; - - let mock = MockFileRead::new(1024); - let expected_0 = mock.data.slice(0..100); - let expected_1 = mock.data.slice(200..300); - let expected_2 = mock.data.slice(500..600); - - let mut reader = - super::ArrowFileReader::new(crate::io::FileMetadata { size: 1024 }, Box::new(mock)) - .with_parquet_read_options( - super::ParquetReadOptions::builder() - .with_range_coalesce_bytes(1024) - .build(), - ); - - // All ranges within coalesce threshold — should merge into one fetch. - let result = reader - .get_byte_ranges(vec![0..100, 200..300, 500..600]) - .await - .unwrap(); - - assert_eq!(result.len(), 3); - assert_eq!(result[0], expected_0); - assert_eq!(result[1], expected_1); - assert_eq!(result[2], expected_2); - } - - #[tokio::test] - async fn test_get_byte_ranges_empty() { - use parquet::arrow::async_reader::AsyncFileReader; - - let mock = MockFileRead::new(1024); - let mut reader = - super::ArrowFileReader::new(crate::io::FileMetadata { size: 1024 }, Box::new(mock)); - - let result = reader.get_byte_ranges(vec![]).await.unwrap(); - assert!(result.is_empty()); - } - - #[tokio::test] - async fn test_get_byte_ranges_coalesce_max() { - use parquet::arrow::async_reader::AsyncFileReader; - - let mock = MockFileRead::new(2048); - let expected_0 = mock.data.slice(0..100); - let expected_1 = mock.data.slice(1500..1600); - - let mut reader = - super::ArrowFileReader::new(crate::io::FileMetadata { size: 2048 }, Box::new(mock)) - .with_parquet_read_options( - super::ParquetReadOptions::builder() - .with_range_coalesce_bytes(u64::MAX) - .build(), - ); - - // u64::MAX coalesce — all ranges merge into a single fetch. - let result = reader - .get_byte_ranges(vec![0..100, 1500..1600]) - .await - .unwrap(); - - assert_eq!(result.len(), 2); - assert_eq!(result[0], expected_0); - assert_eq!(result[1], expected_1); - } - - #[tokio::test] - async fn test_get_byte_ranges_concurrency_zero() { - use parquet::arrow::async_reader::AsyncFileReader; - - // concurrency=0 is clamped to 1, so this should not hang. - let mock = MockFileRead::new(1024); - let expected = mock.data.slice(0..100); - - let mut reader = - super::ArrowFileReader::new(crate::io::FileMetadata { size: 1024 }, Box::new(mock)) - .with_parquet_read_options( - super::ParquetReadOptions::builder() - .with_range_fetch_concurrency(0) - .build(), - ); - - let result = reader - .get_byte_ranges(vec![0..100, 200..300]) - .await - .unwrap(); - assert_eq!(result.len(), 2); - assert_eq!(result[0], expected); - } - - #[tokio::test] - async fn test_get_byte_ranges_concurrency_one() { - use parquet::arrow::async_reader::AsyncFileReader; - - let mock = MockFileRead::new(2048); - let expected_0 = mock.data.slice(0..100); - let expected_1 = mock.data.slice(500..600); - let expected_2 = mock.data.slice(1500..1600); - - let mut reader = - super::ArrowFileReader::new(crate::io::FileMetadata { size: 2048 }, Box::new(mock)) - .with_parquet_read_options( - super::ParquetReadOptions::builder() - .with_range_coalesce_bytes(0) - .with_range_fetch_concurrency(1) - .build(), - ); - - // concurrency=1 with no coalescing — sequential fetches. - let result = reader - .get_byte_ranges(vec![0..100, 500..600, 1500..1600]) - .await - .unwrap(); - - assert_eq!(result.len(), 3); - assert_eq!(result[0], expected_0); - assert_eq!(result[1], expected_1); - assert_eq!(result[2], expected_2); - } - - /// Test that a Parquet file written with Arrow Binary type can be read when the - /// Iceberg schema declares the column as Fixed(N). - /// - /// This reproduces a real-world issue where Snowflake writes `FIXED_LEN_BYTE_ARRAY` - /// columns that the Arrow Parquet reader decodes as `Binary` rather than - /// `FixedSizeBinary(N)`. Without the `(Binary, Fixed(_))` arm in - /// `type_promotion_is_valid`, the column is silently excluded from projection and - /// filled with nulls. - #[tokio::test] - async fn test_binary_to_fixed_type_promotion() { - // UUID-like 16-byte values - let uuid_bytes: Vec<[u8; 16]> = vec![ - [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], - [ - 0xA1, 0xB2, 0xC3, 0xD4, 0xE5, 0xF6, 0x07, 0x18, 0x29, 0x3A, 0x4B, 0x5C, 0x6D, 0x7E, - 0x8F, 0x90, - ], - [0xFF; 16], - ]; - let int_data = vec![1i32, 2, 3]; - - // Iceberg schema: field 1 = Int, field 2 = Fixed(16) - let iceberg_schema = Arc::new( - Schema::builder() - .with_schema_id(1) - .with_fields(vec![ - NestedField::required(1, "id", Type::Primitive(PrimitiveType::Int)).into(), - NestedField::optional(2, "uuid_col", Type::Primitive(PrimitiveType::Fixed(16))) - .into(), - ]) - .build() - .unwrap(), - ); - - // Arrow schema: write uuid_col as Binary (not FixedSizeBinary), simulating - // what the Arrow Parquet reader produces for some writers (e.g. Snowflake). - let arrow_schema = Arc::new(ArrowSchema::new(vec![ - Field::new("id", DataType::Int32, false).with_metadata(HashMap::from([( - PARQUET_FIELD_ID_META_KEY.to_string(), - "1".to_string(), - )])), - Field::new("uuid_col", DataType::Binary, true).with_metadata(HashMap::from([( - PARQUET_FIELD_ID_META_KEY.to_string(), - "2".to_string(), - )])), - ])); - - let id_col = Arc::new(Int32Array::from(int_data.clone())) as ArrayRef; - let uuid_col = Arc::new(BinaryArray::from_vec( - uuid_bytes.iter().map(|b| b.as_slice()).collect(), - )) as ArrayRef; - - let batch = RecordBatch::try_new(arrow_schema.clone(), vec![id_col, uuid_col]).unwrap(); - - // Write Parquet file - let tmp_dir = TempDir::new().unwrap(); - let table_location = tmp_dir.path().to_str().unwrap().to_string(); - let parquet_path = format!("{table_location}/1.parquet"); - let file = File::create(&parquet_path).unwrap(); - let props = WriterProperties::builder() - .set_compression(Compression::SNAPPY) - .build(); - let mut writer = ArrowWriter::try_new(file, batch.schema(), Some(props)).unwrap(); - writer.write(&batch).unwrap(); - writer.close().unwrap(); - - let file_io = FileIO::new_with_fs(); - let file_size = std::fs::metadata(&parquet_path).unwrap().len(); - let reader = ArrowReaderBuilder::new(file_io.clone()).build(); - - // --- Test 1: Full scan (all columns projected) --- - // This is the case that previously failed. - { - let tasks = Box::pin(futures::stream::iter(vec![Ok(FileScanTask { - file_size_in_bytes: file_size, - start: 0, - length: 0, - record_count: None, - data_file_path: parquet_path.clone(), - data_file_format: DataFileFormat::Parquet, - schema: iceberg_schema.clone(), - project_field_ids: vec![1, 2], - predicate: None, - deletes: vec![], - partition: None, - partition_spec: None, - name_mapping: None, - case_sensitive: false, - })])) as FileScanTaskStream; - - let batches: Vec = - reader.read(tasks).unwrap().try_collect().await.unwrap(); - - assert_eq!(batches.len(), 1); - let result = &batches[0]; - assert_eq!(result.num_rows(), 3); - assert_eq!(result.num_columns(), 2); - - // Verify id column - let id_arr = result - .column_by_name("id") - .unwrap() - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(id_arr.values(), &int_data); - - // Verify uuid_col: data must come through as Binary, preserving every byte - let uuid_arr = result.column_by_name("uuid_col").unwrap(); - assert_eq!(uuid_arr.null_count(), 0, "uuid_col should have no nulls"); - // The transformer may cast Binary -> FixedSizeBinary to match the target schema - let uuid_values: Vec<&[u8]> = - if let Some(bin) = uuid_arr.as_any().downcast_ref::() { - (0..bin.len()).map(|i| bin.value(i)).collect() - } else if let Some(fsb) = uuid_arr.as_any().downcast_ref::() { - (0..fsb.len()).map(|i| fsb.value(i)).collect() - } else { - panic!("uuid_col has unexpected type: {}", uuid_arr.data_type()) - }; - for (i, expected) in uuid_bytes.iter().enumerate() { - assert_eq!( - uuid_values[i], - expected.as_slice(), - "uuid_col row {i} bytes mismatch" - ); - } - } - - // --- Test 2: Projected scan (only uuid_col) --- - { - let reader2 = ArrowReaderBuilder::new(file_io).build(); - let tasks = Box::pin(futures::stream::iter(vec![Ok(FileScanTask { - file_size_in_bytes: file_size, - start: 0, - length: 0, - record_count: None, - data_file_path: parquet_path.clone(), - data_file_format: DataFileFormat::Parquet, - schema: iceberg_schema.clone(), - project_field_ids: vec![2], - predicate: None, - deletes: vec![], - partition: None, - partition_spec: None, - name_mapping: None, - case_sensitive: false, - })])) as FileScanTaskStream; - - let batches: Vec = - reader2.read(tasks).unwrap().try_collect().await.unwrap(); - - assert_eq!(batches.len(), 1); - let result = &batches[0]; - assert_eq!(result.num_rows(), 3); - assert_eq!(result.num_columns(), 1); - - let uuid_arr = result.column(0); - assert_eq!(uuid_arr.null_count(), 0, "uuid_col should have no nulls"); - let uuid_values: Vec<&[u8]> = - if let Some(bin) = uuid_arr.as_any().downcast_ref::() { - (0..bin.len()).map(|i| bin.value(i)).collect() - } else if let Some(fsb) = uuid_arr.as_any().downcast_ref::() { - (0..fsb.len()).map(|i| fsb.value(i)).collect() - } else { - panic!("uuid_col has unexpected type: {}", uuid_arr.data_type()) - }; - for (i, expected) in uuid_bytes.iter().enumerate() { - assert_eq!( - uuid_values[i], - expected.as_slice(), - "uuid_col row {i} bytes mismatch in projected scan" - ); - } - } - } -} diff --git a/crates/iceberg/src/arrow/reader/file_reader.rs b/crates/iceberg/src/arrow/reader/file_reader.rs new file mode 100644 index 0000000000..fb0482caf5 --- /dev/null +++ b/crates/iceberg/src/arrow/reader/file_reader.rs @@ -0,0 +1,376 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Async Parquet file reader that adapts an Iceberg `FileRead` to parquet's `AsyncFileReader`. + +use std::ops::Range; +use std::sync::Arc; + +use bytes::Bytes; +use futures::future::BoxFuture; +use futures::{FutureExt, StreamExt, TryFutureExt, TryStreamExt}; +use parquet::arrow::arrow_reader::ArrowReaderOptions; +use parquet::arrow::async_reader::AsyncFileReader; +use parquet::file::metadata::{PageIndexPolicy, ParquetMetaData, ParquetMetaDataReader}; + +use super::ParquetReadOptions; +use crate::io::{FileMetadata, FileRead}; + +/// ArrowFileReader is a wrapper around a FileRead that impls parquets AsyncFileReader. +pub struct ArrowFileReader { + meta: FileMetadata, + parquet_read_options: ParquetReadOptions, + r: Box, +} + +impl ArrowFileReader { + /// Create a new ArrowFileReader + pub fn new(meta: FileMetadata, r: Box) -> Self { + Self { + meta, + parquet_read_options: ParquetReadOptions::builder().build(), + r, + } + } + + /// Configure all Parquet read options. + pub(crate) fn with_parquet_read_options(mut self, options: ParquetReadOptions) -> Self { + self.parquet_read_options = options; + self + } +} + +impl AsyncFileReader for ArrowFileReader { + fn get_bytes(&mut self, range: Range) -> BoxFuture<'_, parquet::errors::Result> { + Box::pin( + self.r + .read(range.start..range.end) + .map_err(|err| parquet::errors::ParquetError::External(Box::new(err))), + ) + } + + /// Override the default `get_byte_ranges` which calls `get_bytes` sequentially. + /// The parquet reader calls this to fetch column chunks for a row group, so + /// without this override each column chunk is a serial round-trip to object storage. + /// Adapted from object_store's `coalesce_ranges` in `util.rs`. + fn get_byte_ranges( + &mut self, + ranges: Vec>, + ) -> BoxFuture<'_, parquet::errors::Result>> { + let coalesce_bytes = self.parquet_read_options.range_coalesce_bytes(); + let concurrency = self.parquet_read_options.range_fetch_concurrency().max(1); + + async move { + // Merge nearby ranges to reduce the number of object store requests. + let fetch_ranges = merge_ranges(&ranges, coalesce_bytes); + let r = &self.r; + + // Fetch merged ranges concurrently. + let fetched: Vec = futures::stream::iter(fetch_ranges.iter().cloned()) + .map(|range| async move { + r.read(range) + .await + .map_err(|e| parquet::errors::ParquetError::External(Box::new(e))) + }) + .buffered(concurrency) + .try_collect() + .await?; + + // Slice the fetched data back into the originally requested ranges. + Ok(ranges + .iter() + .map(|range| { + let idx = fetch_ranges.partition_point(|v| v.start <= range.start) - 1; + let fetch_range = &fetch_ranges[idx]; + let fetch_bytes = &fetched[idx]; + let start = (range.start - fetch_range.start) as usize; + let end = (range.end - fetch_range.start) as usize; + fetch_bytes.slice(start..end.min(fetch_bytes.len())) + }) + .collect()) + } + .boxed() + } + + // TODO: currently we don't respect `ArrowReaderOptions` cause it don't expose any method to access the option field + // we will fix it after `v55.1.0` is released in https://github.com/apache/arrow-rs/issues/7393 + fn get_metadata( + &mut self, + _options: Option<&'_ ArrowReaderOptions>, + ) -> BoxFuture<'_, parquet::errors::Result>> { + async move { + fn page_index_policy(enabled: bool) -> PageIndexPolicy { + if enabled { + PageIndexPolicy::Optional + } else { + PageIndexPolicy::Skip + } + } + + let reader = ParquetMetaDataReader::new() + .with_prefetch_hint(self.parquet_read_options.metadata_size_hint()) + // Set the page policy first because it updates both column and offset policies. + .with_page_index_policy(page_index_policy( + self.parquet_read_options.preload_page_index(), + )) + .with_column_index_policy(page_index_policy( + self.parquet_read_options.preload_column_index(), + )) + .with_offset_index_policy(page_index_policy( + self.parquet_read_options.preload_offset_index(), + )); + let size = self.meta.size; + let meta = reader.load_and_finish(self, size).await?; + + Ok(Arc::new(meta)) + } + .boxed() + } +} + +/// Merge overlapping or nearby byte ranges, combining ranges with gaps <= `coalesce` bytes. +/// Adapted from object_store's `merge_ranges` in `util.rs`. +fn merge_ranges(ranges: &[Range], coalesce: u64) -> Vec> { + if ranges.is_empty() { + return vec![]; + } + + let mut ranges = ranges.to_vec(); + ranges.sort_unstable_by_key(|r| r.start); + + let mut merged = Vec::with_capacity(ranges.len()); + let mut start_idx = 0; + let mut end_idx = 1; + + while start_idx != ranges.len() { + let mut range_end = ranges[start_idx].end; + + while end_idx != ranges.len() + && ranges[end_idx] + .start + .checked_sub(range_end) + .map(|delta| delta <= coalesce) + .unwrap_or(true) + { + range_end = range_end.max(ranges[end_idx].end); + end_idx += 1; + } + + merged.push(ranges[start_idx].start..range_end); + start_idx = end_idx; + end_idx += 1; + } + + merged +} + +#[cfg(test)] +mod tests { + use std::ops::Range; + + use parquet::arrow::async_reader::AsyncFileReader; + + use super::{ArrowFileReader, ParquetReadOptions, merge_ranges}; + use crate::io::{FileMetadata, FileRead}; + + #[test] + fn test_merge_ranges_empty() { + assert_eq!(merge_ranges(&[], 1024), Vec::>::new()); + } + + #[test] + fn test_merge_ranges_no_coalesce() { + // Ranges far apart should not be merged + let ranges = vec![0..100, 1_000_000..1_000_100]; + let merged = merge_ranges(&ranges, 1024); + assert_eq!(merged, vec![0..100, 1_000_000..1_000_100]); + } + + #[test] + fn test_merge_ranges_coalesce() { + // Ranges within the gap threshold should be merged + let ranges = vec![0..100, 200..300, 500..600]; + let merged = merge_ranges(&ranges, 1024); + assert_eq!(merged, vec![0..600]); + } + + #[test] + fn test_merge_ranges_overlapping() { + let ranges = vec![0..200, 100..300]; + let merged = merge_ranges(&ranges, 0); + assert_eq!(merged, vec![0..300]); + } + + #[test] + fn test_merge_ranges_unsorted() { + let ranges = vec![500..600, 0..100, 200..300]; + let merged = merge_ranges(&ranges, 1024); + assert_eq!(merged, vec![0..600]); + } + + /// Mock FileRead backed by a flat byte buffer. + struct MockFileRead { + data: bytes::Bytes, + } + + impl MockFileRead { + fn new(size: usize) -> Self { + // Fill with sequential byte values so slices are verifiable. + let data: Vec = (0..size).map(|i| (i % 256) as u8).collect(); + Self { + data: bytes::Bytes::from(data), + } + } + } + + #[async_trait::async_trait] + impl FileRead for MockFileRead { + async fn read(&self, range: Range) -> crate::Result { + Ok(self.data.slice(range.start as usize..range.end as usize)) + } + } + + #[tokio::test] + async fn test_get_byte_ranges_no_coalesce() { + let mock = MockFileRead::new(2048); + let expected_0 = mock.data.slice(0..100); + let expected_1 = mock.data.slice(1500..1600); + + let mut reader = ArrowFileReader::new(FileMetadata { size: 2048 }, Box::new(mock)) + .with_parquet_read_options( + ParquetReadOptions::builder() + .with_range_coalesce_bytes(0) + .build(), + ); + + let result = reader + .get_byte_ranges(vec![0..100, 1500..1600]) + .await + .unwrap(); + + assert_eq!(result.len(), 2); + assert_eq!(result[0], expected_0); + assert_eq!(result[1], expected_1); + } + + #[tokio::test] + async fn test_get_byte_ranges_with_coalesce() { + let mock = MockFileRead::new(1024); + let expected_0 = mock.data.slice(0..100); + let expected_1 = mock.data.slice(200..300); + let expected_2 = mock.data.slice(500..600); + + let mut reader = ArrowFileReader::new(FileMetadata { size: 1024 }, Box::new(mock)) + .with_parquet_read_options( + ParquetReadOptions::builder() + .with_range_coalesce_bytes(1024) + .build(), + ); + + // All ranges within coalesce threshold — should merge into one fetch. + let result = reader + .get_byte_ranges(vec![0..100, 200..300, 500..600]) + .await + .unwrap(); + + assert_eq!(result.len(), 3); + assert_eq!(result[0], expected_0); + assert_eq!(result[1], expected_1); + assert_eq!(result[2], expected_2); + } + + #[tokio::test] + async fn test_get_byte_ranges_empty() { + let mock = MockFileRead::new(1024); + let mut reader = ArrowFileReader::new(FileMetadata { size: 1024 }, Box::new(mock)); + + let result = reader.get_byte_ranges(vec![]).await.unwrap(); + assert!(result.is_empty()); + } + + #[tokio::test] + async fn test_get_byte_ranges_coalesce_max() { + let mock = MockFileRead::new(2048); + let expected_0 = mock.data.slice(0..100); + let expected_1 = mock.data.slice(1500..1600); + + let mut reader = ArrowFileReader::new(FileMetadata { size: 2048 }, Box::new(mock)) + .with_parquet_read_options( + ParquetReadOptions::builder() + .with_range_coalesce_bytes(u64::MAX) + .build(), + ); + + // u64::MAX coalesce — all ranges merge into a single fetch. + let result = reader + .get_byte_ranges(vec![0..100, 1500..1600]) + .await + .unwrap(); + + assert_eq!(result.len(), 2); + assert_eq!(result[0], expected_0); + assert_eq!(result[1], expected_1); + } + + #[tokio::test] + async fn test_get_byte_ranges_concurrency_zero() { + // concurrency=0 is clamped to 1, so this should not hang. + let mock = MockFileRead::new(1024); + let expected = mock.data.slice(0..100); + + let mut reader = ArrowFileReader::new(FileMetadata { size: 1024 }, Box::new(mock)) + .with_parquet_read_options( + ParquetReadOptions::builder() + .with_range_fetch_concurrency(0) + .build(), + ); + + let result = reader + .get_byte_ranges(vec![0..100, 200..300]) + .await + .unwrap(); + assert_eq!(result.len(), 2); + assert_eq!(result[0], expected); + } + + #[tokio::test] + async fn test_get_byte_ranges_concurrency_one() { + let mock = MockFileRead::new(2048); + let expected_0 = mock.data.slice(0..100); + let expected_1 = mock.data.slice(500..600); + let expected_2 = mock.data.slice(1500..1600); + + let mut reader = ArrowFileReader::new(FileMetadata { size: 2048 }, Box::new(mock)) + .with_parquet_read_options( + ParquetReadOptions::builder() + .with_range_coalesce_bytes(0) + .with_range_fetch_concurrency(1) + .build(), + ); + + // concurrency=1 with no coalescing — sequential fetches. + let result = reader + .get_byte_ranges(vec![0..100, 500..600, 1500..1600]) + .await + .unwrap(); + + assert_eq!(result.len(), 3); + assert_eq!(result[0], expected_0); + assert_eq!(result[1], expected_1); + assert_eq!(result[2], expected_2); + } +} diff --git a/crates/iceberg/src/arrow/reader/mod.rs b/crates/iceberg/src/arrow/reader/mod.rs new file mode 100644 index 0000000000..bc465e9973 --- /dev/null +++ b/crates/iceberg/src/arrow/reader/mod.rs @@ -0,0 +1,193 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Parquet file data reader + +use arrow_array::RecordBatch; +use futures::{SinkExt, Stream, StreamExt}; + +use crate::arrow::caching_delete_file_loader::CachingDeleteFileLoader; +use crate::error::Result; +use crate::io::FileIO; +use crate::scan::ArrowRecordBatchStream; +use crate::util::available_parallelism; +use crate::{Error, ErrorKind}; + +/// Default gap between byte ranges below which they are coalesced into a +/// single request. Matches object_store's `OBJECT_STORE_COALESCE_DEFAULT`. +const DEFAULT_RANGE_COALESCE_BYTES: u64 = 1024 * 1024; + +/// Default maximum number of coalesced byte ranges fetched concurrently. +/// Matches object_store's `OBJECT_STORE_COALESCE_PARALLEL`. +const DEFAULT_RANGE_FETCH_CONCURRENCY: usize = 10; + +/// Default number of bytes to prefetch when parsing Parquet footer metadata. +/// Matches DataFusion's default `ParquetOptions::metadata_size_hint`. +const DEFAULT_METADATA_SIZE_HINT: usize = 512 * 1024; + +mod file_reader; +mod options; +mod pipeline; +mod positional_deletes; +mod predicate_visitor; +mod projection; +mod row_filter; +pub use file_reader::ArrowFileReader; +pub(crate) use options::ParquetReadOptions; +use predicate_visitor::{CollectFieldIdVisitor, PredicateConverter}; +use projection::{add_fallback_field_ids_to_arrow_schema, apply_name_mapping_to_arrow_schema}; + +/// Builder to create ArrowReader +pub struct ArrowReaderBuilder { + batch_size: Option, + file_io: FileIO, + concurrency_limit_data_files: usize, + row_group_filtering_enabled: bool, + row_selection_enabled: bool, + parquet_read_options: ParquetReadOptions, +} + +impl ArrowReaderBuilder { + /// Create a new ArrowReaderBuilder + pub fn new(file_io: FileIO) -> Self { + let num_cpus = available_parallelism().get(); + + ArrowReaderBuilder { + batch_size: None, + file_io, + concurrency_limit_data_files: num_cpus, + row_group_filtering_enabled: true, + row_selection_enabled: false, + parquet_read_options: ParquetReadOptions::builder().build(), + } + } + + /// Sets the max number of in flight data files that are being fetched + pub fn with_data_file_concurrency_limit(mut self, val: usize) -> Self { + self.concurrency_limit_data_files = val; + self + } + + /// Sets the desired size of batches in the response + /// to something other than the default + pub fn with_batch_size(mut self, batch_size: usize) -> Self { + self.batch_size = Some(batch_size); + self + } + + /// Determines whether to enable row group filtering. + pub fn with_row_group_filtering_enabled(mut self, row_group_filtering_enabled: bool) -> Self { + self.row_group_filtering_enabled = row_group_filtering_enabled; + self + } + + /// Determines whether to enable row selection. + pub fn with_row_selection_enabled(mut self, row_selection_enabled: bool) -> Self { + self.row_selection_enabled = row_selection_enabled; + self + } + + /// Provide a hint as to the number of bytes to prefetch for parsing the Parquet metadata + /// + /// This hint can help reduce the number of fetch requests. For more details see the + /// [ParquetMetaDataReader documentation](https://docs.rs/parquet/latest/parquet/file/metadata/struct.ParquetMetaDataReader.html#method.with_prefetch_hint). + pub fn with_metadata_size_hint(mut self, metadata_size_hint: usize) -> Self { + self.parquet_read_options.metadata_size_hint = Some(metadata_size_hint); + self + } + + /// Sets the gap threshold for merging nearby byte ranges into a single request. + /// Ranges with gaps smaller than this value will be coalesced. + /// + /// Defaults to 1 MiB, matching object_store's OBJECT_STORE_COALESCE_DEFAULT. + pub fn with_range_coalesce_bytes(mut self, range_coalesce_bytes: u64) -> Self { + self.parquet_read_options.range_coalesce_bytes = range_coalesce_bytes; + self + } + + /// Sets the maximum number of merged byte ranges to fetch concurrently. + /// + /// Defaults to 10, matching object_store's OBJECT_STORE_COALESCE_PARALLEL. + pub fn with_range_fetch_concurrency(mut self, range_fetch_concurrency: usize) -> Self { + self.parquet_read_options.range_fetch_concurrency = range_fetch_concurrency; + self + } + + /// Build the ArrowReader. + pub fn build(self) -> ArrowReader { + ArrowReader { + batch_size: self.batch_size, + file_io: self.file_io.clone(), + delete_file_loader: CachingDeleteFileLoader::new( + self.file_io.clone(), + self.concurrency_limit_data_files, + ), + concurrency_limit_data_files: self.concurrency_limit_data_files, + row_group_filtering_enabled: self.row_group_filtering_enabled, + row_selection_enabled: self.row_selection_enabled, + parquet_read_options: self.parquet_read_options, + } + } +} + +/// Reads data from Parquet files +#[derive(Clone)] +pub struct ArrowReader { + pub(crate) batch_size: Option, + pub(crate) file_io: FileIO, + delete_file_loader: CachingDeleteFileLoader, + + /// the maximum number of data files that can be fetched at the same time + pub(crate) concurrency_limit_data_files: usize, + + pub(crate) row_group_filtering_enabled: bool, + pub(crate) row_selection_enabled: bool, + pub(crate) parquet_read_options: ParquetReadOptions, +} + +/// Trait indicating that the implementing type streams into a stream of type `S` using +/// a reader of type `R`. +pub trait StreamsInto { + /// Stream from the reader and produce a stream of type `S`. + fn stream(self, reader: R) -> Result; +} + +/// Helper function to process a stream of record batches and send through a channel. +/// Handles the Result pattern, so callers don't need to match on the stream result. +/// This pattern is used in both reader.rs and incremental.rs. +pub(crate) async fn process_record_batch_stream( + record_batch_stream: Result, + mut tx: T, + error_context: &str, +) where + E: std::error::Error + Send + Sync + 'static, + S: Stream> + Send + Unpin + 'static, + T: SinkExt> + Unpin + Send + 'static, +{ + match record_batch_stream { + Ok(mut stream) => { + while let Some(batch_result) = stream.next().await { + let batch = batch_result + .map_err(|e| Error::new(ErrorKind::Unexpected, error_context).with_source(e)); + let _ = tx.send(batch).await; + } + } + Err(e) => { + let _ = tx.send(Err(e)).await; + } + } +} diff --git a/crates/iceberg/src/arrow/reader/options.rs b/crates/iceberg/src/arrow/reader/options.rs new file mode 100644 index 0000000000..ae6a3ed18e --- /dev/null +++ b/crates/iceberg/src/arrow/reader/options.rs @@ -0,0 +1,84 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Tunables for Parquet file I/O used by `ArrowReader`. + +use typed_builder::TypedBuilder; + +use super::{ + DEFAULT_METADATA_SIZE_HINT, DEFAULT_RANGE_COALESCE_BYTES, DEFAULT_RANGE_FETCH_CONCURRENCY, +}; + +/// Options for tuning Parquet file I/O. +#[derive(Clone, Copy, Debug, TypedBuilder)] +#[builder(field_defaults(setter(prefix = "with_")))] +pub(crate) struct ParquetReadOptions { + /// Number of bytes to prefetch for parsing the Parquet metadata. + /// + /// This hint can help reduce the number of fetch requests. For more details see the + /// [ParquetMetaDataReader documentation](https://docs.rs/parquet/latest/parquet/file/metadata/struct.ParquetMetaDataReader.html#method.with_prefetch_hint). + /// + /// Defaults to 512 KiB, matching DataFusion's default `ParquetOptions::metadata_size_hint`. + #[builder(default = Some(DEFAULT_METADATA_SIZE_HINT))] + pub(crate) metadata_size_hint: Option, + /// Gap threshold for merging nearby byte ranges into a single request. + /// Ranges with gaps smaller than this value will be coalesced. + /// + /// Defaults to 1 MiB, matching object_store's `OBJECT_STORE_COALESCE_DEFAULT`. + #[builder(default = DEFAULT_RANGE_COALESCE_BYTES)] + pub(crate) range_coalesce_bytes: u64, + /// Maximum number of merged byte ranges to fetch concurrently. + /// + /// Defaults to 10, matching object_store's `OBJECT_STORE_COALESCE_PARALLEL`. + #[builder(default = DEFAULT_RANGE_FETCH_CONCURRENCY)] + pub(crate) range_fetch_concurrency: usize, + /// Whether to preload the column index when reading Parquet metadata. + #[builder(default = true)] + pub(crate) preload_column_index: bool, + /// Whether to preload the offset index when reading Parquet metadata. + #[builder(default = true)] + pub(crate) preload_offset_index: bool, + /// Whether to preload the page index when reading Parquet metadata. + #[builder(default = false)] + pub(crate) preload_page_index: bool, +} + +impl ParquetReadOptions { + pub(crate) fn metadata_size_hint(&self) -> Option { + self.metadata_size_hint + } + + pub(crate) fn range_coalesce_bytes(&self) -> u64 { + self.range_coalesce_bytes + } + + pub(crate) fn range_fetch_concurrency(&self) -> usize { + self.range_fetch_concurrency + } + + pub(crate) fn preload_column_index(&self) -> bool { + self.preload_column_index + } + + pub(crate) fn preload_offset_index(&self) -> bool { + self.preload_offset_index + } + + pub(crate) fn preload_page_index(&self) -> bool { + self.preload_page_index + } +} diff --git a/crates/iceberg/src/arrow/reader/pipeline.rs b/crates/iceberg/src/arrow/reader/pipeline.rs new file mode 100644 index 0000000000..7bb7feb9d3 --- /dev/null +++ b/crates/iceberg/src/arrow/reader/pipeline.rs @@ -0,0 +1,1330 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! The main `ArrowReader` pipeline: reading a stream of `FileScanTask`s, +//! opening Parquet files and resolving schemas, then wiring projection, +//! predicates, row-group / row selection, and delete handling into a stream +//! of transformed Arrow `RecordBatch`es. + +use std::sync::atomic::AtomicU64; +use std::sync::{Arc, Mutex}; + +use arrow_array::RecordBatch; +use futures::channel::mpsc::channel; +use futures::{StreamExt, TryStreamExt}; +use parquet::arrow::arrow_reader::{ArrowReaderMetadata, ArrowReaderOptions}; +use parquet::arrow::{PARQUET_FIELD_ID_META_KEY, ParquetRecordBatchStreamBuilder}; + +use super::{ + ArrowFileReader, ArrowReader, ParquetReadOptions, add_fallback_field_ids_to_arrow_schema, + apply_name_mapping_to_arrow_schema, process_record_batch_stream, +}; +use crate::arrow::caching_delete_file_loader::CachingDeleteFileLoader; +use crate::arrow::int96::coerce_int96_timestamps; +use crate::arrow::record_batch_transformer::{ + RecordBatchTransformer, RecordBatchTransformerBuilder, +}; +use crate::arrow::scan_metrics::{CountingFileRead, ScanMetrics, ScanResult}; +use crate::delete_vector::DeleteVector; +use crate::error::Result; +use crate::io::{FileIO, FileMetadata, FileRead}; +use crate::metadata_columns::{ + RESERVED_FIELD_ID_FILE, RESERVED_FIELD_ID_POS, is_metadata_field, row_pos_field, +}; +use crate::runtime::spawn; +use crate::scan::{ArrowRecordBatchStream, FileScanTask, FileScanTaskStream}; +use crate::spec::{Datum, NameMapping, PartitionSpec, SchemaRef, Struct}; +use crate::{Error, ErrorKind}; + +impl ArrowReader { + /// Take a stream of FileScanTasks and reads all the files. + /// Returns a [`ScanResult`] containing the record batch stream and scan metrics. + pub fn read(self, tasks: FileScanTaskStream) -> Result { + let concurrency_limit_data_files = self.concurrency_limit_data_files; + let scan_metrics = ScanMetrics::new(); + + let task_reader = FileScanTaskReader { + batch_size: self.batch_size, + file_io: self.file_io, + delete_file_loader: self + .delete_file_loader + .with_scan_metrics(scan_metrics.clone()), + row_group_filtering_enabled: self.row_group_filtering_enabled, + row_selection_enabled: self.row_selection_enabled, + parquet_read_options: self.parquet_read_options, + scan_metrics: scan_metrics.clone(), + }; + + // Fast-path for single concurrency to avoid overhead of try_flatten_unordered + let stream: ArrowRecordBatchStream = if concurrency_limit_data_files == 1 { + Box::pin( + tasks + .and_then(move |task| task_reader.clone().process(task)) + .map_err(|err| { + Error::new(ErrorKind::Unexpected, "file scan task generate failed") + .with_source(err) + }) + .try_flatten(), + ) + } else { + // Multi-concurrency path: spawn each file's IO-heavy processing as an independent + // tokio task for true parallelism, streaming results through a channel. + let (tx, rx) = channel::>(concurrency_limit_data_files); + + // Outer spawn: runs the task coordination loop without blocking the caller. + spawn(async move { + let _ = tasks + .try_for_each_concurrent(concurrency_limit_data_files, |task| { + let task_reader = task_reader.clone(); + let tx = tx.clone(); + + async move { + // Inner spawn: each file's IO operations run on their own tokio task. + spawn(async move { + let record_batch_stream = task_reader.process(task).await; + process_record_batch_stream( + record_batch_stream, + tx, + "failed to read record batch", + ) + .await; + }) + .await; + + Ok(()) + } + }) + .await; + }); + + Box::pin(rx) as ArrowRecordBatchStream + }; + + Ok(ScanResult::new(stream, scan_metrics)) + } +} + +/// Per-scan state for processing [`FileScanTask`]s. Created once per +/// [`ArrowReader::read`] call and cloned per task. +#[derive(Clone)] +struct FileScanTaskReader { + batch_size: Option, + file_io: FileIO, + delete_file_loader: CachingDeleteFileLoader, + row_group_filtering_enabled: bool, + row_selection_enabled: bool, + parquet_read_options: ParquetReadOptions, + scan_metrics: ScanMetrics, +} + +impl FileScanTaskReader { + async fn process(self, task: FileScanTask) -> Result { + let should_load_page_index = + (self.row_selection_enabled && task.predicate.is_some()) || !task.deletes.is_empty(); + let mut parquet_read_options = self.parquet_read_options; + parquet_read_options.preload_page_index = should_load_page_index; + + // Concurrently open the Parquet file and start loading delete files. + let open_fut = ArrowReader::open_parquet_stream_builder( + &task.data_file_path, + task.file_size_in_bytes, + self.file_io.clone(), + parquet_read_options, + ArrowReader::build_virtual_columns(&task.project_field_ids), + self.batch_size, + task.name_mapping.as_deref(), + Some(Arc::clone(self.scan_metrics.bytes_read_counter())), + Some(&task.schema), + ); + let delete_filter_rx = self + .delete_file_loader + .load_deletes(&task.deletes, Arc::clone(&task.schema)); + + let (open_result, delete_filter) = + futures::join!(open_fut, async { delete_filter_rx.await.unwrap() }); + + let (builder, has_missing_field_ids) = open_result?; + let delete_filter = delete_filter?; + + let delete_predicate = delete_filter.build_equality_delete_predicate(&task).await?; + + // In addition to the optional predicate supplied in the `FileScanTask`, + // we also have an optional predicate resulting from equality delete files. + // If both are present, we logical-AND them together to form a single filter + // predicate that we can pass to the `RecordBatchStreamBuilder`. + let final_predicate = match (&task.predicate, delete_predicate) { + (None, None) => None, + (Some(predicate), None) => Some(predicate.clone()), + (None, Some(ref predicate)) => Some(predicate.clone()), + (Some(filter_predicate), Some(delete_predicate)) => { + Some(filter_predicate.clone().and(delete_predicate)) + } + }; + + let positional_deletes = delete_filter.get_delete_vector(&task); + + let builder = ArrowReader::apply_parquet_filters( + builder, + task.start, + task.length, + &task.schema, + final_predicate.as_ref(), + positional_deletes.as_deref(), + self.row_group_filtering_enabled, + self.row_selection_enabled, + false, // use_predicate_projection: projection applied separately via build_projected_record_batch_stream + has_missing_field_ids, + )?; + + ArrowReader::build_projected_record_batch_stream( + builder, + &task.project_field_ids, + task.schema_ref(), + has_missing_field_ids, + &task.data_file_path, + task.partition_spec, + task.partition, + ) + } +} + +impl ArrowReader { + /// Opens a Parquet file and loads its metadata, wrapping the reader with + /// [`CountingFileRead`] so all I/O is accumulated into `bytes_read`. + pub(crate) async fn open_parquet_file( + data_file_path: &str, + file_io: &FileIO, + file_size_in_bytes: u64, + parquet_read_options: ParquetReadOptions, + bytes_read: &Arc, + ) -> Result<(ArrowFileReader, ArrowReaderMetadata)> { + let parquet_file = file_io.new_input(data_file_path)?; + let counting_reader = + CountingFileRead::new(parquet_file.reader().await?, Arc::clone(bytes_read)); + Self::build_parquet_reader( + Box::new(counting_reader), + file_size_in_bytes, + parquet_read_options, + ) + .await + } + + async fn build_parquet_reader( + parquet_reader: Box, + file_size_in_bytes: u64, + parquet_read_options: ParquetReadOptions, + ) -> Result<(ArrowFileReader, ArrowReaderMetadata)> { + let mut reader = ArrowFileReader::new( + FileMetadata { + size: file_size_in_bytes, + }, + parquet_reader, + ) + .with_parquet_read_options(parquet_read_options); + + let arrow_metadata = ArrowReaderMetadata::load_async(&mut reader, Default::default()) + .await + .map_err(|e| { + Error::new(ErrorKind::Unexpected, "Failed to load Parquet metadata").with_source(e) + })?; + + Ok((reader, arrow_metadata)) + } + + /// Opens a Parquet file, resolves its schema (name-mapping / field-ID fallback), and + /// applies the batch size. Returns `(builder, has_missing_field_ids)`. + /// + /// This is the async phase shared by every reading path. Callers that have background + /// work to overlap (e.g. delete-file loading) can run this concurrently with that work + /// using [`futures::join!`], then pass the result to [`Self::apply_parquet_filters`]. + /// + /// Implements the three-branch schema resolution strategy matching Java's `ReadConf` constructor: + /// - Branch 1: file has embedded field IDs → trust them, use as-is + /// - Branch 2: name_mapping present → apply name mapping to assign correct Iceberg field IDs + /// - Branch 3: no name mapping → assign fallback position-based IDs + /// + /// When `iceberg_schema` is `Some`, INT96 timestamp columns are coerced to the resolution + /// specified by the Iceberg schema before building the stream reader. + /// + /// When `bytes_read` is `Some`, wraps the file reader with [`CountingFileRead`] so all + /// I/O bytes are accumulated into the provided counter. + #[allow(clippy::too_many_arguments)] + pub(crate) async fn open_parquet_stream_builder( + data_file_path: &str, + file_size_in_bytes: u64, + file_io: FileIO, + parquet_read_options: ParquetReadOptions, + virtual_columns: Vec>, + batch_size: Option, + name_mapping: Option<&NameMapping>, + bytes_read: Option>, + iceberg_schema: Option<&crate::spec::Schema>, + ) -> Result<(ParquetRecordBatchStreamBuilder, bool)> { + let parquet_file = file_io.new_input(data_file_path)?; + let raw_reader = parquet_file.reader().await?; + let boxed_reader: Box = if let Some(counter) = bytes_read { + Box::new(CountingFileRead::new(raw_reader, counter)) + } else { + Box::new(raw_reader) + }; + let (file_reader, arrow_metadata) = + Self::build_parquet_reader(boxed_reader, file_size_in_bytes, parquet_read_options) + .await?; + + // Check if Parquet file has embedded field IDs. + // Corresponds to Java's ParquetSchemaUtil.hasIds() + let has_missing_field_ids = arrow_metadata + .schema() + .fields() + .iter() + .next() + .is_some_and(|f| f.metadata().get(PARQUET_FIELD_ID_META_KEY).is_none()); + + // Three-branch schema resolution strategy matching Java's ReadConf constructor. + // + // When Parquet files lack field IDs (e.g., Hive/Spark migrations via add_files), + // we must assign field IDs BEFORE reading data to enable correct column projection. + let arrow_metadata = if has_missing_field_ids { + // Parquet file lacks field IDs - must assign them before reading. + let arrow_schema = if let Some(nm) = name_mapping { + // Branch 2: Apply name mapping to assign correct Iceberg field IDs. + // Corresponds to Java's ParquetSchemaUtil.applyNameMapping() + apply_name_mapping_to_arrow_schema(Arc::clone(arrow_metadata.schema()), nm)? + } else { + // Branch 3: No name mapping - use position-based fallback IDs. + // Corresponds to Java's ParquetSchemaUtil.addFallbackIds() + add_fallback_field_ids_to_arrow_schema(arrow_metadata.schema()) + }; + let mut options = ArrowReaderOptions::new().with_schema(arrow_schema); + if !virtual_columns.is_empty() { + options = options.with_virtual_columns(virtual_columns)?; + } + ArrowReaderMetadata::try_new(Arc::clone(arrow_metadata.metadata()), options).map_err( + |e| { + Error::new( + ErrorKind::Unexpected, + "Failed to create ArrowReaderMetadata with field ID schema", + ) + .with_source(e) + }, + )? + } else { + // Branch 1: File has embedded field IDs - trust them. + if !virtual_columns.is_empty() { + let options = ArrowReaderOptions::new().with_virtual_columns(virtual_columns)?; + ArrowReaderMetadata::try_new(Arc::clone(arrow_metadata.metadata()), options) + .map_err(|e| { + Error::new( + ErrorKind::Unexpected, + "Failed to create ArrowReaderMetadata with virtual columns", + ) + .with_source(e) + })? + } else { + arrow_metadata + } + }; + + // Coerce INT96 timestamp columns to the resolution specified by the Iceberg schema. + // This must happen before building the stream reader to avoid i64 overflow in arrow-rs. + let arrow_metadata = if let Some(schema) = iceberg_schema { + if let Some(coerced_schema) = coerce_int96_timestamps(arrow_metadata.schema(), schema) { + let options = ArrowReaderOptions::new().with_schema(Arc::clone(&coerced_schema)); + ArrowReaderMetadata::try_new(Arc::clone(arrow_metadata.metadata()), options) + .map_err(|e| { + Error::new( + ErrorKind::Unexpected, + format!( + "Failed to create ArrowReaderMetadata with INT96-coerced schema: {coerced_schema}" + ), + ) + .with_source(e) + })? + } else { + arrow_metadata + } + } else { + arrow_metadata + }; + + let mut builder = + ParquetRecordBatchStreamBuilder::new_with_metadata(file_reader, arrow_metadata); + + if let Some(batch_size) = batch_size { + builder = builder.with_batch_size(batch_size); + } + + Ok((builder, has_missing_field_ids)) + } + + /// Applies all row-level and row-group-level filters to a builder returned by + /// [`Self::open_parquet_stream_builder`]. + /// + /// Handles byte-range row group pruning, predicate row filtering (with optional + /// projection), and positional-delete row selection. + #[allow(clippy::too_many_arguments)] + pub(crate) fn apply_parquet_filters( + mut builder: ParquetRecordBatchStreamBuilder, + start: u64, + length: u64, + schema: &crate::spec::Schema, + bound_predicate: Option<&crate::expr::BoundPredicate>, + positional_deletes: Option<&Mutex>, + row_group_filtering_enabled: bool, + row_selection_enabled: bool, + use_predicate_projection: bool, + has_missing_field_ids: bool, + ) -> Result> { + let mut selected_row_group_indices = None; + let mut row_selection = None; + + if start != 0 || length != 0 { + selected_row_group_indices = Some(Self::filter_row_groups_by_byte_range( + builder.metadata(), + start, + length, + )?); + } + + if let Some(predicate) = bound_predicate { + let (iceberg_field_ids, field_id_map) = + Self::build_field_id_set_and_map(builder.parquet_schema(), predicate)?; + + if use_predicate_projection { + let predicate_field_ids: Vec = iceberg_field_ids.iter().copied().collect(); + builder = Self::apply_projection( + builder, + &predicate_field_ids, + schema, + has_missing_field_ids, + )?; + } + + let row_filter = Self::get_row_filter( + predicate, + builder.parquet_schema(), + &iceberg_field_ids, + &field_id_map, + )?; + builder = builder.with_row_filter(row_filter); + + if row_group_filtering_enabled { + let predicate_filtered = Self::get_selected_row_group_indices( + predicate, + builder.metadata(), + &field_id_map, + schema, + )?; + selected_row_group_indices = Some(match selected_row_group_indices.take() { + Some(existing) => existing + .into_iter() + .filter(|idx| predicate_filtered.contains(idx)) + .collect(), + None => predicate_filtered, + }); + } + + if row_selection_enabled { + row_selection = Some(Self::get_row_selection_for_filter_predicate( + predicate, + builder.metadata(), + &selected_row_group_indices, + &field_id_map, + schema, + )?); + } + } + + if let Some(positional_delete_indexes) = positional_deletes { + let delete_row_selection = { + let guard = positional_delete_indexes.lock().unwrap(); + Self::build_deletes_row_selection( + builder.metadata().row_groups(), + &selected_row_group_indices, + &guard, + ) + }?; + row_selection = Some(match row_selection.take() { + None => delete_row_selection, + Some(prev) => prev.intersection(&delete_row_selection), + }); + } + + if let Some(sel) = row_selection { + builder = builder.with_row_selection(sel); + } + if let Some(groups) = selected_row_group_indices { + builder = builder.with_row_groups(groups); + } + + Ok(builder) + } + + /// Applies a projection mask derived from `field_ids` to a builder. + /// + /// Wraps `get_arrow_projection_mask` + `with_projection` into a single call. + fn apply_projection( + builder: ParquetRecordBatchStreamBuilder, + field_ids: &[i32], + schema: &crate::spec::Schema, + has_missing_field_ids: bool, + ) -> Result> { + // Metadata fields (e.g. _file, _pos) are virtual — they don't exist as Parquet columns. + // Filter them out so get_arrow_projection_mask only sees real schema field IDs. + let project_field_ids_without_metadata: Vec = field_ids + .iter() + .filter(|&&id| !is_metadata_field(id)) + .copied() + .collect(); + let mask = Self::get_arrow_projection_mask( + &project_field_ids_without_metadata, + schema, + builder.parquet_schema(), + builder.schema(), + has_missing_field_ids, + )?; + Ok(builder.with_projection(mask)) + } + + /// Returns the list of virtual columns to request from the Parquet reader for the + /// given projection. Currently, only `_pos` is a virtual column (produced by the + /// Parquet reader itself rather than read from file data). + pub(crate) fn build_virtual_columns( + project_field_ids: &[i32], + ) -> Vec> { + let mut virtual_columns = Vec::new(); + if project_field_ids.contains(&RESERVED_FIELD_ID_POS) { + virtual_columns.push(Arc::clone(row_pos_field())); + } + virtual_columns + } + + /// Builds a [`RecordBatchTransformer`] for a data file scan task. + /// + /// Handles the three optional transformations that are common to both the full + /// scan (`process_file_scan_task`) and the incremental append scan + /// (`process_incremental_append_task`): + /// - `_file` constant column (only when `RESERVED_FIELD_ID_FILE` is projected) + /// - `_pos` virtual column (only when `RESERVED_FIELD_ID_POS` is projected) + /// - identity-transform partition columns (only when partition metadata is present) + fn build_record_batch_transformer( + schema: SchemaRef, + project_field_ids: &[i32], + data_file_path: &str, + partition_spec: Option>, + partition: Option, + ) -> Result { + let mut builder = RecordBatchTransformerBuilder::new(schema, project_field_ids); + + if project_field_ids.contains(&RESERVED_FIELD_ID_FILE) { + builder = builder.with_constant(RESERVED_FIELD_ID_FILE, Datum::string(data_file_path)); + } + + if project_field_ids.contains(&RESERVED_FIELD_ID_POS) { + builder = builder.with_virtual_field(Arc::clone(row_pos_field()))?; + } + + if let (Some(spec), Some(data)) = (partition_spec, partition) { + builder = builder.with_partition(spec, data)?; + } + + Ok(builder.build()) + } + + /// Centralises the final "commit" step shared by all Parquet reading paths. + /// Applies projection to `builder`, constructs a `RecordBatchTransformer`, builds the + /// Parquet stream, and wraps it so every batch is passed through the transformer. + /// + /// This is the shared finalization step used by every data-file reading path. + pub(crate) fn build_projected_record_batch_stream( + builder: ParquetRecordBatchStreamBuilder, + project_field_ids: &[i32], + schema: SchemaRef, + has_missing_field_ids: bool, + data_file_path: &str, + partition_spec: Option>, + partition: Option, + ) -> Result { + let builder = + Self::apply_projection(builder, project_field_ids, &schema, has_missing_field_ids)?; + + let mut record_batch_transformer = Self::build_record_batch_transformer( + schema, + project_field_ids, + data_file_path, + partition_spec, + partition, + )?; + + let record_batch_stream = builder.build()?.map(move |batch| match batch { + Ok(batch) => record_batch_transformer.process_record_batch(batch), + Err(err) => Err(err.into()), + }); + + Ok(Box::pin(record_batch_stream) as ArrowRecordBatchStream) + } +} + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + use std::fs::File; + use std::sync::Arc; + + use arrow_array::cast::AsArray; + use arrow_array::{Array, ArrayRef, RecordBatch}; + use arrow_schema::{DataType, Field, Schema as ArrowSchema}; + use futures::TryStreamExt; + use parquet::arrow::{ArrowWriter, PARQUET_FIELD_ID_META_KEY}; + use parquet::basic::Compression; + use parquet::file::properties::WriterProperties; + use tempfile::TempDir; + + use crate::arrow::ArrowReaderBuilder; + use crate::io::FileIO; + use crate::scan::{FileScanTask, FileScanTaskStream}; + use crate::spec::{DataFileFormat, NestedField, PrimitiveType, Schema, SchemaRef, Type}; + + // INT96 encoding: [nanos_low_u32, nanos_high_u32, julian_day_u32] + // Julian day 2_440_588 = Unix epoch (1970-01-01) + const UNIX_EPOCH_JULIAN: i64 = 2_440_588; + const MICROS_PER_DAY: i64 = 86_400_000_000; + // Noon on 3333-01-01 (Julian day 2_953_529) — outside the i64 nanosecond range (~1677-2262). + const INT96_TEST_NANOS_WITHIN_DAY: u64 = 43_200_000_000_000; + const INT96_TEST_JULIAN_DAY: u32 = 2_953_529; + + fn make_int96_test_value() -> (parquet::data_type::Int96, i64) { + let mut val = parquet::data_type::Int96::new(); + val.set_data( + (INT96_TEST_NANOS_WITHIN_DAY & 0xFFFFFFFF) as u32, + (INT96_TEST_NANOS_WITHIN_DAY >> 32) as u32, + INT96_TEST_JULIAN_DAY, + ); + let expected_micros = (INT96_TEST_JULIAN_DAY as i64 - UNIX_EPOCH_JULIAN) * MICROS_PER_DAY + + (INT96_TEST_NANOS_WITHIN_DAY / 1_000) as i64; + (val, expected_micros) + } + + async fn read_int96_batches( + file_path: &str, + schema: SchemaRef, + project_field_ids: Vec, + ) -> Vec { + let file_io = FileIO::new_with_fs(); + let reader = ArrowReaderBuilder::new(file_io).build(); + + let file_size = std::fs::metadata(file_path).unwrap().len(); + let task = FileScanTask { + file_size_in_bytes: file_size, + start: 0, + length: file_size, + record_count: None, + data_file_path: file_path.to_string(), + data_file_format: DataFileFormat::Parquet, + schema, + project_field_ids, + predicate: None, + deletes: vec![], + partition: None, + partition_spec: None, + name_mapping: None, + case_sensitive: false, + }; + + let tasks = Box::pin(futures::stream::iter(vec![Ok(task)])) as FileScanTaskStream; + reader + .read(tasks) + .unwrap() + .stream() + .try_collect() + .await + .unwrap() + } + + // ArrowWriter cannot write INT96, so we use SerializedFileWriter directly. + fn write_int96_parquet_file( + table_location: &str, + filename: &str, + with_field_ids: bool, + ) -> (String, Vec) { + use parquet::basic::{Repetition, Type as PhysicalType}; + use parquet::data_type::{Int32Type, Int96, Int96Type}; + use parquet::file::writer::SerializedFileWriter; + use parquet::schema::types::Type as SchemaType; + + let file_path = format!("{table_location}/{filename}"); + + let mut ts_builder = SchemaType::primitive_type_builder("ts", PhysicalType::INT96) + .with_repetition(Repetition::OPTIONAL); + let mut id_builder = SchemaType::primitive_type_builder("id", PhysicalType::INT32) + .with_repetition(Repetition::REQUIRED); + + if with_field_ids { + ts_builder = ts_builder.with_id(Some(1)); + id_builder = id_builder.with_id(Some(2)); + } + + let schema = SchemaType::group_type_builder("schema") + .with_fields(vec![ + Arc::new(ts_builder.build().unwrap()), + Arc::new(id_builder.build().unwrap()), + ]) + .build() + .unwrap(); + + // Dates outside the i64 nanosecond range (~1677-2262) overflow without coercion. + const NOON_NANOS: u64 = INT96_TEST_NANOS_WITHIN_DAY; + const JULIAN_3333: u32 = INT96_TEST_JULIAN_DAY; + const JULIAN_2100: u32 = 2_488_070; + + let test_data: Vec<(u32, u32, u32, i64)> = vec![ + // 3333-01-01 00:00:00 + ( + 0, + 0, + JULIAN_3333, + (JULIAN_3333 as i64 - UNIX_EPOCH_JULIAN) * MICROS_PER_DAY, + ), + // 3333-01-01 12:00:00 + ( + (NOON_NANOS & 0xFFFFFFFF) as u32, + (NOON_NANOS >> 32) as u32, + JULIAN_3333, + (JULIAN_3333 as i64 - UNIX_EPOCH_JULIAN) * MICROS_PER_DAY + + (NOON_NANOS / 1_000) as i64, + ), + // 2100-01-01 00:00:00 + ( + 0, + 0, + JULIAN_2100, + (JULIAN_2100 as i64 - UNIX_EPOCH_JULIAN) * MICROS_PER_DAY, + ), + ]; + + let int96_values: Vec = test_data + .iter() + .map(|(lo, hi, day, _)| { + let mut v = Int96::new(); + v.set_data(*lo, *hi, *day); + v + }) + .collect(); + + let id_values: Vec = (0..test_data.len() as i32).collect(); + let expected_micros: Vec = test_data.iter().map(|(_, _, _, m)| *m).collect(); + + let file = File::create(&file_path).unwrap(); + let mut writer = + SerializedFileWriter::new(file, Arc::new(schema), Default::default()).unwrap(); + + let mut row_group = writer.next_row_group().unwrap(); + { + // def=1: ts is OPTIONAL and present. No repetition levels (top-level columns). + let mut col = row_group.next_column().unwrap().unwrap(); + col.typed::() + .write_batch(&int96_values, Some(&vec![1; test_data.len()]), None) + .unwrap(); + col.close().unwrap(); + } + { + let mut col = row_group.next_column().unwrap().unwrap(); + col.typed::() + .write_batch(&id_values, None, None) + .unwrap(); + col.close().unwrap(); + } + row_group.close().unwrap(); + writer.close().unwrap(); + + (file_path, expected_micros) + } + + async fn assert_int96_read_matches( + file_path: &str, + schema: SchemaRef, + project_field_ids: Vec, + expected_micros: &[i64], + ) { + use arrow_array::TimestampMicrosecondArray; + + let batches = read_int96_batches(file_path, schema, project_field_ids).await; + + assert_eq!(batches.len(), 1); + let ts_array = batches[0] + .column(0) + .as_any() + .downcast_ref::() + .expect("Expected TimestampMicrosecondArray"); + + for (i, expected) in expected_micros.iter().enumerate() { + assert_eq!( + ts_array.value(i), + *expected, + "Row {i}: got {}, expected {expected}", + ts_array.value(i) + ); + } + } + + /// Test that concurrency=1 reads all files correctly and in deterministic order. + /// This verifies the fast-path optimization for single concurrency. + #[tokio::test] + async fn test_read_with_concurrency_one() { + use arrow_array::Int32Array; + + let schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::required(1, "id", Type::Primitive(PrimitiveType::Int)).into(), + NestedField::required(2, "file_num", Type::Primitive(PrimitiveType::Int)) + .into(), + ]) + .build() + .unwrap(), + ); + + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("id", DataType::Int32, false).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + "1".to_string(), + )])), + Field::new("file_num", DataType::Int32, false).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + "2".to_string(), + )])), + ])); + + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + let file_io = FileIO::new_with_fs(); + + // Create 3 parquet files with different data + let props = WriterProperties::builder() + .set_compression(Compression::SNAPPY) + .build(); + + for file_num in 0..3 { + let id_data = Arc::new(Int32Array::from_iter_values( + file_num * 10..(file_num + 1) * 10, + )) as ArrayRef; + let file_num_data = Arc::new(Int32Array::from(vec![file_num; 10])) as ArrayRef; + + let to_write = + RecordBatch::try_new(arrow_schema.clone(), vec![id_data, file_num_data]).unwrap(); + + let file = File::create(format!("{table_location}/file_{file_num}.parquet")).unwrap(); + let mut writer = + ArrowWriter::try_new(file, to_write.schema(), Some(props.clone())).unwrap(); + writer.write(&to_write).expect("Writing batch"); + writer.close().unwrap(); + } + + // Read with concurrency=1 (fast-path) + let reader = ArrowReaderBuilder::new(file_io) + .with_data_file_concurrency_limit(1) + .build(); + + // Create tasks in a specific order: file_0, file_1, file_2 + let tasks = vec![ + Ok(FileScanTask { + file_size_in_bytes: std::fs::metadata(format!("{table_location}/file_0.parquet")) + .unwrap() + .len(), + start: 0, + length: 0, + record_count: None, + data_file_path: format!("{table_location}/file_0.parquet"), + data_file_format: DataFileFormat::Parquet, + schema: schema.clone(), + project_field_ids: vec![1, 2], + predicate: None, + deletes: vec![], + partition: None, + partition_spec: None, + name_mapping: None, + case_sensitive: false, + }), + Ok(FileScanTask { + file_size_in_bytes: std::fs::metadata(format!("{table_location}/file_1.parquet")) + .unwrap() + .len(), + start: 0, + length: 0, + record_count: None, + data_file_path: format!("{table_location}/file_1.parquet"), + data_file_format: DataFileFormat::Parquet, + schema: schema.clone(), + project_field_ids: vec![1, 2], + predicate: None, + deletes: vec![], + partition: None, + partition_spec: None, + name_mapping: None, + case_sensitive: false, + }), + Ok(FileScanTask { + file_size_in_bytes: std::fs::metadata(format!("{table_location}/file_2.parquet")) + .unwrap() + .len(), + start: 0, + length: 0, + record_count: None, + data_file_path: format!("{table_location}/file_2.parquet"), + data_file_format: DataFileFormat::Parquet, + schema: schema.clone(), + project_field_ids: vec![1, 2], + predicate: None, + deletes: vec![], + partition: None, + partition_spec: None, + name_mapping: None, + case_sensitive: false, + }), + ]; + + let tasks_stream = Box::pin(futures::stream::iter(tasks)) as FileScanTaskStream; + + let result = reader + .read(tasks_stream) + .unwrap() + .stream() + .try_collect::>() + .await + .unwrap(); + + // Verify we got all 30 rows (10 from each file) + let total_rows: usize = result.iter().map(|b| b.num_rows()).sum(); + assert_eq!(total_rows, 30, "Should have 30 total rows"); + + // Collect all ids and file_nums to verify data + let mut all_ids = Vec::new(); + let mut all_file_nums = Vec::new(); + + for batch in &result { + let id_col = batch + .column(0) + .as_primitive::(); + let file_num_col = batch + .column(1) + .as_primitive::(); + + for i in 0..batch.num_rows() { + all_ids.push(id_col.value(i)); + all_file_nums.push(file_num_col.value(i)); + } + } + + assert_eq!(all_ids.len(), 30); + assert_eq!(all_file_nums.len(), 30); + + // With concurrency=1 and sequential processing, files should be processed in order + // file_0: ids 0-9, file_num=0 + // file_1: ids 10-19, file_num=1 + // file_2: ids 20-29, file_num=2 + for i in 0..10 { + assert_eq!(all_file_nums[i], 0, "First 10 rows should be from file_0"); + assert_eq!(all_ids[i], i as i32, "IDs should be 0-9"); + } + for i in 10..20 { + assert_eq!(all_file_nums[i], 1, "Next 10 rows should be from file_1"); + assert_eq!(all_ids[i], i as i32, "IDs should be 10-19"); + } + for i in 20..30 { + assert_eq!(all_file_nums[i], 2, "Last 10 rows should be from file_2"); + assert_eq!(all_ids[i], i as i32, "IDs should be 20-29"); + } + } + + #[tokio::test] + async fn test_read_int96_timestamps_with_field_ids() { + let schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::optional(1, "ts", Type::Primitive(PrimitiveType::Timestamp)) + .into(), + NestedField::required(2, "id", Type::Primitive(PrimitiveType::Int)).into(), + ]) + .build() + .unwrap(), + ); + + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + let (file_path, expected_micros) = + write_int96_parquet_file(&table_location, "with_ids.parquet", true); + + assert_int96_read_matches(&file_path, schema, vec![1, 2], &expected_micros).await; + } + + #[tokio::test] + async fn test_read_int96_timestamps_without_field_ids() { + let schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::optional(1, "ts", Type::Primitive(PrimitiveType::Timestamp)) + .into(), + NestedField::required(2, "id", Type::Primitive(PrimitiveType::Int)).into(), + ]) + .build() + .unwrap(), + ); + + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + let (file_path, expected_micros) = + write_int96_parquet_file(&table_location, "no_ids.parquet", false); + + assert_int96_read_matches(&file_path, schema, vec![1, 2], &expected_micros).await; + } + + #[tokio::test] + async fn test_read_int96_timestamps_in_struct() { + use arrow_array::{StructArray, TimestampMicrosecondArray}; + use parquet::basic::{Repetition, Type as PhysicalType}; + use parquet::data_type::Int96Type; + use parquet::file::writer::SerializedFileWriter; + use parquet::schema::types::Type as SchemaType; + + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + let file_path = format!("{table_location}/struct_int96.parquet"); + + let ts_type = SchemaType::primitive_type_builder("ts", PhysicalType::INT96) + .with_repetition(Repetition::OPTIONAL) + .with_id(Some(2)) + .build() + .unwrap(); + + let struct_type = SchemaType::group_type_builder("data") + .with_repetition(Repetition::REQUIRED) + .with_id(Some(1)) + .with_fields(vec![Arc::new(ts_type)]) + .build() + .unwrap(); + + let parquet_schema = SchemaType::group_type_builder("schema") + .with_fields(vec![Arc::new(struct_type)]) + .build() + .unwrap(); + + let (int96_val, expected_micros) = make_int96_test_value(); + + let file = File::create(&file_path).unwrap(); + let mut writer = + SerializedFileWriter::new(file, Arc::new(parquet_schema), Default::default()).unwrap(); + + // def=1: struct is REQUIRED so no level, ts is OPTIONAL and present (1). + // No repetition levels needed (no repeated groups). + let mut row_group = writer.next_row_group().unwrap(); + { + let mut col = row_group.next_column().unwrap().unwrap(); + col.typed::() + .write_batch(&[int96_val], Some(&[1]), None) + .unwrap(); + col.close().unwrap(); + } + row_group.close().unwrap(); + writer.close().unwrap(); + + let iceberg_schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::required( + 1, + "data", + Type::Struct(crate::spec::StructType::new(vec![ + NestedField::optional( + 2, + "ts", + Type::Primitive(PrimitiveType::Timestamp), + ) + .into(), + ])), + ) + .into(), + ]) + .build() + .unwrap(), + ); + + let batches = read_int96_batches(&file_path, iceberg_schema, vec![1]).await; + + assert_eq!(batches.len(), 1); + let struct_array = batches[0] + .column(0) + .as_any() + .downcast_ref::() + .expect("Expected StructArray"); + let ts_array = struct_array + .column(0) + .as_any() + .downcast_ref::() + .expect("Expected TimestampMicrosecondArray inside struct"); + + assert_eq!( + ts_array.value(0), + expected_micros, + "INT96 in struct: got {}, expected {expected_micros}", + ts_array.value(0) + ); + } + + #[tokio::test] + async fn test_read_int96_timestamps_in_list() { + use arrow_array::{ListArray, TimestampMicrosecondArray}; + use parquet::basic::{Repetition, Type as PhysicalType}; + use parquet::data_type::Int96Type; + use parquet::file::writer::SerializedFileWriter; + use parquet::schema::types::Type as SchemaType; + + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + let file_path = format!("{table_location}/list_int96.parquet"); + + // 3-level LIST encoding: + // optional group timestamps (LIST) { + // repeated group list { + // optional int96 element; + // } + // } + let element_type = SchemaType::primitive_type_builder("element", PhysicalType::INT96) + .with_repetition(Repetition::OPTIONAL) + .with_id(Some(2)) + .build() + .unwrap(); + + let list_group = SchemaType::group_type_builder("list") + .with_repetition(Repetition::REPEATED) + .with_fields(vec![Arc::new(element_type)]) + .build() + .unwrap(); + + let list_type = SchemaType::group_type_builder("timestamps") + .with_repetition(Repetition::OPTIONAL) + .with_id(Some(1)) + .with_logical_type(Some(parquet::basic::LogicalType::List)) + .with_fields(vec![Arc::new(list_group)]) + .build() + .unwrap(); + + let parquet_schema = SchemaType::group_type_builder("schema") + .with_fields(vec![Arc::new(list_type)]) + .build() + .unwrap(); + + let (int96_val, expected_micros) = make_int96_test_value(); + + let file = File::create(&file_path).unwrap(); + let mut writer = + SerializedFileWriter::new(file, Arc::new(parquet_schema), Default::default()).unwrap(); + + // Write a single row with a list containing one INT96 element. + // def=3: list present (1) + repeated group (2) + element present (3) + // rep=0: start of a new list + let mut row_group = writer.next_row_group().unwrap(); + { + let mut col = row_group.next_column().unwrap().unwrap(); + col.typed::() + .write_batch(&[int96_val], Some(&[3]), Some(&[0])) + .unwrap(); + col.close().unwrap(); + } + row_group.close().unwrap(); + writer.close().unwrap(); + + let iceberg_schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::optional( + 1, + "timestamps", + Type::List(crate::spec::ListType { + element_field: NestedField::optional( + 2, + "element", + Type::Primitive(PrimitiveType::Timestamp), + ) + .into(), + }), + ) + .into(), + ]) + .build() + .unwrap(), + ); + + let batches = read_int96_batches(&file_path, iceberg_schema, vec![1]).await; + + assert_eq!(batches.len(), 1); + let list_array = batches[0] + .column(0) + .as_any() + .downcast_ref::() + .expect("Expected ListArray"); + let ts_array = list_array + .values() + .as_any() + .downcast_ref::() + .expect("Expected TimestampMicrosecondArray inside list"); + + assert_eq!( + ts_array.value(0), + expected_micros, + "INT96 in list: got {}, expected {expected_micros}", + ts_array.value(0) + ); + } + + #[tokio::test] + async fn test_read_int96_timestamps_in_map() { + use arrow_array::{MapArray, TimestampMicrosecondArray}; + use parquet::basic::{Repetition, Type as PhysicalType}; + use parquet::data_type::{ByteArrayType, Int96Type}; + use parquet::file::writer::SerializedFileWriter; + use parquet::schema::types::Type as SchemaType; + + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + let file_path = format!("{table_location}/map_int96.parquet"); + + // MAP encoding: + // optional group ts_map (MAP) { + // repeated group key_value { + // required binary key (UTF8); + // optional int96 value; + // } + // } + let key_type = SchemaType::primitive_type_builder("key", PhysicalType::BYTE_ARRAY) + .with_repetition(Repetition::REQUIRED) + .with_logical_type(Some(parquet::basic::LogicalType::String)) + .with_id(Some(2)) + .build() + .unwrap(); + + let value_type = SchemaType::primitive_type_builder("value", PhysicalType::INT96) + .with_repetition(Repetition::OPTIONAL) + .with_id(Some(3)) + .build() + .unwrap(); + + let key_value_group = SchemaType::group_type_builder("key_value") + .with_repetition(Repetition::REPEATED) + .with_fields(vec![Arc::new(key_type), Arc::new(value_type)]) + .build() + .unwrap(); + + let map_type = SchemaType::group_type_builder("ts_map") + .with_repetition(Repetition::OPTIONAL) + .with_id(Some(1)) + .with_logical_type(Some(parquet::basic::LogicalType::Map)) + .with_fields(vec![Arc::new(key_value_group)]) + .build() + .unwrap(); + + let parquet_schema = SchemaType::group_type_builder("schema") + .with_fields(vec![Arc::new(map_type)]) + .build() + .unwrap(); + + let (int96_val, expected_micros) = make_int96_test_value(); + + let file = File::create(&file_path).unwrap(); + let mut writer = + SerializedFileWriter::new(file, Arc::new(parquet_schema), Default::default()).unwrap(); + + // Write a single row with a map containing one key-value pair. + // rep=0 for both columns: start of a new map. + // key def=2: map present (1) + key_value entry present (2), key is REQUIRED. + // value def=3: map present (1) + key_value entry present (2) + value present (3). + let mut row_group = writer.next_row_group().unwrap(); + { + let mut col = row_group.next_column().unwrap().unwrap(); + col.typed::() + .write_batch( + &[parquet::data_type::ByteArray::from("event_time")], + Some(&[2]), + Some(&[0]), + ) + .unwrap(); + col.close().unwrap(); + } + { + let mut col = row_group.next_column().unwrap().unwrap(); + col.typed::() + .write_batch(&[int96_val], Some(&[3]), Some(&[0])) + .unwrap(); + col.close().unwrap(); + } + row_group.close().unwrap(); + writer.close().unwrap(); + + let iceberg_schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::optional( + 1, + "ts_map", + Type::Map(crate::spec::MapType { + key_field: NestedField::required( + 2, + "key", + Type::Primitive(PrimitiveType::String), + ) + .into(), + value_field: NestedField::optional( + 3, + "value", + Type::Primitive(PrimitiveType::Timestamp), + ) + .into(), + }), + ) + .into(), + ]) + .build() + .unwrap(), + ); + + let batches = read_int96_batches(&file_path, iceberg_schema, vec![1]).await; + + assert_eq!(batches.len(), 1); + let map_array = batches[0] + .column(0) + .as_any() + .downcast_ref::() + .expect("Expected MapArray"); + let ts_array = map_array + .values() + .as_any() + .downcast_ref::() + .expect("Expected TimestampMicrosecondArray as map values"); + + assert_eq!( + ts_array.value(0), + expected_micros, + "INT96 in map: got {}, expected {expected_micros}", + ts_array.value(0) + ); + } +} diff --git a/crates/iceberg/src/arrow/reader/positional_deletes.rs b/crates/iceberg/src/arrow/reader/positional_deletes.rs new file mode 100644 index 0000000000..b2993572c5 --- /dev/null +++ b/crates/iceberg/src/arrow/reader/positional_deletes.rs @@ -0,0 +1,934 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Positional delete handling for `ArrowReader`: converting a `DeleteVector` +//! into a Parquet `RowSelection` that skips the deleted rows, while respecting +//! any row-group selection made by the predicate evaluator. + +use parquet::arrow::arrow_reader::{RowSelection, RowSelector}; +use parquet::file::metadata::RowGroupMetaData; + +use super::ArrowReader; +use crate::delete_vector::DeleteVector; +use crate::error::Result; + +impl ArrowReader { + /// computes a `RowSelection` from positional delete indices. + /// + /// Using the Parquet page index, we build a `RowSelection` that rejects rows that are indicated + /// as having been deleted by a positional delete, taking into account any row groups that have + /// been skipped entirely by the filter predicate + pub(super) fn build_deletes_row_selection( + row_group_metadata_list: &[RowGroupMetaData], + selected_row_groups: &Option>, + positional_deletes: &DeleteVector, + ) -> Result { + let mut results: Vec = Vec::new(); + let mut selected_row_groups_idx = 0; + let mut current_row_group_base_idx: u64 = 0; + let mut delete_vector_iter = positional_deletes.iter(); + let mut next_deleted_row_idx_opt = delete_vector_iter.next(); + + for (idx, row_group_metadata) in row_group_metadata_list.iter().enumerate() { + let row_group_num_rows = row_group_metadata.num_rows() as u64; + let next_row_group_base_idx = current_row_group_base_idx + row_group_num_rows; + + // if row group selection is enabled, + if let Some(selected_row_groups) = selected_row_groups { + // if we've consumed all the selected row groups, we're done + if selected_row_groups_idx == selected_row_groups.len() { + break; + } + + if idx == selected_row_groups[selected_row_groups_idx] { + // we're in a selected row group. Increment selected_row_groups_idx + // so that next time around the for loop we're looking for the next + // selected row group + selected_row_groups_idx += 1; + } else { + // Advance iterator past all deletes in the skipped row group. + // advance_to() positions the iterator to the first delete >= next_row_group_base_idx. + // However, if our cached next_deleted_row_idx_opt is in the skipped range, + // we need to call next() to update the cache with the newly positioned value. + delete_vector_iter.advance_to(next_row_group_base_idx); + // Only update the cache if the cached value is stale (in the skipped range) + if let Some(cached_idx) = next_deleted_row_idx_opt + && cached_idx < next_row_group_base_idx + { + next_deleted_row_idx_opt = delete_vector_iter.next(); + } + + // still increment the current page base index but then skip to the next row group + // in the file + current_row_group_base_idx += row_group_num_rows; + continue; + } + } + + let mut next_deleted_row_idx = match next_deleted_row_idx_opt { + Some(next_deleted_row_idx) => { + // if the index of the next deleted row is beyond this row group, add a selection for + // the remainder of this row group and skip to the next row group + if next_deleted_row_idx >= next_row_group_base_idx { + results.push(RowSelector::select(row_group_num_rows as usize)); + current_row_group_base_idx += row_group_num_rows; + continue; + } + + next_deleted_row_idx + } + + // If there are no more pos deletes, add a selector for the entirety of this row group. + _ => { + results.push(RowSelector::select(row_group_num_rows as usize)); + current_row_group_base_idx += row_group_num_rows; + continue; + } + }; + + let mut current_idx = current_row_group_base_idx; + 'chunks: while next_deleted_row_idx < next_row_group_base_idx { + // `select` all rows that precede the next delete index + if current_idx < next_deleted_row_idx { + let run_length = next_deleted_row_idx - current_idx; + results.push(RowSelector::select(run_length as usize)); + current_idx += run_length; + } + + // `skip` all consecutive deleted rows in the current row group + let mut run_length = 0; + while next_deleted_row_idx == current_idx + && next_deleted_row_idx < next_row_group_base_idx + { + run_length += 1; + current_idx += 1; + + next_deleted_row_idx_opt = delete_vector_iter.next(); + next_deleted_row_idx = match next_deleted_row_idx_opt { + Some(next_deleted_row_idx) => next_deleted_row_idx, + _ => { + // We've processed the final positional delete. + // Conclude the skip and then break so that we select the remaining + // rows in the row group and move on to the next row group + results.push(RowSelector::skip(run_length)); + break 'chunks; + } + }; + } + if run_length > 0 { + results.push(RowSelector::skip(run_length)); + } + } + + if current_idx < next_row_group_base_idx { + results.push(RowSelector::select( + (next_row_group_base_idx - current_idx) as usize, + )); + } + + current_row_group_base_idx += row_group_num_rows; + } + + Ok(results.into()) + } +} + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + use std::fs::File; + use std::sync::Arc; + + use arrow_array::cast::AsArray; + use arrow_array::{RecordBatch, StringArray}; + use arrow_schema::{DataType, Field, Schema as ArrowSchema}; + use futures::TryStreamExt; + use parquet::arrow::arrow_reader::{RowSelection, RowSelector}; + use parquet::arrow::{ArrowWriter, PARQUET_FIELD_ID_META_KEY}; + use parquet::basic::Compression; + use parquet::file::metadata::{ColumnChunkMetaData, RowGroupMetaData}; + use parquet::file::properties::WriterProperties; + use parquet::schema::types::{SchemaDescPtr, SchemaDescriptor}; + use roaring::RoaringTreemap; + use tempfile::TempDir; + + use crate::arrow::{ArrowReader, ArrowReaderBuilder}; + use crate::delete_vector::DeleteVector; + use crate::io::FileIO; + use crate::scan::{FileScanTask, FileScanTaskDeleteFile, FileScanTaskStream}; + use crate::spec::{DataContentType, DataFileFormat, NestedField, PrimitiveType, Schema, Type}; + + fn build_test_row_group_meta( + schema_descr: SchemaDescPtr, + columns: Vec, + num_rows: i64, + ordinal: i16, + ) -> RowGroupMetaData { + RowGroupMetaData::builder(schema_descr.clone()) + .set_num_rows(num_rows) + .set_total_byte_size(2000) + .set_column_metadata(columns) + .set_ordinal(ordinal) + .build() + .unwrap() + } + + fn get_test_schema_descr() -> SchemaDescPtr { + use parquet::schema::types::Type as SchemaType; + + let schema = SchemaType::group_type_builder("schema") + .with_fields(vec![ + Arc::new( + SchemaType::primitive_type_builder("a", parquet::basic::Type::INT32) + .build() + .unwrap(), + ), + Arc::new( + SchemaType::primitive_type_builder("b", parquet::basic::Type::INT32) + .build() + .unwrap(), + ), + ]) + .build() + .unwrap(); + + Arc::new(SchemaDescriptor::new(Arc::new(schema))) + } + + #[test] + fn test_build_deletes_row_selection() { + let schema_descr = get_test_schema_descr(); + + let mut columns = vec![]; + for ptr in schema_descr.columns() { + let column = ColumnChunkMetaData::builder(ptr.clone()).build().unwrap(); + columns.push(column); + } + + let row_groups_metadata = vec![ + build_test_row_group_meta(schema_descr.clone(), columns.clone(), 1000, 0), + build_test_row_group_meta(schema_descr.clone(), columns.clone(), 500, 1), + build_test_row_group_meta(schema_descr.clone(), columns.clone(), 500, 2), + build_test_row_group_meta(schema_descr.clone(), columns.clone(), 1000, 3), + build_test_row_group_meta(schema_descr.clone(), columns.clone(), 500, 4), + ]; + + let selected_row_groups = Some(vec![1, 3]); + + /* cases to cover: + * {skip|select} {first|intermediate|last} {one row|multiple rows} in + {first|intermediate|last} {skipped|selected} row group + * row group selection disabled + */ + + let positional_deletes = RoaringTreemap::from_iter(&[ + 1, // in skipped rg 0, should be ignored + 3, // run of three consecutive items in skipped rg0 + 4, 5, 998, // two consecutive items at end of skipped rg0 + 999, 1000, // solitary row at start of selected rg1 (1, 9) + 1010, // run of 3 rows in selected rg1 + 1011, 1012, // (3, 485) + 1498, // run of two items at end of selected rg1 + 1499, 1500, // run of two items at start of skipped rg2 + 1501, 1600, // should ignore, in skipped rg2 + 1999, // single row at end of skipped rg2 + 2000, // run of two items at start of selected rg3 + 2001, // (4, 98) + 2100, // single row in selected row group 3 (1, 99) + 2200, // run of 3 consecutive rows in selected row group 3 + 2201, 2202, // (3, 796) + 2999, // single item at end of selected rg3 (1) + 3000, // single item at start of skipped rg4 + ]); + + let positional_deletes = DeleteVector::new(positional_deletes); + + // using selected row groups 1 and 3 + let result = ArrowReader::build_deletes_row_selection( + &row_groups_metadata, + &selected_row_groups, + &positional_deletes, + ) + .unwrap(); + + let expected = RowSelection::from(vec![ + RowSelector::skip(1), + RowSelector::select(9), + RowSelector::skip(3), + RowSelector::select(485), + RowSelector::skip(4), + RowSelector::select(98), + RowSelector::skip(1), + RowSelector::select(99), + RowSelector::skip(3), + RowSelector::select(796), + RowSelector::skip(1), + ]); + + assert_eq!(result, expected); + + // selecting all row groups + let result = ArrowReader::build_deletes_row_selection( + &row_groups_metadata, + &None, + &positional_deletes, + ) + .unwrap(); + + let expected = RowSelection::from(vec![ + RowSelector::select(1), + RowSelector::skip(1), + RowSelector::select(1), + RowSelector::skip(3), + RowSelector::select(992), + RowSelector::skip(3), + RowSelector::select(9), + RowSelector::skip(3), + RowSelector::select(485), + RowSelector::skip(4), + RowSelector::select(98), + RowSelector::skip(1), + RowSelector::select(398), + RowSelector::skip(3), + RowSelector::select(98), + RowSelector::skip(1), + RowSelector::select(99), + RowSelector::skip(3), + RowSelector::select(796), + RowSelector::skip(2), + RowSelector::select(499), + ]); + + assert_eq!(result, expected); + } + + /// Test for bug where position deletes in later row groups are not applied correctly. + /// + /// When a file has multiple row groups and a position delete targets a row in a later + /// row group, the `build_deletes_row_selection` function had a bug where it would + /// fail to increment `current_row_group_base_idx` when skipping row groups. + /// + /// This test creates: + /// - A data file with 200 rows split into 2 row groups (0-99, 100-199) + /// - A position delete file that deletes row 199 (last row in second row group) + /// + /// Expected behavior: Should return 199 rows (with id=200 deleted) + /// Bug behavior: Returns 200 rows (delete is not applied) + /// + /// This bug was discovered while running Apache Spark + Apache Iceberg integration tests + /// through DataFusion Comet. The following Iceberg Java tests failed due to this bug: + /// - `org.apache.iceberg.spark.extensions.TestMergeOnReadDelete::testDeleteWithMultipleRowGroupsParquet` + /// - `org.apache.iceberg.spark.extensions.TestMergeOnReadUpdate::testUpdateWithMultipleRowGroupsParquet` + #[tokio::test] + async fn test_position_delete_across_multiple_row_groups() { + use arrow_array::{Int32Array, Int64Array}; + use parquet::file::reader::{FileReader, SerializedFileReader}; + + // Field IDs for positional delete schema + const FIELD_ID_POSITIONAL_DELETE_FILE_PATH: u64 = 2147483546; + const FIELD_ID_POSITIONAL_DELETE_POS: u64 = 2147483545; + + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + + // Create table schema with a single 'id' column + let table_schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::required(1, "id", Type::Primitive(PrimitiveType::Int)).into(), + ]) + .build() + .unwrap(), + ); + + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("id", DataType::Int32, false).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + "1".to_string(), + )])), + ])); + + // Step 1: Create data file with 200 rows in 2 row groups + // Row group 0: rows 0-99 (ids 1-100) + // Row group 1: rows 100-199 (ids 101-200) + let data_file_path = format!("{table_location}/data.parquet"); + + let batch1 = RecordBatch::try_new(arrow_schema.clone(), vec![Arc::new( + Int32Array::from_iter_values(1..=100), + )]) + .unwrap(); + + let batch2 = RecordBatch::try_new(arrow_schema.clone(), vec![Arc::new( + Int32Array::from_iter_values(101..=200), + )]) + .unwrap(); + + // Force each batch into its own row group + let props = WriterProperties::builder() + .set_compression(Compression::SNAPPY) + .set_max_row_group_row_count(Some(100)) + .build(); + + let file = File::create(&data_file_path).unwrap(); + let mut writer = ArrowWriter::try_new(file, arrow_schema.clone(), Some(props)).unwrap(); + writer.write(&batch1).expect("Writing batch 1"); + writer.write(&batch2).expect("Writing batch 2"); + writer.close().unwrap(); + + // Verify we created 2 row groups + let verify_file = File::open(&data_file_path).unwrap(); + let verify_reader = SerializedFileReader::new(verify_file).unwrap(); + assert_eq!( + verify_reader.metadata().num_row_groups(), + 2, + "Should have 2 row groups" + ); + + // Step 2: Create position delete file that deletes row 199 (id=200, last row in row group 1) + let delete_file_path = format!("{table_location}/deletes.parquet"); + + let delete_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("file_path", DataType::Utf8, false).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + FIELD_ID_POSITIONAL_DELETE_FILE_PATH.to_string(), + )])), + Field::new("pos", DataType::Int64, false).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + FIELD_ID_POSITIONAL_DELETE_POS.to_string(), + )])), + ])); + + // Delete row at position 199 (0-indexed, so it's the last row: id=200) + let delete_batch = RecordBatch::try_new(delete_schema.clone(), vec![ + Arc::new(StringArray::from_iter_values(vec![data_file_path.clone()])), + Arc::new(Int64Array::from_iter_values(vec![199i64])), + ]) + .unwrap(); + + let delete_props = WriterProperties::builder() + .set_compression(Compression::SNAPPY) + .build(); + + let delete_file = File::create(&delete_file_path).unwrap(); + let mut delete_writer = + ArrowWriter::try_new(delete_file, delete_schema, Some(delete_props)).unwrap(); + delete_writer.write(&delete_batch).unwrap(); + delete_writer.close().unwrap(); + + // Step 3: Read the data file with the delete applied + let file_io = FileIO::new_with_fs(); + let reader = ArrowReaderBuilder::new(file_io).build(); + + let task = FileScanTask { + file_size_in_bytes: std::fs::metadata(&data_file_path).unwrap().len(), + start: 0, + length: 0, + record_count: Some(200), + data_file_path: data_file_path.clone(), + data_file_format: DataFileFormat::Parquet, + schema: table_schema.clone(), + project_field_ids: vec![1], + predicate: None, + deletes: vec![FileScanTaskDeleteFile { + file_size_in_bytes: std::fs::metadata(&delete_file_path).unwrap().len(), + file_path: delete_file_path, + file_type: DataContentType::PositionDeletes, + partition_spec_id: 0, + equality_ids: None, + }], + partition: None, + partition_spec: None, + name_mapping: None, + case_sensitive: false, + }; + + let tasks = Box::pin(futures::stream::iter(vec![Ok(task)])) as FileScanTaskStream; + let result = reader + .read(tasks) + .unwrap() + .stream() + .try_collect::>() + .await + .unwrap(); + + // Step 4: Verify we got 199 rows (not 200) + let total_rows: usize = result.iter().map(|b| b.num_rows()).sum(); + + println!("Total rows read: {total_rows}"); + println!("Expected: 199 rows (deleted row 199 which had id=200)"); + + // This assertion will FAIL before the fix and PASS after the fix + assert_eq!( + total_rows, 199, + "Expected 199 rows after deleting row 199, but got {total_rows} rows. \ + The bug causes position deletes in later row groups to be ignored." + ); + + // Verify the deleted row (id=200) is not present + let all_ids: Vec = result + .iter() + .flat_map(|batch| { + batch + .column(0) + .as_primitive::() + .values() + .iter() + .copied() + }) + .collect(); + + assert!( + !all_ids.contains(&200), + "Row with id=200 should be deleted but was found in results" + ); + + // Verify we have all other ids (1-199) + let expected_ids: Vec = (1..=199).collect(); + assert_eq!( + all_ids, expected_ids, + "Should have ids 1-199 but got different values" + ); + } + + /// Test for bug where position deletes are lost when skipping unselected row groups. + /// + /// This is a variant of `test_position_delete_across_multiple_row_groups` that exercises + /// the row group selection code path (`selected_row_groups: Some([...])`). + /// + /// When a file has multiple row groups and only some are selected for reading, + /// the `build_deletes_row_selection` function must correctly skip over deletes in + /// unselected row groups WITHOUT consuming deletes that belong to selected row groups. + /// + /// This test creates: + /// - A data file with 200 rows split into 2 row groups (0-99, 100-199) + /// - A position delete file that deletes row 199 (last row in second row group) + /// - Row group selection that reads ONLY row group 1 (rows 100-199) + /// + /// Expected behavior: Should return 99 rows (with row 199 deleted) + /// Bug behavior: Returns 100 rows (delete is lost when skipping row group 0) + /// + /// The bug occurs when processing row group 0 (unselected): + /// ```rust + /// delete_vector_iter.advance_to(next_row_group_base_idx); // Position at first delete >= 100 + /// next_deleted_row_idx_opt = delete_vector_iter.next(); // BUG: Consumes delete at 199! + /// ``` + /// + /// The fix is to NOT call `next()` after `advance_to()` when skipping unselected row groups, + /// because `advance_to()` already positions the iterator correctly without consuming elements. + #[tokio::test] + async fn test_position_delete_with_row_group_selection() { + use arrow_array::{Int32Array, Int64Array}; + use parquet::file::reader::{FileReader, SerializedFileReader}; + + // Field IDs for positional delete schema + const FIELD_ID_POSITIONAL_DELETE_FILE_PATH: u64 = 2147483546; + const FIELD_ID_POSITIONAL_DELETE_POS: u64 = 2147483545; + + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + + // Create table schema with a single 'id' column + let table_schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::required(1, "id", Type::Primitive(PrimitiveType::Int)).into(), + ]) + .build() + .unwrap(), + ); + + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("id", DataType::Int32, false).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + "1".to_string(), + )])), + ])); + + // Step 1: Create data file with 200 rows in 2 row groups + // Row group 0: rows 0-99 (ids 1-100) + // Row group 1: rows 100-199 (ids 101-200) + let data_file_path = format!("{table_location}/data.parquet"); + + let batch1 = RecordBatch::try_new(arrow_schema.clone(), vec![Arc::new( + Int32Array::from_iter_values(1..=100), + )]) + .unwrap(); + + let batch2 = RecordBatch::try_new(arrow_schema.clone(), vec![Arc::new( + Int32Array::from_iter_values(101..=200), + )]) + .unwrap(); + + // Force each batch into its own row group + let props = WriterProperties::builder() + .set_compression(Compression::SNAPPY) + .set_max_row_group_row_count(Some(100)) + .build(); + + let file = File::create(&data_file_path).unwrap(); + let mut writer = ArrowWriter::try_new(file, arrow_schema.clone(), Some(props)).unwrap(); + writer.write(&batch1).expect("Writing batch 1"); + writer.write(&batch2).expect("Writing batch 2"); + writer.close().unwrap(); + + // Verify we created 2 row groups + let verify_file = File::open(&data_file_path).unwrap(); + let verify_reader = SerializedFileReader::new(verify_file).unwrap(); + assert_eq!( + verify_reader.metadata().num_row_groups(), + 2, + "Should have 2 row groups" + ); + + // Step 2: Create position delete file that deletes row 199 (id=200, last row in row group 1) + let delete_file_path = format!("{table_location}/deletes.parquet"); + + let delete_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("file_path", DataType::Utf8, false).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + FIELD_ID_POSITIONAL_DELETE_FILE_PATH.to_string(), + )])), + Field::new("pos", DataType::Int64, false).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + FIELD_ID_POSITIONAL_DELETE_POS.to_string(), + )])), + ])); + + // Delete row at position 199 (0-indexed, so it's the last row: id=200) + let delete_batch = RecordBatch::try_new(delete_schema.clone(), vec![ + Arc::new(StringArray::from_iter_values(vec![data_file_path.clone()])), + Arc::new(Int64Array::from_iter_values(vec![199i64])), + ]) + .unwrap(); + + let delete_props = WriterProperties::builder() + .set_compression(Compression::SNAPPY) + .build(); + + let delete_file = File::create(&delete_file_path).unwrap(); + let mut delete_writer = + ArrowWriter::try_new(delete_file, delete_schema, Some(delete_props)).unwrap(); + delete_writer.write(&delete_batch).unwrap(); + delete_writer.close().unwrap(); + + // Step 3: Get byte ranges to read ONLY row group 1 (rows 100-199) + // This exercises the row group selection code path where row group 0 is skipped + let metadata_file = File::open(&data_file_path).unwrap(); + let metadata_reader = SerializedFileReader::new(metadata_file).unwrap(); + let metadata = metadata_reader.metadata(); + + let row_group_0 = metadata.row_group(0); + let row_group_1 = metadata.row_group(1); + + let rg0_start = 4u64; // Parquet files start with 4-byte magic "PAR1" + let rg1_start = rg0_start + row_group_0.compressed_size() as u64; + let rg1_length = row_group_1.compressed_size() as u64; + + println!( + "Row group 0: starts at byte {}, {} bytes compressed", + rg0_start, + row_group_0.compressed_size() + ); + println!( + "Row group 1: starts at byte {}, {} bytes compressed", + rg1_start, + row_group_1.compressed_size() + ); + + let file_io = FileIO::new_with_fs(); + let reader = ArrowReaderBuilder::new(file_io).build(); + + // Create FileScanTask that reads ONLY row group 1 via byte range filtering + let task = FileScanTask { + file_size_in_bytes: std::fs::metadata(&data_file_path).unwrap().len(), + start: rg1_start, + length: rg1_length, + record_count: Some(100), // Row group 1 has 100 rows + data_file_path: data_file_path.clone(), + data_file_format: DataFileFormat::Parquet, + schema: table_schema.clone(), + project_field_ids: vec![1], + predicate: None, + deletes: vec![FileScanTaskDeleteFile { + file_size_in_bytes: std::fs::metadata(&delete_file_path).unwrap().len(), + file_path: delete_file_path, + file_type: DataContentType::PositionDeletes, + partition_spec_id: 0, + equality_ids: None, + }], + partition: None, + partition_spec: None, + name_mapping: None, + case_sensitive: false, + }; + + let tasks = Box::pin(futures::stream::iter(vec![Ok(task)])) as FileScanTaskStream; + let result = reader + .read(tasks) + .unwrap() + .stream() + .try_collect::>() + .await + .unwrap(); + + // Step 4: Verify we got 99 rows (not 100) + // Row group 1 has 100 rows (ids 101-200), minus 1 delete (id=200) = 99 rows + let total_rows: usize = result.iter().map(|b| b.num_rows()).sum(); + + println!("Total rows read from row group 1: {total_rows}"); + println!("Expected: 99 rows (row group 1 has 100 rows, 1 delete at position 199)"); + + // This assertion will FAIL before the fix and PASS after the fix + assert_eq!( + total_rows, 99, + "Expected 99 rows from row group 1 after deleting position 199, but got {total_rows} rows. \ + The bug causes position deletes to be lost when advance_to() is followed by next() \ + when skipping unselected row groups." + ); + + // Verify the deleted row (id=200) is not present + let all_ids: Vec = result + .iter() + .flat_map(|batch| { + batch + .column(0) + .as_primitive::() + .values() + .iter() + .copied() + }) + .collect(); + + assert!( + !all_ids.contains(&200), + "Row with id=200 should be deleted but was found in results" + ); + + // Verify we have ids 101-199 (not 101-200) + let expected_ids: Vec = (101..=199).collect(); + assert_eq!( + all_ids, expected_ids, + "Should have ids 101-199 but got different values" + ); + } + + /// Test for bug where stale cached delete causes infinite loop when skipping row groups. + /// + /// This test exposes the inverse scenario of `test_position_delete_with_row_group_selection`: + /// - Position delete targets a row in the SKIPPED row group (not the selected one) + /// - After calling advance_to(), the cached delete index is stale + /// - Without updating the cache, the code enters an infinite loop + /// + /// This test creates: + /// - A data file with 200 rows split into 2 row groups (0-99, 100-199) + /// - A position delete file that deletes row 0 (first row in SKIPPED row group 0) + /// - Row group selection that reads ONLY row group 1 (rows 100-199) + /// + /// The bug occurs when skipping row group 0: + /// ```rust + /// let mut next_deleted_row_idx_opt = delete_vector_iter.next(); // Some(0) + /// // ... skip to row group 1 ... + /// delete_vector_iter.advance_to(100); // Iterator advances past delete at 0 + /// // BUG: next_deleted_row_idx_opt is still Some(0) - STALE! + /// // When processing row group 1: + /// // current_idx = 100, next_deleted_row_idx = 0, next_row_group_base_idx = 200 + /// // Loop condition: 0 < 200 (true) + /// // But: current_idx (100) > next_deleted_row_idx (0) + /// // And: current_idx (100) != next_deleted_row_idx (0) + /// // Neither branch executes -> INFINITE LOOP! + /// ``` + /// + /// Expected behavior: Should return 100 rows (delete at 0 doesn't affect row group 1) + /// Bug behavior: Infinite loop in build_deletes_row_selection + #[tokio::test] + async fn test_position_delete_in_skipped_row_group() { + use arrow_array::{Int32Array, Int64Array}; + use parquet::file::reader::{FileReader, SerializedFileReader}; + + // Field IDs for positional delete schema + const FIELD_ID_POSITIONAL_DELETE_FILE_PATH: u64 = 2147483546; + const FIELD_ID_POSITIONAL_DELETE_POS: u64 = 2147483545; + + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + + // Create table schema with a single 'id' column + let table_schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::required(1, "id", Type::Primitive(PrimitiveType::Int)).into(), + ]) + .build() + .unwrap(), + ); + + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("id", DataType::Int32, false).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + "1".to_string(), + )])), + ])); + + // Step 1: Create data file with 200 rows in 2 row groups + // Row group 0: rows 0-99 (ids 1-100) + // Row group 1: rows 100-199 (ids 101-200) + let data_file_path = format!("{table_location}/data.parquet"); + + let batch1 = RecordBatch::try_new(arrow_schema.clone(), vec![Arc::new( + Int32Array::from_iter_values(1..=100), + )]) + .unwrap(); + + let batch2 = RecordBatch::try_new(arrow_schema.clone(), vec![Arc::new( + Int32Array::from_iter_values(101..=200), + )]) + .unwrap(); + + // Force each batch into its own row group + let props = WriterProperties::builder() + .set_compression(Compression::SNAPPY) + .set_max_row_group_row_count(Some(100)) + .build(); + + let file = File::create(&data_file_path).unwrap(); + let mut writer = ArrowWriter::try_new(file, arrow_schema.clone(), Some(props)).unwrap(); + writer.write(&batch1).expect("Writing batch 1"); + writer.write(&batch2).expect("Writing batch 2"); + writer.close().unwrap(); + + // Verify we created 2 row groups + let verify_file = File::open(&data_file_path).unwrap(); + let verify_reader = SerializedFileReader::new(verify_file).unwrap(); + assert_eq!( + verify_reader.metadata().num_row_groups(), + 2, + "Should have 2 row groups" + ); + + // Step 2: Create position delete file that deletes row 0 (id=1, first row in row group 0) + let delete_file_path = format!("{table_location}/deletes.parquet"); + + let delete_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("file_path", DataType::Utf8, false).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + FIELD_ID_POSITIONAL_DELETE_FILE_PATH.to_string(), + )])), + Field::new("pos", DataType::Int64, false).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + FIELD_ID_POSITIONAL_DELETE_POS.to_string(), + )])), + ])); + + // Delete row at position 0 (0-indexed, so it's the first row: id=1) + let delete_batch = RecordBatch::try_new(delete_schema.clone(), vec![ + Arc::new(StringArray::from_iter_values(vec![data_file_path.clone()])), + Arc::new(Int64Array::from_iter_values(vec![0i64])), + ]) + .unwrap(); + + let delete_props = WriterProperties::builder() + .set_compression(Compression::SNAPPY) + .build(); + + let delete_file = File::create(&delete_file_path).unwrap(); + let mut delete_writer = + ArrowWriter::try_new(delete_file, delete_schema, Some(delete_props)).unwrap(); + delete_writer.write(&delete_batch).unwrap(); + delete_writer.close().unwrap(); + + // Step 3: Get byte ranges to read ONLY row group 1 (rows 100-199) + // This exercises the row group selection code path where row group 0 is skipped + let metadata_file = File::open(&data_file_path).unwrap(); + let metadata_reader = SerializedFileReader::new(metadata_file).unwrap(); + let metadata = metadata_reader.metadata(); + + let row_group_0 = metadata.row_group(0); + let row_group_1 = metadata.row_group(1); + + let rg0_start = 4u64; // Parquet files start with 4-byte magic "PAR1" + let rg1_start = rg0_start + row_group_0.compressed_size() as u64; + let rg1_length = row_group_1.compressed_size() as u64; + + let file_io = FileIO::new_with_fs(); + let reader = ArrowReaderBuilder::new(file_io).build(); + + // Create FileScanTask that reads ONLY row group 1 via byte range filtering + let task = FileScanTask { + file_size_in_bytes: std::fs::metadata(&data_file_path).unwrap().len(), + start: rg1_start, + length: rg1_length, + record_count: Some(100), // Row group 1 has 100 rows + data_file_path: data_file_path.clone(), + data_file_format: DataFileFormat::Parquet, + schema: table_schema.clone(), + project_field_ids: vec![1], + predicate: None, + deletes: vec![FileScanTaskDeleteFile { + file_size_in_bytes: std::fs::metadata(&delete_file_path).unwrap().len(), + file_path: delete_file_path, + file_type: DataContentType::PositionDeletes, + partition_spec_id: 0, + equality_ids: None, + }], + partition: None, + partition_spec: None, + name_mapping: None, + case_sensitive: false, + }; + + let tasks = Box::pin(futures::stream::iter(vec![Ok(task)])) as FileScanTaskStream; + let result = reader + .read(tasks) + .unwrap() + .stream() + .try_collect::>() + .await + .unwrap(); + + // Step 4: Verify we got 100 rows (all of row group 1) + // The delete at position 0 is in row group 0, which is skipped, so it doesn't affect us + let total_rows: usize = result.iter().map(|b| b.num_rows()).sum(); + + assert_eq!( + total_rows, 100, + "Expected 100 rows from row group 1 (delete at position 0 is in skipped row group 0). \ + If this hangs or fails, it indicates the cached delete index was not updated after advance_to()." + ); + + // Verify we have all ids from row group 1 (101-200) + let all_ids: Vec = result + .iter() + .flat_map(|batch| { + batch + .column(0) + .as_primitive::() + .values() + .iter() + .copied() + }) + .collect(); + + let expected_ids: Vec = (101..=200).collect(); + assert_eq!( + all_ids, expected_ids, + "Should have ids 101-200 (all of row group 1)" + ); + } +} diff --git a/crates/iceberg/src/arrow/reader/predicate_visitor.rs b/crates/iceberg/src/arrow/reader/predicate_visitor.rs new file mode 100644 index 0000000000..272de49390 --- /dev/null +++ b/crates/iceberg/src/arrow/reader/predicate_visitor.rs @@ -0,0 +1,820 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Visitors that translate Iceberg bound predicates into the pieces needed for +//! Arrow-level evaluation: collecting referenced field IDs and producing +//! per-record-batch predicate closures. + +use std::collections::{HashMap, HashSet}; +use std::sync::Arc; + +use arrow_arith::boolean::{and, and_kleene, is_not_null, is_null, not, or, or_kleene}; +use arrow_array::cast::AsArray; +use arrow_array::types::{Float32Type, Float64Type}; +use arrow_array::{Array, ArrayRef, BooleanArray, Datum as ArrowDatum, RecordBatch, Scalar}; +use arrow_buffer::BooleanBuffer; +use arrow_cast::cast::cast; +use arrow_ord::cmp::{eq, gt, gt_eq, lt, lt_eq, neq}; +use arrow_schema::{ArrowError, DataType}; +use arrow_string::like::starts_with; +use fnv::FnvHashSet; +use parquet::schema::types::SchemaDescriptor; + +use crate::arrow::get_arrow_datum; +use crate::error::Result; +use crate::expr::visitors::bound_predicate_visitor::BoundPredicateVisitor; +use crate::expr::{BoundPredicate, BoundReference}; +use crate::spec::Datum; +use crate::{Error, ErrorKind}; + +/// A visitor to collect field ids from bound predicates. +pub(super) struct CollectFieldIdVisitor { + pub(super) field_ids: HashSet, +} + +impl CollectFieldIdVisitor { + pub(super) fn field_ids(self) -> HashSet { + self.field_ids + } +} + +impl BoundPredicateVisitor for CollectFieldIdVisitor { + type T = (); + + fn always_true(&mut self) -> Result<()> { + Ok(()) + } + + fn always_false(&mut self) -> Result<()> { + Ok(()) + } + + fn and(&mut self, _lhs: (), _rhs: ()) -> Result<()> { + Ok(()) + } + + fn or(&mut self, _lhs: (), _rhs: ()) -> Result<()> { + Ok(()) + } + + fn not(&mut self, _inner: ()) -> Result<()> { + Ok(()) + } + + fn is_null(&mut self, reference: &BoundReference, _predicate: &BoundPredicate) -> Result<()> { + self.field_ids.insert(reference.field().id); + Ok(()) + } + + fn not_null(&mut self, reference: &BoundReference, _predicate: &BoundPredicate) -> Result<()> { + self.field_ids.insert(reference.field().id); + Ok(()) + } + + fn is_nan(&mut self, reference: &BoundReference, _predicate: &BoundPredicate) -> Result<()> { + self.field_ids.insert(reference.field().id); + Ok(()) + } + + fn not_nan(&mut self, reference: &BoundReference, _predicate: &BoundPredicate) -> Result<()> { + self.field_ids.insert(reference.field().id); + Ok(()) + } + + fn less_than( + &mut self, + reference: &BoundReference, + _literal: &Datum, + _predicate: &BoundPredicate, + ) -> Result<()> { + self.field_ids.insert(reference.field().id); + Ok(()) + } + + fn less_than_or_eq( + &mut self, + reference: &BoundReference, + _literal: &Datum, + _predicate: &BoundPredicate, + ) -> Result<()> { + self.field_ids.insert(reference.field().id); + Ok(()) + } + + fn greater_than( + &mut self, + reference: &BoundReference, + _literal: &Datum, + _predicate: &BoundPredicate, + ) -> Result<()> { + self.field_ids.insert(reference.field().id); + Ok(()) + } + + fn greater_than_or_eq( + &mut self, + reference: &BoundReference, + _literal: &Datum, + _predicate: &BoundPredicate, + ) -> Result<()> { + self.field_ids.insert(reference.field().id); + Ok(()) + } + + fn eq( + &mut self, + reference: &BoundReference, + _literal: &Datum, + _predicate: &BoundPredicate, + ) -> Result<()> { + self.field_ids.insert(reference.field().id); + Ok(()) + } + + fn not_eq( + &mut self, + reference: &BoundReference, + _literal: &Datum, + _predicate: &BoundPredicate, + ) -> Result<()> { + self.field_ids.insert(reference.field().id); + Ok(()) + } + + fn starts_with( + &mut self, + reference: &BoundReference, + _literal: &Datum, + _predicate: &BoundPredicate, + ) -> Result<()> { + self.field_ids.insert(reference.field().id); + Ok(()) + } + + fn not_starts_with( + &mut self, + reference: &BoundReference, + _literal: &Datum, + _predicate: &BoundPredicate, + ) -> Result<()> { + self.field_ids.insert(reference.field().id); + Ok(()) + } + + fn r#in( + &mut self, + reference: &BoundReference, + _literals: &FnvHashSet, + _predicate: &BoundPredicate, + ) -> Result<()> { + self.field_ids.insert(reference.field().id); + Ok(()) + } + + fn not_in( + &mut self, + reference: &BoundReference, + _literals: &FnvHashSet, + _predicate: &BoundPredicate, + ) -> Result<()> { + self.field_ids.insert(reference.field().id); + Ok(()) + } +} + +/// A visitor to convert Iceberg bound predicates to Arrow predicates. +pub(super) struct PredicateConverter<'a> { + /// The Parquet schema descriptor. + pub(super) parquet_schema: &'a SchemaDescriptor, + /// The map between field id and leaf column index in Parquet schema. + pub(super) column_map: &'a HashMap, + /// The required column indices in Parquet schema for the predicates. + pub(super) column_indices: &'a Vec, +} + +impl PredicateConverter<'_> { + /// When visiting a bound reference, we return index of the leaf column in the + /// required column indices which is used to project the column in the record batch. + /// Return None if the field id is not found in the column map, which is possible + /// due to schema evolution. + fn bound_reference(&mut self, reference: &BoundReference) -> Result> { + // The leaf column's index in Parquet schema. + if let Some(column_idx) = self.column_map.get(&reference.field().id) { + if self.parquet_schema.get_column_root(*column_idx).is_group() { + return Err(Error::new( + ErrorKind::DataInvalid, + format!( + "Leaf column `{}` in predicates isn't a root column in Parquet schema.", + reference.field().name + ), + )); + } + + // The leaf column's index in the required column indices. + let index = self + .column_indices + .iter() + .position(|&idx| idx == *column_idx) + .ok_or(Error::new( + ErrorKind::DataInvalid, + format!( + "Leaf column `{}` in predicates cannot be found in the required column indices.", + reference.field().name + ), + ))?; + + Ok(Some(index)) + } else { + Ok(None) + } + } + + /// Build an Arrow predicate that always returns true. + fn build_always_true(&self) -> Result> { + Ok(Box::new(|batch| { + Ok(BooleanArray::from(vec![true; batch.num_rows()])) + })) + } + + /// Build an Arrow predicate that always returns false. + fn build_always_false(&self) -> Result> { + Ok(Box::new(|batch| { + Ok(BooleanArray::from(vec![false; batch.num_rows()])) + })) + } +} + +/// Gets the leaf column from the record batch for the required column index. Only +/// supports top-level columns for now. +fn project_column( + batch: &RecordBatch, + column_idx: usize, +) -> std::result::Result { + let column = batch.column(column_idx); + + match column.data_type() { + DataType::Struct(_) => Err(ArrowError::SchemaError( + "Does not support struct column yet.".to_string(), + )), + _ => Ok(column.clone()), + } +} + +fn compute_is_nan(array: &ArrayRef) -> std::result::Result { + // Compute NaN over the contiguous values slice, then fold the null bitmap + // in with a single bitwise AND so that null slots become false. + let (is_nan, nulls) = match array.data_type() { + DataType::Float32 => { + let arr = array.as_primitive::(); + ( + BooleanBuffer::from_iter(arr.values().iter().map(|v| v.is_nan())), + arr.nulls(), + ) + } + DataType::Float64 => { + let arr = array.as_primitive::(); + ( + BooleanBuffer::from_iter(arr.values().iter().map(|v| v.is_nan())), + arr.nulls(), + ) + } + _ => unreachable!("is_nan is only valid for float types"), + }; + + let values = match nulls { + Some(nulls) => &is_nan & nulls.inner(), + None => is_nan, + }; + + Ok(BooleanArray::new(values, None)) +} + +pub(super) type PredicateResult = + dyn FnMut(RecordBatch) -> std::result::Result + Send + 'static; + +impl BoundPredicateVisitor for PredicateConverter<'_> { + type T = Box; + + fn always_true(&mut self) -> Result> { + self.build_always_true() + } + + fn always_false(&mut self) -> Result> { + self.build_always_false() + } + + fn and( + &mut self, + mut lhs: Box, + mut rhs: Box, + ) -> Result> { + Ok(Box::new(move |batch| { + let left = lhs(batch.clone())?; + let right = rhs(batch)?; + and_kleene(&left, &right) + })) + } + + fn or( + &mut self, + mut lhs: Box, + mut rhs: Box, + ) -> Result> { + Ok(Box::new(move |batch| { + let left = lhs(batch.clone())?; + let right = rhs(batch)?; + or_kleene(&left, &right) + })) + } + + fn not(&mut self, mut inner: Box) -> Result> { + Ok(Box::new(move |batch| { + let pred_ret = inner(batch)?; + not(&pred_ret) + })) + } + + fn is_null( + &mut self, + reference: &BoundReference, + _predicate: &BoundPredicate, + ) -> Result> { + if let Some(idx) = self.bound_reference(reference)? { + Ok(Box::new(move |batch| { + let column = project_column(&batch, idx)?; + is_null(&column) + })) + } else { + // A missing column, treating it as null. + self.build_always_true() + } + } + + fn not_null( + &mut self, + reference: &BoundReference, + _predicate: &BoundPredicate, + ) -> Result> { + if let Some(idx) = self.bound_reference(reference)? { + Ok(Box::new(move |batch| { + let column = project_column(&batch, idx)?; + is_not_null(&column) + })) + } else { + // A missing column, treating it as null. + self.build_always_false() + } + } + + fn is_nan( + &mut self, + reference: &BoundReference, + _predicate: &BoundPredicate, + ) -> Result> { + if let Some(idx) = self.bound_reference(reference)? { + Ok(Box::new(move |batch| { + let column = project_column(&batch, idx)?; + compute_is_nan(&column) + })) + } else { + // A missing column, treating it as null. + self.build_always_false() + } + } + + fn not_nan( + &mut self, + reference: &BoundReference, + _predicate: &BoundPredicate, + ) -> Result> { + if let Some(idx) = self.bound_reference(reference)? { + Ok(Box::new(move |batch| { + let column = project_column(&batch, idx)?; + let is_nan = compute_is_nan(&column)?; + not(&is_nan) + })) + } else { + // A missing column, treating it as null. + self.build_always_true() + } + } + + fn less_than( + &mut self, + reference: &BoundReference, + literal: &Datum, + _predicate: &BoundPredicate, + ) -> Result> { + if let Some(idx) = self.bound_reference(reference)? { + let literal = get_arrow_datum(literal)?; + + Ok(Box::new(move |batch| { + let left = project_column(&batch, idx)?; + let literal = try_cast_literal(&literal, left.data_type())?; + lt(&left, literal.as_ref()) + })) + } else { + // A missing column, treating it as null. + self.build_always_true() + } + } + + fn less_than_or_eq( + &mut self, + reference: &BoundReference, + literal: &Datum, + _predicate: &BoundPredicate, + ) -> Result> { + if let Some(idx) = self.bound_reference(reference)? { + let literal = get_arrow_datum(literal)?; + + Ok(Box::new(move |batch| { + let left = project_column(&batch, idx)?; + let literal = try_cast_literal(&literal, left.data_type())?; + lt_eq(&left, literal.as_ref()) + })) + } else { + // A missing column, treating it as null. + self.build_always_true() + } + } + + fn greater_than( + &mut self, + reference: &BoundReference, + literal: &Datum, + _predicate: &BoundPredicate, + ) -> Result> { + if let Some(idx) = self.bound_reference(reference)? { + let literal = get_arrow_datum(literal)?; + + Ok(Box::new(move |batch| { + let left = project_column(&batch, idx)?; + let literal = try_cast_literal(&literal, left.data_type())?; + gt(&left, literal.as_ref()) + })) + } else { + // A missing column, treating it as null. + self.build_always_false() + } + } + + fn greater_than_or_eq( + &mut self, + reference: &BoundReference, + literal: &Datum, + _predicate: &BoundPredicate, + ) -> Result> { + if let Some(idx) = self.bound_reference(reference)? { + let literal = get_arrow_datum(literal)?; + + Ok(Box::new(move |batch| { + let left = project_column(&batch, idx)?; + let literal = try_cast_literal(&literal, left.data_type())?; + gt_eq(&left, literal.as_ref()) + })) + } else { + // A missing column, treating it as null. + self.build_always_false() + } + } + + fn eq( + &mut self, + reference: &BoundReference, + literal: &Datum, + _predicate: &BoundPredicate, + ) -> Result> { + if let Some(idx) = self.bound_reference(reference)? { + let literal = get_arrow_datum(literal)?; + + Ok(Box::new(move |batch| { + let left = project_column(&batch, idx)?; + let literal = try_cast_literal(&literal, left.data_type())?; + eq(&left, literal.as_ref()) + })) + } else { + // A missing column, treating it as null. + self.build_always_false() + } + } + + fn not_eq( + &mut self, + reference: &BoundReference, + literal: &Datum, + _predicate: &BoundPredicate, + ) -> Result> { + if let Some(idx) = self.bound_reference(reference)? { + let literal = get_arrow_datum(literal)?; + + Ok(Box::new(move |batch| { + let left = project_column(&batch, idx)?; + let literal = try_cast_literal(&literal, left.data_type())?; + neq(&left, literal.as_ref()) + })) + } else { + // A missing column, treating it as null. + self.build_always_false() + } + } + + fn starts_with( + &mut self, + reference: &BoundReference, + literal: &Datum, + _predicate: &BoundPredicate, + ) -> Result> { + if let Some(idx) = self.bound_reference(reference)? { + let literal = get_arrow_datum(literal)?; + + Ok(Box::new(move |batch| { + let left = project_column(&batch, idx)?; + let literal = try_cast_literal(&literal, left.data_type())?; + starts_with(&left, literal.as_ref()) + })) + } else { + // A missing column, treating it as null. + self.build_always_false() + } + } + + fn not_starts_with( + &mut self, + reference: &BoundReference, + literal: &Datum, + _predicate: &BoundPredicate, + ) -> Result> { + if let Some(idx) = self.bound_reference(reference)? { + let literal = get_arrow_datum(literal)?; + + Ok(Box::new(move |batch| { + let left = project_column(&batch, idx)?; + let literal = try_cast_literal(&literal, left.data_type())?; + // update here if arrow ever adds a native not_starts_with + not(&starts_with(&left, literal.as_ref())?) + })) + } else { + // A missing column, treating it as null. + self.build_always_true() + } + } + + fn r#in( + &mut self, + reference: &BoundReference, + literals: &FnvHashSet, + _predicate: &BoundPredicate, + ) -> Result> { + if let Some(idx) = self.bound_reference(reference)? { + let literals: Vec<_> = literals + .iter() + .map(|lit| get_arrow_datum(lit).unwrap()) + .collect(); + + Ok(Box::new(move |batch| { + // update this if arrow ever adds a native is_in kernel + let left = project_column(&batch, idx)?; + + let mut acc = BooleanArray::from(vec![false; batch.num_rows()]); + for literal in &literals { + let literal = try_cast_literal(literal, left.data_type())?; + acc = or(&acc, &eq(&left, literal.as_ref())?)? + } + + Ok(acc) + })) + } else { + // A missing column, treating it as null. + self.build_always_false() + } + } + + fn not_in( + &mut self, + reference: &BoundReference, + literals: &FnvHashSet, + _predicate: &BoundPredicate, + ) -> Result> { + if let Some(idx) = self.bound_reference(reference)? { + let literals: Vec<_> = literals + .iter() + .map(|lit| get_arrow_datum(lit).unwrap()) + .collect(); + + Ok(Box::new(move |batch| { + // update this if arrow ever adds a native not_in kernel + let left = project_column(&batch, idx)?; + let mut acc = BooleanArray::from(vec![true; batch.num_rows()]); + for literal in &literals { + let literal = try_cast_literal(literal, left.data_type())?; + acc = and(&acc, &neq(&left, literal.as_ref())?)? + } + + Ok(acc) + })) + } else { + // A missing column, treating it as null. + self.build_always_true() + } + } +} + +/// The Arrow type of an array that the Parquet reader reads may not match the exact Arrow type +/// that Iceberg uses for literals - but they are effectively the same logical type, +/// i.e. LargeUtf8 and Utf8 or Utf8View and Utf8 or Utf8View and LargeUtf8. +/// +/// The Arrow compute kernels that we use must match the type exactly, so first cast the literal +/// into the type of the batch we read from Parquet before sending it to the compute kernel. +fn try_cast_literal( + literal: &Arc, + column_type: &DataType, +) -> std::result::Result, ArrowError> { + let literal_array = literal.get().0; + + // No cast required + if literal_array.data_type() == column_type { + return Ok(Arc::clone(literal)); + } + + let literal_array = cast(literal_array, column_type)?; + Ok(Arc::new(Scalar::new(literal_array))) +} + +#[cfg(test)] +mod tests { + use std::collections::{HashMap, HashSet}; + use std::sync::Arc; + + use arrow_array::{Array, BooleanArray, RecordBatch}; + use arrow_schema::{DataType, Field, Schema as ArrowSchema}; + use parquet::schema::parser::parse_message_type; + use parquet::schema::types::SchemaDescriptor; + + use super::{CollectFieldIdVisitor, PredicateConverter}; + use crate::expr::visitors::bound_predicate_visitor::visit; + use crate::expr::{Bind, Predicate, Reference}; + use crate::spec::{NestedField, PrimitiveType, Schema, SchemaRef, Type}; + + fn table_schema_simple() -> SchemaRef { + Arc::new( + Schema::builder() + .with_schema_id(1) + .with_identifier_field_ids(vec![2]) + .with_fields(vec![ + NestedField::optional(1, "foo", Type::Primitive(PrimitiveType::String)).into(), + NestedField::required(2, "bar", Type::Primitive(PrimitiveType::Int)).into(), + NestedField::optional(3, "baz", Type::Primitive(PrimitiveType::Boolean)).into(), + NestedField::optional(4, "qux", Type::Primitive(PrimitiveType::Float)).into(), + ]) + .build() + .unwrap(), + ) + } + + #[test] + fn test_collect_field_id() { + let schema = table_schema_simple(); + let expr = Reference::new("qux").is_null(); + let bound_expr = expr.bind(schema, true).unwrap(); + + let mut visitor = CollectFieldIdVisitor { + field_ids: HashSet::default(), + }; + visit(&mut visitor, &bound_expr).unwrap(); + + let mut expected = HashSet::default(); + expected.insert(4_i32); + + assert_eq!(visitor.field_ids, expected); + } + + #[test] + fn test_collect_field_id_with_and() { + let schema = table_schema_simple(); + let expr = Reference::new("qux") + .is_null() + .and(Reference::new("baz").is_null()); + let bound_expr = expr.bind(schema, true).unwrap(); + + let mut visitor = CollectFieldIdVisitor { + field_ids: HashSet::default(), + }; + visit(&mut visitor, &bound_expr).unwrap(); + + let mut expected = HashSet::default(); + expected.insert(4_i32); + expected.insert(3); + + assert_eq!(visitor.field_ids, expected); + } + + #[test] + fn test_collect_field_id_with_or() { + let schema = table_schema_simple(); + let expr = Reference::new("qux") + .is_null() + .or(Reference::new("baz").is_null()); + let bound_expr = expr.bind(schema, true).unwrap(); + + let mut visitor = CollectFieldIdVisitor { + field_ids: HashSet::default(), + }; + visit(&mut visitor, &bound_expr).unwrap(); + + let mut expected = HashSet::default(); + expected.insert(4_i32); + expected.insert(3); + + assert_eq!(visitor.field_ids, expected); + } + + fn apply_predicate_to_batch( + predicate: Predicate, + schema: SchemaRef, + batch: RecordBatch, + ) -> BooleanArray { + let bound = predicate.bind(schema, true).unwrap(); + + // Build a trivial Parquet schema with one float column at field id 4 + let message_type = " + message schema { + optional float qux = 4; + } + "; + let parquet_type = parse_message_type(message_type).expect("parse schema"); + let parquet_schema = SchemaDescriptor::new(Arc::new(parquet_type)); + + let column_map = HashMap::from([(4i32, 0usize)]); + let column_indices = vec![0usize]; + + let mut converter = PredicateConverter { + parquet_schema: &parquet_schema, + column_map: &column_map, + column_indices: &column_indices, + }; + + let mut predicate_fn = visit(&mut converter, &bound).unwrap(); + predicate_fn(batch).unwrap() + } + + #[test] + fn test_predicate_converter_nan() { + use arrow_array::Float32Array; + + let schema = table_schema_simple(); + let arrow_schema = Arc::new(ArrowSchema::new(vec![Field::new( + "qux", + DataType::Float32, + true, + )])); + let values = vec![Some(1.0f32), Some(f32::NAN), None, Some(0.0f32)]; + + // is_nan: non-null-propagating per Java's implementation - NULL → false + let batch = RecordBatch::try_new(arrow_schema.clone(), vec![Arc::new(Float32Array::from( + values.clone(), + ))]) + .unwrap(); + let result = + apply_predicate_to_batch(Reference::new("qux").is_nan(), schema.clone(), batch); + assert_eq!( + [ + result.value(0), + result.value(1), + result.value(2), + result.value(3) + ], + [false, true, false, false] + ); + assert!(!result.is_null(2)); + + // not_nan: non-null-propagating per Java's implementation - NULL → true + let batch = + RecordBatch::try_new(arrow_schema, vec![Arc::new(Float32Array::from(values))]).unwrap(); + let result = apply_predicate_to_batch(Reference::new("qux").is_not_nan(), schema, batch); + assert_eq!( + [ + result.value(0), + result.value(1), + result.value(2), + result.value(3) + ], + [true, false, true, true] + ); + assert!(!result.is_null(2)); + } +} diff --git a/crates/iceberg/src/arrow/reader/projection.rs b/crates/iceberg/src/arrow/reader/projection.rs new file mode 100644 index 0000000000..6d1a0f927d --- /dev/null +++ b/crates/iceberg/src/arrow/reader/projection.rs @@ -0,0 +1,1920 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Column projection for `ArrowReader`: building the Parquet projection mask +//! from Iceberg field IDs, and mapping field IDs between Iceberg and Parquet +//! (including fallback handling for files without embedded IDs). + +use std::collections::{HashMap, HashSet}; +use std::str::FromStr; +use std::sync::Arc; + +use arrow_schema::{Field, Schema as ArrowSchema, SchemaRef as ArrowSchemaRef}; +use parquet::arrow::{PARQUET_FIELD_ID_META_KEY, ProjectionMask}; +use parquet::schema::types::{SchemaDescriptor, Type as ParquetType}; + +use super::{ArrowReader, CollectFieldIdVisitor}; +use crate::arrow::arrow_schema_to_schema; +use crate::error::Result; +use crate::expr::BoundPredicate; +use crate::expr::visitors::bound_predicate_visitor::visit; +use crate::spec::{NameMapping, NestedField, PrimitiveType, Schema, Type}; +use crate::{Error, ErrorKind}; + +impl ArrowReader { + pub(super) fn build_field_id_set_and_map( + parquet_schema: &SchemaDescriptor, + predicate: &BoundPredicate, + ) -> Result<(HashSet, HashMap)> { + // Collects all Iceberg field IDs referenced in the filter predicate + let mut collector = CollectFieldIdVisitor { + field_ids: HashSet::default(), + }; + visit(&mut collector, predicate)?; + + let iceberg_field_ids = collector.field_ids(); + + // Without embedded field IDs, we fall back to position-based mapping for compatibility + let field_id_map = match build_field_id_map(parquet_schema)? { + Some(map) => map, + None => build_fallback_field_id_map(parquet_schema), + }; + + Ok((iceberg_field_ids, field_id_map)) + } + + /// Recursively extract leaf field IDs because Parquet projection works at the leaf column level. + /// Nested types (struct/list/map) are flattened in Parquet's columnar format. + fn include_leaf_field_id(field: &NestedField, field_ids: &mut Vec) { + match field.field_type.as_ref() { + Type::Primitive(_) => { + field_ids.push(field.id); + } + Type::Struct(struct_type) => { + for nested_field in struct_type.fields() { + Self::include_leaf_field_id(nested_field, field_ids); + } + } + Type::List(list_type) => { + Self::include_leaf_field_id(&list_type.element_field, field_ids); + } + Type::Map(map_type) => { + Self::include_leaf_field_id(&map_type.key_field, field_ids); + Self::include_leaf_field_id(&map_type.value_field, field_ids); + } + } + } + + pub(super) fn get_arrow_projection_mask( + field_ids: &[i32], + iceberg_schema_of_task: &Schema, + parquet_schema: &SchemaDescriptor, + arrow_schema: &ArrowSchemaRef, + use_fallback: bool, // Whether file lacks embedded field IDs (e.g., migrated from Hive/Spark) + ) -> Result { + fn type_promotion_is_valid( + file_type: Option<&PrimitiveType>, + projected_type: Option<&PrimitiveType>, + ) -> bool { + match (file_type, projected_type) { + (Some(lhs), Some(rhs)) if lhs == rhs => true, + (Some(PrimitiveType::Int), Some(PrimitiveType::Long)) => true, + (Some(PrimitiveType::Float), Some(PrimitiveType::Double)) => true, + ( + Some(PrimitiveType::Decimal { + precision: file_precision, + scale: file_scale, + }), + Some(PrimitiveType::Decimal { + precision: requested_precision, + scale: requested_scale, + }), + ) if requested_precision >= file_precision && file_scale == requested_scale => true, + // Uuid will be store as Fixed(16) in parquet file, so the read back type will be Fixed(16). + (Some(PrimitiveType::Fixed(16)), Some(PrimitiveType::Uuid)) => true, + // Some Parquet writers (e.g. Snowflake) store FIXED_LEN_BYTE_ARRAY as + // Arrow Binary rather than FixedSizeBinary. Allow Binary -> Fixed(N) + // since the underlying bytes are the same. + (Some(PrimitiveType::Binary), Some(PrimitiveType::Fixed(_))) => true, + _ => false, + } + } + + if field_ids.is_empty() { + return Ok(ProjectionMask::all()); + } + + if use_fallback { + // Position-based projection necessary because file lacks embedded field IDs + Self::get_arrow_projection_mask_fallback(field_ids, parquet_schema) + } else { + // Field-ID-based projection using embedded field IDs from Parquet metadata + + // Parquet's columnar format requires leaf-level (not top-level struct/list/map) projection + let mut leaf_field_ids = vec![]; + for field_id in field_ids { + let field = iceberg_schema_of_task.field_by_id(*field_id); + if let Some(field) = field { + Self::include_leaf_field_id(field, &mut leaf_field_ids); + } + } + + Self::get_arrow_projection_mask_with_field_ids( + &leaf_field_ids, + iceberg_schema_of_task, + parquet_schema, + arrow_schema, + type_promotion_is_valid, + ) + } + } + + /// Standard projection using embedded field IDs from Parquet metadata. + /// For iceberg-java compatibility with ParquetSchemaUtil.pruneColumns(). + fn get_arrow_projection_mask_with_field_ids( + leaf_field_ids: &[i32], + iceberg_schema_of_task: &Schema, + parquet_schema: &SchemaDescriptor, + arrow_schema: &ArrowSchemaRef, + type_promotion_is_valid: fn(Option<&PrimitiveType>, Option<&PrimitiveType>) -> bool, + ) -> Result { + let mut column_map = HashMap::new(); + let fields = arrow_schema.fields(); + + // Pre-project only the fields that have been selected, possibly avoiding converting + // some Arrow types that are not yet supported. + let mut projected_fields: HashMap = HashMap::new(); + let projected_arrow_schema = ArrowSchema::new_with_metadata( + fields.filter_leaves(|_, f| { + f.metadata() + .get(PARQUET_FIELD_ID_META_KEY) + .and_then(|field_id| i32::from_str(field_id).ok()) + .is_some_and(|field_id| { + projected_fields.insert((*f).clone(), field_id); + leaf_field_ids.contains(&field_id) + }) + }), + arrow_schema.metadata().clone(), + ); + let iceberg_schema = arrow_schema_to_schema(&projected_arrow_schema)?; + + fields.filter_leaves(|idx, field| { + let Some(field_id) = projected_fields.get(field).cloned() else { + return false; + }; + + let iceberg_field = iceberg_schema_of_task.field_by_id(field_id); + let parquet_iceberg_field = iceberg_schema.field_by_id(field_id); + + if iceberg_field.is_none() || parquet_iceberg_field.is_none() { + return false; + } + + if !type_promotion_is_valid( + parquet_iceberg_field + .unwrap() + .field_type + .as_primitive_type(), + iceberg_field.unwrap().field_type.as_primitive_type(), + ) { + return false; + } + + column_map.insert(field_id, idx); + true + }); + + // Schema evolution: New columns may not exist in old Parquet files. + // We only project existing columns; RecordBatchTransformer adds default/NULL values. + let mut indices = vec![]; + for field_id in leaf_field_ids { + if let Some(col_idx) = column_map.get(field_id) { + indices.push(*col_idx); + } + } + + if indices.is_empty() { + // Edge case: All requested columns are new (don't exist in file). + // Project all columns so RecordBatchTransformer has a batch to transform. + Ok(ProjectionMask::all()) + } else { + Ok(ProjectionMask::leaves(parquet_schema, indices)) + } + } + + /// Fallback projection for Parquet files without field IDs. + /// Uses position-based matching: field ID N → column position N-1. + /// Projects entire top-level columns (including nested content) for iceberg-java compatibility. + fn get_arrow_projection_mask_fallback( + field_ids: &[i32], + parquet_schema: &SchemaDescriptor, + ) -> Result { + // Position-based: field_id N → column N-1 (field IDs are 1-indexed) + let parquet_root_fields = parquet_schema.root_schema().get_fields(); + let mut root_indices = vec![]; + + for field_id in field_ids.iter() { + let parquet_pos = (*field_id - 1) as usize; + + if parquet_pos < parquet_root_fields.len() { + root_indices.push(parquet_pos); + } + // RecordBatchTransformer adds missing columns with NULL values + } + + if root_indices.is_empty() { + Ok(ProjectionMask::all()) + } else { + Ok(ProjectionMask::roots(parquet_schema, root_indices)) + } + } +} + +/// Build the map of parquet field id to Parquet column index in the schema. +/// Returns None if the Parquet file doesn't have field IDs embedded (e.g., migrated tables). +pub(super) fn build_field_id_map( + parquet_schema: &SchemaDescriptor, +) -> Result>> { + let mut column_map = HashMap::new(); + + for (idx, field) in parquet_schema.columns().iter().enumerate() { + let field_type = field.self_type(); + match field_type { + ParquetType::PrimitiveType { basic_info, .. } => { + if !basic_info.has_id() { + return Ok(None); + } + column_map.insert(basic_info.id(), idx); + } + ParquetType::GroupType { .. } => { + return Err(Error::new( + ErrorKind::DataInvalid, + format!( + "Leaf column in schema should be primitive type but got {field_type:?}" + ), + )); + } + }; + } + + Ok(Some(column_map)) +} + +/// Build a fallback field ID map for Parquet files without embedded field IDs. +/// +/// Returns the number of primitive (leaf) columns in a Parquet type, recursing into groups. +fn leaf_count(ty: &parquet::schema::types::Type) -> usize { + if ty.is_primitive() { + 1 + } else { + ty.get_fields().iter().map(|f| leaf_count(f)).sum() + } +} + +/// Builds a mapping from fallback field IDs to leaf column indices for Parquet files +/// without embedded field IDs. Returns entries only for primitive top-level fields. +/// +/// Must use top-level field positions (not leaf column positions) to stay consistent +/// with `add_fallback_field_ids_to_arrow_schema`, which assigns ordinal IDs to +/// top-level Arrow fields. Using leaf positions instead would produce wrong indices +/// when nested types (struct/list/map) expand into multiple leaf columns. +/// +/// Mirrors iceberg-java's ParquetSchemaUtil.addFallbackIds() which iterates +/// fileSchema.getFields() assigning ordinal IDs to top-level fields. +pub(super) fn build_fallback_field_id_map( + parquet_schema: &SchemaDescriptor, +) -> HashMap { + let mut column_map = HashMap::new(); + let mut leaf_idx = 0; + + for (top_pos, field) in parquet_schema.root_schema().get_fields().iter().enumerate() { + let field_id = (top_pos + 1) as i32; + if field.is_primitive() { + column_map.insert(field_id, leaf_idx); + } + leaf_idx += leaf_count(field); + } + + column_map +} + +/// Apply name mapping to Arrow schema for Parquet files lacking field IDs. +/// +/// Assigns Iceberg field IDs based on column names using the name mapping, +/// enabling correct projection on migrated files (e.g., from Hive/Spark via add_files). +/// +/// Per Iceberg spec Column Projection rule #2: +/// "Use schema.name-mapping.default metadata to map field id to columns without field id" +/// https://iceberg.apache.org/spec/#column-projection +/// +/// Corresponds to Java's ParquetSchemaUtil.applyNameMapping() and ApplyNameMapping visitor. +/// The key difference is Java operates on Parquet MessageType, while we operate on Arrow Schema. +/// +/// # Arguments +/// * `arrow_schema` - Arrow schema from Parquet file (without field IDs) +/// * `name_mapping` - Name mapping from table metadata (TableProperties.DEFAULT_NAME_MAPPING) +/// +/// # Returns +/// Arrow schema with field IDs assigned based on name mapping +pub(super) fn apply_name_mapping_to_arrow_schema( + arrow_schema: ArrowSchemaRef, + name_mapping: &NameMapping, +) -> Result> { + debug_assert!( + arrow_schema + .fields() + .iter() + .next() + .is_none_or(|f| f.metadata().get(PARQUET_FIELD_ID_META_KEY).is_none()), + "Schema already has field IDs - name mapping should not be applied" + ); + + let fields_with_mapped_ids: Vec<_> = arrow_schema + .fields() + .iter() + .map(|field| { + // Look up this column name in name mapping to get the Iceberg field ID. + // Corresponds to Java's ApplyNameMapping visitor which calls + // nameMapping.find(currentPath()) and returns field.withId() if found. + // + // If the field isn't in the mapping, leave it WITHOUT assigning an ID + // (matching Java's behavior of returning the field unchanged). + // Later, during projection, fields without IDs are filtered out. + let mapped_field_opt = name_mapping + .fields() + .iter() + .find(|f| f.names().contains(&field.name().to_string())); + + let mut metadata = field.metadata().clone(); + + if let Some(mapped_field) = mapped_field_opt + && let Some(field_id) = mapped_field.field_id() + { + // Field found in mapping with a field_id → assign it + metadata.insert(PARQUET_FIELD_ID_META_KEY.to_string(), field_id.to_string()); + } + // If field_id is None, leave the field without an ID (will be filtered by projection) + + Field::new(field.name(), field.data_type().clone(), field.is_nullable()) + .with_metadata(metadata) + }) + .collect(); + + Ok(Arc::new(ArrowSchema::new_with_metadata( + fields_with_mapped_ids, + arrow_schema.metadata().clone(), + ))) +} + +/// Add position-based fallback field IDs to Arrow schema for Parquet files lacking them. +/// Enables projection on migrated files (e.g., from Hive/Spark). +/// +/// Why at schema level (not per-batch): Efficiency - avoids repeated schema modification. +/// Why only top-level: Nested projection uses leaf column indices, not parent struct IDs. +/// Why 1-indexed: Compatibility with iceberg-java's ParquetSchemaUtil.addFallbackIds(). +pub(super) fn add_fallback_field_ids_to_arrow_schema( + arrow_schema: &ArrowSchemaRef, +) -> Arc { + debug_assert!( + arrow_schema + .fields() + .iter() + .next() + .is_none_or(|f| f.metadata().get(PARQUET_FIELD_ID_META_KEY).is_none()), + "Schema already has field IDs" + ); + + let fields_with_fallback_ids: Vec<_> = arrow_schema + .fields() + .iter() + .enumerate() + .map(|(pos, field)| { + let mut metadata = field.metadata().clone(); + let field_id = (pos + 1) as i32; // 1-indexed for Java compatibility + metadata.insert(PARQUET_FIELD_ID_META_KEY.to_string(), field_id.to_string()); + + Field::new(field.name(), field.data_type().clone(), field.is_nullable()) + .with_metadata(metadata) + }) + .collect(); + + Arc::new(ArrowSchema::new_with_metadata( + fields_with_fallback_ids, + arrow_schema.metadata().clone(), + )) +} + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + use std::fs::File; + use std::sync::Arc; + + use arrow_array::cast::AsArray; + use arrow_array::{ + Array, ArrayRef, BinaryArray, FixedSizeBinaryArray, Int32Array, RecordBatch, StringArray, + }; + use arrow_schema::{DataType, Field, Schema as ArrowSchema, TimeUnit}; + use futures::TryStreamExt; + use parquet::arrow::{ArrowWriter, PARQUET_FIELD_ID_META_KEY, ProjectionMask}; + use parquet::basic::Compression; + use parquet::file::properties::WriterProperties; + use parquet::schema::parser::parse_message_type; + use parquet::schema::types::SchemaDescriptor; + use tempfile::TempDir; + + use crate::ErrorKind; + use crate::arrow::{ArrowReader, ArrowReaderBuilder}; + use crate::expr::{Bind, Reference}; + use crate::io::FileIO; + use crate::scan::{FileScanTask, FileScanTaskStream}; + use crate::spec::{DataFileFormat, Datum, NestedField, PrimitiveType, Schema, Type}; + + #[test] + fn test_arrow_projection_mask() { + let schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_identifier_field_ids(vec![1]) + .with_fields(vec![ + NestedField::required(1, "c1", Type::Primitive(PrimitiveType::String)).into(), + NestedField::optional(2, "c2", Type::Primitive(PrimitiveType::Int)).into(), + NestedField::optional( + 3, + "c3", + Type::Primitive(PrimitiveType::Decimal { + precision: 38, + scale: 3, + }), + ) + .into(), + ]) + .build() + .unwrap(), + ); + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("c1", DataType::Utf8, false).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + "1".to_string(), + )])), + // Type not supported + Field::new("c2", DataType::Duration(TimeUnit::Microsecond), true).with_metadata( + HashMap::from([(PARQUET_FIELD_ID_META_KEY.to_string(), "2".to_string())]), + ), + // Precision is beyond the supported range + Field::new("c3", DataType::Decimal128(39, 3), true).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + "3".to_string(), + )])), + ])); + + let message_type = " +message schema { + required binary c1 (STRING) = 1; + optional int32 c2 (INTEGER(8,true)) = 2; + optional fixed_len_byte_array(17) c3 (DECIMAL(39,3)) = 3; +} + "; + let parquet_type = parse_message_type(message_type).expect("should parse schema"); + let parquet_schema = SchemaDescriptor::new(Arc::new(parquet_type)); + + // Try projecting the fields c2 and c3 with the unsupported data types + let err = ArrowReader::get_arrow_projection_mask( + &[1, 2, 3], + &schema, + &parquet_schema, + &arrow_schema, + false, + ) + .unwrap_err(); + + assert_eq!(err.kind(), ErrorKind::DataInvalid); + assert_eq!( + err.to_string(), + "DataInvalid => Unsupported Arrow data type: Duration(µs)".to_string() + ); + + // Omitting field c2, we still get an error due to c3 being selected + let err = ArrowReader::get_arrow_projection_mask( + &[1, 3], + &schema, + &parquet_schema, + &arrow_schema, + false, + ) + .unwrap_err(); + + assert_eq!(err.kind(), ErrorKind::DataInvalid); + assert_eq!( + err.to_string(), + "DataInvalid => Failed to create decimal type, source: DataInvalid => Decimals with precision larger than 38 are not supported: 39".to_string() + ); + + // Finally avoid selecting fields with unsupported data types + let mask = ArrowReader::get_arrow_projection_mask( + &[1], + &schema, + &parquet_schema, + &arrow_schema, + false, + ) + .expect("Some ProjectionMask"); + assert_eq!(mask, ProjectionMask::leaves(&parquet_schema, vec![0])); + } + + /// Test schema evolution: reading old Parquet file (with only column 'a') + /// using a newer table schema (with columns 'a' and 'b'). + /// This tests that: + /// 1. get_arrow_projection_mask allows missing columns + /// 2. RecordBatchTransformer adds missing column 'b' with NULL values + #[tokio::test] + async fn test_schema_evolution_add_column() { + use arrow_array::{Array, Int32Array}; + + // New table schema: columns 'a' and 'b' (b was added later, file only has 'a') + let new_schema = Arc::new( + Schema::builder() + .with_schema_id(2) + .with_fields(vec![ + NestedField::required(1, "a", Type::Primitive(PrimitiveType::Int)).into(), + NestedField::optional(2, "b", Type::Primitive(PrimitiveType::Int)).into(), + ]) + .build() + .unwrap(), + ); + + // Create Arrow schema for old Parquet file (only has column 'a') + let arrow_schema_old = Arc::new(ArrowSchema::new(vec![ + Field::new("a", DataType::Int32, false).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + "1".to_string(), + )])), + ])); + + // Write old Parquet file with only column 'a' + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + let file_io = FileIO::new_with_fs(); + + let data_a = Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef; + let to_write = RecordBatch::try_new(arrow_schema_old.clone(), vec![data_a]).unwrap(); + + let props = WriterProperties::builder() + .set_compression(Compression::SNAPPY) + .build(); + let file = File::create(format!("{table_location}/old_file.parquet")).unwrap(); + let mut writer = ArrowWriter::try_new(file, to_write.schema(), Some(props)).unwrap(); + writer.write(&to_write).expect("Writing batch"); + writer.close().unwrap(); + + // Read the old Parquet file using the NEW schema (with column 'b') + let reader = ArrowReaderBuilder::new(file_io).build(); + let tasks = Box::pin(futures::stream::iter( + vec![Ok(FileScanTask { + file_size_in_bytes: std::fs::metadata(format!("{table_location}/old_file.parquet")) + .unwrap() + .len(), + start: 0, + length: 0, + record_count: None, + data_file_path: format!("{table_location}/old_file.parquet"), + data_file_format: DataFileFormat::Parquet, + schema: new_schema.clone(), + project_field_ids: vec![1, 2], // Request both columns 'a' and 'b' + predicate: None, + deletes: vec![], + partition: None, + partition_spec: None, + name_mapping: None, + case_sensitive: false, + })] + .into_iter(), + )) as FileScanTaskStream; + + let result = reader + .read(tasks) + .unwrap() + .stream() + .try_collect::>() + .await + .unwrap(); + + // Verify we got the correct data + assert_eq!(result.len(), 1); + let batch = &result[0]; + + // Should have 2 columns now + assert_eq!(batch.num_columns(), 2); + assert_eq!(batch.num_rows(), 3); + + // Column 'a' should have the original data + let col_a = batch + .column(0) + .as_primitive::(); + assert_eq!(col_a.values(), &[1, 2, 3]); + + // Column 'b' should be all NULLs (it didn't exist in the old file) + let col_b = batch + .column(1) + .as_primitive::(); + assert_eq!(col_b.null_count(), 3); + assert!(col_b.is_null(0)); + assert!(col_b.is_null(1)); + assert!(col_b.is_null(2)); + } + + /// Test reading Parquet files without field ID metadata (e.g., migrated tables). + /// This exercises the position-based fallback path. + /// + /// Corresponds to Java's ParquetSchemaUtil.addFallbackIds() + pruneColumnsFallback() + /// in /parquet/src/main/java/org/apache/iceberg/parquet/ParquetSchemaUtil.java + #[tokio::test] + async fn test_read_parquet_file_without_field_ids() { + let schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::required(1, "name", Type::Primitive(PrimitiveType::String)).into(), + NestedField::required(2, "age", Type::Primitive(PrimitiveType::Int)).into(), + ]) + .build() + .unwrap(), + ); + + // Parquet file from a migrated table - no field ID metadata + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("name", DataType::Utf8, false), + Field::new("age", DataType::Int32, false), + ])); + + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + let file_io = FileIO::new_with_fs(); + + let name_data = vec!["Alice", "Bob", "Charlie"]; + let age_data = vec![30, 25, 35]; + + use arrow_array::Int32Array; + let name_col = Arc::new(StringArray::from(name_data.clone())) as ArrayRef; + let age_col = Arc::new(Int32Array::from(age_data.clone())) as ArrayRef; + + let to_write = RecordBatch::try_new(arrow_schema.clone(), vec![name_col, age_col]).unwrap(); + + let props = WriterProperties::builder() + .set_compression(Compression::SNAPPY) + .build(); + + let file = File::create(format!("{table_location}/1.parquet")).unwrap(); + let mut writer = ArrowWriter::try_new(file, to_write.schema(), Some(props)).unwrap(); + + writer.write(&to_write).expect("Writing batch"); + writer.close().unwrap(); + + let reader = ArrowReaderBuilder::new(file_io).build(); + + let tasks = Box::pin(futures::stream::iter( + vec![Ok(FileScanTask { + file_size_in_bytes: std::fs::metadata(format!("{table_location}/1.parquet")) + .unwrap() + .len(), + start: 0, + length: 0, + record_count: None, + data_file_path: format!("{table_location}/1.parquet"), + data_file_format: DataFileFormat::Parquet, + schema: schema.clone(), + project_field_ids: vec![1, 2], + predicate: None, + deletes: vec![], + partition: None, + partition_spec: None, + name_mapping: None, + case_sensitive: false, + })] + .into_iter(), + )) as FileScanTaskStream; + + let result = reader + .read(tasks) + .unwrap() + .stream() + .try_collect::>() + .await + .unwrap(); + + assert_eq!(result.len(), 1); + let batch = &result[0]; + assert_eq!(batch.num_rows(), 3); + assert_eq!(batch.num_columns(), 2); + + // Verify position-based mapping: field_id 1 → position 0, field_id 2 → position 1 + let name_array = batch.column(0).as_string::(); + assert_eq!(name_array.value(0), "Alice"); + assert_eq!(name_array.value(1), "Bob"); + assert_eq!(name_array.value(2), "Charlie"); + + let age_array = batch + .column(1) + .as_primitive::(); + assert_eq!(age_array.value(0), 30); + assert_eq!(age_array.value(1), 25); + assert_eq!(age_array.value(2), 35); + } + + /// Test reading Parquet files without field IDs with partial projection. + /// Only a subset of columns are requested, verifying position-based fallback + /// handles column selection correctly. + #[tokio::test] + async fn test_read_parquet_without_field_ids_partial_projection() { + use arrow_array::Int32Array; + + let schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::required(1, "col1", Type::Primitive(PrimitiveType::String)).into(), + NestedField::required(2, "col2", Type::Primitive(PrimitiveType::Int)).into(), + NestedField::required(3, "col3", Type::Primitive(PrimitiveType::String)).into(), + NestedField::required(4, "col4", Type::Primitive(PrimitiveType::Int)).into(), + ]) + .build() + .unwrap(), + ); + + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("col1", DataType::Utf8, false), + Field::new("col2", DataType::Int32, false), + Field::new("col3", DataType::Utf8, false), + Field::new("col4", DataType::Int32, false), + ])); + + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + let file_io = FileIO::new_with_fs(); + + let col1_data = Arc::new(StringArray::from(vec!["a", "b"])) as ArrayRef; + let col2_data = Arc::new(Int32Array::from(vec![10, 20])) as ArrayRef; + let col3_data = Arc::new(StringArray::from(vec!["c", "d"])) as ArrayRef; + let col4_data = Arc::new(Int32Array::from(vec![30, 40])) as ArrayRef; + + let to_write = RecordBatch::try_new(arrow_schema.clone(), vec![ + col1_data, col2_data, col3_data, col4_data, + ]) + .unwrap(); + + let props = WriterProperties::builder() + .set_compression(Compression::SNAPPY) + .build(); + + let file = File::create(format!("{table_location}/1.parquet")).unwrap(); + let mut writer = ArrowWriter::try_new(file, to_write.schema(), Some(props)).unwrap(); + + writer.write(&to_write).expect("Writing batch"); + writer.close().unwrap(); + + let reader = ArrowReaderBuilder::new(file_io).build(); + + let tasks = Box::pin(futures::stream::iter( + vec![Ok(FileScanTask { + file_size_in_bytes: std::fs::metadata(format!("{table_location}/1.parquet")) + .unwrap() + .len(), + start: 0, + length: 0, + record_count: None, + data_file_path: format!("{table_location}/1.parquet"), + data_file_format: DataFileFormat::Parquet, + schema: schema.clone(), + project_field_ids: vec![1, 3], + predicate: None, + deletes: vec![], + partition: None, + partition_spec: None, + name_mapping: None, + case_sensitive: false, + })] + .into_iter(), + )) as FileScanTaskStream; + + let result = reader + .read(tasks) + .unwrap() + .stream() + .try_collect::>() + .await + .unwrap(); + + assert_eq!(result.len(), 1); + let batch = &result[0]; + assert_eq!(batch.num_rows(), 2); + assert_eq!(batch.num_columns(), 2); + + let col1_array = batch.column(0).as_string::(); + assert_eq!(col1_array.value(0), "a"); + assert_eq!(col1_array.value(1), "b"); + + let col3_array = batch.column(1).as_string::(); + assert_eq!(col3_array.value(0), "c"); + assert_eq!(col3_array.value(1), "d"); + } + + /// Test reading Parquet files without field IDs with schema evolution. + /// The Iceberg schema has more fields than the Parquet file, testing that + /// missing columns are filled with NULLs. + #[tokio::test] + async fn test_read_parquet_without_field_ids_schema_evolution() { + use arrow_array::{Array, Int32Array}; + + // Schema with field 3 added after the file was written + let schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::required(1, "name", Type::Primitive(PrimitiveType::String)).into(), + NestedField::required(2, "age", Type::Primitive(PrimitiveType::Int)).into(), + NestedField::optional(3, "city", Type::Primitive(PrimitiveType::String)).into(), + ]) + .build() + .unwrap(), + ); + + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("name", DataType::Utf8, false), + Field::new("age", DataType::Int32, false), + ])); + + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + let file_io = FileIO::new_with_fs(); + + let name_data = Arc::new(StringArray::from(vec!["Alice", "Bob"])) as ArrayRef; + let age_data = Arc::new(Int32Array::from(vec![30, 25])) as ArrayRef; + + let to_write = + RecordBatch::try_new(arrow_schema.clone(), vec![name_data, age_data]).unwrap(); + + let props = WriterProperties::builder() + .set_compression(Compression::SNAPPY) + .build(); + + let file = File::create(format!("{table_location}/1.parquet")).unwrap(); + let mut writer = ArrowWriter::try_new(file, to_write.schema(), Some(props)).unwrap(); + + writer.write(&to_write).expect("Writing batch"); + writer.close().unwrap(); + + let reader = ArrowReaderBuilder::new(file_io).build(); + + let tasks = Box::pin(futures::stream::iter( + vec![Ok(FileScanTask { + file_size_in_bytes: std::fs::metadata(format!("{table_location}/1.parquet")) + .unwrap() + .len(), + start: 0, + length: 0, + record_count: None, + data_file_path: format!("{table_location}/1.parquet"), + data_file_format: DataFileFormat::Parquet, + schema: schema.clone(), + project_field_ids: vec![1, 2, 3], + predicate: None, + deletes: vec![], + partition: None, + partition_spec: None, + name_mapping: None, + case_sensitive: false, + })] + .into_iter(), + )) as FileScanTaskStream; + + let result = reader + .read(tasks) + .unwrap() + .stream() + .try_collect::>() + .await + .unwrap(); + + assert_eq!(result.len(), 1); + let batch = &result[0]; + assert_eq!(batch.num_rows(), 2); + assert_eq!(batch.num_columns(), 3); + + let name_array = batch.column(0).as_string::(); + assert_eq!(name_array.value(0), "Alice"); + assert_eq!(name_array.value(1), "Bob"); + + let age_array = batch + .column(1) + .as_primitive::(); + assert_eq!(age_array.value(0), 30); + assert_eq!(age_array.value(1), 25); + + // Verify missing column filled with NULLs + let city_array = batch.column(2).as_string::(); + assert_eq!(city_array.null_count(), 2); + assert!(city_array.is_null(0)); + assert!(city_array.is_null(1)); + } + + /// Test reading Parquet files without field IDs that have multiple row groups. + /// This ensures the position-based fallback works correctly across row group boundaries. + #[tokio::test] + async fn test_read_parquet_without_field_ids_multiple_row_groups() { + use arrow_array::Int32Array; + + let schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::required(1, "name", Type::Primitive(PrimitiveType::String)).into(), + NestedField::required(2, "value", Type::Primitive(PrimitiveType::Int)).into(), + ]) + .build() + .unwrap(), + ); + + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("name", DataType::Utf8, false), + Field::new("value", DataType::Int32, false), + ])); + + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + let file_io = FileIO::new_with_fs(); + + // Small row group size to create multiple row groups + let props = WriterProperties::builder() + .set_compression(Compression::SNAPPY) + .set_write_batch_size(2) + .set_max_row_group_row_count(Some(2)) + .build(); + + let file = File::create(format!("{table_location}/1.parquet")).unwrap(); + let mut writer = ArrowWriter::try_new(file, arrow_schema.clone(), Some(props)).unwrap(); + + // Write 6 rows in 3 batches (will create 3 row groups) + for batch_num in 0..3 { + let name_data = Arc::new(StringArray::from(vec![ + format!("name_{}", batch_num * 2), + format!("name_{}", batch_num * 2 + 1), + ])) as ArrayRef; + let value_data = + Arc::new(Int32Array::from(vec![batch_num * 2, batch_num * 2 + 1])) as ArrayRef; + + let batch = + RecordBatch::try_new(arrow_schema.clone(), vec![name_data, value_data]).unwrap(); + writer.write(&batch).expect("Writing batch"); + } + writer.close().unwrap(); + + let reader = ArrowReaderBuilder::new(file_io).build(); + + let tasks = Box::pin(futures::stream::iter( + vec![Ok(FileScanTask { + file_size_in_bytes: std::fs::metadata(format!("{table_location}/1.parquet")) + .unwrap() + .len(), + start: 0, + length: 0, + record_count: None, + data_file_path: format!("{table_location}/1.parquet"), + data_file_format: DataFileFormat::Parquet, + schema: schema.clone(), + project_field_ids: vec![1, 2], + predicate: None, + deletes: vec![], + partition: None, + partition_spec: None, + name_mapping: None, + case_sensitive: false, + })] + .into_iter(), + )) as FileScanTaskStream; + + let result = reader + .read(tasks) + .unwrap() + .stream() + .try_collect::>() + .await + .unwrap(); + + assert!(!result.is_empty()); + + let mut all_names = Vec::new(); + let mut all_values = Vec::new(); + + for batch in &result { + let name_array = batch.column(0).as_string::(); + let value_array = batch + .column(1) + .as_primitive::(); + + for i in 0..batch.num_rows() { + all_names.push(name_array.value(i).to_string()); + all_values.push(value_array.value(i)); + } + } + + assert_eq!(all_names.len(), 6); + assert_eq!(all_values.len(), 6); + + for i in 0..6 { + assert_eq!(all_names[i], format!("name_{i}")); + assert_eq!(all_values[i], i as i32); + } + } + + /// Test reading Parquet files without field IDs with nested types (struct). + /// Java's pruneColumnsFallback() projects entire top-level columns including nested content. + /// This test verifies that a top-level struct field is projected correctly with all its nested fields. + #[tokio::test] + async fn test_read_parquet_without_field_ids_with_struct() { + use arrow_array::{Int32Array, StructArray}; + use arrow_schema::Fields; + + let schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::required(1, "id", Type::Primitive(PrimitiveType::Int)).into(), + NestedField::required( + 2, + "person", + Type::Struct(crate::spec::StructType::new(vec![ + NestedField::required( + 3, + "name", + Type::Primitive(PrimitiveType::String), + ) + .into(), + NestedField::required(4, "age", Type::Primitive(PrimitiveType::Int)) + .into(), + ])), + ) + .into(), + ]) + .build() + .unwrap(), + ); + + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new( + "person", + DataType::Struct(Fields::from(vec![ + Field::new("name", DataType::Utf8, false), + Field::new("age", DataType::Int32, false), + ])), + false, + ), + ])); + + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + let file_io = FileIO::new_with_fs(); + + let id_data = Arc::new(Int32Array::from(vec![1, 2])) as ArrayRef; + let name_data = Arc::new(StringArray::from(vec!["Alice", "Bob"])) as ArrayRef; + let age_data = Arc::new(Int32Array::from(vec![30, 25])) as ArrayRef; + let person_data = Arc::new(StructArray::from(vec![ + ( + Arc::new(Field::new("name", DataType::Utf8, false)), + name_data, + ), + ( + Arc::new(Field::new("age", DataType::Int32, false)), + age_data, + ), + ])) as ArrayRef; + + let to_write = + RecordBatch::try_new(arrow_schema.clone(), vec![id_data, person_data]).unwrap(); + + let props = WriterProperties::builder() + .set_compression(Compression::SNAPPY) + .build(); + + let file = File::create(format!("{table_location}/1.parquet")).unwrap(); + let mut writer = ArrowWriter::try_new(file, to_write.schema(), Some(props)).unwrap(); + + writer.write(&to_write).expect("Writing batch"); + writer.close().unwrap(); + + let reader = ArrowReaderBuilder::new(file_io).build(); + + let tasks = Box::pin(futures::stream::iter( + vec![Ok(FileScanTask { + file_size_in_bytes: std::fs::metadata(format!("{table_location}/1.parquet")) + .unwrap() + .len(), + start: 0, + length: 0, + record_count: None, + data_file_path: format!("{table_location}/1.parquet"), + data_file_format: DataFileFormat::Parquet, + schema: schema.clone(), + project_field_ids: vec![1, 2], + predicate: None, + deletes: vec![], + partition: None, + partition_spec: None, + name_mapping: None, + case_sensitive: false, + })] + .into_iter(), + )) as FileScanTaskStream; + + let result = reader + .read(tasks) + .unwrap() + .stream() + .try_collect::>() + .await + .unwrap(); + + assert_eq!(result.len(), 1); + let batch = &result[0]; + assert_eq!(batch.num_rows(), 2); + assert_eq!(batch.num_columns(), 2); + + let id_array = batch + .column(0) + .as_primitive::(); + assert_eq!(id_array.value(0), 1); + assert_eq!(id_array.value(1), 2); + + let person_array = batch.column(1).as_struct(); + assert_eq!(person_array.num_columns(), 2); + + let name_array = person_array.column(0).as_string::(); + assert_eq!(name_array.value(0), "Alice"); + assert_eq!(name_array.value(1), "Bob"); + + let age_array = person_array + .column(1) + .as_primitive::(); + assert_eq!(age_array.value(0), 30); + assert_eq!(age_array.value(1), 25); + } + + /// Test reading Parquet files without field IDs with schema evolution - column added in the middle. + /// When a new column is inserted between existing columns in the schema order, + /// the fallback projection must correctly map field IDs to output positions. + #[tokio::test] + async fn test_read_parquet_without_field_ids_schema_evolution_add_column_in_middle() { + use arrow_array::{Array, Int32Array}; + + let arrow_schema_old = Arc::new(ArrowSchema::new(vec![ + Field::new("col0", DataType::Int32, true), + Field::new("col1", DataType::Int32, true), + ])); + + // New column added between existing columns: col0 (id=1), newCol (id=5), col1 (id=2) + let schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::optional(1, "col0", Type::Primitive(PrimitiveType::Int)).into(), + NestedField::optional(5, "newCol", Type::Primitive(PrimitiveType::Int)).into(), + NestedField::optional(2, "col1", Type::Primitive(PrimitiveType::Int)).into(), + ]) + .build() + .unwrap(), + ); + + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + let file_io = FileIO::new_with_fs(); + + let col0_data = Arc::new(Int32Array::from(vec![1, 2])) as ArrayRef; + let col1_data = Arc::new(Int32Array::from(vec![10, 20])) as ArrayRef; + + let to_write = + RecordBatch::try_new(arrow_schema_old.clone(), vec![col0_data, col1_data]).unwrap(); + + let props = WriterProperties::builder() + .set_compression(Compression::SNAPPY) + .build(); + + let file = File::create(format!("{table_location}/1.parquet")).unwrap(); + let mut writer = ArrowWriter::try_new(file, to_write.schema(), Some(props)).unwrap(); + writer.write(&to_write).expect("Writing batch"); + writer.close().unwrap(); + + let reader = ArrowReaderBuilder::new(file_io).build(); + + let tasks = Box::pin(futures::stream::iter( + vec![Ok(FileScanTask { + file_size_in_bytes: std::fs::metadata(format!("{table_location}/1.parquet")) + .unwrap() + .len(), + start: 0, + length: 0, + record_count: None, + data_file_path: format!("{table_location}/1.parquet"), + data_file_format: DataFileFormat::Parquet, + schema: schema.clone(), + project_field_ids: vec![1, 5, 2], + predicate: None, + deletes: vec![], + partition: None, + partition_spec: None, + name_mapping: None, + case_sensitive: false, + })] + .into_iter(), + )) as FileScanTaskStream; + + let result = reader + .read(tasks) + .unwrap() + .stream() + .try_collect::>() + .await + .unwrap(); + + assert_eq!(result.len(), 1); + let batch = &result[0]; + assert_eq!(batch.num_rows(), 2); + assert_eq!(batch.num_columns(), 3); + + let result_col0 = batch + .column(0) + .as_primitive::(); + assert_eq!(result_col0.value(0), 1); + assert_eq!(result_col0.value(1), 2); + + // New column should be NULL (doesn't exist in old file) + let result_newcol = batch + .column(1) + .as_primitive::(); + assert_eq!(result_newcol.null_count(), 2); + assert!(result_newcol.is_null(0)); + assert!(result_newcol.is_null(1)); + + let result_col1 = batch + .column(2) + .as_primitive::(); + assert_eq!(result_col1.value(0), 10); + assert_eq!(result_col1.value(1), 20); + } + + /// Test reading Parquet files without field IDs with a filter that eliminates all row groups. + /// During development of field ID mapping, we saw a panic when row_selection_enabled=true and + /// all row groups are filtered out. + #[tokio::test] + async fn test_read_parquet_without_field_ids_filter_eliminates_all_rows() { + use arrow_array::{Float64Array, Int32Array}; + + // Schema with fields that will use fallback IDs 1, 2, 3 + let schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::required(1, "id", Type::Primitive(PrimitiveType::Int)).into(), + NestedField::required(2, "name", Type::Primitive(PrimitiveType::String)).into(), + NestedField::required(3, "value", Type::Primitive(PrimitiveType::Double)) + .into(), + ]) + .build() + .unwrap(), + ); + + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("name", DataType::Utf8, false), + Field::new("value", DataType::Float64, false), + ])); + + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + let file_io = FileIO::new_with_fs(); + + // Write data where all ids are >= 10 + let id_data = Arc::new(Int32Array::from(vec![10, 11, 12])) as ArrayRef; + let name_data = Arc::new(StringArray::from(vec!["a", "b", "c"])) as ArrayRef; + let value_data = Arc::new(Float64Array::from(vec![100.0, 200.0, 300.0])) as ArrayRef; + + let to_write = + RecordBatch::try_new(arrow_schema.clone(), vec![id_data, name_data, value_data]) + .unwrap(); + + let props = WriterProperties::builder() + .set_compression(Compression::SNAPPY) + .build(); + + let file = File::create(format!("{table_location}/1.parquet")).unwrap(); + let mut writer = ArrowWriter::try_new(file, to_write.schema(), Some(props)).unwrap(); + writer.write(&to_write).expect("Writing batch"); + writer.close().unwrap(); + + // Filter that eliminates all row groups: id < 5 + let predicate = Reference::new("id").less_than(Datum::int(5)); + + // Enable both row_group_filtering and row_selection - triggered the panic + let reader = ArrowReaderBuilder::new(file_io) + .with_row_group_filtering_enabled(true) + .with_row_selection_enabled(true) + .build(); + + let tasks = Box::pin(futures::stream::iter( + vec![Ok(FileScanTask { + file_size_in_bytes: std::fs::metadata(format!("{table_location}/1.parquet")) + .unwrap() + .len(), + start: 0, + length: 0, + record_count: None, + data_file_path: format!("{table_location}/1.parquet"), + data_file_format: DataFileFormat::Parquet, + schema: schema.clone(), + project_field_ids: vec![1, 2, 3], + predicate: Some(predicate.bind(schema, true).unwrap()), + deletes: vec![], + partition: None, + partition_spec: None, + name_mapping: None, + case_sensitive: false, + })] + .into_iter(), + )) as FileScanTaskStream; + + // Should no longer panic + let result = reader + .read(tasks) + .unwrap() + .stream() + .try_collect::>() + .await + .unwrap(); + + // Should return empty results + assert!(result.is_empty() || result.iter().all(|batch| batch.num_rows() == 0)); + } + + /// Test bucket partitioning reads source column from data file (not partition metadata). + /// + /// This is an integration test verifying the complete ArrowReader pipeline with bucket partitioning. + /// It corresponds to TestRuntimeFiltering tests in Iceberg Java (e.g., testRenamedSourceColumnTable). + /// + /// # Iceberg Spec Requirements + /// + /// Per the Iceberg spec "Column Projection" section: + /// > "Return the value from partition metadata if an **Identity Transform** exists for the field" + /// + /// This means: + /// - Identity transforms (e.g., `identity(dept)`) use constants from partition metadata + /// - Non-identity transforms (e.g., `bucket(4, id)`) must read source columns from data files + /// - Partition metadata for bucket transforms stores bucket numbers (0-3), NOT source values + /// + /// Java's PartitionUtil.constantsMap() implements this via: + /// ```java + /// if (field.transform().isIdentity()) { + /// idToConstant.put(field.sourceId(), converted); + /// } + /// ``` + /// + /// # What This Test Verifies + /// + /// This test ensures the full ArrowReader → RecordBatchTransformer pipeline correctly handles + /// bucket partitioning when FileScanTask provides partition_spec and partition_data: + /// + /// - Parquet file has field_id=1 named "id" with actual data [1, 5, 9, 13] + /// - FileScanTask specifies partition_spec with bucket(4, id) and partition_data with bucket=1 + /// - RecordBatchTransformer.constants_map() excludes bucket-partitioned field from constants + /// - ArrowReader correctly reads [1, 5, 9, 13] from the data file + /// - Values are NOT replaced with constant 1 from partition metadata + /// + /// # Why This Matters + /// + /// Without correct handling: + /// - Runtime filtering would break (e.g., `WHERE id = 5` would fail) + /// - Query results would be incorrect (all rows would have id=1) + /// - Bucket partitioning would be unusable for query optimization + /// + /// # References + /// - Iceberg spec: format/spec.md "Column Projection" + "Partition Transforms" + /// - Java test: spark/src/test/java/.../TestRuntimeFiltering.java + /// - Java impl: core/src/main/java/org/apache/iceberg/util/PartitionUtil.java + #[tokio::test] + async fn test_bucket_partitioning_reads_source_column_from_file() { + use arrow_array::Int32Array; + + use crate::spec::{Literal, PartitionSpec, Struct, Transform}; + + // Iceberg schema with id and name columns + let schema = Arc::new( + Schema::builder() + .with_schema_id(0) + .with_fields(vec![ + NestedField::required(1, "id", Type::Primitive(PrimitiveType::Int)).into(), + NestedField::optional(2, "name", Type::Primitive(PrimitiveType::String)).into(), + ]) + .build() + .unwrap(), + ); + + // Partition spec: bucket(4, id) + let partition_spec = Arc::new( + PartitionSpec::builder(schema.clone()) + .with_spec_id(0) + .add_partition_field("id", "id_bucket", Transform::Bucket(4)) + .unwrap() + .build() + .unwrap(), + ); + + // Partition data: bucket value is 1 + let partition_data = Struct::from_iter(vec![Some(Literal::int(1))]); + + // Create Arrow schema with field IDs for Parquet file + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("id", DataType::Int32, false).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + "1".to_string(), + )])), + Field::new("name", DataType::Utf8, true).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + "2".to_string(), + )])), + ])); + + // Write Parquet file with data + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + let file_io = FileIO::new_with_fs(); + + let id_data = Arc::new(Int32Array::from(vec![1, 5, 9, 13])) as ArrayRef; + let name_data = + Arc::new(StringArray::from(vec!["Alice", "Bob", "Charlie", "Dave"])) as ArrayRef; + + let to_write = + RecordBatch::try_new(arrow_schema.clone(), vec![id_data, name_data]).unwrap(); + + let props = WriterProperties::builder() + .set_compression(Compression::SNAPPY) + .build(); + let file = File::create(format!("{}/data.parquet", &table_location)).unwrap(); + let mut writer = ArrowWriter::try_new(file, to_write.schema(), Some(props)).unwrap(); + writer.write(&to_write).expect("Writing batch"); + writer.close().unwrap(); + + // Read the Parquet file with partition spec and data + let reader = ArrowReaderBuilder::new(file_io).build(); + let tasks = Box::pin(futures::stream::iter( + vec![Ok(FileScanTask { + file_size_in_bytes: std::fs::metadata(format!("{table_location}/data.parquet")) + .unwrap() + .len(), + start: 0, + length: 0, + record_count: None, + data_file_path: format!("{table_location}/data.parquet"), + data_file_format: DataFileFormat::Parquet, + schema: schema.clone(), + project_field_ids: vec![1, 2], + predicate: None, + deletes: vec![], + partition: Some(partition_data), + partition_spec: Some(partition_spec), + name_mapping: None, + case_sensitive: false, + })] + .into_iter(), + )) as FileScanTaskStream; + + let result = reader + .read(tasks) + .unwrap() + .stream() + .try_collect::>() + .await + .unwrap(); + + // Verify we got the correct data + assert_eq!(result.len(), 1); + let batch = &result[0]; + + assert_eq!(batch.num_columns(), 2); + assert_eq!(batch.num_rows(), 4); + + // The id column MUST contain actual values from the Parquet file [1, 5, 9, 13], + // NOT the constant partition value 1 + let id_col = batch + .column(0) + .as_primitive::(); + assert_eq!(id_col.value(0), 1); + assert_eq!(id_col.value(1), 5); + assert_eq!(id_col.value(2), 9); + assert_eq!(id_col.value(3), 13); + + let name_col = batch.column(1).as_string::(); + assert_eq!(name_col.value(0), "Alice"); + assert_eq!(name_col.value(1), "Bob"); + assert_eq!(name_col.value(2), "Charlie"); + assert_eq!(name_col.value(3), "Dave"); + } + + /// Regression for : + /// predicate on a column after nested types in a migrated file (no field IDs). + /// Schema has struct, list, and map columns before the predicate target (`id`), + /// exercising the fallback field ID mapping across all nested type variants. + #[tokio::test] + async fn test_predicate_on_migrated_file_with_nested_types() { + use serde::{Deserialize, Serialize}; + use serde_arrow::schema::{SchemaLike, TracingOptions}; + + #[derive(Serialize, Deserialize)] + struct Person { + name: String, + age: i32, + } + + #[derive(Serialize, Deserialize)] + struct Row { + person: Person, + people: Vec, + props: std::collections::BTreeMap, + id: i32, + } + + let rows = vec![ + Row { + person: Person { + name: "Alice".into(), + age: 30, + }, + people: vec![Person { + name: "Alice".into(), + age: 30, + }], + props: [("k1".into(), "v1".into())].into(), + id: 1, + }, + Row { + person: Person { + name: "Bob".into(), + age: 25, + }, + people: vec![Person { + name: "Bob".into(), + age: 25, + }], + props: [("k2".into(), "v2".into())].into(), + id: 2, + }, + Row { + person: Person { + name: "Carol".into(), + age: 40, + }, + people: vec![Person { + name: "Carol".into(), + age: 40, + }], + props: [("k3".into(), "v3".into())].into(), + id: 3, + }, + ]; + + let tracing_options = TracingOptions::default() + .map_as_struct(false) + .strings_as_large_utf8(false) + .sequence_as_large_list(false); + let fields = Vec::::from_type::(tracing_options).unwrap(); + let arrow_schema = Arc::new(ArrowSchema::new(fields.clone())); + let batch = serde_arrow::to_record_batch(&fields, &rows).unwrap(); + + // Fallback field IDs: person=1, people=2, props=3, id=4 + let iceberg_schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::required( + 1, + "person", + Type::Struct(crate::spec::StructType::new(vec![ + NestedField::required( + 5, + "name", + Type::Primitive(PrimitiveType::String), + ) + .into(), + NestedField::required(6, "age", Type::Primitive(PrimitiveType::Int)) + .into(), + ])), + ) + .into(), + NestedField::required( + 2, + "people", + Type::List(crate::spec::ListType { + element_field: NestedField::required( + 7, + "element", + Type::Struct(crate::spec::StructType::new(vec![ + NestedField::required( + 8, + "name", + Type::Primitive(PrimitiveType::String), + ) + .into(), + NestedField::required( + 9, + "age", + Type::Primitive(PrimitiveType::Int), + ) + .into(), + ])), + ) + .into(), + }), + ) + .into(), + NestedField::required( + 3, + "props", + Type::Map(crate::spec::MapType { + key_field: NestedField::required( + 10, + "key", + Type::Primitive(PrimitiveType::String), + ) + .into(), + value_field: NestedField::required( + 11, + "value", + Type::Primitive(PrimitiveType::String), + ) + .into(), + }), + ) + .into(), + NestedField::required(4, "id", Type::Primitive(PrimitiveType::Int)).into(), + ]) + .build() + .unwrap(), + ); + + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + let file_path = format!("{table_location}/1.parquet"); + + let props = WriterProperties::builder() + .set_compression(Compression::SNAPPY) + .build(); + let file = File::create(&file_path).unwrap(); + let mut writer = ArrowWriter::try_new(file, arrow_schema, Some(props)).unwrap(); + writer.write(&batch).expect("Writing batch"); + writer.close().unwrap(); + + let predicate = Reference::new("id").greater_than(Datum::int(1)); + + let reader = ArrowReaderBuilder::new(FileIO::new_with_fs()) + .with_row_group_filtering_enabled(true) + .with_row_selection_enabled(true) + .build(); + + let tasks = Box::pin(futures::stream::iter( + vec![Ok(FileScanTask { + file_size_in_bytes: std::fs::metadata(&file_path).unwrap().len(), + start: 0, + length: 0, + record_count: None, + data_file_path: file_path, + data_file_format: DataFileFormat::Parquet, + schema: iceberg_schema.clone(), + project_field_ids: vec![4], + predicate: Some(predicate.bind(iceberg_schema, true).unwrap()), + deletes: vec![], + partition: None, + partition_spec: None, + name_mapping: None, + case_sensitive: false, + })] + .into_iter(), + )) as FileScanTaskStream; + + let result = reader + .read(tasks) + .unwrap() + .stream() + .try_collect::>() + .await + .unwrap(); + + let ids: Vec = result + .iter() + .flat_map(|b| { + b.column(0) + .as_primitive::() + .values() + .iter() + .copied() + }) + .collect(); + assert_eq!(ids, vec![2, 3]); + } + + /// Test that a Parquet file written with Arrow Binary type can be read when the + /// Iceberg schema declares the column as Fixed(N). + /// + /// This reproduces a real-world issue where Snowflake writes `FIXED_LEN_BYTE_ARRAY` + /// columns that the Arrow Parquet reader decodes as `Binary` rather than + /// `FixedSizeBinary(N)`. Without the `(Binary, Fixed(_))` arm in + /// `type_promotion_is_valid`, the column is silently excluded from projection and + /// filled with nulls. + #[tokio::test] + async fn test_binary_to_fixed_type_promotion() { + // UUID-like 16-byte values + let uuid_bytes: Vec<[u8; 16]> = vec![ + [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], + [ + 0xA1, 0xB2, 0xC3, 0xD4, 0xE5, 0xF6, 0x07, 0x18, 0x29, 0x3A, 0x4B, 0x5C, 0x6D, 0x7E, + 0x8F, 0x90, + ], + [0xFF; 16], + ]; + let int_data = vec![1i32, 2, 3]; + + // Iceberg schema: field 1 = Int, field 2 = Fixed(16) + let iceberg_schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::required(1, "id", Type::Primitive(PrimitiveType::Int)).into(), + NestedField::optional(2, "uuid_col", Type::Primitive(PrimitiveType::Fixed(16))) + .into(), + ]) + .build() + .unwrap(), + ); + + // Arrow schema: write uuid_col as Binary (not FixedSizeBinary), simulating + // what the Arrow Parquet reader produces for some writers (e.g. Snowflake). + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("id", DataType::Int32, false).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + "1".to_string(), + )])), + Field::new("uuid_col", DataType::Binary, true).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + "2".to_string(), + )])), + ])); + + let id_col = Arc::new(Int32Array::from(int_data.clone())) as ArrayRef; + let uuid_col = Arc::new(BinaryArray::from_vec( + uuid_bytes.iter().map(|b| b.as_slice()).collect(), + )) as ArrayRef; + + let batch = RecordBatch::try_new(arrow_schema.clone(), vec![id_col, uuid_col]).unwrap(); + + // Write Parquet file + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + let parquet_path = format!("{table_location}/1.parquet"); + let file = File::create(&parquet_path).unwrap(); + let props = WriterProperties::builder() + .set_compression(Compression::SNAPPY) + .build(); + let mut writer = ArrowWriter::try_new(file, batch.schema(), Some(props)).unwrap(); + writer.write(&batch).unwrap(); + writer.close().unwrap(); + + let file_io = FileIO::new_with_fs(); + let file_size = std::fs::metadata(&parquet_path).unwrap().len(); + let reader = ArrowReaderBuilder::new(file_io.clone()).build(); + + // --- Test 1: Full scan (all columns projected) --- + // This is the case that previously failed. + { + let tasks = Box::pin(futures::stream::iter(vec![Ok(FileScanTask { + file_size_in_bytes: file_size, + start: 0, + length: 0, + record_count: None, + data_file_path: parquet_path.clone(), + data_file_format: DataFileFormat::Parquet, + schema: iceberg_schema.clone(), + project_field_ids: vec![1, 2], + predicate: None, + deletes: vec![], + partition: None, + partition_spec: None, + name_mapping: None, + case_sensitive: false, + })])) as FileScanTaskStream; + + let batches: Vec = reader + .read(tasks) + .unwrap() + .stream() + .try_collect() + .await + .unwrap(); + + assert_eq!(batches.len(), 1); + let result = &batches[0]; + assert_eq!(result.num_rows(), 3); + assert_eq!(result.num_columns(), 2); + + // Verify id column + let id_arr = result + .column_by_name("id") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(id_arr.values(), &int_data); + + // Verify uuid_col: data must come through as Binary, preserving every byte + let uuid_arr = result.column_by_name("uuid_col").unwrap(); + assert_eq!(uuid_arr.null_count(), 0, "uuid_col should have no nulls"); + // The transformer may cast Binary -> FixedSizeBinary to match the target schema + let uuid_values: Vec<&[u8]> = + if let Some(bin) = uuid_arr.as_any().downcast_ref::() { + (0..bin.len()).map(|i| bin.value(i)).collect() + } else if let Some(fsb) = uuid_arr.as_any().downcast_ref::() { + (0..fsb.len()).map(|i| fsb.value(i)).collect() + } else { + panic!("uuid_col has unexpected type: {}", uuid_arr.data_type()) + }; + for (i, expected) in uuid_bytes.iter().enumerate() { + assert_eq!( + uuid_values[i], + expected.as_slice(), + "uuid_col row {i} bytes mismatch" + ); + } + } + + // --- Test 2: Projected scan (only uuid_col) --- + { + let reader2 = ArrowReaderBuilder::new(file_io).build(); + let tasks = Box::pin(futures::stream::iter(vec![Ok(FileScanTask { + file_size_in_bytes: file_size, + start: 0, + length: 0, + record_count: None, + data_file_path: parquet_path.clone(), + data_file_format: DataFileFormat::Parquet, + schema: iceberg_schema.clone(), + project_field_ids: vec![2], + predicate: None, + deletes: vec![], + partition: None, + partition_spec: None, + name_mapping: None, + case_sensitive: false, + })])) as FileScanTaskStream; + + let batches: Vec = reader2 + .read(tasks) + .unwrap() + .stream() + .try_collect() + .await + .unwrap(); + + assert_eq!(batches.len(), 1); + let result = &batches[0]; + assert_eq!(result.num_rows(), 3); + assert_eq!(result.num_columns(), 1); + + let uuid_arr = result.column(0); + assert_eq!(uuid_arr.null_count(), 0, "uuid_col should have no nulls"); + let uuid_values: Vec<&[u8]> = + if let Some(bin) = uuid_arr.as_any().downcast_ref::() { + (0..bin.len()).map(|i| bin.value(i)).collect() + } else if let Some(fsb) = uuid_arr.as_any().downcast_ref::() { + (0..fsb.len()).map(|i| fsb.value(i)).collect() + } else { + panic!("uuid_col has unexpected type: {}", uuid_arr.data_type()) + }; + for (i, expected) in uuid_bytes.iter().enumerate() { + assert_eq!( + uuid_values[i], + expected.as_slice(), + "uuid_col row {i} bytes mismatch in projected scan" + ); + } + } + } +} diff --git a/crates/iceberg/src/arrow/reader/row_filter.rs b/crates/iceberg/src/arrow/reader/row_filter.rs new file mode 100644 index 0000000000..80432a0437 --- /dev/null +++ b/crates/iceberg/src/arrow/reader/row_filter.rs @@ -0,0 +1,619 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Predicate-driven row filtering for `ArrowReader`: constructing Arrow `RowFilter`s +//! from Iceberg predicates, row-group selection based on column statistics, and +//! row-selection via the Parquet page index. Also includes byte-range row-group +//! filtering used for file splitting. + +use std::collections::{HashMap, HashSet}; +use std::sync::Arc; + +use parquet::arrow::ProjectionMask; +use parquet::arrow::arrow_reader::{ArrowPredicateFn, RowFilter, RowSelection}; +use parquet::file::metadata::ParquetMetaData; +use parquet::schema::types::SchemaDescriptor; + +use super::{ArrowReader, PredicateConverter}; +use crate::error::Result; +use crate::expr::BoundPredicate; +use crate::expr::visitors::bound_predicate_visitor::visit; +use crate::expr::visitors::page_index_evaluator::PageIndexEvaluator; +use crate::expr::visitors::row_group_metrics_evaluator::RowGroupMetricsEvaluator; +use crate::spec::Schema; +use crate::{Error, ErrorKind}; + +impl ArrowReader { + pub(super) fn get_row_filter( + predicates: &BoundPredicate, + parquet_schema: &SchemaDescriptor, + iceberg_field_ids: &HashSet, + field_id_map: &HashMap, + ) -> Result { + // Collect Parquet column indices from field ids. + // If the field id is not found in Parquet schema, it will be ignored due to schema evolution. + let mut column_indices = iceberg_field_ids + .iter() + .filter_map(|field_id| field_id_map.get(field_id).cloned()) + .collect::>(); + column_indices.sort(); + + // The converter that converts `BoundPredicates` to `ArrowPredicates` + let mut converter = PredicateConverter { + parquet_schema, + column_map: field_id_map, + column_indices: &column_indices, + }; + + // After collecting required leaf column indices used in the predicate, + // creates the projection mask for the Arrow predicates. + let projection_mask = ProjectionMask::leaves(parquet_schema, column_indices.clone()); + let predicate_func = visit(&mut converter, predicates)?; + let arrow_predicate = ArrowPredicateFn::new(projection_mask, predicate_func); + Ok(RowFilter::new(vec![Box::new(arrow_predicate)])) + } + + pub(super) fn get_selected_row_group_indices( + predicate: &BoundPredicate, + parquet_metadata: &Arc, + field_id_map: &HashMap, + snapshot_schema: &Schema, + ) -> Result> { + let row_groups_metadata = parquet_metadata.row_groups(); + let mut results = Vec::with_capacity(row_groups_metadata.len()); + + for (idx, row_group_metadata) in row_groups_metadata.iter().enumerate() { + if RowGroupMetricsEvaluator::eval( + predicate, + row_group_metadata, + field_id_map, + snapshot_schema, + )? { + results.push(idx); + } + } + + Ok(results) + } + + pub(super) fn get_row_selection_for_filter_predicate( + predicate: &BoundPredicate, + parquet_metadata: &Arc, + selected_row_groups: &Option>, + field_id_map: &HashMap, + snapshot_schema: &Schema, + ) -> Result { + let Some(column_index) = parquet_metadata.column_index() else { + return Err(Error::new( + ErrorKind::Unexpected, + "Parquet file metadata does not contain a column index", + )); + }; + + let Some(offset_index) = parquet_metadata.offset_index() else { + return Err(Error::new( + ErrorKind::Unexpected, + "Parquet file metadata does not contain an offset index", + )); + }; + + // If all row groups were filtered out, return an empty RowSelection (select no rows) + if let Some(selected_row_groups) = selected_row_groups + && selected_row_groups.is_empty() + { + return Ok(RowSelection::from(Vec::new())); + } + + let mut selected_row_groups_idx = 0; + + let page_index = column_index + .iter() + .enumerate() + .zip(offset_index) + .zip(parquet_metadata.row_groups()); + + let mut results = Vec::new(); + for (((idx, column_index), offset_index), row_group_metadata) in page_index { + if let Some(selected_row_groups) = selected_row_groups { + // skip row groups that aren't present in selected_row_groups + if idx == selected_row_groups[selected_row_groups_idx] { + selected_row_groups_idx += 1; + } else { + continue; + } + } + + let selections_for_page = PageIndexEvaluator::eval( + predicate, + column_index, + offset_index, + row_group_metadata, + field_id_map, + snapshot_schema, + )?; + + results.push(selections_for_page); + + if let Some(selected_row_groups) = selected_row_groups + && selected_row_groups_idx == selected_row_groups.len() + { + break; + } + } + + Ok(results.into_iter().flatten().collect::>().into()) + } + + /// Filters row groups by byte range to support Iceberg's file splitting. + /// + /// Iceberg splits large files at row group boundaries, so we only read row groups + /// whose byte ranges overlap with [start, start+length). + pub(super) fn filter_row_groups_by_byte_range( + parquet_metadata: &Arc, + start: u64, + length: u64, + ) -> Result> { + let row_groups = parquet_metadata.row_groups(); + let mut selected = Vec::new(); + let end = start + length; + + // Row groups are stored sequentially after the 4-byte magic header. + let mut current_byte_offset = 4u64; + + for (idx, row_group) in row_groups.iter().enumerate() { + let row_group_size = row_group.compressed_size() as u64; + let row_group_end = current_byte_offset + row_group_size; + + if current_byte_offset < end && start < row_group_end { + selected.push(idx); + } + + current_byte_offset = row_group_end; + } + + Ok(selected) + } +} + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + use std::fs::File; + use std::sync::Arc; + + use arrow_array::cast::AsArray; + use arrow_array::{ArrayRef, LargeStringArray, RecordBatch, StringArray}; + use arrow_schema::{DataType, Field, Schema as ArrowSchema}; + use futures::TryStreamExt; + use parquet::arrow::{ArrowWriter, PARQUET_FIELD_ID_META_KEY}; + use parquet::basic::Compression; + use parquet::file::properties::WriterProperties; + use tempfile::TempDir; + + use crate::arrow::{ArrowReader, ArrowReaderBuilder}; + use crate::expr::{Bind, Predicate, Reference}; + use crate::io::FileIO; + use crate::scan::{FileScanTask, FileScanTaskStream}; + use crate::spec::{DataFileFormat, Datum, NestedField, PrimitiveType, Schema, SchemaRef, Type}; + + async fn test_perform_read( + predicate: Predicate, + schema: SchemaRef, + table_location: String, + reader: ArrowReader, + ) -> Vec> { + let tasks = Box::pin(futures::stream::iter( + vec![Ok(FileScanTask { + file_size_in_bytes: std::fs::metadata(format!("{table_location}/1.parquet")) + .unwrap() + .len(), + start: 0, + length: 0, + record_count: None, + data_file_path: format!("{table_location}/1.parquet"), + data_file_format: DataFileFormat::Parquet, + schema: schema.clone(), + project_field_ids: vec![1], + predicate: Some(predicate.bind(schema, true).unwrap()), + deletes: vec![], + partition: None, + partition_spec: None, + name_mapping: None, + case_sensitive: false, + })] + .into_iter(), + )) as FileScanTaskStream; + + let result = reader + .read(tasks) + .unwrap() + .stream() + .try_collect::>() + .await + .unwrap(); + + result[0].columns()[0] + .as_string_opt::() + .unwrap() + .iter() + .map(|v| v.map(ToOwned::to_owned)) + .collect::>() + } + + fn setup_kleene_logic( + data_for_col_a: Vec>, + col_a_type: DataType, + ) -> (FileIO, SchemaRef, String, TempDir) { + let schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::optional(1, "a", Type::Primitive(PrimitiveType::String)).into(), + ]) + .build() + .unwrap(), + ); + + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("a", col_a_type.clone(), true).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + "1".to_string(), + )])), + ])); + + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + + let file_io = FileIO::new_with_fs(); + + let col = match col_a_type { + DataType::Utf8 => Arc::new(StringArray::from(data_for_col_a)) as ArrayRef, + DataType::LargeUtf8 => Arc::new(LargeStringArray::from(data_for_col_a)) as ArrayRef, + _ => panic!("unexpected col_a_type"), + }; + + let to_write = RecordBatch::try_new(arrow_schema.clone(), vec![col]).unwrap(); + + // Write the Parquet files + let props = WriterProperties::builder() + .set_compression(Compression::SNAPPY) + .build(); + + let file = File::create(format!("{table_location}/1.parquet")).unwrap(); + let mut writer = + ArrowWriter::try_new(file, to_write.schema(), Some(props.clone())).unwrap(); + + writer.write(&to_write).expect("Writing batch"); + + // writer must be closed to write footer + writer.close().unwrap(); + + (file_io, schema, table_location, tmp_dir) + } + + #[tokio::test] + async fn test_kleene_logic_or_behaviour() { + // a IS NULL OR a = 'foo' + let predicate = Reference::new("a") + .is_null() + .or(Reference::new("a").equal_to(Datum::string("foo"))); + + // Table data: [NULL, "foo", "bar"] + let data_for_col_a = vec![None, Some("foo".to_string()), Some("bar".to_string())]; + + // Expected: [NULL, "foo"]. + let expected = vec![None, Some("foo".to_string())]; + + let (file_io, schema, table_location, _temp_dir) = + setup_kleene_logic(data_for_col_a, DataType::Utf8); + let reader = ArrowReaderBuilder::new(file_io).build(); + + let result_data = test_perform_read(predicate, schema, table_location, reader).await; + + assert_eq!(result_data, expected); + } + + #[tokio::test] + async fn test_kleene_logic_and_behaviour() { + // a IS NOT NULL AND a != 'foo' + let predicate = Reference::new("a") + .is_not_null() + .and(Reference::new("a").not_equal_to(Datum::string("foo"))); + + // Table data: [NULL, "foo", "bar"] + let data_for_col_a = vec![None, Some("foo".to_string()), Some("bar".to_string())]; + + // Expected: ["bar"]. + let expected = vec![Some("bar".to_string())]; + + let (file_io, schema, table_location, _temp_dir) = + setup_kleene_logic(data_for_col_a, DataType::Utf8); + let reader = ArrowReaderBuilder::new(file_io).build(); + + let result_data = test_perform_read(predicate, schema, table_location, reader).await; + + assert_eq!(result_data, expected); + } + + #[tokio::test] + async fn test_predicate_cast_literal() { + let predicates = vec![ + // a == 'foo' + (Reference::new("a").equal_to(Datum::string("foo")), vec![ + Some("foo".to_string()), + ]), + // a != 'foo' + ( + Reference::new("a").not_equal_to(Datum::string("foo")), + vec![Some("bar".to_string())], + ), + // STARTS_WITH(a, 'foo') + (Reference::new("a").starts_with(Datum::string("f")), vec![ + Some("foo".to_string()), + ]), + // NOT STARTS_WITH(a, 'foo') + ( + Reference::new("a").not_starts_with(Datum::string("f")), + vec![Some("bar".to_string())], + ), + // a < 'foo' + (Reference::new("a").less_than(Datum::string("foo")), vec![ + Some("bar".to_string()), + ]), + // a <= 'foo' + ( + Reference::new("a").less_than_or_equal_to(Datum::string("foo")), + vec![Some("foo".to_string()), Some("bar".to_string())], + ), + // a > 'foo' + ( + Reference::new("a").greater_than(Datum::string("bar")), + vec![Some("foo".to_string())], + ), + // a >= 'foo' + ( + Reference::new("a").greater_than_or_equal_to(Datum::string("foo")), + vec![Some("foo".to_string())], + ), + // a IN ('foo', 'bar') + ( + Reference::new("a").is_in([Datum::string("foo"), Datum::string("baz")]), + vec![Some("foo".to_string())], + ), + // a NOT IN ('foo', 'bar') + ( + Reference::new("a").is_not_in([Datum::string("foo"), Datum::string("baz")]), + vec![Some("bar".to_string())], + ), + ]; + + // Table data: ["foo", "bar"] + let data_for_col_a = vec![Some("foo".to_string()), Some("bar".to_string())]; + + let (file_io, schema, table_location, _temp_dir) = + setup_kleene_logic(data_for_col_a, DataType::LargeUtf8); + let reader = ArrowReaderBuilder::new(file_io).build(); + + for (predicate, expected) in predicates { + println!("testing predicate {predicate}"); + let result_data = test_perform_read( + predicate.clone(), + schema.clone(), + table_location.clone(), + reader.clone(), + ) + .await; + + assert_eq!(result_data, expected, "predicate={predicate}"); + } + } + + /// Verifies that file splits respect byte ranges and only read specific row groups. + #[tokio::test] + async fn test_file_splits_respect_byte_ranges() { + use arrow_array::Int32Array; + use parquet::file::reader::{FileReader, SerializedFileReader}; + + let schema = Arc::new( + Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + NestedField::required(1, "id", Type::Primitive(PrimitiveType::Int)).into(), + ]) + .build() + .unwrap(), + ); + + let arrow_schema = Arc::new(ArrowSchema::new(vec![ + Field::new("id", DataType::Int32, false).with_metadata(HashMap::from([( + PARQUET_FIELD_ID_META_KEY.to_string(), + "1".to_string(), + )])), + ])); + + let tmp_dir = TempDir::new().unwrap(); + let table_location = tmp_dir.path().to_str().unwrap().to_string(); + let file_path = format!("{table_location}/multi_row_group.parquet"); + + // Force each batch into its own row group for testing byte range filtering. + let batch1 = RecordBatch::try_new(arrow_schema.clone(), vec![Arc::new(Int32Array::from( + (0..100).collect::>(), + ))]) + .unwrap(); + let batch2 = RecordBatch::try_new(arrow_schema.clone(), vec![Arc::new(Int32Array::from( + (100..200).collect::>(), + ))]) + .unwrap(); + let batch3 = RecordBatch::try_new(arrow_schema.clone(), vec![Arc::new(Int32Array::from( + (200..300).collect::>(), + ))]) + .unwrap(); + + let props = WriterProperties::builder() + .set_compression(Compression::SNAPPY) + .set_max_row_group_row_count(Some(100)) + .build(); + + let file = File::create(&file_path).unwrap(); + let mut writer = ArrowWriter::try_new(file, arrow_schema.clone(), Some(props)).unwrap(); + writer.write(&batch1).expect("Writing batch 1"); + writer.write(&batch2).expect("Writing batch 2"); + writer.write(&batch3).expect("Writing batch 3"); + writer.close().unwrap(); + + // Read the file metadata to get row group byte positions + let file = File::open(&file_path).unwrap(); + let reader = SerializedFileReader::new(file).unwrap(); + let metadata = reader.metadata(); + + println!("File has {} row groups", metadata.num_row_groups()); + assert_eq!(metadata.num_row_groups(), 3, "Expected 3 row groups"); + + // Get byte positions for each row group + let row_group_0 = metadata.row_group(0); + let row_group_1 = metadata.row_group(1); + let row_group_2 = metadata.row_group(2); + + let rg0_start = 4u64; // Parquet files start with 4-byte magic "PAR1" + let rg1_start = rg0_start + row_group_0.compressed_size() as u64; + let rg2_start = rg1_start + row_group_1.compressed_size() as u64; + let file_end = rg2_start + row_group_2.compressed_size() as u64; + + println!( + "Row group 0: {} rows, starts at byte {}, {} bytes compressed", + row_group_0.num_rows(), + rg0_start, + row_group_0.compressed_size() + ); + println!( + "Row group 1: {} rows, starts at byte {}, {} bytes compressed", + row_group_1.num_rows(), + rg1_start, + row_group_1.compressed_size() + ); + println!( + "Row group 2: {} rows, starts at byte {}, {} bytes compressed", + row_group_2.num_rows(), + rg2_start, + row_group_2.compressed_size() + ); + + let file_io = FileIO::new_with_fs(); + let reader = ArrowReaderBuilder::new(file_io).build(); + + // Task 1: read only the first row group + let task1 = FileScanTask { + file_size_in_bytes: std::fs::metadata(&file_path).unwrap().len(), + start: rg0_start, + length: row_group_0.compressed_size() as u64, + record_count: Some(100), + data_file_path: file_path.clone(), + data_file_format: DataFileFormat::Parquet, + schema: schema.clone(), + project_field_ids: vec![1], + predicate: None, + deletes: vec![], + partition: None, + partition_spec: None, + name_mapping: None, + case_sensitive: false, + }; + + // Task 2: read the second and third row groups + let task2 = FileScanTask { + file_size_in_bytes: std::fs::metadata(&file_path).unwrap().len(), + start: rg1_start, + length: file_end - rg1_start, + record_count: Some(200), + data_file_path: file_path.clone(), + data_file_format: DataFileFormat::Parquet, + schema: schema.clone(), + project_field_ids: vec![1], + predicate: None, + deletes: vec![], + partition: None, + partition_spec: None, + name_mapping: None, + case_sensitive: false, + }; + + let tasks1 = Box::pin(futures::stream::iter(vec![Ok(task1)])) as FileScanTaskStream; + let result1 = reader + .clone() + .read(tasks1) + .unwrap() + .stream() + .try_collect::>() + .await + .unwrap(); + + let total_rows_task1: usize = result1.iter().map(|b| b.num_rows()).sum(); + println!( + "Task 1 (bytes {}-{}) returned {} rows", + rg0_start, + rg0_start + row_group_0.compressed_size() as u64, + total_rows_task1 + ); + + let tasks2 = Box::pin(futures::stream::iter(vec![Ok(task2)])) as FileScanTaskStream; + let result2 = reader + .read(tasks2) + .unwrap() + .stream() + .try_collect::>() + .await + .unwrap(); + + let total_rows_task2: usize = result2.iter().map(|b| b.num_rows()).sum(); + println!("Task 2 (bytes {rg1_start}-{file_end}) returned {total_rows_task2} rows"); + + assert_eq!( + total_rows_task1, 100, + "Task 1 should read only the first row group (100 rows), but got {total_rows_task1} rows" + ); + + assert_eq!( + total_rows_task2, 200, + "Task 2 should read only the second+third row groups (200 rows), but got {total_rows_task2} rows" + ); + + // Verify the actual data values are correct (not just the row count) + if total_rows_task1 > 0 { + let first_batch = &result1[0]; + let id_col = first_batch + .column(0) + .as_primitive::(); + let first_val = id_col.value(0); + let last_val = id_col.value(id_col.len() - 1); + println!("Task 1 data range: {first_val} to {last_val}"); + + assert_eq!(first_val, 0, "Task 1 should start with id=0"); + assert_eq!(last_val, 99, "Task 1 should end with id=99"); + } + + if total_rows_task2 > 0 { + let first_batch = &result2[0]; + let id_col = first_batch + .column(0) + .as_primitive::(); + let first_val = id_col.value(0); + println!("Task 2 first value: {first_val}"); + + assert_eq!(first_val, 100, "Task 2 should start with id=100, not id=0"); + } + } +} diff --git a/crates/iceberg/src/arrow/scan_metrics.rs b/crates/iceberg/src/arrow/scan_metrics.rs new file mode 100644 index 0000000000..642190c57d --- /dev/null +++ b/crates/iceberg/src/arrow/scan_metrics.rs @@ -0,0 +1,96 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Scan metrics and I/O counting for Parquet data file reads. + +use std::ops::Range; +use std::sync::Arc; +use std::sync::atomic::{AtomicU64, Ordering}; + +use bytes::Bytes; + +use crate::error::Result; +use crate::io::FileRead; +use crate::scan::ArrowRecordBatchStream; + +/// Wraps a [`FileRead`] to count bytes read via a shared atomic counter. +pub(crate) struct CountingFileRead { + inner: F, + bytes_read: Arc, +} + +impl CountingFileRead { + pub(crate) fn new(inner: F, bytes_read: Arc) -> Self { + Self { inner, bytes_read } + } +} + +#[async_trait::async_trait] +impl FileRead for CountingFileRead { + async fn read(&self, range: Range) -> Result { + debug_assert!(range.end >= range.start); + self.bytes_read + .fetch_add(range.end - range.start, Ordering::Relaxed); + self.inner.read(range).await + } +} + +/// Metrics collected during an Iceberg scan. +#[derive(Clone, Debug)] +pub struct ScanMetrics { + bytes_read: Arc, +} + +impl ScanMetrics { + pub(crate) fn new() -> Self { + Self { + bytes_read: Arc::new(AtomicU64::new(0)), + } + } + + pub(crate) fn bytes_read_counter(&self) -> &Arc { + &self.bytes_read + } + + /// Total bytes read from storage during this scan, including data files and delete files. + pub fn bytes_read(&self) -> u64 { + self.bytes_read.load(Ordering::Relaxed) + } +} + +/// Result of [`ArrowReader::read`](super::ArrowReader::read), containing the +/// record batch stream and metrics collected during the scan. +pub struct ScanResult { + stream: ArrowRecordBatchStream, + metrics: ScanMetrics, +} + +impl ScanResult { + pub(crate) fn new(stream: ArrowRecordBatchStream, metrics: ScanMetrics) -> Self { + Self { stream, metrics } + } + + /// Consumes the result, returning only the record batch stream. + pub fn stream(self) -> ArrowRecordBatchStream { + self.stream + } + + /// Returns a reference to the scan metrics. + pub fn metrics(&self) -> &ScanMetrics { + &self.metrics + } +} diff --git a/crates/iceberg/src/arrow/schema.rs b/crates/iceberg/src/arrow/schema.rs index 165717f6a0..88d4a07a39 100644 --- a/crates/iceberg/src/arrow/schema.rs +++ b/crates/iceberg/src/arrow/schema.rs @@ -199,7 +199,10 @@ fn visit_struct(fields: &Fields, visitor: &mut V) -> Resu } /// Visit schema in post order. -fn visit_schema(schema: &ArrowSchema, visitor: &mut V) -> Result { +pub(crate) fn visit_schema( + schema: &ArrowSchema, + visitor: &mut V, +) -> Result { let mut results = Vec::with_capacity(schema.fields().len()); for field in schema.fields() { visitor.before_field(field)?; @@ -759,6 +762,11 @@ pub(crate) fn get_arrow_datum(datum: &Datum) -> Result { + let array = FixedSizeBinaryArray::try_from_iter(std::iter::once(value.as_slice())) + .map_err(|e| Error::new(ErrorKind::DataInvalid, e.to_string()))?; + Ok(Arc::new(Scalar::new(array))) + } (primitive_type, _) => Err(Error::new( ErrorKind::FeatureUnsupported, @@ -2177,6 +2185,18 @@ mod tests { assert!(is_scalar); assert_eq!(array.value(0), [66u8; 16]); } + { + let datum = Datum::fixed(vec![1u8, 2, 3, 4, 5, 6, 7, 8]); + let arrow_datum = get_arrow_datum(&datum).unwrap(); + let (array, is_scalar) = arrow_datum.get(); + let array = array + .as_any() + .downcast_ref::() + .unwrap(); + assert!(is_scalar); + assert_eq!(array.value_length(), 8); + assert_eq!(array.value(0), &[1u8, 2, 3, 4, 5, 6, 7, 8]); + } } #[test] diff --git a/crates/iceberg/src/catalog/mod.rs b/crates/iceberg/src/catalog/mod.rs index f296cf2260..43102adec9 100644 --- a/crates/iceberg/src/catalog/mod.rs +++ b/crates/iceberg/src/catalog/mod.rs @@ -144,7 +144,6 @@ pub trait CatalogBuilder: Default + Debug + Send + Sync { /// /// let catalog = MyCatalogBuilder::default() /// .with_storage_factory(Arc::new(OpenDalStorageFactory::S3 { - /// configured_scheme: "s3a".to_string(), /// customized_credential_load: None, /// })) /// .load("my_catalog", props) diff --git a/crates/iceberg/src/encryption/crypto.rs b/crates/iceberg/src/encryption/crypto.rs index 0b34580db8..0f6a9eff43 100644 --- a/crates/iceberg/src/encryption/crypto.rs +++ b/crates/iceberg/src/encryption/crypto.rs @@ -43,7 +43,7 @@ use crate::{Error, ErrorKind, Result}; /// containing `SensitiveBytes` can safely derive or implement `Debug` /// without risk of leaking key material. #[derive(Clone, PartialEq, Eq)] -struct SensitiveBytes(Zeroizing>); +pub struct SensitiveBytes(Zeroizing>); impl SensitiveBytes { /// Wraps the given bytes as sensitive material. @@ -57,13 +57,11 @@ impl SensitiveBytes { } /// Returns the number of bytes. - #[allow(dead_code)] // Encryption work is ongoing so currently unused pub fn len(&self) -> usize { self.0.len() } /// Returns `true` if the byte slice is empty. - #[allow(dead_code)] // Encryption work is ongoing so currently unused pub fn is_empty(&self) -> bool { self.0.is_empty() } @@ -85,9 +83,10 @@ impl fmt::Display for SensitiveBytes { /// /// The Iceberg spec supports 128, 192, and 256-bit keys for AES-GCM. /// See: -#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)] pub enum AesKeySize { - /// 128-bit AES key (16 bytes) + /// 128-bit AES key (16 bytes). Default per the Iceberg spec. + #[default] Bits128 = 128, /// 192-bit AES key (24 bytes) Bits192 = 192, diff --git a/crates/iceberg/src/encryption/file_decryptor.rs b/crates/iceberg/src/encryption/file_decryptor.rs new file mode 100644 index 0000000000..e44c0e1d78 --- /dev/null +++ b/crates/iceberg/src/encryption/file_decryptor.rs @@ -0,0 +1,156 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! File-level decryption helper for AGS1 stream-encrypted files. + +use std::fmt; +use std::sync::Arc; + +use super::crypto::{AesGcmCipher, SecureKey}; +use super::stream::AesGcmFileRead; +use crate::Result; +use crate::io::FileRead; + +/// Holds the decryption material for a single encrypted file. +/// +/// Created from a plaintext DEK and AAD prefix, then used to wrap +/// an encrypted file reader for transparent decryption on read. +pub struct AesGcmFileDecryptor { + cipher: Arc, + aad_prefix: Box<[u8]>, +} + +impl AesGcmFileDecryptor { + /// Creates a new `AesGcmFileDecryptor` from a plaintext DEK and AAD prefix. + pub fn new(dek: &[u8], aad_prefix: impl Into>) -> Result { + let key = SecureKey::new(dek)?; + let cipher = Arc::new(AesGcmCipher::new(key)); + Ok(Self { + cipher, + aad_prefix: aad_prefix.into(), + }) + } + + /// Wraps a raw encrypted-file reader in a decrypting [`AesGcmFileRead`]. + pub fn wrap_reader( + &self, + reader: Box, + encrypted_file_length: u64, + ) -> Result> { + let decrypting = AesGcmFileRead::new( + reader, + Arc::clone(&self.cipher), + self.aad_prefix.clone(), + encrypted_file_length, + )?; + Ok(Box::new(decrypting)) + } + + /// Calculates the plaintext length from an encrypted file's total length. + pub fn plaintext_length(&self, encrypted_file_length: u64) -> Result { + AesGcmFileRead::calculate_plaintext_length(encrypted_file_length) + } +} + +impl fmt::Debug for AesGcmFileDecryptor { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("AesGcmFileDecryptor") + .field("aad_prefix_len", &self.aad_prefix.len()) + .finish_non_exhaustive() + } +} + +#[cfg(test)] +mod tests { + use std::ops::Range; + + use bytes::Bytes; + + use super::*; + use crate::encryption::AesGcmFileEncryptor; + use crate::io::FileWrite; + + struct MemoryFileRead(Bytes); + + #[async_trait::async_trait] + impl FileRead for MemoryFileRead { + async fn read(&self, range: Range) -> Result { + Ok(self.0.slice(range.start as usize..range.end as usize)) + } + } + + struct MemoryFileWrite { + buffer: std::sync::Arc>>, + } + + #[async_trait::async_trait] + impl FileWrite for MemoryFileWrite { + async fn write(&mut self, bs: Bytes) -> Result<()> { + self.buffer.lock().unwrap().extend_from_slice(&bs); + Ok(()) + } + + async fn close(&mut self) -> Result<()> { + Ok(()) + } + } + + #[tokio::test] + async fn test_wrap_reader_roundtrip() { + let key = b"0123456789abcdef"; + let aad_prefix = b"test-aad-prefix!"; + let plaintext = b"Hello from file decryptor!"; + + // Encrypt via the encryptor wrapper + let encryptor = AesGcmFileEncryptor::new(key.as_slice(), aad_prefix.as_slice()).unwrap(); + let buffer = std::sync::Arc::new(std::sync::Mutex::new(Vec::new())); + let mut writer = encryptor.wrap_writer(Box::new(MemoryFileWrite { + buffer: buffer.clone(), + })); + writer.write(Bytes::from(plaintext.to_vec())).await.unwrap(); + writer.close().await.unwrap(); + let encrypted = buffer.lock().unwrap().clone(); + let encrypted_len = encrypted.len() as u64; + + // Decrypt via the decryptor wrapper + let decryptor = AesGcmFileDecryptor::new(key.as_slice(), aad_prefix.as_slice()).unwrap(); + let reader = decryptor + .wrap_reader( + Box::new(MemoryFileRead(Bytes::from(encrypted))), + encrypted_len, + ) + .unwrap(); + + let result = reader.read(0..plaintext.len() as u64).await.unwrap(); + assert_eq!(&result[..], plaintext); + } + + #[tokio::test] + async fn test_invalid_key_length() { + let result = AesGcmFileDecryptor::new(b"too-short", b"aad".as_slice()); + assert!(result.is_err()); + } + + #[tokio::test] + async fn test_plaintext_length() { + let decryptor = AesGcmFileDecryptor::new(b"0123456789abcdef", b"aad".as_slice()).unwrap(); + // header(8) + nonce(12) + 10 bytes ciphertext + tag(16) = 46 + let encrypted_len = 8 + 12 + 10 + 16; + let plain_len = decryptor.plaintext_length(encrypted_len).unwrap(); + assert_eq!(plain_len, 10); + } +} diff --git a/crates/iceberg/src/encryption/file_encryptor.rs b/crates/iceberg/src/encryption/file_encryptor.rs new file mode 100644 index 0000000000..773438ad80 --- /dev/null +++ b/crates/iceberg/src/encryption/file_encryptor.rs @@ -0,0 +1,138 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! File-level encryption helper for AGS1 stream-encrypted files. + +use std::fmt; +use std::sync::Arc; + +use super::crypto::{AesGcmCipher, SecureKey}; +use super::stream::AesGcmFileWrite; +use crate::Result; +use crate::io::FileWrite; + +/// Holds the encryption material for a single encrypted file. +/// +/// This is the write-side counterpart to +/// [`AesGcmFileDecryptor`](super::AesGcmFileDecryptor). Created from +/// a plaintext DEK and AAD prefix, then used to wrap an output writer +/// for transparent encryption on write. +pub struct AesGcmFileEncryptor { + cipher: Arc, + aad_prefix: Box<[u8]>, +} + +impl AesGcmFileEncryptor { + /// Creates a new `AesGcmFileEncryptor` from a plaintext DEK and AAD prefix. + pub fn new(dek: &[u8], aad_prefix: impl Into>) -> Result { + let key = SecureKey::new(dek)?; + let cipher = Arc::new(AesGcmCipher::new(key)); + Ok(Self { + cipher, + aad_prefix: aad_prefix.into(), + }) + } + + /// Wraps a raw output writer in an encrypting [`AesGcmFileWrite`]. + pub fn wrap_writer(&self, writer: Box) -> Box { + Box::new(AesGcmFileWrite::new( + writer, + Arc::clone(&self.cipher), + self.aad_prefix.clone(), + )) + } +} + +impl fmt::Debug for AesGcmFileEncryptor { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("AesGcmFileEncryptor") + .field("aad_prefix_len", &self.aad_prefix.len()) + .finish_non_exhaustive() + } +} + +#[cfg(test)] +mod tests { + use std::ops::Range; + + use bytes::Bytes; + + use super::*; + use crate::encryption::AesGcmFileDecryptor; + use crate::io::FileRead; + + struct MemoryFileRead(Bytes); + + #[async_trait::async_trait] + impl FileRead for MemoryFileRead { + async fn read(&self, range: Range) -> Result { + Ok(self.0.slice(range.start as usize..range.end as usize)) + } + } + + struct MemoryFileWrite { + buffer: std::sync::Arc>>, + } + + #[async_trait::async_trait] + impl FileWrite for MemoryFileWrite { + async fn write(&mut self, bs: Bytes) -> Result<()> { + self.buffer.lock().unwrap().extend_from_slice(&bs); + Ok(()) + } + + async fn close(&mut self) -> Result<()> { + Ok(()) + } + } + + #[tokio::test] + async fn test_wrap_writer_roundtrip() { + let key = b"0123456789abcdef"; + let aad_prefix = b"test-aad-prefix!"; + let plaintext = b"Hello from file encryptor!"; + + // Encrypt via the encryptor wrapper + let encryptor = AesGcmFileEncryptor::new(key.as_slice(), aad_prefix.as_slice()).unwrap(); + let buffer = std::sync::Arc::new(std::sync::Mutex::new(Vec::new())); + let mut writer = encryptor.wrap_writer(Box::new(MemoryFileWrite { + buffer: buffer.clone(), + })); + writer.write(Bytes::from(plaintext.to_vec())).await.unwrap(); + writer.close().await.unwrap(); + let encrypted = buffer.lock().unwrap().clone(); + let encrypted_len = encrypted.len() as u64; + + // Decrypt via the decryptor wrapper + let decryptor = AesGcmFileDecryptor::new(key.as_slice(), aad_prefix.as_slice()).unwrap(); + let reader = decryptor + .wrap_reader( + Box::new(MemoryFileRead(Bytes::from(encrypted))), + encrypted_len, + ) + .unwrap(); + + let result = reader.read(0..plaintext.len() as u64).await.unwrap(); + assert_eq!(&result[..], plaintext); + } + + #[tokio::test] + async fn test_invalid_key_length() { + let result = AesGcmFileEncryptor::new(b"bad-key", b"aad".as_slice()); + assert!(result.is_err()); + } +} diff --git a/crates/iceberg/src/encryption/key_metadata.rs b/crates/iceberg/src/encryption/key_metadata.rs new file mode 100644 index 0000000000..4ef66ce394 --- /dev/null +++ b/crates/iceberg/src/encryption/key_metadata.rs @@ -0,0 +1,286 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Avro-serialized key metadata format compatible with Java's +//! `org.apache.iceberg.encryption.StandardKeyMetadata`. + +use std::fmt; + +use super::SensitiveBytes; +use crate::{Error, ErrorKind, Result}; + +/// Standard key metadata for Iceberg table encryption. +/// +/// Contains the Data Encryption Key (DEK), AAD prefix, and optional file +/// length. Byte-compatible with Java's `StandardKeyMetadata` via Avro +/// serialization. +/// +/// Wire format: `[version byte (0x01)] [Avro binary datum]` +#[derive(Clone, PartialEq, Eq)] +pub struct StandardKeyMetadata { + encryption_key: SensitiveBytes, + aad_prefix: Option>, + file_length: Option, +} + +impl fmt::Debug for StandardKeyMetadata { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("StandardKeyMetadata") + .field("encryption_key", &self.encryption_key) + .field( + "aad_prefix", + &self + .aad_prefix + .as_ref() + .map(|b| format!("[{} bytes]", b.len())), + ) + .field("file_length", &self.file_length) + .finish() + } +} + +impl StandardKeyMetadata { + /// Creates a new `StandardKeyMetadata`. + pub fn new(encryption_key: &[u8]) -> Self { + Self { + encryption_key: SensitiveBytes::new(encryption_key), + aad_prefix: None, + file_length: None, + } + } + + /// Adds an AAD prefix. + pub fn with_aad_prefix(mut self, aad_prefix: &[u8]) -> Self { + self.aad_prefix = Some(aad_prefix.into()); + self + } + + /// Adds a file length. + pub fn with_file_length(mut self, length: u64) -> Self { + self.file_length = Some(length); + self + } + + /// Returns the plaintext Data Encryption Key. + pub fn encryption_key(&self) -> &SensitiveBytes { + &self.encryption_key + } + + /// Returns the AAD prefix. + pub fn aad_prefix(&self) -> Option<&[u8]> { + self.aad_prefix.as_deref() + } + + /// Returns the optional file length. + pub fn file_length(&self) -> Option { + self.file_length + } + + /// Encodes to Java-compatible format: `[0x01] [Avro binary datum]` + pub fn encode(&self) -> Result> { + _serde::StandardKeyMetadataV1::from(self).encode() + } + + /// Decodes from Java-compatible format. + pub fn decode(bytes: &[u8]) -> Result { + _serde::StandardKeyMetadataV1::decode(bytes).map(Self::from) + } +} + +mod _serde { + use std::io::Cursor; + use std::sync::{Arc, LazyLock}; + + use apache_avro::{Schema as AvroSchema, from_avro_datum, from_value, to_avro_datum, to_value}; + use serde::{Deserialize, Serialize}; + + use super::*; + use crate::avro::schema_to_avro_schema; + use crate::spec::{NestedField, PrimitiveType, Schema, Type}; + + pub(super) const V1: u8 = 1; + + /// Avro schema for StandardKeyMetadata V1, derived from Iceberg schema. + pub(super) static AVRO_SCHEMA_V1: LazyLock = LazyLock::new(|| { + let schema = Schema::builder() + .with_fields(vec![ + Arc::new(NestedField::required( + 0, + "encryption_key", + Type::Primitive(PrimitiveType::Binary), + )), + Arc::new(NestedField::optional( + 1, + "aad_prefix", + Type::Primitive(PrimitiveType::Binary), + )), + Arc::new(NestedField::optional( + 2, + "file_length", + Type::Primitive(PrimitiveType::Long), + )), + ]) + .build() + .expect("Failed to build StandardKeyMetadata Iceberg schema"); + + schema_to_avro_schema("StandardKeyMetadata", &schema) + .expect("Failed to convert StandardKeyMetadata to Avro schema") + }); + + /// Serde struct for Avro serialization of [`StandardKeyMetadata`] V1. + /// Field names must match [`AVRO_SCHEMA_V1`] exactly. + #[derive(Serialize, Deserialize)] + pub(super) struct StandardKeyMetadataV1 { + pub encryption_key: serde_bytes::ByteBuf, + pub aad_prefix: Option, + pub file_length: Option, + } + + impl StandardKeyMetadataV1 { + pub(super) fn encode(&self) -> Result> { + let value = to_value(self) + .and_then(|v| v.resolve(&AVRO_SCHEMA_V1)) + .map_err(|e| { + Error::new(ErrorKind::Unexpected, "Failed to encode key metadata") + .with_source(e) + })?; + + let datum = to_avro_datum(&AVRO_SCHEMA_V1, value).map_err(|e| { + Error::new(ErrorKind::Unexpected, "Failed to encode key metadata").with_source(e) + })?; + + let mut result = Vec::with_capacity(1 + datum.len()); + result.push(V1); + result.extend_from_slice(&datum); + Ok(result.into_boxed_slice()) + } + + pub(super) fn decode(bytes: &[u8]) -> Result { + if bytes.is_empty() { + return Err(Error::new( + ErrorKind::DataInvalid, + "Empty key metadata buffer", + )); + } + + let version = bytes[0]; + if version != V1 { + return Err(Error::new( + ErrorKind::FeatureUnsupported, + format!("Cannot resolve schema for version: {version}"), + )); + } + + let mut reader = Cursor::new(&bytes[1..]); + let value = from_avro_datum(&AVRO_SCHEMA_V1, &mut reader, None).map_err(|e| { + Error::new(ErrorKind::DataInvalid, "Failed to decode key metadata").with_source(e) + })?; + + from_value(&value).map_err(|e| { + Error::new( + ErrorKind::DataInvalid, + "Failed to decode key metadata fields", + ) + .with_source(e) + }) + } + } + + impl From<&StandardKeyMetadata> for StandardKeyMetadataV1 { + fn from(metadata: &StandardKeyMetadata) -> Self { + Self { + encryption_key: serde_bytes::ByteBuf::from(metadata.encryption_key.as_bytes()), + aad_prefix: metadata + .aad_prefix + .as_ref() + .map(|b| serde_bytes::ByteBuf::from(b.as_ref())), + file_length: metadata.file_length, + } + } + } + + impl From for StandardKeyMetadata { + fn from(v1: StandardKeyMetadataV1) -> Self { + Self { + encryption_key: SensitiveBytes::new(v1.encryption_key.into_vec()), + aad_prefix: v1.aad_prefix.map(|b| b.into_vec().into_boxed_slice()), + file_length: v1.file_length, + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_roundtrip() { + let key = b"0123456789012345"; + let aad = b"1234567890123456"; + + let metadata = StandardKeyMetadata::new(key).with_aad_prefix(aad); + let serialized = metadata.encode().unwrap(); + let parsed = StandardKeyMetadata::decode(&serialized).unwrap(); + + assert_eq!(parsed.encryption_key().as_bytes(), key); + assert_eq!(parsed.aad_prefix(), Some(aad.as_slice())); + assert_eq!(parsed.file_length(), None); + } + + #[test] + fn test_roundtrip_with_length() { + let key = b"0123456789012345"; + let aad = b"1234567890123456"; + + let file_length = 100_000; + let metadata = StandardKeyMetadata::new(key) + .with_aad_prefix(aad) + .with_file_length(file_length); + let serialized = metadata.encode().unwrap(); + let parsed = StandardKeyMetadata::decode(&serialized).unwrap(); + + assert_eq!(parsed.encryption_key().as_bytes(), key); + assert_eq!(parsed.aad_prefix(), Some(aad.as_slice())); + assert_eq!(parsed.file_length(), Some(file_length)); + } + + #[test] + fn test_unsupported_version() { + let result = StandardKeyMetadata::decode(&[0x02]); + assert!(result.is_err()); + let err = result.unwrap_err(); + assert_eq!(err.kind(), ErrorKind::FeatureUnsupported); + } + + #[test] + fn test_empty_buffer() { + let result = StandardKeyMetadata::decode(&[]); + assert!(result.is_err()); + assert_eq!(result.unwrap_err().kind(), ErrorKind::DataInvalid); + } + + #[test] + fn test_roundtrip_without_aad() { + let metadata = StandardKeyMetadata::new(&[1, 2, 3, 4]); + let serialized = metadata.encode().unwrap(); + let parsed = StandardKeyMetadata::decode(&serialized).unwrap(); + + assert_eq!(parsed.encryption_key().as_bytes(), &[1, 2, 3, 4]); + assert_eq!(parsed.aad_prefix(), None); + } +} diff --git a/crates/iceberg/src/encryption/kms/client.rs b/crates/iceberg/src/encryption/kms/client.rs new file mode 100644 index 0000000000..85cd511758 --- /dev/null +++ b/crates/iceberg/src/encryption/kms/client.rs @@ -0,0 +1,98 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Key management client trait for encryption key operations. +//! +//! Mirrors the Java `KeyManagementClient` interface from the Apache Iceberg spec. + +use async_trait::async_trait; + +use crate::Result; +use crate::encryption::SensitiveBytes; + +/// Result of a server-side key generation operation. +/// +/// Returned by [`KeyManagementClient::generate_key`] when the KMS supports +/// atomic key generation and wrapping. +pub struct GeneratedKey { + key: SensitiveBytes, + wrapped_key: Vec, +} + +impl GeneratedKey { + /// Creates a new `GeneratedKey` from plaintext key bytes and wrapped key bytes. + pub fn new(key: SensitiveBytes, wrapped_key: Vec) -> Self { + Self { key, wrapped_key } + } + + /// Returns the plaintext key bytes. Zeroized on drop, redacted in Debug. + pub fn key(&self) -> &SensitiveBytes { + &self.key + } + + /// Returns the wrapped (encrypted) key bytes. + pub fn wrapped_key(&self) -> &[u8] { + &self.wrapped_key + } +} + +/// Pluggable interface for key management systems (AWS KMS, Azure Key Vault, etc.). +#[async_trait] +pub trait KeyManagementClient: Send + Sync + std::fmt::Debug { + /// Wrap (encrypt) a key using a wrapping key managed by the KMS. + async fn wrap_key(&self, key: &[u8], wrapping_key_id: &str) -> Result>; + + /// Unwrap (decrypt) a previously wrapped key. + async fn unwrap_key(&self, wrapped_key: &[u8], wrapping_key_id: &str) + -> Result; + + /// Whether this KMS supports server-side key generation. + /// + /// If `true`, callers can use [`generate_key`](Self::generate_key) for atomic + /// key generation and wrapping, which is more secure than generating a key + /// locally and then wrapping it. + fn supports_key_generation(&self) -> bool; + + /// Generate a new key and wrap it atomically on the server side. + /// + /// This is only supported when [`supports_key_generation`](Self::supports_key_generation) + /// returns `true`. + async fn generate_key(&self, wrapping_key_id: &str) -> Result; +} + +#[async_trait] +impl + Send + Sync + std::fmt::Debug> KeyManagementClient for T { + async fn wrap_key(&self, key: &[u8], wrapping_key_id: &str) -> Result> { + self.as_ref().wrap_key(key, wrapping_key_id).await + } + + async fn unwrap_key( + &self, + wrapped_key: &[u8], + wrapping_key_id: &str, + ) -> Result { + self.as_ref().unwrap_key(wrapped_key, wrapping_key_id).await + } + + fn supports_key_generation(&self) -> bool { + self.as_ref().supports_key_generation() + } + + async fn generate_key(&self, wrapping_key_id: &str) -> Result { + self.as_ref().generate_key(wrapping_key_id).await + } +} diff --git a/crates/iceberg/src/encryption/kms/memory.rs b/crates/iceberg/src/encryption/kms/memory.rs new file mode 100644 index 0000000000..65319831dd --- /dev/null +++ b/crates/iceberg/src/encryption/kms/memory.rs @@ -0,0 +1,296 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! In-memory KMS implementation for testing and development. +//! +//! **WARNING**: This implementation is NOT suitable for production use. +//! Keys are stored in memory only and will be lost when the process exits. + +use std::collections::HashMap; +use std::fmt; +use std::sync::{Arc, RwLock}; + +use async_trait::async_trait; + +use super::KeyManagementClient; +use crate::encryption::{AesGcmCipher, AesKeySize, SecureKey, SensitiveBytes}; +use crate::error::lock_error; +use crate::{Error, ErrorKind, Result}; + +/// In-memory KMS for testing. Not suitable for production use. +/// +/// ``` +/// use iceberg::encryption::KeyManagementClient; +/// use iceberg::encryption::kms::MemoryKeyManagementClient; +/// +/// # async fn example() -> iceberg::Result<()> { +/// let kms = MemoryKeyManagementClient::new(); +/// kms.add_master_key("my-master-key")?; +/// +/// let dek = vec![0u8; 16]; +/// let wrapped = kms.wrap_key(&dek, "my-master-key").await?; +/// let unwrapped = kms.unwrap_key(&wrapped, "my-master-key").await?; +/// assert_eq!(dek.as_slice(), unwrapped.as_bytes()); +/// # Ok(()) +/// # } +/// ``` +#[derive(Clone, Default)] +pub struct MemoryKeyManagementClient { + master_keys: Arc>>, + master_key_size: AesKeySize, +} + +impl fmt::Debug for MemoryKeyManagementClient { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("MemoryKeyManagementClient") + .field("master_key_size", &self.master_key_size) + .field("key_count", &self.key_count()) + .finish() + } +} + +impl MemoryKeyManagementClient { + /// Creates a new in-memory KMS with 128-bit AES keys. + pub fn new() -> Self { + Self::default() + } + + /// Creates a new in-memory KMS with the specified master key size. + pub fn with_master_key_size(master_key_size: AesKeySize) -> Self { + Self { + master_keys: Arc::new(RwLock::new(HashMap::new())), + master_key_size, + } + } + + /// Adds a randomly generated master key with the given ID. + pub fn add_master_key(&self, key_id: impl Into) -> Result<()> { + let key = SecureKey::generate(self.master_key_size); + self.insert_key(key_id.into(), SensitiveBytes::new(key.as_bytes())) + } + + /// Adds a master key with explicit key bytes. + /// + /// Use this to seed the KMS with known key material, e.g. for + /// cross-language integration tests where both Java and Rust must + /// share the same master key bytes. + pub fn add_master_key_bytes( + &self, + key_id: impl Into, + key_bytes: SensitiveBytes, + ) -> Result<()> { + Self::check_key_length(&key_bytes)?; + self.insert_key(key_id.into(), key_bytes) + } + + /// Check the key length is valid by constructing a SecureKey. + fn check_key_length(key_bytes: &SensitiveBytes) -> Result<()> { + SecureKey::new(key_bytes.as_bytes())?; + Ok(()) + } + + fn insert_key(&self, key_id: String, key: SensitiveBytes) -> Result<()> { + let mut keys = self.master_keys.write().map_err(lock_error)?; + + if keys.contains_key(&key_id) { + return Err(Error::new( + ErrorKind::DataInvalid, + format!("Master key already exists: {key_id}"), + )); + } + + keys.insert(key_id, key); + Ok(()) + } + + fn get_master_key(&self, key_id: &str) -> Result { + let keys = self.master_keys.read().map_err(lock_error)?; + + keys.get(key_id).cloned().ok_or_else(|| { + Error::new( + ErrorKind::DataInvalid, + format!("Master key not found: {key_id}"), + ) + }) + } + + /// Number of master keys stored. + pub fn key_count(&self) -> usize { + self.master_keys.read().map(|keys| keys.len()).unwrap_or(0) + } + + /// Whether a master key with the given ID exists. + pub fn has_key(&self, key_id: &str) -> bool { + self.master_keys + .read() + .map(|keys| keys.contains_key(key_id)) + .unwrap_or(false) + } +} + +#[async_trait] +impl KeyManagementClient for MemoryKeyManagementClient { + async fn wrap_key(&self, key: &[u8], wrapping_key_id: &str) -> Result> { + let master_key_bytes = self.get_master_key(wrapping_key_id)?; + let master_key = SecureKey::new(master_key_bytes.as_bytes())?; + let cipher = AesGcmCipher::new(master_key); + + cipher.encrypt(key, None) + } + + async fn unwrap_key( + &self, + wrapped_key: &[u8], + wrapping_key_id: &str, + ) -> Result { + let master_key_bytes = self.get_master_key(wrapping_key_id)?; + let master_key = SecureKey::new(master_key_bytes.as_bytes())?; + let cipher = AesGcmCipher::new(master_key); + + Ok(SensitiveBytes::new(cipher.decrypt(wrapped_key, None)?)) + } + + fn supports_key_generation(&self) -> bool { + false + } + + async fn generate_key(&self, _wrapping_key_id: &str) -> Result { + Err(Error::new( + ErrorKind::FeatureUnsupported, + "MemoryKeyManagementClient does not support server-side key generation", + )) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[tokio::test] + async fn test_wrap_unwrap_roundtrip() { + let kms = MemoryKeyManagementClient::new(); + kms.add_master_key("master-1").unwrap(); + let dek = vec![0u8; 16]; + + let wrapped = kms.wrap_key(&dek, "master-1").await.unwrap(); + let unwrapped = kms.unwrap_key(&wrapped, "master-1").await.unwrap(); + assert_eq!(unwrapped.as_bytes(), dek.as_slice()); + } + + #[tokio::test] + async fn test_wrap_unknown_key_fails() { + let kms = MemoryKeyManagementClient::new(); + let dek = vec![0u8; 16]; + + let result = kms.wrap_key(&dek, "nonexistent").await; + assert!(result.is_err()); + } + + #[tokio::test] + async fn test_wrong_master_key_fails_unwrap() { + let kms = MemoryKeyManagementClient::new(); + kms.add_master_key("master-1").unwrap(); + kms.add_master_key("master-2").unwrap(); + let dek = vec![0u8; 16]; + + let wrapped = kms.wrap_key(&dek, "master-1").await.unwrap(); + + let result = kms.unwrap_key(&wrapped, "master-2").await; + assert!(result.is_err()); + } + + #[tokio::test] + async fn test_does_not_support_key_generation() { + let kms = MemoryKeyManagementClient::new(); + assert!(!kms.supports_key_generation()); + + let result = kms.generate_key("master-1").await; + assert!(result.is_err()); + } + + #[tokio::test] + async fn test_multiple_master_keys() { + let kms = MemoryKeyManagementClient::new(); + kms.add_master_key("master-1").unwrap(); + kms.add_master_key("master-2").unwrap(); + let dek1 = vec![1u8; 16]; + let dek2 = vec![2u8; 16]; + + let wrapped1 = kms.wrap_key(&dek1, "master-1").await.unwrap(); + let wrapped2 = kms.wrap_key(&dek2, "master-2").await.unwrap(); + + let unwrapped1 = kms.unwrap_key(&wrapped1, "master-1").await.unwrap(); + let unwrapped2 = kms.unwrap_key(&wrapped2, "master-2").await.unwrap(); + + assert_eq!(unwrapped1.as_bytes(), dek1.as_slice()); + assert_eq!(unwrapped2.as_bytes(), dek2.as_slice()); + } + + #[tokio::test] + async fn test_add_master_key() { + let kms = MemoryKeyManagementClient::new(); + + kms.add_master_key("my-key").unwrap(); + assert!(kms.has_key("my-key")); + assert_eq!(kms.key_count(), 1); + + let result = kms.add_master_key("my-key"); + assert!(result.is_err()); + } + + #[tokio::test] + async fn test_add_master_key_bytes() { + let kms = MemoryKeyManagementClient::new(); + let key_bytes = SensitiveBytes::new([42u8; 16]); + + kms.add_master_key_bytes("my-key", key_bytes).unwrap(); + assert!(kms.has_key("my-key")); + + let dek = vec![7u8; 16]; + let wrapped = kms.wrap_key(&dek, "my-key").await.unwrap(); + let unwrapped = kms.unwrap_key(&wrapped, "my-key").await.unwrap(); + assert_eq!(unwrapped.as_bytes(), dek.as_slice()); + } + + #[tokio::test] + async fn test_add_master_key_bytes_invalid_length() { + let kms = MemoryKeyManagementClient::new(); + + let result = kms.add_master_key_bytes("my-key", SensitiveBytes::new([0u8; 7])); + assert!(result.is_err()); + } + + #[tokio::test] + async fn test_with_master_key_size() { + let kms = MemoryKeyManagementClient::with_master_key_size(AesKeySize::Bits256); + kms.add_master_key("master-256").unwrap(); + + let dek = vec![0u8; 16]; + let wrapped = kms.wrap_key(&dek, "master-256").await.unwrap(); + let unwrapped = kms.unwrap_key(&wrapped, "master-256").await.unwrap(); + assert_eq!(unwrapped.as_bytes(), dek.as_slice()); + } + + #[tokio::test] + async fn test_clone_shares_state() { + let kms1 = MemoryKeyManagementClient::new(); + let kms2 = kms1.clone(); + + kms1.add_master_key("shared-key").unwrap(); + assert!(kms2.has_key("shared-key")); + } +} diff --git a/crates/iceberg/src/encryption/kms/mod.rs b/crates/iceberg/src/encryption/kms/mod.rs new file mode 100644 index 0000000000..160e692550 --- /dev/null +++ b/crates/iceberg/src/encryption/kms/mod.rs @@ -0,0 +1,27 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Key Management System trait and implementations. +//! +//! This module provides the [`KeyManagementClient`] trait for pluggable KMS +//! integration and implementations for different key management systems. + +mod client; +mod memory; + +pub use client::{GeneratedKey, KeyManagementClient}; +pub use memory::MemoryKeyManagementClient; diff --git a/crates/iceberg/src/encryption/mod.rs b/crates/iceberg/src/encryption/mod.rs index 097f4f24e3..773d781d6d 100644 --- a/crates/iceberg/src/encryption/mod.rs +++ b/crates/iceberg/src/encryption/mod.rs @@ -17,9 +17,19 @@ //! Encryption module for Apache Iceberg. //! -//! This module provides core cryptographic primitives for encrypting -//! and decrypting data in Iceberg tables. +//! This module provides core cryptographic primitives and key management +//! for encrypting and decrypting data in Iceberg tables. mod crypto; +mod file_decryptor; +mod file_encryptor; +pub(crate) mod key_metadata; +pub mod kms; +mod stream; -pub use crypto::{AesGcmCipher, AesKeySize, SecureKey}; +pub use crypto::{AesGcmCipher, AesKeySize, SecureKey, SensitiveBytes}; +pub use file_decryptor::AesGcmFileDecryptor; +pub use file_encryptor::AesGcmFileEncryptor; +pub use key_metadata::StandardKeyMetadata; +pub use kms::{GeneratedKey, KeyManagementClient}; +pub use stream::{AesGcmFileRead, AesGcmFileWrite}; diff --git a/crates/iceberg/src/encryption/stream.rs b/crates/iceberg/src/encryption/stream.rs new file mode 100644 index 0000000000..130578f2b1 --- /dev/null +++ b/crates/iceberg/src/encryption/stream.rs @@ -0,0 +1,1249 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! AGS1 stream encryption/decryption for Iceberg. +//! +//! Implements the block-based AES-GCM stream format used by Iceberg for +//! encrypting manifest lists and manifest files. The format is +//! byte-compatible with Java's `AesGcmInputStream` / `AesGcmOutputStream`. +//! +//! # AGS1 File Format +//! +//! ```text +//! ┌─────────────────────────────────────────────┐ +//! │ Header (8 bytes) │ +//! │ Magic: "AGS1" (4 bytes, ASCII) │ +//! │ Plain block size: u32 LE (4 bytes) │ +//! │ Default: 1,048,576 (1 MiB) │ +//! ├─────────────────────────────────────────────┤ +//! │ Block 0 │ +//! │ Nonce (12 bytes) │ +//! │ Ciphertext (up to plain_block_size bytes) │ +//! │ GCM Tag (16 bytes) │ +//! ├─────────────────────────────────────────────┤ +//! │ Block 1..N (same structure) │ +//! ├─────────────────────────────────────────────┤ +//! │ Final block (may be shorter) │ +//! └─────────────────────────────────────────────┘ +//! ``` +//! +//! Each block's AAD is: `aad_prefix || block_index (4 bytes, LE)`. + +use std::ops::Range; +use std::sync::Arc; + +use bytes::{Bytes, BytesMut}; + +use super::AesGcmCipher; +use crate::io::{FileRead, FileWrite}; +use crate::{Error, ErrorKind, Result}; + +/// Default plaintext block size (1 MiB), matching Java's `Ciphers.PLAIN_BLOCK_SIZE`. +pub const PLAIN_BLOCK_SIZE: u32 = 1024 * 1024; + +/// AES-GCM nonce length in bytes. +pub const NONCE_LENGTH: u32 = 12; + +/// AES-GCM authentication tag length in bytes. +pub const GCM_TAG_LENGTH: u32 = 16; + +/// Cipher block size = plaintext block size + nonce + GCM tag. +pub const CIPHER_BLOCK_SIZE: u32 = PLAIN_BLOCK_SIZE + NONCE_LENGTH + GCM_TAG_LENGTH; + +/// AGS1 stream magic bytes. +pub const GCM_STREAM_MAGIC: [u8; 4] = *b"AGS1"; + +/// AGS1 stream header length (4-byte magic + 4-byte block size). +pub const GCM_STREAM_HEADER_LENGTH: u32 = 8; + +/// Minimum valid AGS1 stream length (header + one empty block). +#[cfg(test)] +pub const MIN_STREAM_LENGTH: u32 = GCM_STREAM_HEADER_LENGTH + NONCE_LENGTH + GCM_TAG_LENGTH; + +/// Constructs the per-block AAD for AGS1 stream encryption. +/// +/// Format: `aad_prefix || block_index (4 bytes, little-endian)` +/// +/// This matches Java's `Ciphers.streamBlockAAD()`. +pub(crate) fn stream_block_aad(aad_prefix: &[u8], block_index: u32) -> Vec { + let index_bytes = block_index.to_le_bytes(); + if aad_prefix.is_empty() { + index_bytes.to_vec() + } else { + let mut aad = Vec::with_capacity(aad_prefix.len() + 4); + aad.extend_from_slice(aad_prefix); + aad.extend_from_slice(&index_bytes); + aad + } +} + +/// Transparent decryption of AGS1 stream-encrypted files. +/// +/// Implements the [`FileRead`] trait, providing random-access reads over +/// encrypted data. Each `read()` call determines which encrypted blocks +/// overlap the requested plaintext range, reads and decrypts them, then +/// returns the requested plaintext bytes. +/// +/// # Usage +/// +/// ```ignore +/// // (ignored: requires async runtime and concrete FileRead/FileWrite impls) +/// let reader = AesGcmFileRead::new( +/// inner_reader, // Box for the encrypted file +/// cipher, // Arc with the DEK +/// aad_prefix.to_vec(), +/// encrypted_file_length, +/// )?; +/// +/// // Read plaintext bytes transparently +/// let plaintext = reader.read(0..1024).await?; +/// ``` +pub struct AesGcmFileRead { + /// The underlying encrypted file reader. + inner: Box, + /// The AES-GCM cipher holding the DEK. + cipher: Arc, + /// AAD prefix from the key metadata. + aad_prefix: Box<[u8]>, + /// Total plaintext stream size in bytes. + plain_stream_size: u64, + /// Total number of encrypted blocks. + num_blocks: u64, + /// Size of the last cipher block (may be smaller than `CIPHER_BLOCK_SIZE`). + last_cipher_block_size: u32, +} + +impl AesGcmFileRead { + /// Creates a new `AesGcmFileRead` for decrypting an AGS1 stream. + /// + /// Computes the plaintext size and block layout from the encrypted file + /// length. No I/O is performed; header validation happens implicitly + /// when blocks are decrypted (GCM authentication will fail on corrupt data). + /// + /// # Arguments + /// + /// * `inner` - Reader for the underlying encrypted file + /// * `cipher` - AES-GCM cipher initialized with the file's DEK + /// * `aad_prefix` - AAD prefix from the file's `StandardKeyMetadata` + /// * `encrypted_file_length` - Total byte length of the encrypted file + pub fn new( + inner: Box, + cipher: Arc, + aad_prefix: Box<[u8]>, + encrypted_file_length: u64, + ) -> Result { + let plain_stream_size = Self::calculate_plaintext_length(encrypted_file_length)?; + let stream_length = encrypted_file_length - GCM_STREAM_HEADER_LENGTH as u64; + + if stream_length == 0 { + return Ok(Self { + inner, + cipher, + aad_prefix, + plain_stream_size: 0, + num_blocks: 0, + last_cipher_block_size: 0, + }); + } + + let num_full_blocks = stream_length / CIPHER_BLOCK_SIZE as u64; + let cipher_bytes_in_last_block = (stream_length % CIPHER_BLOCK_SIZE as u64) as u32; + let full_blocks_only = cipher_bytes_in_last_block == 0; + + let num_blocks = if full_blocks_only { + num_full_blocks + } else { + num_full_blocks + 1 + }; + + if num_blocks > u32::MAX as u64 { + return Err(Error::new( + ErrorKind::DataInvalid, + format!( + "AGS1 format supports at most {} blocks (~4 TiB per file), but file requires {num_blocks} blocks", + u32::MAX + ), + )); + } + + let last_cipher_block_size = if full_blocks_only { + CIPHER_BLOCK_SIZE + } else { + cipher_bytes_in_last_block + }; + + Ok(Self { + inner, + cipher, + aad_prefix, + plain_stream_size, + num_blocks, + last_cipher_block_size, + }) + } + + /// Returns the plaintext stream size in bytes. + pub fn plaintext_length(&self) -> u64 { + self.plain_stream_size + } + + /// Calculates the plaintext length from an encrypted file's total length. + /// + /// This is a static calculation matching Java's + /// `AesGcmInputStream.calculatePlaintextLength()`. + pub fn calculate_plaintext_length(encrypted_file_length: u64) -> Result { + if encrypted_file_length < GCM_STREAM_HEADER_LENGTH as u64 { + return Err(Error::new( + ErrorKind::DataInvalid, + format!( + "Encrypted file too short: {encrypted_file_length} bytes (minimum {GCM_STREAM_HEADER_LENGTH})" + ), + )); + } + + let stream_length = encrypted_file_length - GCM_STREAM_HEADER_LENGTH as u64; + + if stream_length == 0 { + return Ok(0); + } + + let num_full_blocks = stream_length / CIPHER_BLOCK_SIZE as u64; + let cipher_bytes_in_last_block = stream_length % CIPHER_BLOCK_SIZE as u64; + let full_blocks_only = cipher_bytes_in_last_block == 0; + + let plain_bytes_in_last_block = if full_blocks_only { + 0 + } else { + if cipher_bytes_in_last_block < (NONCE_LENGTH + GCM_TAG_LENGTH) as u64 { + return Err(Error::new( + ErrorKind::DataInvalid, + format!( + "Truncated encrypted file: last block is {} bytes (minimum {})", + cipher_bytes_in_last_block, + NONCE_LENGTH + GCM_TAG_LENGTH + ), + )); + } + cipher_bytes_in_last_block - NONCE_LENGTH as u64 - GCM_TAG_LENGTH as u64 + }; + + Ok(num_full_blocks * PLAIN_BLOCK_SIZE as u64 + plain_bytes_in_last_block) + } + + /// Returns the encrypted byte offset for a given block index. + fn encrypted_block_offset(block_index: u64) -> u64 { + block_index * CIPHER_BLOCK_SIZE as u64 + GCM_STREAM_HEADER_LENGTH as u64 + } + + /// Returns the cipher block size for a given block index. + fn cipher_block_size(&self, block_index: u64) -> u32 { + if block_index == self.num_blocks - 1 { + self.last_cipher_block_size + } else { + CIPHER_BLOCK_SIZE + } + } +} + +#[async_trait::async_trait] +impl FileRead for AesGcmFileRead { + /// Reads and decrypts a plaintext byte range from the encrypted AGS1 stream. + /// + /// The caller specifies a range in **plaintext** coordinates (e.g. "bytes 0..1024 + /// of the original file"). This method translates that into the encrypted file + /// layout and performs the following steps: + /// + /// 1. **Map to blocks** — divides the plaintext range by `PLAIN_BLOCK_SIZE` to + /// find which encrypted blocks (`first_block..=last_block`) contain the + /// requested data. + /// + /// 2. **Single I/O read** — calculates the contiguous byte range in the + /// encrypted file that covers all needed blocks (including the 8-byte AGS1 + /// header offset, 12-byte nonces, and 16-byte GCM tags) and fetches them in + /// one call to the inner `FileRead`. + /// + /// 3. **Decrypt per block** — iterates over each cipher block in the response, + /// decrypts it with AES-GCM using the per-block AAD (`aad_prefix || block_index`), + /// and slices out only the plaintext bytes that overlap the requested range. + /// + /// 4. **Assemble result** — concatenates the slices into a single `Bytes` buffer + /// matching exactly `range.end - range.start` bytes. + /// + /// Because each block is independently encrypted with its own nonce and AAD, + /// arbitrary random-access reads are supported without decrypting the entire + /// file. GCM authentication is verified per-block, so any tampering is detected + /// at the granularity of individual blocks. + async fn read(&self, range: Range) -> Result { + if range.start == range.end { + return Ok(Bytes::new()); + } + + if range.start > range.end { + return Err(Error::new( + ErrorKind::DataInvalid, + format!( + "Invalid read range: start ({}) is greater than end ({})", + range.start, range.end + ), + )); + } + + if range.end > self.plain_stream_size { + return Err(Error::new( + ErrorKind::DataInvalid, + format!( + "Read range {}..{} exceeds plaintext size {}", + range.start, range.end, self.plain_stream_size + ), + )); + } + + if self.num_blocks == 0 { + return Ok(Bytes::new()); + } + + let first_block = range.start / PLAIN_BLOCK_SIZE as u64; + let last_block = (range.end - 1) / PLAIN_BLOCK_SIZE as u64; + + // Read all needed encrypted blocks in a single I/O call + let encrypted_start = Self::encrypted_block_offset(first_block); + let encrypted_end = + Self::encrypted_block_offset(last_block) + self.cipher_block_size(last_block) as u64; + + let all_encrypted = self.inner.read(encrypted_start..encrypted_end).await?; + + // Decrypt each block and extract the requested plaintext range + let result_len = (range.end - range.start) as usize; + let mut result = BytesMut::with_capacity(result_len); + let mut encrypted_offset = 0usize; + + for block_idx in first_block..=last_block { + let block_size = self.cipher_block_size(block_idx) as usize; + let cipher_block = &all_encrypted[encrypted_offset..encrypted_offset + block_size]; + encrypted_offset += block_size; + + let aad = stream_block_aad(&self.aad_prefix, block_idx as u32); + let decrypted = self.cipher.decrypt(cipher_block, Some(&aad))?; + + // Calculate which slice of this decrypted block we need + let block_plain_start = block_idx * PLAIN_BLOCK_SIZE as u64; + let slice_start = if block_idx == first_block { + (range.start - block_plain_start) as usize + } else { + 0 + }; + let slice_end = if block_idx == last_block { + (range.end - block_plain_start) as usize + } else { + decrypted.len() + }; + + result.extend_from_slice(&decrypted[slice_start..slice_end]); + } + + Ok(result.freeze()) + } +} + +/// Transparent encryption of AGS1 stream-encrypted files. +/// +/// Implements the [`FileWrite`] trait, buffering plaintext and emitting +/// encrypted AGS1 blocks. This is the streaming write counterpart to +/// [`AesGcmFileRead`]. +/// +/// # Usage +/// +/// ```ignore +/// // (ignored: requires async runtime and concrete FileRead/FileWrite impls) +/// let writer = AesGcmFileWrite::new( +/// inner_writer, // Box for the output file +/// cipher, // Arc with the DEK +/// aad_prefix.to_vec(), +/// ); +/// +/// writer.write(plaintext_chunk).await?; +/// writer.close().await?; +/// ``` +pub struct AesGcmFileWrite { + /// The underlying output writer. + inner: Box, + /// The AES-GCM cipher holding the DEK. + cipher: Arc, + /// AAD prefix from the key metadata. + aad_prefix: Box<[u8]>, + /// Plaintext buffer accumulating data before block encryption. + buffer: Vec, + /// Current block index for AAD construction. + block_index: u32, + /// Whether the AGS1 header has been written. + header_written: bool, + /// Whether close() has been called. + closed: bool, + /// Whether the writer is in a poisoned state due to a failed inner write. + /// Once poisoned, all subsequent operations are rejected because the inner + /// writer may have received partial data. + poisoned: bool, +} + +impl AesGcmFileWrite { + /// Creates a new `AesGcmFileWrite` for encrypting to AGS1 format. + /// + /// No I/O is performed until `write()` or `close()` is called. + pub fn new( + inner: Box, + cipher: Arc, + aad_prefix: impl Into>, + ) -> Self { + Self { + inner, + cipher, + aad_prefix: aad_prefix.into(), + buffer: Vec::new(), + block_index: 0, + header_written: false, + closed: false, + poisoned: false, + } + } + + /// Writes the AGS1 header (magic + plain block size) to the inner writer. + async fn write_header(&mut self) -> Result<()> { + let mut header = Vec::with_capacity(GCM_STREAM_HEADER_LENGTH as usize); + header.extend_from_slice(&GCM_STREAM_MAGIC); + header.extend_from_slice(&PLAIN_BLOCK_SIZE.to_le_bytes()); + if let Err(e) = self.inner.write(Bytes::from(header)).await { + self.poisoned = true; + return Err(e); + } + self.header_written = true; + Ok(()) + } + + /// Encrypts a plaintext block and writes it to the inner writer. + async fn encrypt_and_write_block(&mut self, block_data: &[u8]) -> Result<()> { + let aad = stream_block_aad(&self.aad_prefix, self.block_index); + let encrypted = self.cipher.encrypt(block_data, Some(&aad))?; + if let Err(e) = self.inner.write(Bytes::from(encrypted)).await { + self.poisoned = true; + return Err(e); + } + self.block_index = self.block_index.checked_add(1).ok_or_else(|| { + Error::new( + ErrorKind::DataInvalid, + "AGS1 block index overflow: file exceeds the maximum supported size (~4 TiB)", + ) + })?; + Ok(()) + } + + /// Encrypts the first `PLAIN_BLOCK_SIZE` bytes of the buffer in-place + /// and drains them, avoiding a 1 MiB temporary copy. + async fn encrypt_and_drain_block(&mut self) -> Result<()> { + let aad = stream_block_aad(&self.aad_prefix, self.block_index); + let encrypted = self + .cipher + .encrypt(&self.buffer[..PLAIN_BLOCK_SIZE as usize], Some(&aad))?; + if let Err(e) = self.inner.write(Bytes::from(encrypted)).await { + self.poisoned = true; + return Err(e); + } + self.block_index = self.block_index.checked_add(1).ok_or_else(|| { + Error::new( + ErrorKind::DataInvalid, + "AGS1 block index overflow: file exceeds the maximum supported size (~4 TiB)", + ) + })?; + self.buffer.drain(..PLAIN_BLOCK_SIZE as usize); + Ok(()) + } +} + +#[async_trait::async_trait] +impl FileWrite for AesGcmFileWrite { + async fn write(&mut self, bs: Bytes) -> Result<()> { + if self.closed { + return Err(Error::new( + ErrorKind::Unexpected, + "Cannot write to a closed AesGcmFileWrite", + )); + } + if self.poisoned { + return Err(Error::new( + ErrorKind::Unexpected, + "AesGcmFileWrite is in a poisoned state due to a previous write failure", + )); + } + + if !self.header_written { + self.write_header().await?; + } + + self.buffer.extend_from_slice(&bs); + + // Flush full blocks + while self.buffer.len() >= PLAIN_BLOCK_SIZE as usize { + self.encrypt_and_drain_block().await?; + } + + Ok(()) + } + + async fn close(&mut self) -> Result<()> { + if self.closed { + return Err(Error::new( + ErrorKind::Unexpected, + "AesGcmFileWrite already closed", + )); + } + if self.poisoned { + return Err(Error::new( + ErrorKind::Unexpected, + "AesGcmFileWrite is in a poisoned state due to a previous write failure", + )); + } + + if !self.header_written { + self.write_header().await?; + } + + // Write the final block if there's remaining data, or if this is an empty file + // (block_index == 0). Skip writing a spurious empty block when the plaintext was + // exactly block-aligned (buffer empty, blocks already written). + if !self.buffer.is_empty() || self.block_index == 0 { + let final_block = std::mem::take(&mut self.buffer); + self.encrypt_and_write_block(&final_block).await?; + } + self.closed = true; + + self.inner.close().await + } +} + +#[cfg(test)] +mod tests { + use super::*; + + /// Encrypts plaintext into AGS1 format for testing. + /// + /// Mirrors Java's `AesGcmOutputStream` behavior: + /// - Always writes header + at least one block (even for empty input) + /// - Full blocks are `PLAIN_BLOCK_SIZE` bytes; last block may be shorter + fn encrypt_ags1(plaintext: &[u8], cipher: &AesGcmCipher, aad_prefix: &[u8]) -> Vec { + let mut result = Vec::new(); + + // Write header: "AGS1" + PLAIN_BLOCK_SIZE (LE) + result.extend_from_slice(&GCM_STREAM_MAGIC); + result.extend_from_slice(&PLAIN_BLOCK_SIZE.to_le_bytes()); + + // Write blocks + let mut offset = 0; + let mut block_index = 0u32; + + loop { + let remaining = plaintext.len() - offset; + let block_size = std::cmp::min(remaining, PLAIN_BLOCK_SIZE as usize); + + // Block 0 is always written (even if empty); subsequent empty blocks are skipped + if block_size == 0 && block_index > 0 { + break; + } + + let block_data = &plaintext[offset..offset + block_size]; + let aad = stream_block_aad(aad_prefix, block_index); + let encrypted = cipher.encrypt(block_data, Some(&aad)).unwrap(); + result.extend_from_slice(&encrypted); + + offset += block_size; + block_index += 1; + + // A partial block is always the last + if block_size < PLAIN_BLOCK_SIZE as usize { + break; + } + } + + result + } + + /// Helper to create an AesGcmCipher from raw key bytes. + fn make_cipher(key: &[u8]) -> AesGcmCipher { + use super::super::SecureKey; + let secure_key = SecureKey::new(key).unwrap(); + AesGcmCipher::new(secure_key) + } + + /// Helper to create an in-memory FileRead from bytes. + fn memory_reader(data: Vec) -> Box { + Box::new(MemoryFileRead(Bytes::from(data))) + } + + /// Simple in-memory FileRead for tests. + struct MemoryFileRead(Bytes); + + #[async_trait::async_trait] + impl FileRead for MemoryFileRead { + async fn read(&self, range: Range) -> Result { + let start = range.start as usize; + let end = range.end as usize; + if end > self.0.len() { + return Err(Error::new( + ErrorKind::DataInvalid, + format!( + "Range {}..{} out of bounds for {} bytes", + start, + end, + self.0.len() + ), + )); + } + Ok(self.0.slice(start..end)) + } + } + + #[tokio::test] + async fn test_empty_file_roundtrip() { + let key = b"0123456789abcdef"; + let aad_prefix = b"test-aad-prefix!"; + let cipher = make_cipher(key); + + let encrypted = encrypt_ags1(b"", &cipher, aad_prefix); + + // Verify minimum length: header(8) + nonce(12) + tag(16) = 36 + assert_eq!(encrypted.len(), MIN_STREAM_LENGTH as usize); + + let reader = AesGcmFileRead::new( + memory_reader(encrypted.clone()), + Arc::new(make_cipher(key)), + aad_prefix.as_slice().into(), + encrypted.len() as u64, + ) + .unwrap(); + + assert_eq!(reader.plaintext_length(), 0); + + // Reading empty range should return empty bytes + let result = reader.read(0..0).await.unwrap(); + assert!(result.is_empty()); + } + + #[tokio::test] + async fn test_small_file_roundtrip() { + let key = b"0123456789abcdef"; + let aad_prefix = b"test-aad-prefix!"; + let plaintext = b"Hello, Iceberg encryption!"; + let cipher = make_cipher(key); + + let encrypted = encrypt_ags1(plaintext, &cipher, aad_prefix); + + let reader = AesGcmFileRead::new( + memory_reader(encrypted.clone()), + Arc::new(make_cipher(key)), + aad_prefix.as_slice().into(), + encrypted.len() as u64, + ) + .unwrap(); + + assert_eq!(reader.plaintext_length(), plaintext.len() as u64); + + // Read entire file + let result = reader.read(0..plaintext.len() as u64).await.unwrap(); + assert_eq!(&result[..], plaintext); + } + + #[tokio::test] + async fn test_partial_read() { + let key = b"0123456789abcdef"; + let aad_prefix = b"aad-prefix-here!"; + let plaintext = b"ABCDEFGHIJKLMNOPQRSTUVWXYZ"; + let cipher = make_cipher(key); + + let encrypted = encrypt_ags1(plaintext, &cipher, aad_prefix); + + let reader = AesGcmFileRead::new( + memory_reader(encrypted.clone()), + Arc::new(make_cipher(key)), + aad_prefix.as_slice().into(), + encrypted.len() as u64, + ) + .unwrap(); + + // Read a slice from the middle + let result = reader.read(10..20).await.unwrap(); + assert_eq!(&result[..], &plaintext[10..20]); + + // Read first byte + let result = reader.read(0..1).await.unwrap(); + assert_eq!(&result[..], &plaintext[0..1]); + + // Read last byte + let last = plaintext.len() as u64; + let result = reader.read(last - 1..last).await.unwrap(); + assert_eq!(&result[..], &plaintext[plaintext.len() - 1..]); + } + + #[tokio::test] + async fn test_multi_block_roundtrip() { + let key = b"0123456789abcdef"; + let aad_prefix = b"multi-block-aad!"; + + // 1.5 blocks of data + let size = PLAIN_BLOCK_SIZE as usize + PLAIN_BLOCK_SIZE as usize / 2; + let plaintext: Vec = (0..size).map(|i| (i % 256) as u8).collect(); + let cipher = make_cipher(key); + + let encrypted = encrypt_ags1(&plaintext, &cipher, aad_prefix); + + let reader = AesGcmFileRead::new( + memory_reader(encrypted.clone()), + Arc::new(make_cipher(key)), + aad_prefix.as_slice().into(), + encrypted.len() as u64, + ) + .unwrap(); + + assert_eq!(reader.plaintext_length(), plaintext.len() as u64); + + // Read entire file + let result = reader.read(0..plaintext.len() as u64).await.unwrap(); + assert_eq!(&result[..], &plaintext[..]); + } + + #[tokio::test] + async fn test_cross_block_read() { + let key = b"0123456789abcdef"; + let aad_prefix = b"cross-block-aad!"; + + // 2.5 blocks of data + let size = PLAIN_BLOCK_SIZE as usize * 2 + PLAIN_BLOCK_SIZE as usize / 2; + let plaintext: Vec = (0..size).map(|i| (i % 256) as u8).collect(); + let cipher = make_cipher(key); + + let encrypted = encrypt_ags1(&plaintext, &cipher, aad_prefix); + + let reader = AesGcmFileRead::new( + memory_reader(encrypted.clone()), + Arc::new(make_cipher(key)), + aad_prefix.as_slice().into(), + encrypted.len() as u64, + ) + .unwrap(); + + // Read across block boundary (last 100 bytes of block 0 + first 100 bytes of block 1) + let boundary = PLAIN_BLOCK_SIZE as u64; + let result = reader.read(boundary - 100..boundary + 100).await.unwrap(); + assert_eq!( + &result[..], + &plaintext[(boundary - 100) as usize..(boundary + 100) as usize] + ); + + // Read across two block boundaries (spans blocks 0, 1, and 2) + let result = reader.read(boundary - 50..boundary * 2 + 50).await.unwrap(); + assert_eq!( + &result[..], + &plaintext[(boundary - 50) as usize..(boundary * 2 + 50) as usize] + ); + } + + #[tokio::test] + async fn test_exact_block_size() { + let key = b"0123456789abcdef"; + let aad_prefix = b"exact-block-aad!"; + + // Exactly 1 block + let plaintext: Vec = (0..PLAIN_BLOCK_SIZE as usize) + .map(|i| (i % 256) as u8) + .collect(); + let cipher = make_cipher(key); + + let encrypted = encrypt_ags1(&plaintext, &cipher, aad_prefix); + + let reader = AesGcmFileRead::new( + memory_reader(encrypted.clone()), + Arc::new(make_cipher(key)), + aad_prefix.as_slice().into(), + encrypted.len() as u64, + ) + .unwrap(); + + assert_eq!(reader.plaintext_length(), PLAIN_BLOCK_SIZE as u64); + + let result = reader.read(0..PLAIN_BLOCK_SIZE as u64).await.unwrap(); + assert_eq!(&result[..], &plaintext[..]); + } + + #[tokio::test] + async fn test_block_size_plus_one() { + let key = b"0123456789abcdef"; + let aad_prefix = b"block-plus-one!!"; + + // 1 block + 1 byte + let size = PLAIN_BLOCK_SIZE as usize + 1; + let plaintext: Vec = (0..size).map(|i| (i % 256) as u8).collect(); + let cipher = make_cipher(key); + + let encrypted = encrypt_ags1(&plaintext, &cipher, aad_prefix); + + let reader = AesGcmFileRead::new( + memory_reader(encrypted.clone()), + Arc::new(make_cipher(key)), + aad_prefix.as_slice().into(), + encrypted.len() as u64, + ) + .unwrap(); + + assert_eq!(reader.plaintext_length(), size as u64); + + // Read the last byte (in block 1) + let result = reader.read(size as u64 - 1..size as u64).await.unwrap(); + assert_eq!(result[0], plaintext[size - 1]); + + // Read all + let result = reader.read(0..size as u64).await.unwrap(); + assert_eq!(&result[..], &plaintext[..]); + } + + #[tokio::test] + async fn test_block_size_minus_one() { + let key = b"0123456789abcdef"; + let aad_prefix = b"block-minus-one!"; + + // 1 block - 1 byte + let size = PLAIN_BLOCK_SIZE as usize - 1; + let plaintext: Vec = (0..size).map(|i| (i % 256) as u8).collect(); + let cipher = make_cipher(key); + + let encrypted = encrypt_ags1(&plaintext, &cipher, aad_prefix); + + let reader = AesGcmFileRead::new( + memory_reader(encrypted.clone()), + Arc::new(make_cipher(key)), + aad_prefix.as_slice().into(), + encrypted.len() as u64, + ) + .unwrap(); + + assert_eq!(reader.plaintext_length(), size as u64); + + let result = reader.read(0..size as u64).await.unwrap(); + assert_eq!(&result[..], &plaintext[..]); + } + + #[tokio::test] + async fn test_wrong_aad_fails() { + let key = b"0123456789abcdef"; + let aad_prefix = b"correct-aad-here"; + let plaintext = b"sensitive data here"; + let cipher = make_cipher(key); + + let encrypted = encrypt_ags1(plaintext, &cipher, aad_prefix); + + // Try to decrypt with wrong AAD + let mut bad_aad = aad_prefix.to_vec(); + bad_aad[0] ^= 0xFF; + + let reader = AesGcmFileRead::new( + memory_reader(encrypted.clone()), + Arc::new(make_cipher(key)), + bad_aad.as_slice().into(), + encrypted.len() as u64, + ) + .unwrap(); + + let result = reader.read(0..plaintext.len() as u64).await; + assert!(result.is_err(), "Decryption with wrong AAD should fail"); + } + + #[tokio::test] + async fn test_wrong_key_fails() { + let key = b"0123456789abcdef"; + let wrong_key = b"fedcba9876543210"; + let aad_prefix = b"test-aad-prefix!"; + let plaintext = b"sensitive data"; + let cipher = make_cipher(key); + + let encrypted = encrypt_ags1(plaintext, &cipher, aad_prefix); + + let reader = AesGcmFileRead::new( + memory_reader(encrypted.clone()), + Arc::new(make_cipher(wrong_key)), + aad_prefix.as_slice().into(), + encrypted.len() as u64, + ) + .unwrap(); + + let result = reader.read(0..plaintext.len() as u64).await; + assert!(result.is_err(), "Decryption with wrong key should fail"); + } + + #[tokio::test] + async fn test_out_of_bounds_read() { + let key = b"0123456789abcdef"; + let aad_prefix = b"test-aad-prefix!"; + let plaintext = b"short data"; + let cipher = make_cipher(key); + + let encrypted = encrypt_ags1(plaintext, &cipher, aad_prefix); + + let reader = AesGcmFileRead::new( + memory_reader(encrypted.clone()), + Arc::new(make_cipher(key)), + aad_prefix.as_slice().into(), + encrypted.len() as u64, + ) + .unwrap(); + + let result = reader.read(0..plaintext.len() as u64 + 1).await; + assert!(result.is_err(), "Reading past end should fail"); + } + + #[tokio::test] + async fn test_calculate_plaintext_length() { + // Empty file: header only (not valid per Java, but handled) + assert_eq!( + AesGcmFileRead::calculate_plaintext_length(GCM_STREAM_HEADER_LENGTH as u64).unwrap(), + 0 + ); + + // Empty file with one empty block: header(8) + nonce(12) + tag(16) = 36 + assert_eq!( + AesGcmFileRead::calculate_plaintext_length(MIN_STREAM_LENGTH as u64).unwrap(), + 0 + ); + + // One full block: header(8) + cipher_block(1048604) = 1048612 + let one_full = GCM_STREAM_HEADER_LENGTH as u64 + CIPHER_BLOCK_SIZE as u64; + assert_eq!( + AesGcmFileRead::calculate_plaintext_length(one_full).unwrap(), + PLAIN_BLOCK_SIZE as u64 + ); + + // One full block + 1 byte: need partial second block + // Second block = nonce(12) + 1 byte ciphertext + tag(16) = 29 + let one_full_plus_one = one_full + NONCE_LENGTH as u64 + 1 + GCM_TAG_LENGTH as u64; + assert_eq!( + AesGcmFileRead::calculate_plaintext_length(one_full_plus_one).unwrap(), + PLAIN_BLOCK_SIZE as u64 + 1 + ); + } + + #[tokio::test] + async fn test_stream_block_aad() { + // With prefix + let aad = stream_block_aad(b"prefix", 0); + assert_eq!(&aad[..6], b"prefix"); + assert_eq!(&aad[6..], &0u32.to_le_bytes()); + + let aad = stream_block_aad(b"prefix", 1); + assert_eq!(&aad[..6], b"prefix"); + assert_eq!(&aad[6..], &1u32.to_le_bytes()); + + // Without prefix + let aad = stream_block_aad(b"", 42); + assert_eq!(&aad[..], &42u32.to_le_bytes()); + } + + #[tokio::test] + async fn test_encrypted_file_too_short() { + let result = AesGcmFileRead::new( + memory_reader(vec![0; 4]), + Arc::new(make_cipher(b"0123456789abcdef")), + [].into(), + 4, + ); + assert!(result.is_err()); + } + + // --- AesGcmFileWrite tests --- + + /// Shared-buffer FileWrite for testing AesGcmFileWrite output. + struct SharedMemoryWrite { + buffer: std::sync::Arc>>, + } + + /// FileWrite that fails after a configured number of successful writes. + struct FailingFileWrite { + writes_before_failure: usize, + write_count: usize, + } + + #[async_trait::async_trait] + impl FileWrite for FailingFileWrite { + async fn write(&mut self, _bs: Bytes) -> Result<()> { + if self.write_count >= self.writes_before_failure { + return Err(Error::new(ErrorKind::Unexpected, "simulated write failure")); + } + self.write_count += 1; + Ok(()) + } + + async fn close(&mut self) -> Result<()> { + Ok(()) + } + } + + #[async_trait::async_trait] + impl FileWrite for SharedMemoryWrite { + async fn write(&mut self, bs: Bytes) -> Result<()> { + self.buffer.lock().unwrap().extend_from_slice(&bs); + Ok(()) + } + + async fn close(&mut self) -> Result<()> { + Ok(()) + } + } + + /// Helper: one-shot encrypt through AesGcmFileWrite, return encrypted bytes. + async fn write_through_ags1(plaintext: &[u8], key: &[u8], aad_prefix: &[u8]) -> Vec { + let buffer = std::sync::Arc::new(std::sync::Mutex::new(Vec::new())); + let inner: Box = Box::new(SharedMemoryWrite { + buffer: buffer.clone(), + }); + let cipher = Arc::new(make_cipher(key)); + let mut writer = AesGcmFileWrite::new(inner, cipher, aad_prefix.to_vec()); + + writer.write(Bytes::from(plaintext.to_vec())).await.unwrap(); + writer.close().await.unwrap(); + + buffer.lock().unwrap().clone() + } + + #[tokio::test] + async fn test_write_empty_roundtrip() { + let key = b"0123456789abcdef"; + let aad_prefix = b"test-aad-prefix!"; + + let encrypted = write_through_ags1(b"", key, aad_prefix).await; + + // Should produce header + one empty encrypted block + assert_eq!(encrypted.len(), MIN_STREAM_LENGTH as usize); + + let reader = AesGcmFileRead::new( + memory_reader(encrypted.clone()), + Arc::new(make_cipher(key)), + aad_prefix.as_slice().into(), + encrypted.len() as u64, + ) + .unwrap(); + + assert_eq!(reader.plaintext_length(), 0); + } + + #[tokio::test] + async fn test_write_small_roundtrip() { + let key = b"0123456789abcdef"; + let aad_prefix = b"test-aad-prefix!"; + let plaintext = b"Hello, Iceberg encryption!"; + + let encrypted = write_through_ags1(plaintext, key, aad_prefix).await; + + let reader = AesGcmFileRead::new( + memory_reader(encrypted.clone()), + Arc::new(make_cipher(key)), + aad_prefix.as_slice().into(), + encrypted.len() as u64, + ) + .unwrap(); + + assert_eq!(reader.plaintext_length(), plaintext.len() as u64); + let result = reader.read(0..plaintext.len() as u64).await.unwrap(); + assert_eq!(&result[..], plaintext); + } + + #[tokio::test] + async fn test_write_multi_block_roundtrip() { + let key = b"0123456789abcdef"; + let aad_prefix = b"multi-block-aad!"; + + // 1.5 blocks of data + let size = PLAIN_BLOCK_SIZE as usize + PLAIN_BLOCK_SIZE as usize / 2; + let plaintext: Vec = (0..size).map(|i| (i % 256) as u8).collect(); + + let encrypted = write_through_ags1(&plaintext, key, aad_prefix).await; + + let reader = AesGcmFileRead::new( + memory_reader(encrypted.clone()), + Arc::new(make_cipher(key)), + aad_prefix.as_slice().into(), + encrypted.len() as u64, + ) + .unwrap(); + + assert_eq!(reader.plaintext_length(), plaintext.len() as u64); + let result = reader.read(0..plaintext.len() as u64).await.unwrap(); + assert_eq!(&result[..], &plaintext[..]); + } + + #[tokio::test] + async fn test_write_cross_block_accumulation() { + let key = b"0123456789abcdef"; + let aad_prefix = b"cross-block-aad!"; + + let buffer = std::sync::Arc::new(std::sync::Mutex::new(Vec::new())); + let inner: Box = Box::new(SharedMemoryWrite { + buffer: buffer.clone(), + }); + let cipher = Arc::new(make_cipher(key)); + let mut writer = AesGcmFileWrite::new(inner, cipher, aad_prefix.to_vec()); + + // Write 1.5 blocks in 1000-byte chunks + let total_size = PLAIN_BLOCK_SIZE as usize + PLAIN_BLOCK_SIZE as usize / 2; + let plaintext: Vec = (0..total_size).map(|i| (i % 256) as u8).collect(); + let chunk_size = 1000; + for chunk in plaintext.chunks(chunk_size) { + writer.write(Bytes::from(chunk.to_vec())).await.unwrap(); + } + writer.close().await.unwrap(); + + let encrypted = buffer.lock().unwrap().clone(); + + let reader = AesGcmFileRead::new( + memory_reader(encrypted.clone()), + Arc::new(make_cipher(key)), + aad_prefix.as_slice().into(), + encrypted.len() as u64, + ) + .unwrap(); + + assert_eq!(reader.plaintext_length(), plaintext.len() as u64); + let result = reader.read(0..plaintext.len() as u64).await.unwrap(); + assert_eq!(&result[..], &plaintext[..]); + } + + #[tokio::test] + async fn test_write_exact_block_size() { + let key = b"0123456789abcdef"; + let aad_prefix = b"exact-block-aad!"; + + // Exactly 1 block + let plaintext: Vec = (0..PLAIN_BLOCK_SIZE as usize) + .map(|i| (i % 256) as u8) + .collect(); + + let encrypted = write_through_ags1(&plaintext, key, aad_prefix).await; + + let reader = AesGcmFileRead::new( + memory_reader(encrypted.clone()), + Arc::new(make_cipher(key)), + aad_prefix.as_slice().into(), + encrypted.len() as u64, + ) + .unwrap(); + + assert_eq!(reader.plaintext_length(), PLAIN_BLOCK_SIZE as u64); + let result = reader.read(0..PLAIN_BLOCK_SIZE as u64).await.unwrap(); + assert_eq!(&result[..], &plaintext[..]); + } + + #[tokio::test] + async fn test_write_block_aligned_no_spurious_empty_block() { + let key = b"0123456789abcdef"; + let aad_prefix = b"block-align-aad!"; + + // Write exactly one block of plaintext — close() should NOT add + // a trailing empty encrypted block (28 bytes: 12-byte nonce + 16-byte tag). + let plaintext: Vec = (0..PLAIN_BLOCK_SIZE as usize) + .map(|i| (i % 256) as u8) + .collect(); + + let encrypted_via_writer = write_through_ags1(&plaintext, key, aad_prefix).await; + let encrypted_via_reference = encrypt_ags1(&plaintext, &make_cipher(key), aad_prefix); + + // Both should be the same length — no extra 28-byte empty block + assert_eq!( + encrypted_via_writer.len(), + encrypted_via_reference.len(), + "Writer output should match reference encryption length (no spurious trailing block)" + ); + + // Verify roundtrip + let reader = AesGcmFileRead::new( + memory_reader(encrypted_via_writer.clone()), + Arc::new(make_cipher(key)), + aad_prefix.as_slice().into(), + encrypted_via_writer.len() as u64, + ) + .unwrap(); + + assert_eq!(reader.plaintext_length(), PLAIN_BLOCK_SIZE as u64); + let result = reader.read(0..PLAIN_BLOCK_SIZE as u64).await.unwrap(); + assert_eq!(&result[..], &plaintext[..]); + } + + #[tokio::test] + async fn test_write_two_blocks_aligned_no_spurious_empty_block() { + let key = b"0123456789abcdef"; + let aad_prefix = b"2blk-align-aad!!"; + + // Exactly 2 blocks + let size = PLAIN_BLOCK_SIZE as usize * 2; + let plaintext: Vec = (0..size).map(|i| (i % 256) as u8).collect(); + + let encrypted_via_writer = write_through_ags1(&plaintext, key, aad_prefix).await; + let encrypted_via_reference = encrypt_ags1(&plaintext, &make_cipher(key), aad_prefix); + + assert_eq!( + encrypted_via_writer.len(), + encrypted_via_reference.len(), + "Writer output should match reference encryption length (no spurious trailing block)" + ); + + let reader = AesGcmFileRead::new( + memory_reader(encrypted_via_writer.clone()), + Arc::new(make_cipher(key)), + aad_prefix.as_slice().into(), + encrypted_via_writer.len() as u64, + ) + .unwrap(); + + assert_eq!(reader.plaintext_length(), size as u64); + let result = reader.read(0..size as u64).await.unwrap(); + assert_eq!(&result[..], &plaintext[..]); + } + + #[tokio::test] + async fn test_write_poisoned_after_inner_write_failure() { + let cipher = Arc::new(make_cipher(b"0123456789abcdef")); + // Fail on the second write (first write is the header, second is block data) + let inner: Box = Box::new(FailingFileWrite { + writes_before_failure: 1, + write_count: 0, + }); + let mut writer = AesGcmFileWrite::new(inner, cipher, b"aad-prefix-here!".to_vec()); + + // First write triggers header (succeeds) + block encrypt+write (fails) + let data = vec![0u8; PLAIN_BLOCK_SIZE as usize]; + let result = writer.write(Bytes::from(data)).await; + assert!(result.is_err()); + + // Subsequent write should be rejected as poisoned + let result = writer.write(Bytes::from(b"more data".to_vec())).await; + assert!(result.is_err()); + assert!( + result.unwrap_err().to_string().contains("poisoned"), + "expected poisoned error" + ); + + // Close should also be rejected + let result = writer.close().await; + assert!(result.is_err()); + assert!( + result.unwrap_err().to_string().contains("poisoned"), + "expected poisoned error on close" + ); + } +} diff --git a/crates/iceberg/src/error.rs b/crates/iceberg/src/error.rs index 55e9043d17..ad91473612 100644 --- a/crates/iceberg/src/error.rs +++ b/crates/iceberg/src/error.rs @@ -18,6 +18,7 @@ use std::backtrace::{Backtrace, BacktraceStatus}; use std::fmt; use std::fmt::{Debug, Display, Formatter}; +use std::sync::PoisonError; use chrono::{DateTime, TimeZone as _, Utc}; @@ -447,6 +448,11 @@ define_from_err!( "Failure in doing io operation" ); +/// Converts a [`PoisonError`] from a poisoned lock into an [`Error`]. +pub(crate) fn lock_error(e: PoisonError) -> Error { + Error::new(ErrorKind::Unexpected, format!("Lock poisoned: {e}")) +} + /// Converts a timestamp in milliseconds to `DateTime`, handling errors. /// /// # Arguments diff --git a/crates/iceberg/src/expr/visitors/page_index_evaluator.rs b/crates/iceberg/src/expr/visitors/page_index_evaluator.rs index 96d1c651cd..4cd676dab1 100644 --- a/crates/iceberg/src/expr/visitors/page_index_evaluator.rs +++ b/crates/iceberg/src/expr/visitors/page_index_evaluator.rs @@ -793,7 +793,7 @@ mod tests { }; use parquet::file::metadata::{PageIndexPolicy, ParquetMetaData}; use parquet::file::properties::WriterProperties; - use rand::{Rng, thread_rng}; + use rand::Rng; use tempfile::NamedTempFile; use super::PageIndexEvaluator; @@ -1284,13 +1284,13 @@ mod tests { #[test] fn eval_in_length_of_set_above_limit_all_rows() -> Result<()> { - let mut rng = thread_rng(); + let mut rng = rand::rng(); let (metadata, _temp_file) = create_test_parquet_file()?; let (column_index, offset_index, row_group_metadata) = get_test_metadata(&metadata); let (iceberg_schema_ref, field_id_map) = build_iceberg_schema_and_field_map()?; let filter = Reference::new("col_float") - .is_in(std::iter::repeat_with(|| Datum::float(rng.gen_range(0.0..10.0))).take(1000)) + .is_in(std::iter::repeat_with(|| Datum::float(rng.random_range(0.0..10.0))).take(1000)) .bind(iceberg_schema_ref.clone(), false)?; let result = PageIndexEvaluator::eval( diff --git a/crates/iceberg/src/expr/visitors/row_group_metrics_evaluator.rs b/crates/iceberg/src/expr/visitors/row_group_metrics_evaluator.rs index 0506b33af0..ad7e19f548 100644 --- a/crates/iceberg/src/expr/visitors/row_group_metrics_evaluator.rs +++ b/crates/iceberg/src/expr/visitors/row_group_metrics_evaluator.rs @@ -528,7 +528,7 @@ mod tests { use parquet::schema::types::{ ColumnDescriptor, ColumnPath, SchemaDescriptor, Type as parquetSchemaType, }; - use rand::{Rng, thread_rng}; + use rand::Rng; use super::RowGroupMetricsEvaluator; use crate::Result; @@ -1617,7 +1617,7 @@ mod tests { #[test] fn eval_true_for_too_many_literals_filter_is_in() -> Result<()> { - let mut rng = thread_rng(); + let mut rng = rand::rng(); let row_group_metadata = create_row_group_metadata( 1, @@ -1636,7 +1636,7 @@ mod tests { let (iceberg_schema_ref, field_id_map) = build_iceberg_schema_and_field_map()?; let filter = Reference::new("col_float") - .is_in(std::iter::repeat_with(|| Datum::float(rng.gen_range(0.0..10.0))).take(1000)) + .is_in(std::iter::repeat_with(|| Datum::float(rng.random_range(0.0..10.0))).take(1000)) .bind(iceberg_schema_ref.clone(), false)?; let result = RowGroupMetricsEvaluator::eval( diff --git a/crates/iceberg/src/io/file_io.rs b/crates/iceberg/src/io/file_io.rs index d00ba1ba6a..6260160f85 100644 --- a/crates/iceberg/src/io/file_io.rs +++ b/crates/iceberg/src/io/file_io.rs @@ -280,6 +280,13 @@ pub trait FileRead: Send + Sync + Unpin + 'static { async fn read(&self, range: Range) -> crate::Result; } +#[async_trait::async_trait] +impl + Send + Sync + Unpin + 'static> FileRead for T { + async fn read(&self, range: Range) -> crate::Result { + self.as_ref().read(range).await + } +} + /// Input file is used for reading from files. #[derive(Debug)] pub struct InputFile { diff --git a/crates/iceberg/src/io/storage/config/s3.rs b/crates/iceberg/src/io/storage/config/s3.rs index fae3a14757..64db47084e 100644 --- a/crates/iceberg/src/io/storage/config/s3.rs +++ b/crates/iceberg/src/io/storage/config/s3.rs @@ -69,8 +69,14 @@ pub const S3_DISABLE_CONFIG_LOAD: &str = "s3.disable-config-load"; /// /// This struct contains all the configuration options for connecting to Amazon S3. /// Use the builder pattern via `S3Config::builder()` to construct instances. -/// ``` -#[derive(Clone, Debug, Default, PartialEq, Eq, Serialize, Deserialize, TypedBuilder)] +/// +/// Defaults follow the Iceberg `S3FileIOProperties` spec (see +/// [`PATH_STYLE_ACCESS_DEFAULT = false`](https://github.com/apache/iceberg/blob/main/aws/src/main/java/org/apache/iceberg/aws/s3/S3FileIOProperties.java)), +/// i.e. virtual-host-style addressing is enabled unless +/// `s3.path-style-access=true` is explicitly set. This matches what +/// Java clients do out of the box and is required for a number of +/// S3-compatible stores that do not support path-style URLs. +#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize, TypedBuilder)] pub struct S3Config { /// S3 endpoint URL. #[builder(default, setter(strip_option, into))] @@ -88,7 +94,9 @@ pub struct S3Config { #[builder(default, setter(strip_option, into))] pub region: Option, /// Enable virtual host style (opposite of path style access). - #[builder(default)] + /// + /// Defaults to `true` to match Iceberg `S3FileIOProperties.PATH_STYLE_ACCESS_DEFAULT = false`. + #[builder(default = true)] pub enable_virtual_host_style: bool, /// Server side encryption type. #[builder(default, setter(strip_option, into))] @@ -125,6 +133,12 @@ pub struct S3Config { pub disable_config_load: bool, } +impl Default for S3Config { + fn default() -> Self { + Self::builder().build() + } +} + impl TryFrom<&StorageConfig> for S3Config { type Error = crate::Error; @@ -267,6 +281,17 @@ mod tests { assert_eq!(s3_config.region.as_deref(), Some("eu-west-1")); } + #[test] + fn test_s3_config_default_is_virtual_host_style() { + // Matches Iceberg S3FileIOProperties.PATH_STYLE_ACCESS_DEFAULT = false. + assert!(S3Config::default().enable_virtual_host_style); + assert!( + S3Config::try_from(&StorageConfig::new()) + .unwrap() + .enable_virtual_host_style + ); + } + #[test] fn test_s3_config_path_style_access() { let storage_config = StorageConfig::new().with_prop(S3_PATH_STYLE_ACCESS, "true"); diff --git a/crates/iceberg/src/scan/incremental/mod.rs b/crates/iceberg/src/scan/incremental/mod.rs index 690f88e8b5..ff1cdd13f9 100644 --- a/crates/iceberg/src/scan/incremental/mod.rs +++ b/crates/iceberg/src/scan/incremental/mod.rs @@ -24,8 +24,7 @@ use std::sync::Arc; use crate::arrow::caching_delete_file_loader::CachingDeleteFileLoader; use crate::arrow::delete_filter::{DeleteFilter, is_equality_delete}; use crate::arrow::{ - ArrowReaderBuilder, CombinedIncrementalBatchRecordStream, StreamsInto, - UnzippedIncrementalBatchRecordStream, + ArrowReaderBuilder, CombinedIncrementalScanResult, StreamsInto, UnzippedIncrementalScanResult, }; use crate::delete_file_index::DeleteFileIndex; use crate::io::FileIO; @@ -733,8 +732,8 @@ impl IncrementalTableScan { Ok((append_stream, delete_stream)) } - /// Returns an [`CombinedIncrementalBatchRecordStream`] for this incremental table scan. - pub async fn to_arrow(&self) -> Result { + /// Returns a [`CombinedIncrementalScanResult`] for this incremental table scan. + pub async fn to_arrow(&self) -> Result { let mut arrow_reader_builder = ArrowReaderBuilder::new(self.file_io.clone()) .with_data_file_concurrency_limit(self.concurrency_limit_data_files) .with_row_group_filtering_enabled(true) @@ -749,9 +748,10 @@ impl IncrementalTableScan { file_scan_task_stream.stream(arrow_reader) } - /// Returns an [`UnzippedIncrementalBatchRecordStream`] for this incremental table scan. - /// This stream will yield separate streams for appended and deleted record batches. - pub async fn to_unzipped_arrow(&self) -> Result { + /// Returns an [`UnzippedIncrementalScanResult`] for this incremental table scan. + /// This result contains separate streams for appended and deleted record batches, + /// together with scan metrics. + pub async fn to_unzipped_arrow(&self) -> Result { let mut arrow_reader_builder = ArrowReaderBuilder::new(self.file_io.clone()) .with_data_file_concurrency_limit(self.concurrency_limit_data_files) .with_row_group_filtering_enabled(true) diff --git a/crates/iceberg/src/scan/incremental/tests.rs b/crates/iceberg/src/scan/incremental/tests.rs index 2a53c9ff37..1f23d5c731 100644 --- a/crates/iceberg/src/scan/incremental/tests.rs +++ b/crates/iceberg/src/scan/incremental/tests.rs @@ -1565,7 +1565,7 @@ async fn scan_and_verify( .build() .unwrap(); - let stream = incremental_scan.to_arrow().await.unwrap(); + let stream = incremental_scan.to_arrow().await.unwrap().stream; let batches: Vec<_> = stream.try_collect().await.unwrap(); let append_batches: Vec<_> = batches @@ -2003,7 +2003,7 @@ async fn test_incremental_scan_builder_options() { .build() .unwrap(); - let stream = scan.to_arrow().await.unwrap(); + let stream = scan.to_arrow().await.unwrap().stream; let batches: Vec<_> = stream.try_collect().await.unwrap(); // Verify we have both append and delete batches @@ -2038,7 +2038,7 @@ async fn test_incremental_scan_builder_options() { .build() .unwrap(); - let stream = scan.to_arrow().await.unwrap(); + let stream = scan.to_arrow().await.unwrap().stream; let batches: Vec<_> = stream.try_collect().await.unwrap(); let append_batches: Vec<_> = batches @@ -2069,7 +2069,7 @@ async fn test_incremental_scan_builder_options() { .build() .unwrap(); - let stream = scan.to_arrow().await.unwrap(); + let stream = scan.to_arrow().await.unwrap().stream; let batches: Vec<_> = stream.try_collect().await.unwrap(); let append_batches: Vec<_> = batches @@ -2097,7 +2097,7 @@ async fn test_incremental_scan_builder_options() { .build() .unwrap(); - let stream = scan.to_arrow().await.unwrap(); + let stream = scan.to_arrow().await.unwrap().stream; let batches: Vec<_> = stream.try_collect().await.unwrap(); let append_batches = batches @@ -2123,7 +2123,7 @@ async fn test_incremental_scan_builder_options() { .build() .unwrap(); - let stream = scan.to_arrow().await.unwrap(); + let stream = scan.to_arrow().await.unwrap().stream; let batches: Vec<_> = stream.try_collect().await.unwrap(); let append_batches = batches @@ -2145,7 +2145,7 @@ async fn test_incremental_scan_builder_options() { .build() .unwrap(); - let stream = scan.to_arrow().await.unwrap(); + let stream = scan.to_arrow().await.unwrap().stream; let batches: Vec<_> = stream.try_collect().await.unwrap(); let append_batches: Vec<_> = batches @@ -2182,7 +2182,7 @@ async fn test_incremental_scan_builder_options() { .build() .unwrap(); - let stream = scan.to_arrow().await.unwrap(); + let stream = scan.to_arrow().await.unwrap().stream; let batches: Vec<_> = stream.try_collect().await.unwrap(); let delete_batches: Vec<_> = batches @@ -2772,7 +2772,7 @@ async fn test_incremental_scan_includes_root_when_from_is_none() { // Test 2: Scan using table.incremental_scan(None, None) API // This should INCLUDE the root snapshot let scan = fixture.table.incremental_scan(None, None).build().unwrap(); - let stream = scan.to_arrow().await.unwrap(); + let stream = scan.to_arrow().await.unwrap().stream; let batches: Vec<_> = stream.try_collect().await.unwrap(); // Collect all appended data @@ -2867,7 +2867,7 @@ async fn test_incremental_scan_with_file_column() { .build() .unwrap(); - let stream = scan.to_arrow().await.unwrap(); + let stream = scan.to_arrow().await.unwrap().stream; let batches: Vec<_> = stream.try_collect().await.unwrap(); // Get append batches @@ -2938,7 +2938,7 @@ async fn test_incremental_select_with_pos_column() { .build() .unwrap(); - let stream = scan.to_arrow().await.unwrap(); + let stream = scan.to_arrow().await.unwrap().stream; let batches: Vec<_> = stream.try_collect().await.unwrap(); // Get append batches (we're only appending in this test) @@ -2997,7 +2997,7 @@ async fn test_incremental_select_with_pos_column() { .build() .unwrap(); - let stream = scan.to_arrow().await.unwrap(); + let stream = scan.to_arrow().await.unwrap().stream; let batches: Vec<_> = stream.try_collect().await.unwrap(); // Get append batches @@ -3073,7 +3073,7 @@ async fn test_incremental_select_with_pos_and_file_columns() { .build() .unwrap(); - let stream = scan.to_arrow().await.unwrap(); + let stream = scan.to_arrow().await.unwrap().stream; let batches: Vec<_> = stream.try_collect().await.unwrap(); // Get append batches @@ -3169,7 +3169,8 @@ async fn test_incremental_scan_with_no_deletes() { .unwrap(); // Convert to arrow streams (unzipped into separate append and delete streams) - let (append_stream, delete_stream) = scan.to_unzipped_arrow().await.unwrap(); + let result = scan.to_unzipped_arrow().await.unwrap(); + let (append_stream, delete_stream) = (result.appends, result.deletes); // IMPORTANT: Try to collect from delete stream FIRST (without consuming append stream) // This is the scenario that previously caused a deadlock because the delete stream @@ -3233,7 +3234,8 @@ async fn test_incremental_scan_deadlock_with_deletes_and_appends() { .unwrap(); // Convert to unzipped streams - let (append_stream, delete_stream) = scan.to_unzipped_arrow().await.unwrap(); + let result = scan.to_unzipped_arrow().await.unwrap(); + let (append_stream, delete_stream) = (result.appends, result.deletes); // Read deletes first (this is important for triggering the deadlock) eprintln!("Starting to read delete stream..."); diff --git a/crates/iceberg/src/scan/mod.rs b/crates/iceberg/src/scan/mod.rs index 97d7d8afb1..86caa99309 100644 --- a/crates/iceberg/src/scan/mod.rs +++ b/crates/iceberg/src/scan/mod.rs @@ -33,6 +33,7 @@ use futures::{SinkExt, StreamExt, TryStreamExt}; pub use task::*; use crate::arrow::ArrowReaderBuilder; +pub use crate::arrow::{ScanMetrics, ScanResult}; use crate::delete_file_index::DeleteFileIndex; use crate::expr::visitors::inclusive_metrics_evaluator::InclusiveMetricsEvaluator; use crate::expr::{Bind, BoundPredicate, Predicate}; @@ -531,7 +532,10 @@ impl TableScan { arrow_reader_builder = arrow_reader_builder.with_batch_size(batch_size); } - arrow_reader_builder.build().read(self.plan_files().await?) + arrow_reader_builder + .build() + .read(self.plan_files().await?) + .map(|result| result.stream()) } /// Returns a reference to the column names of the table scan. @@ -1454,13 +1458,15 @@ pub mod tests { let batch_stream = reader .clone() .read(Box::pin(stream::iter(vec![Ok(plan_task.remove(0))]))) - .unwrap(); + .unwrap() + .stream(); let batch_1: Vec<_> = batch_stream.try_collect().await.unwrap(); let reader = ArrowReaderBuilder::new(fixture.table.file_io().clone()).build(); let batch_stream = reader .read(Box::pin(stream::iter(vec![Ok(plan_task.remove(0))]))) - .unwrap(); + .unwrap() + .stream(); let batch_2: Vec<_> = batch_stream.try_collect().await.unwrap(); assert_eq!(batch_1, batch_2); diff --git a/crates/iceberg/src/spec/snapshot.rs b/crates/iceberg/src/spec/snapshot.rs index 72b5417c47..3b8a3c934e 100644 --- a/crates/iceberg/src/spec/snapshot.rs +++ b/crates/iceberg/src/spec/snapshot.rs @@ -291,6 +291,7 @@ pub(super) mod _serde { pub snapshot_id: i64, #[serde(skip_serializing_if = "Option::is_none")] pub parent_snapshot_id: Option, + #[serde(default)] pub sequence_number: i64, pub timestamp_ms: i64, pub manifest_list: String, diff --git a/crates/iceberg/src/writer/file_writer/rolling_writer.rs b/crates/iceberg/src/writer/file_writer/rolling_writer.rs index b86f6a2ea7..b0b2d2f191 100644 --- a/crates/iceberg/src/writer/file_writer/rolling_writer.rs +++ b/crates/iceberg/src/writer/file_writer/rolling_writer.rs @@ -399,7 +399,7 @@ mod tests { "Kelly", "Larry", "Mallory", "Shawn", ]; - let mut rng = rand::thread_rng(); + let mut rng = rand::rng(); let batch_num = 10; let batch_rows = 100; let expected_rows = batch_num * batch_rows; diff --git a/crates/integration_tests/src/lib.rs b/crates/integration_tests/src/lib.rs index 4bf8f4d19c..feafa3ae9f 100644 --- a/crates/integration_tests/src/lib.rs +++ b/crates/integration_tests/src/lib.rs @@ -18,7 +18,9 @@ use std::collections::HashMap; use std::sync::OnceLock; -use iceberg::io::{S3_ACCESS_KEY_ID, S3_ENDPOINT, S3_REGION, S3_SECRET_ACCESS_KEY}; +use iceberg::io::{ + S3_ACCESS_KEY_ID, S3_ENDPOINT, S3_PATH_STYLE_ACCESS, S3_REGION, S3_SECRET_ACCESS_KEY, +}; use iceberg_catalog_rest::REST_CATALOG_PROP_URI; use iceberg_test_utils::{get_minio_endpoint, get_rest_catalog_endpoint, set_up}; @@ -45,6 +47,7 @@ impl GlobalTestFixture { (S3_ACCESS_KEY_ID.to_string(), "admin".to_string()), (S3_SECRET_ACCESS_KEY.to_string(), "password".to_string()), (S3_REGION.to_string(), "us-east-1".to_string()), + (S3_PATH_STYLE_ACCESS.to_string(), "true".to_string()), ]); GlobalTestFixture { catalog_config } diff --git a/crates/integration_tests/tests/common/mod.rs b/crates/integration_tests/tests/common/mod.rs index e49a57465c..b7197a3a46 100644 --- a/crates/integration_tests/tests/common/mod.rs +++ b/crates/integration_tests/tests/common/mod.rs @@ -28,7 +28,6 @@ pub async fn random_ns() -> Namespace { let fixture = get_test_fixture(); let rest_catalog = RestCatalogBuilder::default() .with_storage_factory(Arc::new(OpenDalStorageFactory::S3 { - configured_scheme: "s3".to_string(), customized_credential_load: None, })) .load("rest", fixture.catalog_config.clone()) diff --git a/crates/integration_tests/tests/conflict_commit_test.rs b/crates/integration_tests/tests/conflict_commit_test.rs index 3b1362b95d..af2c7a7779 100644 --- a/crates/integration_tests/tests/conflict_commit_test.rs +++ b/crates/integration_tests/tests/conflict_commit_test.rs @@ -43,7 +43,6 @@ async fn test_append_data_file_conflict() { let fixture = get_test_fixture(); let rest_catalog = RestCatalogBuilder::default() .with_storage_factory(Arc::new(OpenDalStorageFactory::S3 { - configured_scheme: "s3".to_string(), customized_credential_load: None, })) .load("rest", fixture.catalog_config.clone()) diff --git a/crates/integration_tests/tests/read_evolved_schema.rs b/crates/integration_tests/tests/read_evolved_schema.rs index ae25a08987..f7416be2d4 100644 --- a/crates/integration_tests/tests/read_evolved_schema.rs +++ b/crates/integration_tests/tests/read_evolved_schema.rs @@ -34,7 +34,6 @@ async fn test_evolved_schema() { let fixture = get_test_fixture(); let rest_catalog = RestCatalogBuilder::default() .with_storage_factory(Arc::new(OpenDalStorageFactory::S3 { - configured_scheme: "s3".to_string(), customized_credential_load: None, })) .load("rest", fixture.catalog_config.clone()) diff --git a/crates/integration_tests/tests/read_positional_deletes.rs b/crates/integration_tests/tests/read_positional_deletes.rs index d4c4afeaf3..0f79596a12 100644 --- a/crates/integration_tests/tests/read_positional_deletes.rs +++ b/crates/integration_tests/tests/read_positional_deletes.rs @@ -30,7 +30,6 @@ async fn test_read_table_with_positional_deletes() { let fixture = get_test_fixture(); let rest_catalog = RestCatalogBuilder::default() .with_storage_factory(Arc::new(OpenDalStorageFactory::S3 { - configured_scheme: "s3".to_string(), customized_credential_load: None, })) .load("rest", fixture.catalog_config.clone()) diff --git a/crates/integrations/datafusion/src/physical_plan/scan.rs b/crates/integrations/datafusion/src/physical_plan/scan.rs index 234ab26470..36539ae503 100644 --- a/crates/integrations/datafusion/src/physical_plan/scan.rs +++ b/crates/integrations/datafusion/src/physical_plan/scan.rs @@ -196,7 +196,11 @@ impl DisplayAs for IcebergTableScan { self.predicates .clone() .map_or(String::from(""), |p| format!("{p}")) - ) + )?; + if let Some(limit) = self.limit { + write!(f, " limit:[{limit}]")?; + } + Ok(()) } } diff --git a/crates/sqllogictest/testdata/slts/df_test/basic_queries.slt b/crates/sqllogictest/testdata/slts/df_test/basic_queries.slt index 5d8889f158..a5ca4de46a 100644 --- a/crates/sqllogictest/testdata/slts/df_test/basic_queries.slt +++ b/crates/sqllogictest/testdata/slts/df_test/basic_queries.slt @@ -43,6 +43,18 @@ INSERT INTO default.default.query_test_table VALUES ---- 10 +# Verify EXPLAIN shows limit is pushed down to IcebergTableScan +query TT +EXPLAIN SELECT * FROM default.default.query_test_table LIMIT 3 +---- +logical_plan +01)Limit: skip=0, fetch=3 +02)--TableScan: default.default.query_test_table projection=[id, name, score, category], fetch=3 +physical_plan +01)GlobalLimitExec: skip=0, fetch=3 +02)--CooperativeExec +03)----IcebergTableScan projection:[id,name,score,category] predicate:[] limit:[3] + # Test SELECT * with ORDER BY and LIMIT query ITRT SELECT * FROM default.default.query_test_table ORDER BY id LIMIT 3 diff --git a/crates/storage/opendal/README.md b/crates/storage/opendal/README.md index c5092eb97a..a4ad512e17 100644 --- a/crates/storage/opendal/README.md +++ b/crates/storage/opendal/README.md @@ -61,7 +61,6 @@ use iceberg_storage_opendal::OpenDalStorageFactory; async fn main() -> iceberg::Result<()> { let catalog = RestCatalogBuilder::default() .with_storage_factory(Arc::new(OpenDalStorageFactory::S3 { - configured_scheme: "s3".to_string(), customized_credential_load: None, })) .load( diff --git a/crates/storage/opendal/src/azdls.rs b/crates/storage/opendal/src/azdls.rs index f826b0a103..bab414a95c 100644 --- a/crates/storage/opendal/src/azdls.rs +++ b/crates/storage/opendal/src/azdls.rs @@ -124,10 +124,9 @@ pub(crate) fn azdls_config_parse(mut properties: HashMap) -> Res pub(crate) fn azdls_create_operator<'a>( absolute_path: &'a str, config: &AzdlsConfig, - configured_scheme: &AzureStorageScheme, ) -> Result<(opendal::Operator, &'a str)> { let path = absolute_path.parse::()?; - match_path_with_config(&path, config, configured_scheme)?; + match_path_with_config(&path, config)?; let op = azdls_config_build(config, &path)?; @@ -193,18 +192,7 @@ impl FromStr for AzureStorageScheme { } /// Validates whether the given path matches what's configured for the backend. -pub(crate) fn match_path_with_config( - path: &AzureStoragePath, - config: &AzdlsConfig, - configured_scheme: &AzureStorageScheme, -) -> Result<()> { - ensure_data_valid!( - &path.scheme == configured_scheme, - "Storage::Azdls: Scheme mismatch: configured {}, passed {}", - configured_scheme, - path.scheme - ); - +pub(crate) fn match_path_with_config(path: &AzureStoragePath, config: &AzdlsConfig) -> Result<()> { if let Some(ref configured_account_name) = config.account_name { ensure_data_valid!( &path.account_name == configured_account_name, @@ -518,7 +506,6 @@ mod tests { endpoint: Some("https://myaccount.dfs.core.windows.net".to_string()), ..Default::default() }, - AzureStorageScheme::Abfss, ), Some(("myfs", "/path/to/file.parquet")), ), @@ -531,33 +518,19 @@ mod tests { endpoint: Some("https://myaccount.dfs.core.windows.net".to_string()), ..Default::default() }, - AzureStorageScheme::Abfss, - ), - None, - ), - ( - "different scheme", - ( - "wasbs://myfs@myaccount.dfs.core.windows.net/path/to/file.parquet", - AzdlsConfig { - account_name: Some("myaccount".to_string()), - endpoint: Some("https://myaccount.dfs.core.windows.net".to_string()), - ..Default::default() - }, - AzureStorageScheme::Abfss, ), None, ), ( "incompatible scheme for endpoint", ( - "abfs://myfs@myaccount.dfs.core.windows.net/path/to/file.parquet", + // `abfss` implies https; configured endpoint is plain http. + "abfss://myfs@myaccount.dfs.core.windows.net/path/to/file.parquet", AzdlsConfig { account_name: Some("myaccount".to_string()), endpoint: Some("http://myaccount.dfs.core.windows.net".to_string()), ..Default::default() }, - AzureStorageScheme::Abfss, ), None, ), @@ -570,7 +543,6 @@ mod tests { endpoint: Some("https://myaccount.dfs.core.chinacloudapi.cn".to_string()), ..Default::default() }, - AzureStorageScheme::Abfss, ), None, ), @@ -584,7 +556,18 @@ mod tests { endpoint: None, ..Default::default() }, - AzureStorageScheme::Abfs, + ), + Some(("myfs", "/path/to/file.parquet")), + ), + ( + "different scheme is accepted", + ( + "wasbs://myfs@myaccount.blob.core.windows.net/path/to/file.parquet", + AzdlsConfig { + account_name: Some("myaccount".to_string()), + endpoint: Some("https://myaccount.blob.core.windows.net".to_string()), + ..Default::default() + }, ), Some(("myfs", "/path/to/file.parquet")), ), @@ -598,14 +581,13 @@ mod tests { account_key: Some("secret".to_string()), ..Default::default() }, - AzureStorageScheme::Wasb, ), Some(("testfs", "/path/to/data.parquet")), ), ]; for (name, input, expected) in test_cases { - let result = azdls_create_operator(input.0, &input.1, &input.2); + let result = azdls_create_operator(input.0, &input.1); match expected { Some((expected_filesystem, expected_path)) => { assert!(result.is_ok(), "Test case {name} failed: {result:?}"); diff --git a/crates/storage/opendal/src/lib.rs b/crates/storage/opendal/src/lib.rs index 8160680523..7fdf9e6965 100644 --- a/crates/storage/opendal/src/lib.rs +++ b/crates/storage/opendal/src/lib.rs @@ -46,7 +46,6 @@ use utils::from_opendal_error; cfg_if! { if #[cfg(feature = "opendal-azdls")] { mod azdls; - use azdls::AzureStorageScheme; use azdls::*; use opendal::services::AzdlsConfig; } @@ -108,9 +107,6 @@ pub enum OpenDalStorageFactory { /// S3 storage factory. #[cfg(feature = "opendal-s3")] S3 { - /// s3 storage could have `s3://` and `s3a://`. - /// Storing the scheme string here to return the correct path. - configured_scheme: String, /// Custom AWS credential loader. #[serde(skip)] customized_credential_load: Option, @@ -123,10 +119,7 @@ pub enum OpenDalStorageFactory { Oss, /// Azure Data Lake Storage factory. #[cfg(feature = "opendal-azdls")] - Azdls { - /// The configured Azure storage scheme. - configured_scheme: AzureStorageScheme, - }, + Azdls, } #[typetag::serde(name = "OpenDalStorageFactory")] @@ -142,10 +135,8 @@ impl StorageFactory for OpenDalStorageFactory { OpenDalStorageFactory::Fs => Ok(Arc::new(OpenDalStorage::LocalFs)), #[cfg(feature = "opendal-s3")] OpenDalStorageFactory::S3 { - configured_scheme, customized_credential_load, } => Ok(Arc::new(OpenDalStorage::S3 { - configured_scheme: configured_scheme.clone(), config: s3_config_parse(config.props().clone())?.into(), customized_credential_load: customized_credential_load.clone(), })), @@ -158,12 +149,9 @@ impl StorageFactory for OpenDalStorageFactory { config: oss_config_parse(config.props().clone())?.into(), })), #[cfg(feature = "opendal-azdls")] - OpenDalStorageFactory::Azdls { configured_scheme } => { - Ok(Arc::new(OpenDalStorage::Azdls { - configured_scheme: configured_scheme.clone(), - config: azdls_config_parse(config.props().clone())?.into(), - })) - } + OpenDalStorageFactory::Azdls => Ok(Arc::new(OpenDalStorage::Azdls { + config: azdls_config_parse(config.props().clone())?.into(), + })), #[cfg(all( not(feature = "opendal-memory"), not(feature = "opendal-fs"), @@ -196,11 +184,11 @@ pub enum OpenDalStorage { #[cfg(feature = "opendal-fs")] LocalFs, /// S3 storage variant. + /// + /// Accepts any S3-family URL (`s3://`, `s3a://`, `s3n://`); the scheme is + /// derived from the path at call time. #[cfg(feature = "opendal-s3")] S3 { - /// s3 storage could have `s3://` and `s3a://`. - /// Storing the scheme string here to return the correct path. - configured_scheme: String, /// S3 configuration. config: Arc, /// Custom AWS credential loader. @@ -220,16 +208,12 @@ pub enum OpenDalStorage { config: Arc, }, /// Azure Data Lake Storage variant. - /// Expects paths of the form + /// + /// Accepts paths of the form /// `abfs[s]://@.dfs./` or /// `wasb[s]://@.blob./`. #[cfg(feature = "opendal-azdls")] - #[allow(private_interfaces)] Azdls { - /// The configured Azure storage scheme. - /// Because Azdls accepts multiple possible schemes, we store the full - /// passed scheme here to later validate schemes passed via paths. - configured_scheme: AzureStorageScheme, /// Azure DLS configuration. config: Arc, }, @@ -274,15 +258,21 @@ impl OpenDalStorage { } #[cfg(feature = "opendal-s3")] OpenDalStorage::S3 { - configured_scheme, config, customized_credential_load, } => { let op = s3_config_build(config, customized_credential_load, path)?; let op_info = op.info(); - // Check prefix of s3 path. - let prefix = format!("{}://{}/", configured_scheme, op_info.name()); + // Use the URL scheme in the path for prefix matching. This enables + // use of S3-compatible storage backends using custom schemes (e.g., `minio://`, `r2://`). + let url = url::Url::parse(path).map_err(|e| { + Error::new( + ErrorKind::DataInvalid, + format!("Invalid s3 url: {path}: {e}"), + ) + })?; + let prefix = format!("{}://{}/", url.scheme(), op_info.name()); if path.starts_with(&prefix) { (op, &path[prefix.len()..]) } else { @@ -319,10 +309,7 @@ impl OpenDalStorage { } } #[cfg(feature = "opendal-azdls")] - OpenDalStorage::Azdls { - configured_scheme, - config, - } => azdls_create_operator(path, config, configured_scheme)?, + OpenDalStorage::Azdls { config } => azdls_create_operator(path, config)?, #[cfg(all( not(feature = "opendal-s3"), not(feature = "opendal-fs"), @@ -357,9 +344,7 @@ impl OpenDalStorage { #[cfg(feature = "opendal-fs")] OpenDalStorage::LocalFs => Ok(path.strip_prefix("file:/").unwrap_or(&path[1..])), #[cfg(feature = "opendal-s3")] - OpenDalStorage::S3 { - configured_scheme, .. - } => { + OpenDalStorage::S3 { .. } => { let url = url::Url::parse(path)?; let bucket = url.host_str().ok_or_else(|| { Error::new( @@ -367,7 +352,7 @@ impl OpenDalStorage { format!("Invalid s3 url: {path}, missing bucket"), ) })?; - let prefix = format!("{}://{}/", configured_scheme, bucket); + let prefix = format!("{}://{}/", url.scheme(), bucket); if path.starts_with(&prefix) { Ok(&path[prefix.len()..]) } else { @@ -416,12 +401,9 @@ impl OpenDalStorage { } } #[cfg(feature = "opendal-azdls")] - OpenDalStorage::Azdls { - configured_scheme, - config, - } => { + OpenDalStorage::Azdls { config } => { let azure_path = path.parse::()?; - match_path_with_config(&azure_path, config, configured_scheme)?; + match_path_with_config(&azure_path, config)?; let relative_path_len = azure_path.path.len(); Ok(&path[path.len() - relative_path_len..]) } @@ -631,47 +613,21 @@ mod tests { #[test] fn test_relativize_path_s3() { let storage = OpenDalStorage::S3 { - configured_scheme: "s3".to_string(), config: Arc::new(S3Config::default()), customized_credential_load: None, }; - assert_eq!( - storage - .relativize_path("s3://my-bucket/path/to/file.parquet") - .unwrap(), - "path/to/file.parquet" - ); - - // s3a scheme - let storage_s3a = OpenDalStorage::S3 { - configured_scheme: "s3a".to_string(), - config: Arc::new(S3Config::default()), - customized_credential_load: None, - }; - assert_eq!( - storage_s3a - .relativize_path("s3a://my-bucket/path/to/file.parquet") - .unwrap(), - "path/to/file.parquet" - ); - } - - #[cfg(feature = "opendal-s3")] - #[test] - fn test_relativize_path_s3_scheme_mismatch() { - let storage = OpenDalStorage::S3 { - configured_scheme: "s3".to_string(), - config: Arc::new(S3Config::default()), - customized_credential_load: None, - }; - - // Scheme mismatch should error - assert!( - storage - .relativize_path("s3a://my-bucket/path/to/file.parquet") - .is_err() - ); + // All S3-family schemes are accepted by the same storage instance. + // Custom schemes for S3-compatible stores (e.g., `minio://`) are also + // accepted because the path's scheme is used as-is for prefix matching. + for scheme in ["s3", "s3a", "s3n", "minio"] { + assert_eq!( + storage + .relativize_path(&format!("{scheme}://my-bucket/path/to/file.parquet")) + .unwrap(), + "path/to/file.parquet" + ); + } } #[cfg(feature = "opendal-gcs")] @@ -736,7 +692,6 @@ mod tests { #[test] fn test_relativize_path_azdls() { let storage = OpenDalStorage::Azdls { - configured_scheme: AzureStorageScheme::Abfss, config: Arc::new(AzdlsConfig { account_name: Some("myaccount".to_string()), endpoint: Some("https://myaccount.dfs.core.windows.net".to_string()), @@ -751,24 +706,4 @@ mod tests { "/path/to/file.parquet" ); } - - #[cfg(feature = "opendal-azdls")] - #[test] - fn test_relativize_path_azdls_scheme_mismatch() { - let storage = OpenDalStorage::Azdls { - configured_scheme: AzureStorageScheme::Abfss, - config: Arc::new(AzdlsConfig { - account_name: Some("myaccount".to_string()), - endpoint: Some("https://myaccount.dfs.core.windows.net".to_string()), - ..Default::default() - }), - }; - - // wasbs scheme doesn't match configured abfss - assert!( - storage - .relativize_path("wasbs://myfs@myaccount.dfs.core.windows.net/path/to/file.parquet") - .is_err() - ); - } } diff --git a/crates/storage/opendal/src/resolving.rs b/crates/storage/opendal/src/resolving.rs index 7c06cf96a5..64a16b18d2 100644 --- a/crates/storage/opendal/src/resolving.rs +++ b/crates/storage/opendal/src/resolving.rs @@ -70,29 +70,28 @@ fn parse_scheme(scheme: &str) -> Result { } } -/// Extract the scheme string from a path URL. -fn extract_scheme(path: &str) -> Result { +/// Extract the [`Scheme`] family from a path URL. +fn extract_scheme(path: &str) -> Result { let url = Url::parse(path).map_err(|e| { Error::new( ErrorKind::DataInvalid, format!("Invalid path: {path}, failed to parse URL: {e}"), ) })?; - Ok(url.scheme().to_string()) + parse_scheme(url.scheme()) } /// Build an [`OpenDalStorage`] variant for the given scheme and config properties. fn build_storage_for_scheme( - scheme: &str, + scheme: Scheme, props: &HashMap, #[cfg(feature = "opendal-s3")] customized_credential_load: &Option, ) -> Result { - match parse_scheme(scheme)? { + match scheme { #[cfg(feature = "opendal-s3")] Scheme::S3 => { let config = crate::s3::s3_config_parse(props.clone())?; Ok(OpenDalStorage::S3 { - configured_scheme: scheme.to_string(), config: Arc::new(config), customized_credential_load: customized_credential_load.clone(), }) @@ -113,10 +112,8 @@ fn build_storage_for_scheme( } #[cfg(feature = "opendal-azdls")] Scheme::Azdls => { - let configured_scheme: crate::azdls::AzureStorageScheme = scheme.parse()?; let config = crate::azdls::azdls_config_parse(props.clone())?; Ok(OpenDalStorage::Azdls { - configured_scheme, config: Arc::new(config), }) } @@ -196,14 +193,15 @@ impl StorageFactory for OpenDalResolvingStorageFactory { /// to the appropriate [`OpenDalStorage`] variant. /// /// Sub-storages are lazily created on first use for each scheme and cached -/// for subsequent operations. +/// for subsequent operations. Scheme aliases like `s3`/`s3a`/`s3n` map to +/// the same [`Scheme`] variant, so they share a storage instance. #[derive(Debug, Serialize, Deserialize)] pub struct OpenDalResolvingStorage { /// Configuration properties shared across all backends. props: HashMap, - /// Cache of scheme → storage mappings. + /// Cache of scheme to storage mappings. #[serde(skip, default)] - storages: RwLock>>, + storages: RwLock>>, /// Custom AWS credential loader for S3 storage. #[cfg(feature = "opendal-s3")] #[serde(skip)] @@ -239,7 +237,7 @@ impl OpenDalResolvingStorage { } let storage = build_storage_for_scheme( - &scheme, + scheme, &self.props, #[cfg(feature = "opendal-s3")] &self.customized_credential_load, @@ -288,7 +286,7 @@ impl Storage for OpenDalResolvingStorage { async fn delete_stream(&self, mut paths: BoxStream<'static, String>) -> Result<()> { // Group paths by scheme so each resolved storage receives a batch, // avoiding repeated operator creation per path. - let mut grouped: HashMap> = HashMap::new(); + let mut grouped: HashMap> = HashMap::new(); while let Some(path) = paths.next().await { let scheme = extract_scheme(&path)?; grouped.entry(scheme).or_default().push(path); @@ -317,3 +315,54 @@ impl Storage for OpenDalResolvingStorage { )) } } + +#[cfg(test)] +mod tests { + use super::*; + + /// Builds a resolving storage with empty props, suitable for `resolve()` + /// calls that don't actually hit any backend. + fn empty_resolving_storage() -> OpenDalResolvingStorage { + OpenDalResolvingStorage { + props: HashMap::new(), + storages: RwLock::new(HashMap::new()), + #[cfg(feature = "opendal-s3")] + customized_credential_load: None, + } + } + + #[cfg(feature = "opendal-s3")] + #[test] + fn test_resolve_s3_aliases_share_instance() { + let storage = empty_resolving_storage(); + + // All three S3-family schemes must collapse to a single cached + // `Arc` so that catalogs handing the resolver a mix + // of `s3://`, `s3a://`, `s3n://` paths don't rebuild operators. + let a = storage.resolve("s3://bucket/key").unwrap(); + let b = storage.resolve("s3a://bucket/key").unwrap(); + let c = storage.resolve("s3n://bucket/key").unwrap(); + + assert!(Arc::ptr_eq(&a, &b), "s3 and s3a should share one instance"); + assert!(Arc::ptr_eq(&a, &c), "s3 and s3n should share one instance"); + } + + #[cfg(feature = "opendal-azdls")] + #[test] + fn test_resolve_azdls_aliases_share_instance() { + let storage = empty_resolving_storage(); + + let path_for = |scheme: &str| { + format!("{scheme}://myfs@myaccount.dfs.core.windows.net/path/to/file.parquet") + }; + + // All Azure schemes collapse onto one cached instance. + let abfss = storage.resolve(&path_for("abfss")).unwrap(); + let abfs = storage.resolve(&path_for("abfs")).unwrap(); + + assert!( + Arc::ptr_eq(&abfss, &abfs), + "abfss and abfs should share one instance" + ); + } +} diff --git a/crates/storage/opendal/src/s3.rs b/crates/storage/opendal/src/s3.rs index 7db88d273f..2e21418606 100644 --- a/crates/storage/opendal/src/s3.rs +++ b/crates/storage/opendal/src/s3.rs @@ -37,6 +37,12 @@ use crate::utils::{from_opendal_error, is_truthy}; /// Parse iceberg props to s3 config. pub(crate) fn s3_config_parse(mut m: HashMap) -> Result { let mut cfg = S3Config::default(); + // Match Iceberg `S3FileIOProperties.PATH_STYLE_ACCESS_DEFAULT = false`: + // virtual-host-style addressing is the spec default. opendal's own + // default is path-style, which disagrees with the Java SDK and breaks + // S3-compatible stores that only accept virtual-hosted-style URLs. + // Any explicit `s3.path-style-access` property below overrides this. + cfg.enable_virtual_host_style = true; if let Some(endpoint) = m.remove(S3_ENDPOINT) { cfg.endpoint = Some(endpoint); }; @@ -177,3 +183,28 @@ impl AwsCredentialLoad for CustomAwsCredentialLoader { self.0.load_credential(client).await } } + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + + use iceberg::io::S3_PATH_STYLE_ACCESS; + + use super::s3_config_parse; + + fn parse_with(prop: Option<&str>) -> bool { + let mut props = HashMap::new(); + if let Some(v) = prop { + props.insert(S3_PATH_STYLE_ACCESS.to_string(), v.to_string()); + } + s3_config_parse(props).unwrap().enable_virtual_host_style + } + + #[test] + fn s3_config_parse_path_style_access() { + // Match Iceberg S3FileIOProperties.PATH_STYLE_ACCESS_DEFAULT = false. + assert!(parse_with(None)); + assert!(parse_with(Some("false"))); + assert!(!parse_with(Some("true"))); + } +} diff --git a/crates/storage/opendal/tests/file_io_s3_test.rs b/crates/storage/opendal/tests/file_io_s3_test.rs index 207a4454d7..d6dd8a3b45 100644 --- a/crates/storage/opendal/tests/file_io_s3_test.rs +++ b/crates/storage/opendal/tests/file_io_s3_test.rs @@ -26,7 +26,8 @@ mod tests { use async_trait::async_trait; use futures::StreamExt; use iceberg::io::{ - FileIO, FileIOBuilder, S3_ACCESS_KEY_ID, S3_ENDPOINT, S3_REGION, S3_SECRET_ACCESS_KEY, + FileIO, FileIOBuilder, S3_ACCESS_KEY_ID, S3_ENDPOINT, S3_PATH_STYLE_ACCESS, S3_REGION, + S3_SECRET_ACCESS_KEY, }; use iceberg_storage_opendal::{CustomAwsCredentialLoader, OpenDalStorageFactory}; use iceberg_test_utils::{get_minio_endpoint, normalize_test_name_with_parts, set_up}; @@ -39,7 +40,6 @@ mod tests { let minio_endpoint = get_minio_endpoint(); FileIOBuilder::new(Arc::new(OpenDalStorageFactory::S3 { - configured_scheme: "s3".to_string(), customized_credential_load: None, })) .with_props(vec![ @@ -47,6 +47,7 @@ mod tests { (S3_ACCESS_KEY_ID, "admin".to_string()), (S3_SECRET_ACCESS_KEY, "password".to_string()), (S3_REGION, "us-east-1".to_string()), + (S3_PATH_STYLE_ACCESS, "true".to_string()), ]) .build() } @@ -132,13 +133,13 @@ mod tests { // Test that the loader can be used in FileIOBuilder with OpenDalStorageFactory let _builder = FileIOBuilder::new(Arc::new(OpenDalStorageFactory::S3 { - configured_scheme: "s3".to_string(), customized_credential_load: Some(custom_loader), })) .with_props(vec![ (S3_ENDPOINT, "http://localhost:9000".to_string()), ("bucket", "test-bucket".to_string()), (S3_REGION, "us-east-1".to_string()), + (S3_PATH_STYLE_ACCESS, "true".to_string()), ]); } @@ -154,12 +155,12 @@ mod tests { // Build FileIO with custom credential loader via OpenDalStorageFactory let file_io_with_custom_creds = FileIOBuilder::new(Arc::new(OpenDalStorageFactory::S3 { - configured_scheme: "s3".to_string(), customized_credential_load: Some(custom_loader), })) .with_props(vec![ (S3_ENDPOINT, minio_endpoint), (S3_REGION, "us-east-1".to_string()), + (S3_PATH_STYLE_ACCESS, "true".to_string()), ]) .build(); @@ -182,12 +183,12 @@ mod tests { // Build FileIO with custom credential loader via OpenDalStorageFactory let file_io_with_custom_creds = FileIOBuilder::new(Arc::new(OpenDalStorageFactory::S3 { - configured_scheme: "s3".to_string(), customized_credential_load: Some(custom_loader), })) .with_props(vec![ (S3_ENDPOINT, minio_endpoint), (S3_REGION, "us-east-1".to_string()), + (S3_PATH_STYLE_ACCESS, "true".to_string()), ]) .build(); diff --git a/crates/storage/opendal/tests/resolving_storage_test.rs b/crates/storage/opendal/tests/resolving_storage_test.rs index 4572ad2c2d..c235089508 100644 --- a/crates/storage/opendal/tests/resolving_storage_test.rs +++ b/crates/storage/opendal/tests/resolving_storage_test.rs @@ -29,7 +29,8 @@ mod tests { use std::sync::Arc; use iceberg::io::{ - FileIOBuilder, S3_ACCESS_KEY_ID, S3_ENDPOINT, S3_REGION, S3_SECRET_ACCESS_KEY, + FileIOBuilder, S3_ACCESS_KEY_ID, S3_ENDPOINT, S3_PATH_STYLE_ACCESS, S3_REGION, + S3_SECRET_ACCESS_KEY, }; use iceberg_storage_opendal::OpenDalResolvingStorageFactory; use iceberg_test_utils::{get_minio_endpoint, normalize_test_name_with_parts, set_up}; @@ -45,6 +46,7 @@ mod tests { (S3_ACCESS_KEY_ID, "admin".to_string()), (S3_SECRET_ACCESS_KEY, "password".to_string()), (S3_REGION, "us-east-1".to_string()), + (S3_PATH_STYLE_ACCESS, "true".to_string()), ]) .build() } @@ -288,6 +290,7 @@ mod tests { .with_props(vec![ (S3_ENDPOINT, minio_endpoint), (S3_REGION, "us-east-1".to_string()), + (S3_PATH_STYLE_ACCESS, "true".to_string()), ]) .build(); diff --git a/website/src/release.md b/website/src/release.md index 7549b8ef4d..79c2bca8a0 100644 --- a/website/src/release.md +++ b/website/src/release.md @@ -108,6 +108,7 @@ Bump all components' version in the project to the new iceberg version. Please note that this version is the exact version of the release, not the release candidate version. - rust core: bump version in `Cargo.toml` +- python binding: bump version in `bindings/python/Cargo.toml` ### Update docs @@ -159,6 +160,7 @@ dist ├── apache-iceberg-rust-0.2.0.tar.gz.asc └── apache-iceberg-rust-0.2.0.tar.gz.sha512 ``` +It is recommended to verify the artifacts yourself before uploading them to the SVN dist repo, see [How to verify a release](#how-to-verify-a-release) ### Upload artifacts to the SVN dist repo @@ -175,7 +177,9 @@ svn co https://dist.apache.org/repos/dist/dev/iceberg/ /tmp/iceberg-dist-dev Then, upload the artifacts: -> The `${release_version}` here should be like `0.2.0-rc.1` +> The `${release_version}` here should be like `0.2.0-rc1` + +Example of uploaded artifacts can be found at: https://dist.apache.org/repos/dist/dev/iceberg/apache-iceberg-rust-0.9.1-rc3/ ```shell # create a directory named by version @@ -189,7 +193,8 @@ cd /tmp/iceberg-dist-dev/ # check svn status svn status - +``` +```shell # add to svn svn add apache-iceberg-rust-${release_version} @@ -219,11 +224,11 @@ Title: Content: ``` -Hello, Apache Iceberg Rust Community, +Hello Apache Iceberg Rust Community, -This is a call for a vote to release Apache Iceberg rust version ${iceberg_version}. +This is a call for a vote to release Apache Iceberg Rust version ${iceberg_version}. -The tag to be voted on is v${release_version}. +The tag to be voted on is: v${release_version}. The release candidate: @@ -237,30 +242,30 @@ Git tag for the release: https://github.com/apache/iceberg-rust/releases/tag/v${release_version} -Please download, verify, and test. +Please download, verify, and test the release candidate. -The VOTE will be open for at least 72 hours and until the necessary -number of votes are reached. +This vote will be open for at least 72 hours and will remain open until the required number of votes is reached. -[ ] +1 approve -[ ] +0 no opinion -[ ] -1 disapprove with the reason +Please vote accordingly: +[ ] +1 Approve +[ ] +0 No opinion +[ ] -1 Disapprove (please provide a reason) -To learn more about Apache Iceberg, please see https://rust.iceberg.apache.org/ +To learn more about Apache Iceberg, please visit: +https://rust.iceberg.apache.org/ Checklist for reference: - -[ ] Download links are valid. -[ ] Checksums and signatures. -[ ] LICENSE/NOTICE files exist -[ ] No unexpected binary files +[ ] Download links are valid +[ ] Checksums and signatures are correct +[ ] LICENSE and NOTICE files are present +[ ] No unexpected binary files are included [ ] All source files have ASF headers -[ ] Can compile from source +[ ] The project builds successfully from source -More details please refer to https://rust.iceberg.apache.org/release.html#how-to-verify-a-release. - -Thanks +For more details, please refer to: +https://rust.iceberg.apache.org/release.html#how-to-verify-a-release +Thanks, ${name} ``` @@ -277,7 +282,7 @@ Title: Content: ``` -Hello, Apache Iceberg Rust Community, +Hello Apache Iceberg Rust Community, The vote to release Apache Iceberg Rust ${release_version} has passed. @@ -295,8 +300,7 @@ Non-Binding votes: Vote thread: ${vote_thread_url} -Thanks - +Thanks, ${name} ```