From 7d4c26ad0e12d53d1c609789a66ff33216724c55 Mon Sep 17 00:00:00 2001 From: Shawn Chang Date: Wed, 4 Feb 2026 17:03:35 -0800 Subject: [PATCH 01/13] Add storage module and reorganize code --- crates/iceberg/src/io/file_io.rs | 2 +- crates/iceberg/src/io/mod.rs | 10 +--------- .../iceberg/src/io/{ => storage}/config/azdls.rs | 0 crates/iceberg/src/io/{ => storage}/config/gcs.rs | 2 +- crates/iceberg/src/io/{ => storage}/config/mod.rs | 0 crates/iceberg/src/io/{ => storage}/config/oss.rs | 0 crates/iceberg/src/io/{ => storage}/config/s3.rs | 1 + crates/iceberg/src/io/{ => storage}/local_fs.rs | 14 ++++++++++---- crates/iceberg/src/io/{ => storage}/memory.rs | 14 ++++++++++---- .../iceberg/src/io/{storage.rs => storage/mod.rs} | 13 ++++++++++++- .../iceberg/src/io/{ => storage}/opendal/azdls.rs | 9 ++++++++- crates/iceberg/src/io/{ => storage}/opendal/fs.rs | 0 crates/iceberg/src/io/{ => storage}/opendal/gcs.rs | 4 ++-- .../iceberg/src/io/{ => storage}/opendal/memory.rs | 0 crates/iceberg/src/io/{ => storage}/opendal/mod.rs | 6 ++---- crates/iceberg/src/io/{ => storage}/opendal/oss.rs | 2 +- crates/iceberg/src/io/{ => storage}/opendal/s3.rs | 4 ++-- 17 files changed, 51 insertions(+), 30 deletions(-) rename crates/iceberg/src/io/{ => storage}/config/azdls.rs (100%) rename crates/iceberg/src/io/{ => storage}/config/gcs.rs (100%) rename crates/iceberg/src/io/{ => storage}/config/mod.rs (100%) rename crates/iceberg/src/io/{ => storage}/config/oss.rs (100%) rename crates/iceberg/src/io/{ => storage}/config/s3.rs (99%) rename crates/iceberg/src/io/{ => storage}/local_fs.rs (99%) rename crates/iceberg/src/io/{ => storage}/memory.rs (99%) rename crates/iceberg/src/io/{storage.rs => storage/mod.rs} (93%) rename crates/iceberg/src/io/{ => storage}/opendal/azdls.rs (99%) rename crates/iceberg/src/io/{ => storage}/opendal/fs.rs (100%) rename crates/iceberg/src/io/{ => storage}/opendal/gcs.rs (98%) rename crates/iceberg/src/io/{ => storage}/opendal/memory.rs (100%) rename crates/iceberg/src/io/{ => storage}/opendal/mod.rs (99%) rename crates/iceberg/src/io/{ => storage}/opendal/oss.rs (95%) rename crates/iceberg/src/io/{ => storage}/opendal/s3.rs (99%) diff --git a/crates/iceberg/src/io/file_io.rs b/crates/iceberg/src/io/file_io.rs index 1ad71d8531..d7bd6609e0 100644 --- a/crates/iceberg/src/io/file_io.rs +++ b/crates/iceberg/src/io/file_io.rs @@ -23,7 +23,7 @@ use std::sync::Arc; use bytes::Bytes; use url::Url; -use super::opendal::OpenDalStorage; +use super::storage::OpenDalStorage; use super::storage::Storage; use crate::{Error, ErrorKind, Result}; diff --git a/crates/iceberg/src/io/mod.rs b/crates/iceberg/src/io/mod.rs index 5f65a15fc3..07f40d104e 100644 --- a/crates/iceberg/src/io/mod.rs +++ b/crates/iceberg/src/io/mod.rs @@ -66,19 +66,11 @@ //! - `new_input`: Create input file for reading. //! - `new_output`: Create output file for writing. -mod config; mod file_io; -mod local_fs; -mod memory; -mod opendal; mod storage; -pub use config::*; pub use file_io::*; -#[cfg(feature = "storage-s3")] -pub use opendal::CustomAwsCredentialLoader; -pub use opendal::{OpenDalStorage, OpenDalStorageFactory}; -pub use storage::{Storage, StorageConfig, StorageFactory}; +pub use storage::*; pub(crate) mod object_cache; diff --git a/crates/iceberg/src/io/config/azdls.rs b/crates/iceberg/src/io/storage/config/azdls.rs similarity index 100% rename from crates/iceberg/src/io/config/azdls.rs rename to crates/iceberg/src/io/storage/config/azdls.rs diff --git a/crates/iceberg/src/io/config/gcs.rs b/crates/iceberg/src/io/storage/config/gcs.rs similarity index 100% rename from crates/iceberg/src/io/config/gcs.rs rename to crates/iceberg/src/io/storage/config/gcs.rs index 5b11567f4d..61523b1629 100644 --- a/crates/iceberg/src/io/config/gcs.rs +++ b/crates/iceberg/src/io/storage/config/gcs.rs @@ -24,8 +24,8 @@ use serde::{Deserialize, Serialize}; use typed_builder::TypedBuilder; use super::StorageConfig; -use crate::Result; use crate::io::is_truthy; +use crate::Result; /// Google Cloud Project ID. pub const GCS_PROJECT_ID: &str = "gcs.project-id"; diff --git a/crates/iceberg/src/io/config/mod.rs b/crates/iceberg/src/io/storage/config/mod.rs similarity index 100% rename from crates/iceberg/src/io/config/mod.rs rename to crates/iceberg/src/io/storage/config/mod.rs diff --git a/crates/iceberg/src/io/config/oss.rs b/crates/iceberg/src/io/storage/config/oss.rs similarity index 100% rename from crates/iceberg/src/io/config/oss.rs rename to crates/iceberg/src/io/storage/config/oss.rs diff --git a/crates/iceberg/src/io/config/s3.rs b/crates/iceberg/src/io/storage/config/s3.rs similarity index 99% rename from crates/iceberg/src/io/config/s3.rs rename to crates/iceberg/src/io/storage/config/s3.rs index fae3a14757..6f5060e6f9 100644 --- a/crates/iceberg/src/io/config/s3.rs +++ b/crates/iceberg/src/io/storage/config/s3.rs @@ -125,6 +125,7 @@ pub struct S3Config { pub disable_config_load: bool, } + impl TryFrom<&StorageConfig> for S3Config { type Error = crate::Error; diff --git a/crates/iceberg/src/io/local_fs.rs b/crates/iceberg/src/io/storage/local_fs.rs similarity index 99% rename from crates/iceberg/src/io/local_fs.rs rename to crates/iceberg/src/io/storage/local_fs.rs index 0a55199f70..16848f51a3 100644 --- a/crates/iceberg/src/io/local_fs.rs +++ b/crates/iceberg/src/io/storage/local_fs.rs @@ -31,10 +31,8 @@ use async_trait::async_trait; use bytes::Bytes; use serde::{Deserialize, Serialize}; -use super::{ - FileMetadata, FileRead, FileWrite, InputFile, OutputFile, Storage, StorageConfig, - StorageFactory, -}; +use super::{Storage, StorageConfig, StorageFactory}; +use crate::io::{FileMetadata, FileRead, FileWrite, InputFile, OutputFile}; use crate::{Error, ErrorKind, Result}; /// Local filesystem storage implementation. @@ -52,6 +50,7 @@ use crate::{Error, ErrorKind, Result}; #[derive(Debug, Clone, Default, Serialize, Deserialize)] pub struct LocalFsStorage; + impl LocalFsStorage { /// Create a new `LocalFsStorage` instance. pub fn new() -> Self { @@ -108,6 +107,7 @@ impl Storage for LocalFsStorage { }) } + async fn read(&self, path: &str) -> Result { let path = Self::normalize_path(path); let content = fs::read(&path).map_err(|e| { @@ -165,6 +165,7 @@ impl Storage for LocalFsStorage { })?; } + let file = fs::File::create(&path).map_err(|e| { Error::new( ErrorKind::Unexpected, @@ -215,6 +216,7 @@ pub struct LocalFsFileRead { file: std::sync::Mutex, } + impl LocalFsFileRead { /// Create a new `LocalFsFileRead` with the given file. pub fn new(file: fs::File) -> Self { @@ -269,6 +271,7 @@ impl LocalFsFileWrite { } } + #[async_trait] impl FileWrite for LocalFsFileWrite { async fn write(&mut self, bs: Bytes) -> Result<()> { @@ -324,6 +327,7 @@ impl StorageFactory for LocalFsStorageFactory { } } + #[cfg(test)] mod tests { use tempfile::TempDir; @@ -373,6 +377,7 @@ mod tests { assert_eq!(read_content, content); } + #[tokio::test] async fn test_local_fs_storage_exists() { let tmp_dir = TempDir::new().unwrap(); @@ -446,6 +451,7 @@ mod tests { assert!(!dir_path.exists()); } + #[tokio::test] async fn test_local_fs_storage_reader() { let tmp_dir = TempDir::new().unwrap(); diff --git a/crates/iceberg/src/io/memory.rs b/crates/iceberg/src/io/storage/memory.rs similarity index 99% rename from crates/iceberg/src/io/memory.rs rename to crates/iceberg/src/io/storage/memory.rs index 39f1f5db9d..f875efd01c 100644 --- a/crates/iceberg/src/io/memory.rs +++ b/crates/iceberg/src/io/storage/memory.rs @@ -30,10 +30,8 @@ use async_trait::async_trait; use bytes::Bytes; use serde::{Deserialize, Serialize}; -use super::{ - FileMetadata, FileRead, FileWrite, InputFile, OutputFile, Storage, StorageConfig, - StorageFactory, -}; +use super::{Storage, StorageConfig, StorageFactory}; +use crate::io::{FileMetadata, FileRead, FileWrite, InputFile, OutputFile}; use crate::{Error, ErrorKind, Result}; /// In-memory storage implementation. @@ -106,6 +104,7 @@ impl Storage for MemoryStorage { Ok(data.contains_key(&normalized)) } + async fn metadata(&self, path: &str) -> Result { let normalized = Self::normalize_path(path); let data = self.data.read().map_err(|e| { @@ -159,6 +158,7 @@ impl Storage for MemoryStorage { } } + async fn write(&self, path: &str, bs: Bytes) -> Result<()> { let normalized = Self::normalize_path(path); let mut data = self.data.write().map_err(|e| { @@ -229,6 +229,7 @@ impl Storage for MemoryStorage { } } + /// Factory for creating `MemoryStorage` instances. /// /// This factory implements `StorageFactory` and creates `MemoryStorage` @@ -279,6 +280,7 @@ impl FileRead for MemoryFileRead { } } + /// File writer for in-memory storage. /// /// This struct implements `FileWrite` for writing to in-memory storage. @@ -338,6 +340,7 @@ impl FileWrite for MemoryFileWrite { } } + #[cfg(test)] mod tests { use super::*; @@ -395,6 +398,7 @@ mod tests { assert_eq!(read_content, content); } + #[tokio::test] async fn test_memory_storage_exists() { let storage = MemoryStorage::new(); @@ -463,6 +467,7 @@ mod tests { assert!(storage.exists("memory://other/file.txt").await.unwrap()); } + #[tokio::test] async fn test_memory_storage_reader() { let storage = MemoryStorage::new(); @@ -534,6 +539,7 @@ mod tests { assert!(result.is_err()); } + #[test] fn test_memory_storage_serialization() { let storage = MemoryStorage::new(); diff --git a/crates/iceberg/src/io/storage.rs b/crates/iceberg/src/io/storage/mod.rs similarity index 93% rename from crates/iceberg/src/io/storage.rs rename to crates/iceberg/src/io/storage/mod.rs index 15cc85ab10..1aa46952e8 100644 --- a/crates/iceberg/src/io/storage.rs +++ b/crates/iceberg/src/io/storage/mod.rs @@ -17,6 +17,11 @@ //! Storage interfaces for Iceberg. +mod config; +mod local_fs; +mod memory; +mod opendal; + use std::fmt::Debug; use std::sync::Arc; @@ -25,7 +30,13 @@ use bytes::Bytes; use super::{FileMetadata, FileRead, FileWrite, InputFile, OutputFile}; use crate::Result; -pub use crate::io::config::StorageConfig; + +pub use config::*; +pub use local_fs::{LocalFsStorage, LocalFsStorageFactory}; +pub use memory::{MemoryStorage, MemoryStorageFactory}; +#[cfg(feature = "storage-s3")] +pub use opendal::CustomAwsCredentialLoader; +pub use opendal::{OpenDalStorage, OpenDalStorageFactory}; /// Trait for storage operations in Iceberg. /// diff --git a/crates/iceberg/src/io/opendal/azdls.rs b/crates/iceberg/src/io/storage/opendal/azdls.rs similarity index 99% rename from crates/iceberg/src/io/opendal/azdls.rs rename to crates/iceberg/src/io/storage/opendal/azdls.rs index c957fd62a3..7a86f4fe1c 100644 --- a/crates/iceberg/src/io/opendal/azdls.rs +++ b/crates/iceberg/src/io/storage/opendal/azdls.rs @@ -24,7 +24,7 @@ use opendal::services::AzdlsConfig; use serde::{Deserialize, Serialize}; use url::Url; -use crate::io::config::{ +use crate::io::storage::config::{ ADLS_ACCOUNT_KEY, ADLS_ACCOUNT_NAME, ADLS_AUTHORITY_HOST, ADLS_CLIENT_ID, ADLS_CLIENT_SECRET, ADLS_CONNECTION_STRING, ADLS_SAS_TOKEN, ADLS_TENANT_ID, }; @@ -49,6 +49,7 @@ pub(crate) fn azdls_config_parse(mut properties: HashMap) -> Res config.account_key = Some(account_key); } + if let Some(sas_token) = properties.remove(ADLS_SAS_TOKEN) { config.sas_token = Some(sas_token); } @@ -109,6 +110,7 @@ pub enum AzureStorageScheme { Wasbs, } + impl AzureStorageScheme { // Returns the respective encrypted or plain-text HTTP scheme. pub fn as_http_scheme(&self) -> &str { @@ -169,6 +171,7 @@ fn match_path_with_config( ); } + if let Some(ref configured_endpoint) = config.endpoint { let passed_http_scheme = path.scheme.as_http_scheme(); ensure_data_valid!( @@ -225,6 +228,7 @@ struct AzureStoragePath { path: String, } + impl AzureStoragePath { /// Converts the AzureStoragePath into a full endpoint URL. /// @@ -282,6 +286,7 @@ fn parse_azure_storage_endpoint(url: &Url) -> Result<(&str, &str, &str)> { )); } + let (storage, endpoint_suffix) = endpoint.split_once('.').ok_or(Error::new( ErrorKind::DataInvalid, "AzureStoragePath: No storage service", @@ -382,6 +387,7 @@ mod tests { } } + #[test] fn test_azdls_create_operator() { let test_cases = vec![ @@ -483,6 +489,7 @@ mod tests { } } + #[test] fn test_azure_storage_path_parse() { let test_cases = vec![ diff --git a/crates/iceberg/src/io/opendal/fs.rs b/crates/iceberg/src/io/storage/opendal/fs.rs similarity index 100% rename from crates/iceberg/src/io/opendal/fs.rs rename to crates/iceberg/src/io/storage/opendal/fs.rs diff --git a/crates/iceberg/src/io/opendal/gcs.rs b/crates/iceberg/src/io/storage/opendal/gcs.rs similarity index 98% rename from crates/iceberg/src/io/opendal/gcs.rs rename to crates/iceberg/src/io/storage/opendal/gcs.rs index 5c6145d32b..a21e89fad9 100644 --- a/crates/iceberg/src/io/opendal/gcs.rs +++ b/crates/iceberg/src/io/storage/opendal/gcs.rs @@ -22,11 +22,11 @@ use opendal::Operator; use opendal::services::GcsConfig; use url::Url; -use crate::io::config::{ +use crate::io::is_truthy; +use crate::io::storage::config::{ GCS_ALLOW_ANONYMOUS, GCS_CREDENTIALS_JSON, GCS_DISABLE_CONFIG_LOAD, GCS_DISABLE_VM_METADATA, GCS_NO_AUTH, GCS_SERVICE_PATH, GCS_TOKEN, }; -use crate::io::is_truthy; use crate::{Error, ErrorKind, Result}; /// Parse iceberg properties to [`GcsConfig`]. diff --git a/crates/iceberg/src/io/opendal/memory.rs b/crates/iceberg/src/io/storage/opendal/memory.rs similarity index 100% rename from crates/iceberg/src/io/opendal/memory.rs rename to crates/iceberg/src/io/storage/opendal/memory.rs diff --git a/crates/iceberg/src/io/opendal/mod.rs b/crates/iceberg/src/io/storage/opendal/mod.rs similarity index 99% rename from crates/iceberg/src/io/opendal/mod.rs rename to crates/iceberg/src/io/storage/opendal/mod.rs index fb49dc9e3f..4f342a73eb 100644 --- a/crates/iceberg/src/io/opendal/mod.rs +++ b/crates/iceberg/src/io/storage/opendal/mod.rs @@ -38,10 +38,8 @@ use opendal::{Operator, Scheme}; pub use s3::CustomAwsCredentialLoader; use serde::{Deserialize, Serialize}; -use super::{ - FileIOBuilder, FileMetadata, FileRead, FileWrite, InputFile, OutputFile, Storage, - StorageConfig, StorageFactory, -}; +use super::{Storage, StorageConfig, StorageFactory}; +use crate::io::{FileIOBuilder, FileMetadata, FileRead, FileWrite, InputFile, OutputFile}; use crate::{Error, ErrorKind, Result}; #[cfg(feature = "storage-azdls")] diff --git a/crates/iceberg/src/io/opendal/oss.rs b/crates/iceberg/src/io/storage/opendal/oss.rs similarity index 95% rename from crates/iceberg/src/io/opendal/oss.rs rename to crates/iceberg/src/io/storage/opendal/oss.rs index 83fc1424aa..91a1c0a667 100644 --- a/crates/iceberg/src/io/opendal/oss.rs +++ b/crates/iceberg/src/io/storage/opendal/oss.rs @@ -21,7 +21,7 @@ use opendal::services::OssConfig; use opendal::{Configurator, Operator}; use url::Url; -use crate::io::config::{OSS_ACCESS_KEY_ID, OSS_ACCESS_KEY_SECRET, OSS_ENDPOINT}; +use crate::io::storage::config::{OSS_ACCESS_KEY_ID, OSS_ACCESS_KEY_SECRET, OSS_ENDPOINT}; use crate::{Error, ErrorKind, Result}; /// Parse iceberg props to oss config. diff --git a/crates/iceberg/src/io/opendal/s3.rs b/crates/iceberg/src/io/storage/opendal/s3.rs similarity index 99% rename from crates/iceberg/src/io/opendal/s3.rs rename to crates/iceberg/src/io/storage/opendal/s3.rs index bf7399e01b..9ee30e06e7 100644 --- a/crates/iceberg/src/io/opendal/s3.rs +++ b/crates/iceberg/src/io/storage/opendal/s3.rs @@ -25,13 +25,13 @@ pub use reqsign::{AwsCredential, AwsCredentialLoad}; use reqwest::Client; use url::Url; -use crate::io::config::{ +use crate::io::is_truthy; +use crate::io::storage::config::{ CLIENT_REGION, S3_ACCESS_KEY_ID, S3_ALLOW_ANONYMOUS, S3_ASSUME_ROLE_ARN, S3_ASSUME_ROLE_EXTERNAL_ID, S3_ASSUME_ROLE_SESSION_NAME, S3_DISABLE_CONFIG_LOAD, S3_DISABLE_EC2_METADATA, S3_ENDPOINT, S3_PATH_STYLE_ACCESS, S3_REGION, S3_SECRET_ACCESS_KEY, S3_SESSION_TOKEN, S3_SSE_KEY, S3_SSE_MD5, S3_SSE_TYPE, }; -use crate::io::is_truthy; use crate::{Error, ErrorKind, Result}; /// Parse iceberg props to s3 config. From d5a6eacf723822d0ef99e5013b79e2347ab69f81 Mon Sep 17 00:00:00 2001 From: Shawn Chang Date: Wed, 4 Feb 2026 17:04:23 -0800 Subject: [PATCH 02/13] fmt --- crates/iceberg/src/io/file_io.rs | 3 +-- crates/iceberg/src/io/storage/config/gcs.rs | 2 +- crates/iceberg/src/io/storage/config/s3.rs | 1 - crates/iceberg/src/io/storage/local_fs.rs | 8 -------- crates/iceberg/src/io/storage/memory.rs | 8 -------- crates/iceberg/src/io/storage/mod.rs | 7 +++---- crates/iceberg/src/io/storage/opendal/azdls.rs | 7 ------- 7 files changed, 5 insertions(+), 31 deletions(-) diff --git a/crates/iceberg/src/io/file_io.rs b/crates/iceberg/src/io/file_io.rs index d7bd6609e0..267f295369 100644 --- a/crates/iceberg/src/io/file_io.rs +++ b/crates/iceberg/src/io/file_io.rs @@ -23,8 +23,7 @@ use std::sync::Arc; use bytes::Bytes; use url::Url; -use super::storage::OpenDalStorage; -use super::storage::Storage; +use super::storage::{OpenDalStorage, Storage}; use crate::{Error, ErrorKind, Result}; /// FileIO implementation, used to manipulate files in underlying storage. diff --git a/crates/iceberg/src/io/storage/config/gcs.rs b/crates/iceberg/src/io/storage/config/gcs.rs index 61523b1629..5b11567f4d 100644 --- a/crates/iceberg/src/io/storage/config/gcs.rs +++ b/crates/iceberg/src/io/storage/config/gcs.rs @@ -24,8 +24,8 @@ use serde::{Deserialize, Serialize}; use typed_builder::TypedBuilder; use super::StorageConfig; -use crate::io::is_truthy; use crate::Result; +use crate::io::is_truthy; /// Google Cloud Project ID. pub const GCS_PROJECT_ID: &str = "gcs.project-id"; diff --git a/crates/iceberg/src/io/storage/config/s3.rs b/crates/iceberg/src/io/storage/config/s3.rs index 6f5060e6f9..fae3a14757 100644 --- a/crates/iceberg/src/io/storage/config/s3.rs +++ b/crates/iceberg/src/io/storage/config/s3.rs @@ -125,7 +125,6 @@ pub struct S3Config { pub disable_config_load: bool, } - impl TryFrom<&StorageConfig> for S3Config { type Error = crate::Error; diff --git a/crates/iceberg/src/io/storage/local_fs.rs b/crates/iceberg/src/io/storage/local_fs.rs index 16848f51a3..134242d031 100644 --- a/crates/iceberg/src/io/storage/local_fs.rs +++ b/crates/iceberg/src/io/storage/local_fs.rs @@ -50,7 +50,6 @@ use crate::{Error, ErrorKind, Result}; #[derive(Debug, Clone, Default, Serialize, Deserialize)] pub struct LocalFsStorage; - impl LocalFsStorage { /// Create a new `LocalFsStorage` instance. pub fn new() -> Self { @@ -107,7 +106,6 @@ impl Storage for LocalFsStorage { }) } - async fn read(&self, path: &str) -> Result { let path = Self::normalize_path(path); let content = fs::read(&path).map_err(|e| { @@ -165,7 +163,6 @@ impl Storage for LocalFsStorage { })?; } - let file = fs::File::create(&path).map_err(|e| { Error::new( ErrorKind::Unexpected, @@ -216,7 +213,6 @@ pub struct LocalFsFileRead { file: std::sync::Mutex, } - impl LocalFsFileRead { /// Create a new `LocalFsFileRead` with the given file. pub fn new(file: fs::File) -> Self { @@ -271,7 +267,6 @@ impl LocalFsFileWrite { } } - #[async_trait] impl FileWrite for LocalFsFileWrite { async fn write(&mut self, bs: Bytes) -> Result<()> { @@ -327,7 +322,6 @@ impl StorageFactory for LocalFsStorageFactory { } } - #[cfg(test)] mod tests { use tempfile::TempDir; @@ -377,7 +371,6 @@ mod tests { assert_eq!(read_content, content); } - #[tokio::test] async fn test_local_fs_storage_exists() { let tmp_dir = TempDir::new().unwrap(); @@ -451,7 +444,6 @@ mod tests { assert!(!dir_path.exists()); } - #[tokio::test] async fn test_local_fs_storage_reader() { let tmp_dir = TempDir::new().unwrap(); diff --git a/crates/iceberg/src/io/storage/memory.rs b/crates/iceberg/src/io/storage/memory.rs index f875efd01c..ef1168b17a 100644 --- a/crates/iceberg/src/io/storage/memory.rs +++ b/crates/iceberg/src/io/storage/memory.rs @@ -104,7 +104,6 @@ impl Storage for MemoryStorage { Ok(data.contains_key(&normalized)) } - async fn metadata(&self, path: &str) -> Result { let normalized = Self::normalize_path(path); let data = self.data.read().map_err(|e| { @@ -158,7 +157,6 @@ impl Storage for MemoryStorage { } } - async fn write(&self, path: &str, bs: Bytes) -> Result<()> { let normalized = Self::normalize_path(path); let mut data = self.data.write().map_err(|e| { @@ -229,7 +227,6 @@ impl Storage for MemoryStorage { } } - /// Factory for creating `MemoryStorage` instances. /// /// This factory implements `StorageFactory` and creates `MemoryStorage` @@ -280,7 +277,6 @@ impl FileRead for MemoryFileRead { } } - /// File writer for in-memory storage. /// /// This struct implements `FileWrite` for writing to in-memory storage. @@ -340,7 +336,6 @@ impl FileWrite for MemoryFileWrite { } } - #[cfg(test)] mod tests { use super::*; @@ -398,7 +393,6 @@ mod tests { assert_eq!(read_content, content); } - #[tokio::test] async fn test_memory_storage_exists() { let storage = MemoryStorage::new(); @@ -467,7 +461,6 @@ mod tests { assert!(storage.exists("memory://other/file.txt").await.unwrap()); } - #[tokio::test] async fn test_memory_storage_reader() { let storage = MemoryStorage::new(); @@ -539,7 +532,6 @@ mod tests { assert!(result.is_err()); } - #[test] fn test_memory_storage_serialization() { let storage = MemoryStorage::new(); diff --git a/crates/iceberg/src/io/storage/mod.rs b/crates/iceberg/src/io/storage/mod.rs index 1aa46952e8..31ceaf2436 100644 --- a/crates/iceberg/src/io/storage/mod.rs +++ b/crates/iceberg/src/io/storage/mod.rs @@ -27,10 +27,6 @@ use std::sync::Arc; use async_trait::async_trait; use bytes::Bytes; - -use super::{FileMetadata, FileRead, FileWrite, InputFile, OutputFile}; -use crate::Result; - pub use config::*; pub use local_fs::{LocalFsStorage, LocalFsStorageFactory}; pub use memory::{MemoryStorage, MemoryStorageFactory}; @@ -38,6 +34,9 @@ pub use memory::{MemoryStorage, MemoryStorageFactory}; pub use opendal::CustomAwsCredentialLoader; pub use opendal::{OpenDalStorage, OpenDalStorageFactory}; +use super::{FileMetadata, FileRead, FileWrite, InputFile, OutputFile}; +use crate::Result; + /// Trait for storage operations in Iceberg. /// /// The trait supports serialization via `typetag`, allowing storage instances to be diff --git a/crates/iceberg/src/io/storage/opendal/azdls.rs b/crates/iceberg/src/io/storage/opendal/azdls.rs index 7a86f4fe1c..1d85aa3d3f 100644 --- a/crates/iceberg/src/io/storage/opendal/azdls.rs +++ b/crates/iceberg/src/io/storage/opendal/azdls.rs @@ -49,7 +49,6 @@ pub(crate) fn azdls_config_parse(mut properties: HashMap) -> Res config.account_key = Some(account_key); } - if let Some(sas_token) = properties.remove(ADLS_SAS_TOKEN) { config.sas_token = Some(sas_token); } @@ -110,7 +109,6 @@ pub enum AzureStorageScheme { Wasbs, } - impl AzureStorageScheme { // Returns the respective encrypted or plain-text HTTP scheme. pub fn as_http_scheme(&self) -> &str { @@ -171,7 +169,6 @@ fn match_path_with_config( ); } - if let Some(ref configured_endpoint) = config.endpoint { let passed_http_scheme = path.scheme.as_http_scheme(); ensure_data_valid!( @@ -228,7 +225,6 @@ struct AzureStoragePath { path: String, } - impl AzureStoragePath { /// Converts the AzureStoragePath into a full endpoint URL. /// @@ -286,7 +282,6 @@ fn parse_azure_storage_endpoint(url: &Url) -> Result<(&str, &str, &str)> { )); } - let (storage, endpoint_suffix) = endpoint.split_once('.').ok_or(Error::new( ErrorKind::DataInvalid, "AzureStoragePath: No storage service", @@ -387,7 +382,6 @@ mod tests { } } - #[test] fn test_azdls_create_operator() { let test_cases = vec![ @@ -489,7 +483,6 @@ mod tests { } } - #[test] fn test_azure_storage_path_parse() { let test_cases = vec![ From 92fdca51f5dfa9273f93528963cd05d6cce7df43 Mon Sep 17 00:00:00 2001 From: Shawn Chang Date: Wed, 4 Feb 2026 17:09:30 -0800 Subject: [PATCH 03/13] clean up imports --- crates/iceberg/src/io/storage/local_fs.rs | 6 ++++-- crates/iceberg/src/io/storage/memory.rs | 6 ++++-- crates/iceberg/src/io/storage/opendal/azdls.rs | 2 +- crates/iceberg/src/io/storage/opendal/gcs.rs | 5 ++--- crates/iceberg/src/io/storage/opendal/mod.rs | 6 ++++-- crates/iceberg/src/io/storage/opendal/oss.rs | 2 +- crates/iceberg/src/io/storage/opendal/s3.rs | 5 ++--- 7 files changed, 18 insertions(+), 14 deletions(-) diff --git a/crates/iceberg/src/io/storage/local_fs.rs b/crates/iceberg/src/io/storage/local_fs.rs index 134242d031..d6dd5b433b 100644 --- a/crates/iceberg/src/io/storage/local_fs.rs +++ b/crates/iceberg/src/io/storage/local_fs.rs @@ -31,8 +31,10 @@ use async_trait::async_trait; use bytes::Bytes; use serde::{Deserialize, Serialize}; -use super::{Storage, StorageConfig, StorageFactory}; -use crate::io::{FileMetadata, FileRead, FileWrite, InputFile, OutputFile}; +use crate::io::{ + FileMetadata, FileRead, FileWrite, InputFile, OutputFile, Storage, StorageConfig, + StorageFactory, +}; use crate::{Error, ErrorKind, Result}; /// Local filesystem storage implementation. diff --git a/crates/iceberg/src/io/storage/memory.rs b/crates/iceberg/src/io/storage/memory.rs index ef1168b17a..cb01ee4709 100644 --- a/crates/iceberg/src/io/storage/memory.rs +++ b/crates/iceberg/src/io/storage/memory.rs @@ -30,8 +30,10 @@ use async_trait::async_trait; use bytes::Bytes; use serde::{Deserialize, Serialize}; -use super::{Storage, StorageConfig, StorageFactory}; -use crate::io::{FileMetadata, FileRead, FileWrite, InputFile, OutputFile}; +use crate::io::{ + FileMetadata, FileRead, FileWrite, InputFile, OutputFile, Storage, StorageConfig, + StorageFactory, +}; use crate::{Error, ErrorKind, Result}; /// In-memory storage implementation. diff --git a/crates/iceberg/src/io/storage/opendal/azdls.rs b/crates/iceberg/src/io/storage/opendal/azdls.rs index 1d85aa3d3f..759ff301ea 100644 --- a/crates/iceberg/src/io/storage/opendal/azdls.rs +++ b/crates/iceberg/src/io/storage/opendal/azdls.rs @@ -24,7 +24,7 @@ use opendal::services::AzdlsConfig; use serde::{Deserialize, Serialize}; use url::Url; -use crate::io::storage::config::{ +use crate::io::{ ADLS_ACCOUNT_KEY, ADLS_ACCOUNT_NAME, ADLS_AUTHORITY_HOST, ADLS_CLIENT_ID, ADLS_CLIENT_SECRET, ADLS_CONNECTION_STRING, ADLS_SAS_TOKEN, ADLS_TENANT_ID, }; diff --git a/crates/iceberg/src/io/storage/opendal/gcs.rs b/crates/iceberg/src/io/storage/opendal/gcs.rs index a21e89fad9..4cb8aa8591 100644 --- a/crates/iceberg/src/io/storage/opendal/gcs.rs +++ b/crates/iceberg/src/io/storage/opendal/gcs.rs @@ -22,10 +22,9 @@ use opendal::Operator; use opendal::services::GcsConfig; use url::Url; -use crate::io::is_truthy; -use crate::io::storage::config::{ +use crate::io::{ GCS_ALLOW_ANONYMOUS, GCS_CREDENTIALS_JSON, GCS_DISABLE_CONFIG_LOAD, GCS_DISABLE_VM_METADATA, - GCS_NO_AUTH, GCS_SERVICE_PATH, GCS_TOKEN, + GCS_NO_AUTH, GCS_SERVICE_PATH, GCS_TOKEN, is_truthy, }; use crate::{Error, ErrorKind, Result}; diff --git a/crates/iceberg/src/io/storage/opendal/mod.rs b/crates/iceberg/src/io/storage/opendal/mod.rs index 4f342a73eb..ee0df20fce 100644 --- a/crates/iceberg/src/io/storage/opendal/mod.rs +++ b/crates/iceberg/src/io/storage/opendal/mod.rs @@ -38,8 +38,10 @@ use opendal::{Operator, Scheme}; pub use s3::CustomAwsCredentialLoader; use serde::{Deserialize, Serialize}; -use super::{Storage, StorageConfig, StorageFactory}; -use crate::io::{FileIOBuilder, FileMetadata, FileRead, FileWrite, InputFile, OutputFile}; +use crate::io::{ + FileIOBuilder, FileMetadata, FileRead, FileWrite, InputFile, OutputFile, Storage, + StorageConfig, StorageFactory, +}; use crate::{Error, ErrorKind, Result}; #[cfg(feature = "storage-azdls")] diff --git a/crates/iceberg/src/io/storage/opendal/oss.rs b/crates/iceberg/src/io/storage/opendal/oss.rs index 91a1c0a667..98b68f7cde 100644 --- a/crates/iceberg/src/io/storage/opendal/oss.rs +++ b/crates/iceberg/src/io/storage/opendal/oss.rs @@ -21,7 +21,7 @@ use opendal::services::OssConfig; use opendal::{Configurator, Operator}; use url::Url; -use crate::io::storage::config::{OSS_ACCESS_KEY_ID, OSS_ACCESS_KEY_SECRET, OSS_ENDPOINT}; +use crate::io::{OSS_ACCESS_KEY_ID, OSS_ACCESS_KEY_SECRET, OSS_ENDPOINT}; use crate::{Error, ErrorKind, Result}; /// Parse iceberg props to oss config. diff --git a/crates/iceberg/src/io/storage/opendal/s3.rs b/crates/iceberg/src/io/storage/opendal/s3.rs index 9ee30e06e7..6b4d64e3a6 100644 --- a/crates/iceberg/src/io/storage/opendal/s3.rs +++ b/crates/iceberg/src/io/storage/opendal/s3.rs @@ -25,12 +25,11 @@ pub use reqsign::{AwsCredential, AwsCredentialLoad}; use reqwest::Client; use url::Url; -use crate::io::is_truthy; -use crate::io::storage::config::{ +use crate::io::{ CLIENT_REGION, S3_ACCESS_KEY_ID, S3_ALLOW_ANONYMOUS, S3_ASSUME_ROLE_ARN, S3_ASSUME_ROLE_EXTERNAL_ID, S3_ASSUME_ROLE_SESSION_NAME, S3_DISABLE_CONFIG_LOAD, S3_DISABLE_EC2_METADATA, S3_ENDPOINT, S3_PATH_STYLE_ACCESS, S3_REGION, S3_SECRET_ACCESS_KEY, - S3_SESSION_TOKEN, S3_SSE_KEY, S3_SSE_MD5, S3_SSE_TYPE, + S3_SESSION_TOKEN, S3_SSE_KEY, S3_SSE_MD5, S3_SSE_TYPE, is_truthy, }; use crate::{Error, ErrorKind, Result}; From d806a5cacd4c773bec922db8b4f5911c943f0717 Mon Sep 17 00:00:00 2001 From: Shawn Chang Date: Thu, 5 Feb 2026 11:42:34 -0800 Subject: [PATCH 04/13] Integrate FileIO with Storage trait --- crates/catalog/glue/src/catalog.rs | 58 ++- crates/catalog/hms/src/catalog.rs | 61 ++-- crates/catalog/rest/src/catalog.rs | 91 ++--- crates/catalog/s3tables/src/catalog.rs | 59 ++- crates/catalog/sql/src/catalog.rs | 68 ++-- crates/iceberg/src/catalog/memory/catalog.rs | 50 ++- crates/iceberg/src/catalog/mod.rs | 26 ++ crates/iceberg/src/io/file_io.rs | 345 ++++++++++-------- crates/iceberg/src/io/mod.rs | 47 +-- crates/iceberg/src/io/storage/opendal/mod.rs | 67 +--- .../src/table/table_provider_factory.rs | 40 +- 11 files changed, 508 insertions(+), 404 deletions(-) diff --git a/crates/catalog/glue/src/catalog.rs b/crates/catalog/glue/src/catalog.rs index 37a7996f80..e31b3217a1 100644 --- a/crates/catalog/glue/src/catalog.rs +++ b/crates/catalog/glue/src/catalog.rs @@ -17,6 +17,7 @@ use std::collections::HashMap; use std::fmt::Debug; +use std::sync::Arc; use anyhow::anyhow; use async_trait::async_trait; @@ -24,7 +25,8 @@ use aws_sdk_glue::operation::create_table::CreateTableError; use aws_sdk_glue::operation::update_table::UpdateTableError; use aws_sdk_glue::types::TableInput; use iceberg::io::{ - FileIO, S3_ACCESS_KEY_ID, S3_ENDPOINT, S3_REGION, S3_SECRET_ACCESS_KEY, S3_SESSION_TOKEN, + FileIO, FileIOBuilder, LocalFsStorageFactory, StorageFactory, S3_ACCESS_KEY_ID, S3_ENDPOINT, + S3_REGION, S3_SECRET_ACCESS_KEY, S3_SESSION_TOKEN, }; use iceberg::spec::{TableMetadata, TableMetadataBuilder}; use iceberg::table::Table; @@ -51,47 +53,58 @@ pub const GLUE_CATALOG_PROP_WAREHOUSE: &str = "warehouse"; /// Builder for [`GlueCatalog`]. #[derive(Debug)] -pub struct GlueCatalogBuilder(GlueCatalogConfig); +pub struct GlueCatalogBuilder { + config: GlueCatalogConfig, + storage_factory: Option>, +} impl Default for GlueCatalogBuilder { fn default() -> Self { - Self(GlueCatalogConfig { - name: None, - uri: None, - catalog_id: None, - warehouse: "".to_string(), - props: HashMap::new(), - }) + Self { + config: GlueCatalogConfig { + name: None, + uri: None, + catalog_id: None, + warehouse: "".to_string(), + props: HashMap::new(), + }, + storage_factory: None, + } } } impl CatalogBuilder for GlueCatalogBuilder { type C = GlueCatalog; + fn with_storage_factory(mut self, storage_factory: Arc) -> Self { + self.storage_factory = Some(storage_factory); + self + } + fn load( mut self, name: impl Into, props: HashMap, ) -> impl Future> + Send { - self.0.name = Some(name.into()); + self.config.name = Some(name.into()); if props.contains_key(GLUE_CATALOG_PROP_URI) { - self.0.uri = props.get(GLUE_CATALOG_PROP_URI).cloned() + self.config.uri = props.get(GLUE_CATALOG_PROP_URI).cloned() } if props.contains_key(GLUE_CATALOG_PROP_CATALOG_ID) { - self.0.catalog_id = props.get(GLUE_CATALOG_PROP_CATALOG_ID).cloned() + self.config.catalog_id = props.get(GLUE_CATALOG_PROP_CATALOG_ID).cloned() } if props.contains_key(GLUE_CATALOG_PROP_WAREHOUSE) { - self.0.warehouse = props + self.config.warehouse = props .get(GLUE_CATALOG_PROP_WAREHOUSE) .cloned() .unwrap_or_default(); } // Collect other remaining properties - self.0.props = props + self.config.props = props .into_iter() .filter(|(k, _)| { k != GLUE_CATALOG_PROP_URI @@ -101,20 +114,20 @@ impl CatalogBuilder for GlueCatalogBuilder { .collect(); async move { - if self.0.name.is_none() { + if self.config.name.is_none() { return Err(Error::new( ErrorKind::DataInvalid, "Catalog name is required", )); } - if self.0.warehouse.is_empty() { + if self.config.warehouse.is_empty() { return Err(Error::new( ErrorKind::DataInvalid, "Catalog warehouse is required", )); } - GlueCatalog::new(self.0).await + GlueCatalog::new(self.config, self.storage_factory).await } } } @@ -148,7 +161,10 @@ impl Debug for GlueCatalog { impl GlueCatalog { /// Create a new glue catalog - async fn new(config: GlueCatalogConfig) -> Result { + async fn new( + config: GlueCatalogConfig, + storage_factory: Option>, + ) -> Result { let sdk_config = create_sdk_config(&config.props, config.uri.as_ref()).await; let mut file_io_props = config.props.clone(); if !file_io_props.contains_key(S3_ACCESS_KEY_ID) @@ -182,9 +198,11 @@ impl GlueCatalog { let client = aws_sdk_glue::Client::new(&sdk_config); - let file_io = FileIO::from_path(&config.warehouse)? + // Use provided factory or default to LocalFsStorageFactory + let factory = storage_factory.unwrap_or_else(|| Arc::new(LocalFsStorageFactory)); + let file_io = FileIOBuilder::new(factory) .with_props(file_io_props) - .build()?; + .build(); Ok(GlueCatalog { config, diff --git a/crates/catalog/hms/src/catalog.rs b/crates/catalog/hms/src/catalog.rs index b7d192210b..bf7a6925b6 100644 --- a/crates/catalog/hms/src/catalog.rs +++ b/crates/catalog/hms/src/catalog.rs @@ -18,6 +18,7 @@ use std::collections::HashMap; use std::fmt::{Debug, Formatter}; use std::net::ToSocketAddrs; +use std::sync::Arc; use anyhow::anyhow; use async_trait::async_trait; @@ -25,7 +26,7 @@ use hive_metastore::{ ThriftHiveMetastoreClient, ThriftHiveMetastoreClientBuilder, ThriftHiveMetastoreGetDatabaseException, ThriftHiveMetastoreGetTableException, }; -use iceberg::io::FileIO; +use iceberg::io::{FileIO, FileIOBuilder, LocalFsStorageFactory, StorageFactory}; use iceberg::spec::{TableMetadata, TableMetadataBuilder}; use iceberg::table::Table; use iceberg::{ @@ -50,38 +51,49 @@ pub const THRIFT_TRANSPORT_BUFFERED: &str = "buffered"; /// HMS Catalog warehouse location pub const HMS_CATALOG_PROP_WAREHOUSE: &str = "warehouse"; -/// Builder for [`RestCatalog`]. +/// Builder for [`HmsCatalog`]. #[derive(Debug)] -pub struct HmsCatalogBuilder(HmsCatalogConfig); +pub struct HmsCatalogBuilder { + config: HmsCatalogConfig, + storage_factory: Option>, +} impl Default for HmsCatalogBuilder { fn default() -> Self { - Self(HmsCatalogConfig { - name: None, - address: "".to_string(), - thrift_transport: HmsThriftTransport::default(), - warehouse: "".to_string(), - props: HashMap::new(), - }) + Self { + config: HmsCatalogConfig { + name: None, + address: "".to_string(), + thrift_transport: HmsThriftTransport::default(), + warehouse: "".to_string(), + props: HashMap::new(), + }, + storage_factory: None, + } } } impl CatalogBuilder for HmsCatalogBuilder { type C = HmsCatalog; + fn with_storage_factory(mut self, storage_factory: Arc) -> Self { + self.storage_factory = Some(storage_factory); + self + } + fn load( mut self, name: impl Into, props: HashMap, ) -> impl Future> + Send { - self.0.name = Some(name.into()); + self.config.name = Some(name.into()); if props.contains_key(HMS_CATALOG_PROP_URI) { - self.0.address = props.get(HMS_CATALOG_PROP_URI).cloned().unwrap_or_default(); + self.config.address = props.get(HMS_CATALOG_PROP_URI).cloned().unwrap_or_default(); } if let Some(tt) = props.get(HMS_CATALOG_PROP_THRIFT_TRANSPORT) { - self.0.thrift_transport = match tt.to_lowercase().as_str() { + self.config.thrift_transport = match tt.to_lowercase().as_str() { THRIFT_TRANSPORT_FRAMED => HmsThriftTransport::Framed, THRIFT_TRANSPORT_BUFFERED => HmsThriftTransport::Buffered, _ => HmsThriftTransport::default(), @@ -89,13 +101,13 @@ impl CatalogBuilder for HmsCatalogBuilder { } if props.contains_key(HMS_CATALOG_PROP_WAREHOUSE) { - self.0.warehouse = props + self.config.warehouse = props .get(HMS_CATALOG_PROP_WAREHOUSE) .cloned() .unwrap_or_default(); } - self.0.props = props + self.config.props = props .into_iter() .filter(|(k, _)| { k != HMS_CATALOG_PROP_URI @@ -105,23 +117,23 @@ impl CatalogBuilder for HmsCatalogBuilder { .collect(); let result = { - if self.0.name.is_none() { + if self.config.name.is_none() { Err(Error::new( ErrorKind::DataInvalid, "Catalog name is required", )) - } else if self.0.address.is_empty() { + } else if self.config.address.is_empty() { Err(Error::new( ErrorKind::DataInvalid, "Catalog address is required", )) - } else if self.0.warehouse.is_empty() { + } else if self.config.warehouse.is_empty() { Err(Error::new( ErrorKind::DataInvalid, "Catalog warehouse is required", )) } else { - HmsCatalog::new(self.0) + HmsCatalog::new(self.config, self.storage_factory) } }; @@ -169,7 +181,10 @@ impl Debug for HmsCatalog { impl HmsCatalog { /// Create a new hms catalog. - fn new(config: HmsCatalogConfig) -> Result { + fn new( + config: HmsCatalogConfig, + storage_factory: Option>, + ) -> Result { let address = config .address .as_str() @@ -194,9 +209,11 @@ impl HmsCatalog { .build(), }; - let file_io = FileIO::from_path(&config.warehouse)? + // Use provided factory or default to LocalFsStorageFactory + let factory = storage_factory.unwrap_or_else(|| Arc::new(LocalFsStorageFactory)); + let file_io = FileIOBuilder::new(factory) .with_props(&config.props) - .build()?; + .build(); Ok(Self { config, diff --git a/crates/catalog/rest/src/catalog.rs b/crates/catalog/rest/src/catalog.rs index ddbf6a4e01..8e216006da 100644 --- a/crates/catalog/rest/src/catalog.rs +++ b/crates/catalog/rest/src/catalog.rs @@ -17,13 +17,13 @@ //! This module contains the iceberg REST catalog implementation. -use std::any::Any; use std::collections::HashMap; use std::future::Future; use std::str::FromStr; +use std::sync::Arc; use async_trait::async_trait; -use iceberg::io::{self, FileIO}; +use iceberg::io::{FileIO, FileIOBuilder, LocalFsStorageFactory, StorageFactory}; use iceberg::table::Table; use iceberg::{ Catalog, CatalogBuilder, Error, ErrorKind, Namespace, NamespaceIdent, Result, TableCommit, @@ -57,60 +57,71 @@ const PATH_V1: &str = "v1"; /// Builder for [`RestCatalog`]. #[derive(Debug)] -pub struct RestCatalogBuilder(RestCatalogConfig); +pub struct RestCatalogBuilder { + config: RestCatalogConfig, + storage_factory: Option>, +} impl Default for RestCatalogBuilder { fn default() -> Self { - Self(RestCatalogConfig { - name: None, - uri: "".to_string(), - warehouse: None, - props: HashMap::new(), - client: None, - }) + Self { + config: RestCatalogConfig { + name: None, + uri: "".to_string(), + warehouse: None, + props: HashMap::new(), + client: None, + }, + storage_factory: None, + } } } impl CatalogBuilder for RestCatalogBuilder { type C = RestCatalog; + fn with_storage_factory(mut self, storage_factory: Arc) -> Self { + self.storage_factory = Some(storage_factory); + self + } + fn load( mut self, name: impl Into, props: HashMap, ) -> impl Future> + Send { - self.0.name = Some(name.into()); + self.config.name = Some(name.into()); if props.contains_key(REST_CATALOG_PROP_URI) { - self.0.uri = props + self.config.uri = props .get(REST_CATALOG_PROP_URI) .cloned() .unwrap_or_default(); } if props.contains_key(REST_CATALOG_PROP_WAREHOUSE) { - self.0.warehouse = props.get(REST_CATALOG_PROP_WAREHOUSE).cloned() + self.config.warehouse = props.get(REST_CATALOG_PROP_WAREHOUSE).cloned() } // Collect other remaining properties - self.0.props = props + self.config.props = props .into_iter() .filter(|(k, _)| k != REST_CATALOG_PROP_URI && k != REST_CATALOG_PROP_WAREHOUSE) .collect(); let result = { - if self.0.name.is_none() { + if self.config.name.is_none() { Err(Error::new( ErrorKind::DataInvalid, "Catalog name is required", )) - } else if self.0.uri.is_empty() { + } else if self.config.uri.is_empty() { Err(Error::new( ErrorKind::DataInvalid, "Catalog uri is required", )) } else { - Ok(RestCatalog::new(self.0)) + Ok(RestCatalog::new(self.config, self.storage_factory)) } }; @@ -121,7 +132,7 @@ impl CatalogBuilder for RestCatalogBuilder { impl RestCatalogBuilder { /// Configures the catalog with a custom HTTP client. pub fn with_client(mut self, client: Client) -> Self { - self.0.client = Some(client); + self.config.client = Some(client); self } } @@ -325,26 +336,20 @@ pub struct RestCatalog { /// It could be different from the config fetched from the server and used at runtime. user_config: RestCatalogConfig, ctx: OnceCell, - /// Extensions for the FileIOBuilder. - file_io_extensions: io::Extensions, + /// Storage factory for creating FileIO instances. + storage_factory: Option>, } impl RestCatalog { /// Creates a `RestCatalog` from a [`RestCatalogConfig`]. - fn new(config: RestCatalogConfig) -> Self { + fn new(config: RestCatalogConfig, storage_factory: Option>) -> Self { Self { user_config: config, ctx: OnceCell::new(), - file_io_extensions: io::Extensions::default(), + storage_factory, } } - /// Add an extension to the file IO builder. - pub fn with_file_io_extension(mut self, ext: T) -> Self { - self.file_io_extensions.add(ext); - self - } - /// Gets the [`RestContext`] from the catalog. async fn context(&self) -> Result<&RestContext> { self.ctx @@ -400,18 +405,20 @@ impl RestCatalog { None => None, }; - let file_io = match metadata_location.or(warehouse_path) { - Some(url) => FileIO::from_path(url)? - .with_props(props) - .with_extensions(self.file_io_extensions.clone()) - .build()?, - None => { - return Err(Error::new( - ErrorKind::Unexpected, - "Unable to load file io, neither warehouse nor metadata location is set!", - ))?; - } - }; + if metadata_location.or(warehouse_path).is_none() { + return Err(Error::new( + ErrorKind::Unexpected, + "Unable to load file io, neither warehouse nor metadata location is set!", + )); + } + + // Use provided factory or default to LocalFsStorageFactory + let factory = self + .storage_factory + .clone() + .unwrap_or_else(|| Arc::new(LocalFsStorageFactory)); + + let file_io = FileIOBuilder::new(factory).with_props(props).build(); Ok(file_io) } @@ -2422,7 +2429,7 @@ mod tests { .metadata(resp.metadata) .metadata_location(resp.metadata_location.unwrap()) .identifier(TableIdent::from_strs(["ns1", "test1"]).unwrap()) - .file_io(FileIO::from_path("/tmp").unwrap().build().unwrap()) + .file_io(FileIO::new_with_fs()) .build() .unwrap() }; @@ -2562,7 +2569,7 @@ mod tests { .metadata(resp.metadata) .metadata_location(resp.metadata_location.unwrap()) .identifier(TableIdent::from_strs(["ns1", "test1"]).unwrap()) - .file_io(FileIO::from_path("/tmp").unwrap().build().unwrap()) + .file_io(FileIO::new_with_fs()) .build() .unwrap() }; diff --git a/crates/catalog/s3tables/src/catalog.rs b/crates/catalog/s3tables/src/catalog.rs index 3606fac99a..2ba80e1fb5 100644 --- a/crates/catalog/s3tables/src/catalog.rs +++ b/crates/catalog/s3tables/src/catalog.rs @@ -17,6 +17,7 @@ use std::collections::HashMap; use std::future::Future; +use std::sync::Arc; use async_trait::async_trait; use aws_sdk_s3tables::operation::create_table::CreateTableOutput; @@ -25,7 +26,7 @@ use aws_sdk_s3tables::operation::get_table::GetTableOutput; use aws_sdk_s3tables::operation::list_tables::ListTablesOutput; use aws_sdk_s3tables::operation::update_table_metadata_location::UpdateTableMetadataLocationError; use aws_sdk_s3tables::types::OpenTableFormat; -use iceberg::io::{FileIO, FileIOBuilder}; +use iceberg::io::{FileIO, FileIOBuilder, LocalFsStorageFactory, StorageFactory}; use iceberg::spec::{TableMetadata, TableMetadataBuilder}; use iceberg::table::Table; use iceberg::{ @@ -65,18 +66,24 @@ struct S3TablesCatalogConfig { /// Builder for [`S3TablesCatalog`]. #[derive(Debug)] -pub struct S3TablesCatalogBuilder(S3TablesCatalogConfig); +pub struct S3TablesCatalogBuilder { + config: S3TablesCatalogConfig, + storage_factory: Option>, +} /// Default builder for [`S3TablesCatalog`]. impl Default for S3TablesCatalogBuilder { fn default() -> Self { - Self(S3TablesCatalogConfig { - name: None, - table_bucket_arn: "".to_string(), - endpoint_url: None, - client: None, - props: HashMap::new(), - }) + Self { + config: S3TablesCatalogConfig { + name: None, + table_bucket_arn: "".to_string(), + endpoint_url: None, + client: None, + props: HashMap::new(), + }, + storage_factory: None, + } } } @@ -91,13 +98,13 @@ impl S3TablesCatalogBuilder { /// This follows the general pattern where properties specified in the `load()` method /// have higher priority than builder method configurations. pub fn with_endpoint_url(mut self, endpoint_url: impl Into) -> Self { - self.0.endpoint_url = Some(endpoint_url.into()); + self.config.endpoint_url = Some(endpoint_url.into()); self } /// Configure the catalog with a pre-built AWS SDK client. pub fn with_client(mut self, client: aws_sdk_s3tables::Client) -> Self { - self.0.client = Some(client); + self.config.client = Some(client); self } @@ -110,7 +117,7 @@ impl S3TablesCatalogBuilder { /// This follows the general pattern where properties specified in the `load()` method /// have higher priority than builder method configurations. pub fn with_table_bucket_arn(mut self, table_bucket_arn: impl Into) -> Self { - self.0.table_bucket_arn = table_bucket_arn.into(); + self.config.table_bucket_arn = table_bucket_arn.into(); self } } @@ -118,27 +125,32 @@ impl S3TablesCatalogBuilder { impl CatalogBuilder for S3TablesCatalogBuilder { type C = S3TablesCatalog; + fn with_storage_factory(mut self, storage_factory: Arc) -> Self { + self.storage_factory = Some(storage_factory); + self + } + fn load( mut self, name: impl Into, props: HashMap, ) -> impl Future> + Send { let catalog_name = name.into(); - self.0.name = Some(catalog_name.clone()); + self.config.name = Some(catalog_name.clone()); if props.contains_key(S3TABLES_CATALOG_PROP_TABLE_BUCKET_ARN) { - self.0.table_bucket_arn = props + self.config.table_bucket_arn = props .get(S3TABLES_CATALOG_PROP_TABLE_BUCKET_ARN) .cloned() .unwrap_or_default(); } if props.contains_key(S3TABLES_CATALOG_PROP_ENDPOINT_URL) { - self.0.endpoint_url = props.get(S3TABLES_CATALOG_PROP_ENDPOINT_URL).cloned(); + self.config.endpoint_url = props.get(S3TABLES_CATALOG_PROP_ENDPOINT_URL).cloned(); } // Collect other remaining properties - self.0.props = props + self.config.props = props .into_iter() .filter(|(k, _)| { k != S3TABLES_CATALOG_PROP_TABLE_BUCKET_ARN @@ -152,13 +164,13 @@ impl CatalogBuilder for S3TablesCatalogBuilder { ErrorKind::DataInvalid, "Catalog name cannot be empty", )) - } else if self.0.table_bucket_arn.is_empty() { + } else if self.config.table_bucket_arn.is_empty() { Err(Error::new( ErrorKind::DataInvalid, "Table bucket ARN is required", )) } else { - S3TablesCatalog::new(self.0).await + S3TablesCatalog::new(self.config, self.storage_factory).await } } } @@ -174,7 +186,10 @@ pub struct S3TablesCatalog { impl S3TablesCatalog { /// Creates a new S3Tables catalog. - async fn new(config: S3TablesCatalogConfig) -> Result { + async fn new( + config: S3TablesCatalogConfig, + storage_factory: Option>, + ) -> Result { let s3tables_client = if let Some(client) = config.client.clone() { client } else { @@ -182,7 +197,11 @@ impl S3TablesCatalog { aws_sdk_s3tables::Client::new(&aws_config) }; - let file_io = FileIOBuilder::new("s3").with_props(&config.props).build()?; + // Use provided factory or default to LocalFsStorageFactory + let factory = storage_factory.unwrap_or_else(|| Arc::new(LocalFsStorageFactory)); + let file_io = FileIOBuilder::new(factory) + .with_props(&config.props) + .build(); Ok(Self { config, diff --git a/crates/catalog/sql/src/catalog.rs b/crates/catalog/sql/src/catalog.rs index e3b8a17be3..6954edc6ed 100644 --- a/crates/catalog/sql/src/catalog.rs +++ b/crates/catalog/sql/src/catalog.rs @@ -17,10 +17,11 @@ use std::collections::{HashMap, HashSet}; use std::str::FromStr; +use std::sync::Arc; use std::time::Duration; use async_trait::async_trait; -use iceberg::io::FileIO; +use iceberg::io::{FileIO, FileIOBuilder, LocalFsStorageFactory, StorageFactory}; use iceberg::spec::{TableMetadata, TableMetadataBuilder}; use iceberg::table::Table; use iceberg::{ @@ -63,17 +64,23 @@ static TEST_BEFORE_ACQUIRE: bool = true; // Default the health-check of each con /// Builder for [`SqlCatalog`] #[derive(Debug)] -pub struct SqlCatalogBuilder(SqlCatalogConfig); +pub struct SqlCatalogBuilder { + config: SqlCatalogConfig, + storage_factory: Option>, +} impl Default for SqlCatalogBuilder { fn default() -> Self { - Self(SqlCatalogConfig { - uri: "".to_string(), - name: "".to_string(), - warehouse_location: "".to_string(), - sql_bind_style: SqlBindStyle::DollarNumeric, - props: HashMap::new(), - }) + Self { + config: SqlCatalogConfig { + uri: "".to_string(), + name: "".to_string(), + warehouse_location: "".to_string(), + sql_bind_style: SqlBindStyle::DollarNumeric, + props: HashMap::new(), + }, + storage_factory: None, + } } } @@ -83,7 +90,7 @@ impl SqlCatalogBuilder { /// If `SQL_CATALOG_PROP_URI` has a value set in `props` during `SqlCatalogBuilder::load`, /// that value takes precedence, and the value specified by this method will not be used. pub fn uri(mut self, uri: impl Into) -> Self { - self.0.uri = uri.into(); + self.config.uri = uri.into(); self } @@ -92,7 +99,7 @@ impl SqlCatalogBuilder { /// If `SQL_CATALOG_PROP_WAREHOUSE` has a value set in `props` during `SqlCatalogBuilder::load`, /// that value takes precedence, and the value specified by this method will not be used. pub fn warehouse_location(mut self, location: impl Into) -> Self { - self.0.warehouse_location = location.into(); + self.config.warehouse_location = location.into(); self } @@ -101,7 +108,7 @@ impl SqlCatalogBuilder { /// If `SQL_CATALOG_PROP_BIND_STYLE` has a value set in `props` during `SqlCatalogBuilder::load`, /// that value takes precedence, and the value specified by this method will not be used. pub fn sql_bind_style(mut self, sql_bind_style: SqlBindStyle) -> Self { - self.0.sql_bind_style = sql_bind_style; + self.config.sql_bind_style = sql_bind_style; self } @@ -111,7 +118,7 @@ impl SqlCatalogBuilder { /// those values will take precedence. pub fn props(mut self, props: HashMap) -> Self { for (k, v) in props { - self.0.props.insert(k, v); + self.config.props.insert(k, v); } self } @@ -123,7 +130,7 @@ impl SqlCatalogBuilder { /// If the same key has values set in `props` during `SqlCatalogBuilder::load`, /// those values will take precedence. pub fn prop(mut self, key: impl Into, value: impl Into) -> Self { - self.0.props.insert(key.into(), value.into()); + self.config.props.insert(key.into(), value.into()); self } } @@ -131,28 +138,33 @@ impl SqlCatalogBuilder { impl CatalogBuilder for SqlCatalogBuilder { type C = SqlCatalog; + fn with_storage_factory(mut self, storage_factory: Arc) -> Self { + self.storage_factory = Some(storage_factory); + self + } + fn load( mut self, name: impl Into, props: HashMap, ) -> impl Future> + Send { for (k, v) in props { - self.0.props.insert(k, v); + self.config.props.insert(k, v); } - if let Some(uri) = self.0.props.remove(SQL_CATALOG_PROP_URI) { - self.0.uri = uri; + if let Some(uri) = self.config.props.remove(SQL_CATALOG_PROP_URI) { + self.config.uri = uri; } - if let Some(warehouse_location) = self.0.props.remove(SQL_CATALOG_PROP_WAREHOUSE) { - self.0.warehouse_location = warehouse_location; + if let Some(warehouse_location) = self.config.props.remove(SQL_CATALOG_PROP_WAREHOUSE) { + self.config.warehouse_location = warehouse_location; } let name = name.into(); let mut valid_sql_bind_style = true; - if let Some(sql_bind_style) = self.0.props.remove(SQL_CATALOG_PROP_BIND_STYLE) { + if let Some(sql_bind_style) = self.config.props.remove(SQL_CATALOG_PROP_BIND_STYLE) { if let Ok(sql_bind_style) = SqlBindStyle::from_str(&sql_bind_style) { - self.0.sql_bind_style = sql_bind_style; + self.config.sql_bind_style = sql_bind_style; } else { valid_sql_bind_style = false; } @@ -177,8 +189,8 @@ impl CatalogBuilder for SqlCatalogBuilder { ), )) } else { - self.0.name = name; - SqlCatalog::new(self.0).await + self.config.name = name; + SqlCatalog::new(self.config, self.storage_factory).await } } } @@ -222,8 +234,14 @@ pub enum SqlBindStyle { impl SqlCatalog { /// Create new sql catalog instance - async fn new(config: SqlCatalogConfig) -> Result { - let fileio = FileIO::from_path(&config.warehouse_location)?.build()?; + async fn new( + config: SqlCatalogConfig, + storage_factory: Option>, + ) -> Result { + // Use provided factory or default to LocalFsStorageFactory + let factory = storage_factory.unwrap_or_else(|| Arc::new(LocalFsStorageFactory)); + let fileio = FileIOBuilder::new(factory).build(); + install_default_drivers(); let max_connections: u32 = config .props diff --git a/crates/iceberg/src/catalog/memory/catalog.rs b/crates/iceberg/src/catalog/memory/catalog.rs index df0299acb2..5a69abf57d 100644 --- a/crates/iceberg/src/catalog/memory/catalog.rs +++ b/crates/iceberg/src/catalog/memory/catalog.rs @@ -18,13 +18,14 @@ //! This module contains memory catalog implementation. use std::collections::HashMap; +use std::sync::Arc; use async_trait::async_trait; use futures::lock::{Mutex, MutexGuard}; use itertools::Itertools; use super::namespace_state::NamespaceState; -use crate::io::FileIO; +use crate::io::{FileIO, FileIOBuilder, MemoryStorageFactory}; use crate::spec::{TableMetadata, TableMetadataBuilder}; use crate::table::Table; use crate::{ @@ -40,54 +41,65 @@ const LOCATION: &str = "location"; /// Builder for [`MemoryCatalog`]. #[derive(Debug)] -pub struct MemoryCatalogBuilder(MemoryCatalogConfig); +pub struct MemoryCatalogBuilder { + config: MemoryCatalogConfig, + storage_factory: Option>, +} impl Default for MemoryCatalogBuilder { fn default() -> Self { - Self(MemoryCatalogConfig { - name: None, - warehouse: "".to_string(), - props: HashMap::new(), - }) + Self { + config: MemoryCatalogConfig { + name: None, + warehouse: "".to_string(), + props: HashMap::new(), + }, + storage_factory: None, + } } } impl CatalogBuilder for MemoryCatalogBuilder { type C = MemoryCatalog; + fn with_storage_factory(mut self, storage_factory: Arc) -> Self { + self.storage_factory = Some(storage_factory); + self + } + fn load( mut self, name: impl Into, props: HashMap, ) -> impl Future> + Send { - self.0.name = Some(name.into()); + self.config.name = Some(name.into()); if props.contains_key(MEMORY_CATALOG_WAREHOUSE) { - self.0.warehouse = props + self.config.warehouse = props .get(MEMORY_CATALOG_WAREHOUSE) .cloned() .unwrap_or_default() } // Collect other remaining properties - self.0.props = props + self.config.props = props .into_iter() .filter(|(k, _)| k != MEMORY_CATALOG_WAREHOUSE) .collect(); let result = { - if self.0.name.is_none() { + if self.config.name.is_none() { Err(Error::new( ErrorKind::DataInvalid, "Catalog name is required", )) - } else if self.0.warehouse.is_empty() { + } else if self.config.warehouse.is_empty() { Err(Error::new( ErrorKind::DataInvalid, "Catalog warehouse is required", )) } else { - MemoryCatalog::new(self.0) + MemoryCatalog::new(self.config, self.storage_factory) } }; @@ -112,12 +124,18 @@ pub struct MemoryCatalog { impl MemoryCatalog { /// Creates a memory catalog. - fn new(config: MemoryCatalogConfig) -> Result { + fn new( + config: MemoryCatalogConfig, + storage_factory: Option>, + ) -> Result { + // Use provided factory or default to MemoryStorageFactory + let factory = storage_factory.unwrap_or_else(|| Arc::new(MemoryStorageFactory)); + Ok(Self { root_namespace_state: Mutex::new(NamespaceState::default()), - file_io: FileIO::from_path(&config.warehouse)? + file_io: FileIOBuilder::new(factory) .with_props(config.props) - .build()?, + .build(), warehouse_location: config.warehouse, }) } diff --git a/crates/iceberg/src/catalog/mod.rs b/crates/iceberg/src/catalog/mod.rs index f3a521379e..d12e3d38e7 100644 --- a/crates/iceberg/src/catalog/mod.rs +++ b/crates/iceberg/src/catalog/mod.rs @@ -114,6 +114,32 @@ pub trait Catalog: Debug + Sync + Send { pub trait CatalogBuilder: Default + Debug + Send + Sync { /// The catalog type that this builder creates. type C: Catalog; + + /// Set a custom StorageFactory to use for storage operations. + /// + /// When a StorageFactory is provided, the catalog will use it to build FileIO + /// instances for all storage operations instead of using the default factory. + /// + /// # Arguments + /// + /// * `storage_factory` - The StorageFactory to use for creating storage instances + /// + /// # Example + /// + /// ```rust,ignore + /// use iceberg::CatalogBuilder; + /// use iceberg::io::{OpenDalStorageFactory, StorageFactory}; + /// use std::sync::Arc; + /// + /// let catalog = MyCatalogBuilder::default() + /// .with_storage_factory(Arc::new(OpenDalStorageFactory::S3 { + /// customized_credential_load: None, + /// })) + /// .load("my_catalog", props) + /// .await?; + /// ``` + fn with_storage_factory(self, storage_factory: Arc) -> Self; + /// Create a new catalog instance. fn load( self, diff --git a/crates/iceberg/src/io/file_io.rs b/crates/iceberg/src/io/file_io.rs index 267f295369..33eba11f0e 100644 --- a/crates/iceberg/src/io/file_io.rs +++ b/crates/iceberg/src/io/file_io.rs @@ -15,71 +15,111 @@ // specific language governing permissions and limitations // under the License. -use std::any::{Any, TypeId}; -use std::collections::HashMap; use std::ops::Range; -use std::sync::Arc; +use std::sync::{Arc, OnceLock}; use bytes::Bytes; -use url::Url; -use super::storage::{OpenDalStorage, Storage}; -use crate::{Error, ErrorKind, Result}; +use super::storage::{ + LocalFsStorageFactory, MemoryStorageFactory, Storage, StorageConfig, StorageFactory, +}; +use crate::Result; /// FileIO implementation, used to manipulate files in underlying storage. /// +/// FileIO wraps a `dyn Storage` with lazy initialization via `StorageFactory`. +/// The storage is created on first use and cached for subsequent operations. +/// /// # Note /// -/// All path passed to `FileIO` must be absolute path starting with scheme string used to construct `FileIO`. -/// For example, if you construct `FileIO` with `s3a` scheme, then all path passed to `FileIO` must start with `s3a://`. +/// All paths passed to `FileIO` must be absolute paths starting with the scheme string +/// appropriate for the storage backend being used. +/// +/// # Example +/// +/// ```rust,ignore +/// use iceberg::io::{FileIO, FileIOBuilder}; +/// use iceberg::io::{LocalFsStorageFactory, MemoryStorageFactory}; +/// use std::sync::Arc; /// -/// Supported storages: +/// // Create FileIO with memory storage for testing +/// let file_io = FileIO::new_with_memory(); /// -/// | Storage | Feature Flag | Expected Path Format | Schemes | -/// |--------------------|-------------------|----------------------------------| ------------------------------| -/// | Local file system | `storage-fs` | `file` | `file://path/to/file` | -/// | Memory | `storage-memory` | `memory` | `memory://path/to/file` | -/// | S3 | `storage-s3` | `s3`, `s3a` | `s3:///path/to/file` | -/// | GCS | `storage-gcs` | `gs`, `gcs` | `gs:///path/to/file` | -/// | OSS | `storage-oss` | `oss` | `oss:///path/to/file` | -/// | Azure Datalake | `storage-azdls` | `abfs`, `abfss`, `wasb`, `wasbs` | `abfs://@.dfs.core.windows.net/path/to/file` or `wasb://@.blob.core.windows.net/path/to/file` | +/// // Create FileIO with local filesystem storage +/// let file_io = FileIO::new_with_fs(); +/// +/// // Create FileIO with custom factory +/// let file_io = FileIOBuilder::new(Arc::new(LocalFsStorageFactory)) +/// .with_prop("key", "value") +/// .build(); +/// ``` #[derive(Clone, Debug)] pub struct FileIO { - builder: FileIOBuilder, - - inner: Arc, + /// Storage configuration containing properties + config: StorageConfig, + /// Factory for creating storage instances + factory: Arc, + /// Cached storage instance (lazily initialized) + storage: Arc>>, } impl FileIO { - /// Convert FileIO into [`FileIOBuilder`] which used to build this FileIO. + /// Create a new FileIO backed by in-memory storage. /// - /// This function is useful when you want serialize and deserialize FileIO across - /// distributed systems. - pub fn into_builder(self) -> FileIOBuilder { - self.builder + /// This is useful for testing scenarios where persistent storage is not needed. + pub fn new_with_memory() -> Self { + Self { + config: StorageConfig::new(), + factory: Arc::new(MemoryStorageFactory), + storage: Arc::new(OnceLock::new()), + } } - /// Try to infer file io scheme from path. See [`FileIO`] for supported schemes. + /// Create a new FileIO backed by local filesystem storage. /// - /// - If it's a valid url, for example `s3://bucket/a`, url scheme will be used, and the rest of the url will be ignored. - /// - If it's not a valid url, will try to detect if it's a file path. + /// This is useful for local development and testing with real files. + pub fn new_with_fs() -> Self { + Self { + config: StorageConfig::new(), + factory: Arc::new(LocalFsStorageFactory), + storage: Arc::new(OnceLock::new()), + } + } + + /// Convert FileIO into [`FileIOBuilder`] which can be used to modify configuration. + /// + /// This function is useful when you want to serialize and deserialize FileIO across + /// distributed systems, or when you need to create a new FileIO with modified settings. + pub fn into_builder(self) -> FileIOBuilder { + FileIOBuilder { + factory: self.factory, + config: self.config, + } + } + + /// Get the storage configuration. + pub fn config(&self) -> &StorageConfig { + &self.config + } + + /// Get or create the storage instance. /// - /// Otherwise will return parsing error. - pub fn from_path(path: impl AsRef) -> crate::Result { - let url = Url::parse(path.as_ref()) - .map_err(Error::from) - .or_else(|e| { - Url::from_file_path(path.as_ref()).map_err(|_| { - Error::new( - ErrorKind::DataInvalid, - "Input is neither a valid url nor path", - ) - .with_context("input", path.as_ref().to_string()) - .with_source(e) - }) - })?; + /// The factory is invoked on first access and the result is cached + /// for all subsequent operations. + fn get_storage(&self) -> Result> { + // Check if already initialized + if let Some(storage) = self.storage.get() { + return Ok(storage.clone()); + } + + // Build the storage + let storage = self.factory.build(&self.config)?; + + // Try to set it (another thread might have set it first) + let _ = self.storage.set(storage.clone()); - Ok(FileIOBuilder::new(url.scheme())) + // Return whatever is in the cell (either ours or another thread's) + Ok(self.storage.get().unwrap().clone()) } /// Deletes file. @@ -88,7 +128,7 @@ impl FileIO { /// /// * path: It should be *absolute* path starting with scheme string used to construct [`FileIO`]. pub async fn delete(&self, path: impl AsRef) -> Result<()> { - self.inner.delete(path.as_ref()).await + self.get_storage()?.delete(path.as_ref()).await } /// Remove the path and all nested dirs and files recursively. @@ -103,7 +143,7 @@ impl FileIO { /// - If the path is a empty directory, this function will remove the directory itself. /// - If the path is a non-empty directory, this function will remove the directory and all nested files and directories. pub async fn delete_prefix(&self, path: impl AsRef) -> Result<()> { - self.inner.delete_prefix(path.as_ref()).await + self.get_storage()?.delete_prefix(path.as_ref()).await } /// Check file exists. @@ -112,7 +152,7 @@ impl FileIO { /// /// * path: It should be *absolute* path starting with scheme string used to construct [`FileIO`]. pub async fn exists(&self, path: impl AsRef) -> Result { - self.inner.exists(path.as_ref()).await + self.get_storage()?.exists(path.as_ref()).await } /// Creates input file. @@ -121,7 +161,7 @@ impl FileIO { /// /// * path: It should be *absolute* path starting with scheme string used to construct [`FileIO`]. pub fn new_input(&self, path: impl AsRef) -> Result { - self.inner.new_input(path.as_ref()) + self.get_storage()?.new_input(path.as_ref()) } /// Creates output file. @@ -130,120 +170,76 @@ impl FileIO { /// /// * path: It should be *absolute* path starting with scheme string used to construct [`FileIO`]. pub fn new_output(&self, path: impl AsRef) -> Result { - self.inner.new_output(path.as_ref()) - } -} - -/// Container for storing type-safe extensions used to configure underlying FileIO behavior. -#[derive(Clone, Debug, Default)] -pub struct Extensions(HashMap>); - -impl Extensions { - /// Add an extension. - pub fn add(&mut self, ext: T) { - self.0.insert(TypeId::of::(), Arc::new(ext)); - } - - /// Extends the current set of extensions with another set of extensions. - pub fn extend(&mut self, extensions: Extensions) { - self.0.extend(extensions.0); - } - - /// Fetch an extension. - pub fn get(&self) -> Option> - where T: 'static + Send + Sync + Clone { - let type_id = TypeId::of::(); - self.0 - .get(&type_id) - .and_then(|arc_any| Arc::clone(arc_any).downcast::().ok()) + self.get_storage()?.new_output(path.as_ref()) } } /// Builder for [`FileIO`]. +/// +/// The builder accepts an explicit `StorageFactory` and configuration properties. +/// Storage is lazily initialized on first use. #[derive(Clone, Debug)] pub struct FileIOBuilder { - /// This is used to infer scheme of operator. - /// - /// If this is `None`, then [`FileIOBuilder::build`](FileIOBuilder::build) will build a local file io. - scheme_str: Option, - /// Arguments for operator. - props: HashMap, - /// Optional extensions to configure the underlying FileIO behavior. - extensions: Extensions, + /// Factory for creating storage instances + factory: Arc, + /// Storage configuration + config: StorageConfig, } impl FileIOBuilder { - /// Creates a new builder with scheme. - /// See [`FileIO`] for supported schemes. - pub fn new(scheme_str: impl ToString) -> Self { - Self { - scheme_str: Some(scheme_str.to_string()), - props: HashMap::default(), - extensions: Extensions::default(), - } - } - - /// Creates a new builder for local file io. - pub fn new_fs_io() -> Self { + /// Creates a new builder with the given storage factory. + /// + /// # Arguments + /// + /// * `factory` - The storage factory to use for creating storage instances + /// + /// # Example + /// + /// ```rust,ignore + /// use iceberg::io::{FileIOBuilder, LocalFsStorageFactory}; + /// use std::sync::Arc; + /// + /// let builder = FileIOBuilder::new(Arc::new(LocalFsStorageFactory)); + /// ``` + pub fn new(factory: Arc) -> Self { Self { - scheme_str: None, - props: HashMap::default(), - extensions: Extensions::default(), + factory, + config: StorageConfig::new(), } } - /// Fetch the scheme string. - /// - /// The scheme_str will be empty if it's None. - pub fn into_parts(self) -> (String, HashMap, Extensions) { - ( - self.scheme_str.unwrap_or_default(), - self.props, - self.extensions, - ) - } - - /// Add argument for operator. + /// Add a configuration property. pub fn with_prop(mut self, key: impl ToString, value: impl ToString) -> Self { - self.props.insert(key.to_string(), value.to_string()); + self.config = self.config.with_prop(key.to_string(), value.to_string()); self } - /// Add argument for operator. + /// Add multiple configuration properties. pub fn with_props( mut self, args: impl IntoIterator, ) -> Self { - self.props - .extend(args.into_iter().map(|e| (e.0.to_string(), e.1.to_string()))); - self - } - - /// Add an extension to the file IO builder. - pub fn with_extension(mut self, ext: T) -> Self { - self.extensions.add(ext); + self.config = self + .config + .with_props(args.into_iter().map(|e| (e.0.to_string(), e.1.to_string()))); self } - /// Adds multiple extensions to the file IO builder. - pub fn with_extensions(mut self, extensions: Extensions) -> Self { - self.extensions.extend(extensions); - self - } - - /// Fetch an extension from the file IO builder. - pub fn extension(&self) -> Option> - where T: 'static + Send + Sync + Clone { - self.extensions.get::() + /// Get the storage configuration. + pub fn config(&self) -> &StorageConfig { + &self.config } /// Builds [`FileIO`]. - pub fn build(self) -> Result { - let storage = OpenDalStorage::build(self.clone())?; - Ok(FileIO { - builder: self, - inner: Arc::new(storage), - }) + /// + /// Note: Storage is lazily initialized on first use, so this method + /// does not return a Result. + pub fn build(self) -> FileIO { + FileIO { + config: self.config, + factory: self.factory, + storage: Arc::new(OnceLock::new()), + } } } @@ -395,6 +391,7 @@ mod tests { use std::fs::{File, create_dir_all}; use std::io::Write; use std::path::Path; + use std::sync::Arc; use bytes::Bytes; use futures::AsyncReadExt; @@ -402,9 +399,10 @@ mod tests { use tempfile::TempDir; use super::{FileIO, FileIOBuilder}; + use crate::io::{LocalFsStorageFactory, MemoryStorageFactory}; fn create_local_file_io() -> FileIO { - FileIOBuilder::new_fs_io().build().unwrap() + FileIO::new_with_fs() } fn write_to_file>(s: &str, path: P) { @@ -434,7 +432,6 @@ mod tests { let input_file = file_io.new_input(&full_path).unwrap(); assert!(input_file.exists().await.unwrap()); - // Remove heading slash assert_eq!(&full_path, input_file.location()); let read_content = read_from_file(full_path).await; @@ -510,27 +507,9 @@ mod tests { assert_eq!(content, &read_content); } - #[test] - fn test_create_file_from_path() { - let io = FileIO::from_path("/tmp/a").unwrap(); - assert_eq!("file", io.scheme_str.unwrap().as_str()); - - let io = FileIO::from_path("file:/tmp/b").unwrap(); - assert_eq!("file", io.scheme_str.unwrap().as_str()); - - let io = FileIO::from_path("file:///tmp/c").unwrap(); - assert_eq!("file", io.scheme_str.unwrap().as_str()); - - let io = FileIO::from_path("s3://bucket/a").unwrap(); - assert_eq!("s3", io.scheme_str.unwrap().as_str()); - - let io = FileIO::from_path("tmp/||c"); - assert!(io.is_err()); - } - #[tokio::test] async fn test_memory_io() { - let io = FileIOBuilder::new("memory").build().unwrap(); + let io = FileIO::new_with_memory(); let path = format!("{}/1.txt", TempDir::new().unwrap().path().to_str().unwrap()); @@ -545,4 +524,60 @@ mod tests { io.delete(&path).await.unwrap(); assert!(!io.exists(&path).await.unwrap()); } + + #[tokio::test] + async fn test_file_io_builder_with_props() { + let factory = Arc::new(MemoryStorageFactory); + let file_io = FileIOBuilder::new(factory) + .with_prop("key1", "value1") + .with_prop("key2", "value2") + .build(); + + assert_eq!( + file_io.config().get("key1"), + Some(&"value1".to_string()) + ); + assert_eq!( + file_io.config().get("key2"), + Some(&"value2".to_string()) + ); + } + + #[tokio::test] + async fn test_file_io_builder_with_multiple_props() { + let factory = Arc::new(LocalFsStorageFactory); + let props = vec![("key1", "value1"), ("key2", "value2")]; + let file_io = FileIOBuilder::new(factory).with_props(props).build(); + + assert_eq!( + file_io.config().get("key1"), + Some(&"value1".to_string()) + ); + assert_eq!( + file_io.config().get("key2"), + Some(&"value2".to_string()) + ); + } + + #[tokio::test] + async fn test_file_io_into_builder() { + let factory = Arc::new(MemoryStorageFactory); + let file_io = FileIOBuilder::new(factory) + .with_prop("key", "value") + .build(); + + let builder = file_io.into_builder(); + assert_eq!(builder.config().get("key"), Some(&"value".to_string())); + + // Can build a new FileIO from the builder + let new_file_io = builder.with_prop("key2", "value2").build(); + assert_eq!( + new_file_io.config().get("key"), + Some(&"value".to_string()) + ); + assert_eq!( + new_file_io.config().get("key2"), + Some(&"value2".to_string()) + ); + } } diff --git a/crates/iceberg/src/io/mod.rs b/crates/iceberg/src/io/mod.rs index 07f40d104e..59a21c4b66 100644 --- a/crates/iceberg/src/io/mod.rs +++ b/crates/iceberg/src/io/mod.rs @@ -19,42 +19,24 @@ //! //! # How to build `FileIO` //! -//! We provided a `FileIOBuilder` to build `FileIO` from scratch. For example: +//! `FileIO` uses explicit `StorageFactory` injection for storage creation. +//! We provide convenience constructors for common use cases: //! -//! ```rust -//! use iceberg::Result; -//! use iceberg::io::{FileIOBuilder, S3_REGION}; +//! ```rust,ignore +//! use iceberg::io::{FileIO, FileIOBuilder}; +//! use iceberg::io::{LocalFsStorageFactory, MemoryStorageFactory}; +//! use std::sync::Arc; //! -//! # fn test() -> Result<()> { -//! // Build a memory file io. -//! let file_io = FileIOBuilder::new("memory").build()?; -//! // Build an fs file io. -//! let file_io = FileIOBuilder::new("fs").build()?; -//! // Build an s3 file io. -//! let file_io = FileIOBuilder::new("s3") -//! .with_prop(S3_REGION, "us-east-1") -//! .build()?; -//! # Ok(()) -//! # } -//! ``` -//! -//! Or you can pass a path to ask `FileIO` to infer schema for you: +//! // Build a memory file io for testing +//! let file_io = FileIO::new_with_memory(); //! -//! ```rust -//! use iceberg::Result; -//! use iceberg::io::{FileIO, S3_REGION}; +//! // Build a local filesystem file io +//! let file_io = FileIO::new_with_fs(); //! -//! # fn test() -> Result<()> { -//! // Build a memory file io. -//! let file_io = FileIO::from_path("memory:///")?.build()?; -//! // Build an fs file io. -//! let file_io = FileIO::from_path("fs:///tmp")?.build()?; -//! // Build an s3 file io. -//! let file_io = FileIO::from_path("s3://bucket/a")? -//! .with_prop(S3_REGION, "us-east-1") -//! .build()?; -//! # Ok(()) -//! # } +//! // Build with explicit factory and configuration +//! let file_io = FileIOBuilder::new(Arc::new(LocalFsStorageFactory)) +//! .with_prop("key", "value") +//! .build(); //! ``` //! //! # How to use `FileIO` @@ -62,6 +44,7 @@ //! Currently `FileIO` provides simple methods for file operations: //! //! - `delete`: Delete file. +//! - `delete_prefix`: Delete all files with a given prefix. //! - `exists`: Check if file exists. //! - `new_input`: Create input file for reading. //! - `new_output`: Create output file for writing. diff --git a/crates/iceberg/src/io/storage/opendal/mod.rs b/crates/iceberg/src/io/storage/opendal/mod.rs index ee0df20fce..e59b1a5b01 100644 --- a/crates/iceberg/src/io/storage/opendal/mod.rs +++ b/crates/iceberg/src/io/storage/opendal/mod.rs @@ -33,14 +33,14 @@ use opendal::services::GcsConfig; use opendal::services::OssConfig; #[cfg(feature = "storage-s3")] use opendal::services::S3Config; -use opendal::{Operator, Scheme}; +use opendal::Operator; #[cfg(feature = "storage-s3")] pub use s3::CustomAwsCredentialLoader; use serde::{Deserialize, Serialize}; use crate::io::{ - FileIOBuilder, FileMetadata, FileRead, FileWrite, InputFile, OutputFile, Storage, - StorageConfig, StorageFactory, + FileMetadata, FileRead, FileWrite, InputFile, OutputFile, Storage, StorageConfig, + StorageFactory, }; use crate::{Error, ErrorKind, Result}; @@ -73,8 +73,7 @@ pub use s3::*; /// OpenDAL-based storage factory. /// /// Maps scheme to the corresponding OpenDalStorage storage variant. -/// -/// TODO this is currently not used, we still use OpenDalStorage::build() for now +/// Use this factory with `FileIOBuilder::new(factory)` to create FileIO instances. #[derive(Clone, Debug, Serialize, Deserialize)] pub enum OpenDalStorageFactory { /// Memory storage factory. @@ -210,51 +209,6 @@ pub enum OpenDalStorage { } impl OpenDalStorage { - /// Convert iceberg config to opendal config. - /// - /// TODO Switch to use OpenDalStorageFactory::build() - pub(crate) fn build(file_io_builder: FileIOBuilder) -> Result { - let (scheme_str, props, extensions) = file_io_builder.into_parts(); - let _ = (&props, &extensions); - let scheme = Self::parse_scheme(&scheme_str)?; - - match scheme { - #[cfg(feature = "storage-memory")] - Scheme::Memory => Ok(Self::Memory(memory_config_build()?)), - #[cfg(feature = "storage-fs")] - Scheme::Fs => Ok(Self::LocalFs), - #[cfg(feature = "storage-s3")] - Scheme::S3 => Ok(Self::S3 { - configured_scheme: scheme_str, - config: s3_config_parse(props)?.into(), - customized_credential_load: extensions - .get::() - .map(Arc::unwrap_or_clone), - }), - #[cfg(feature = "storage-gcs")] - Scheme::Gcs => Ok(Self::Gcs { - config: gcs_config_parse(props)?.into(), - }), - #[cfg(feature = "storage-oss")] - Scheme::Oss => Ok(Self::Oss { - config: oss_config_parse(props)?.into(), - }), - #[cfg(feature = "storage-azdls")] - Scheme::Azdls => { - let scheme = scheme_str.parse::()?; - Ok(Self::Azdls { - config: azdls_config_parse(props)?.into(), - configured_scheme: scheme, - }) - } - // Update doc on [`FileIO`] when adding new schemes. - _ => Err(Error::new( - ErrorKind::FeatureUnsupported, - format!("Constructing file io from scheme: {scheme} not supported now",), - )), - } - } - /// Creates operator from path. /// /// # Arguments @@ -362,19 +316,6 @@ impl OpenDalStorage { let operator = operator.layer(RetryLayer::new()); Ok((operator, relative_path)) } - - /// Parse scheme. - fn parse_scheme(scheme: &str) -> Result { - match scheme { - "memory" => Ok(Scheme::Memory), - "file" | "" => Ok(Scheme::Fs), - "s3" | "s3a" => Ok(Scheme::S3), - "gs" | "gcs" => Ok(Scheme::Gcs), - "oss" => Ok(Scheme::Oss), - "abfss" | "abfs" | "wasbs" | "wasb" => Ok(Scheme::Azdls), - s => Ok(s.parse::()?), - } - } } #[typetag::serde(name = "OpenDalStorage")] diff --git a/crates/integrations/datafusion/src/table/table_provider_factory.rs b/crates/integrations/datafusion/src/table/table_provider_factory.rs index 8cae597b7b..9692340cac 100644 --- a/crates/integrations/datafusion/src/table/table_provider_factory.rs +++ b/crates/integrations/datafusion/src/table/table_provider_factory.rs @@ -24,7 +24,7 @@ use datafusion::catalog::{Session, TableProvider, TableProviderFactory}; use datafusion::error::Result as DFResult; use datafusion::logical_expr::CreateExternalTable; use datafusion::sql::TableReference; -use iceberg::io::FileIO; +use iceberg::io::{FileIOBuilder, LocalFsStorageFactory, StorageFactory}; use iceberg::table::StaticTable; use iceberg::{Error, ErrorKind, Result, TableIdent}; @@ -97,11 +97,22 @@ use crate::to_datafusion_error; /// An error will be returned if any unsupported feature, such as partition columns, /// order expressions, constraints, or column defaults, is detected in the table creation command. #[derive(Debug, Default)] -pub struct IcebergTableProviderFactory {} +pub struct IcebergTableProviderFactory { + storage_factory: Option>, +} impl IcebergTableProviderFactory { pub fn new() -> Self { - Self {} + Self { + storage_factory: None, + } + } + + /// Create a new factory with a custom storage factory for creating FileIO instances. + pub fn new_with_storage_factory(storage_factory: Arc) -> Self { + Self { + storage_factory: Some(storage_factory), + } } } @@ -120,10 +131,20 @@ impl TableProviderFactory for IcebergTableProviderFactory { let table_name_with_ns = complement_namespace_if_necessary(table_name); - let table = create_static_table(table_name_with_ns, metadata_file_path, options) - .await - .map_err(to_datafusion_error)? - .into_table(); + let storage_factory = self + .storage_factory + .clone() + .unwrap_or_else(|| Arc::new(LocalFsStorageFactory)); + + let table = create_static_table( + table_name_with_ns, + metadata_file_path, + options, + storage_factory, + ) + .await + .map_err(to_datafusion_error)? + .into_table(); let provider = IcebergStaticTableProvider::try_new_from_table(table) .await @@ -183,11 +204,12 @@ async fn create_static_table( table_name: Cow<'_, TableReference>, metadata_file_path: &str, props: &HashMap, + storage_factory: Arc, ) -> Result { let table_ident = TableIdent::from_strs(table_name.to_vec())?; - let file_io = FileIO::from_path(metadata_file_path)? + let file_io = FileIOBuilder::new(storage_factory) .with_props(props) - .build()?; + .build(); StaticTable::from_metadata_file(metadata_file_path, table_ident, file_io).await } From 08f05978ada2b04d474646232d778640a7e54983 Mon Sep 17 00:00:00 2001 From: Shawn Chang Date: Thu, 5 Feb 2026 12:08:53 -0800 Subject: [PATCH 05/13] update tests --- .../src/arrow/caching_delete_file_loader.rs | 21 +--- .../iceberg/src/arrow/delete_file_loader.rs | 5 +- crates/iceberg/src/arrow/delete_filter.rs | 5 +- crates/iceberg/src/arrow/reader.rs | 30 ++--- crates/iceberg/src/catalog/memory/catalog.rs | 4 +- crates/iceberg/src/catalog/mod.rs | 4 +- crates/iceberg/src/io/object_cache.rs | 5 +- crates/iceberg/src/puffin/metadata.rs | 4 +- crates/iceberg/src/puffin/test_utils.rs | 6 +- crates/iceberg/src/puffin/writer.rs | 4 +- crates/iceberg/src/scan/mod.rs | 15 +-- crates/iceberg/src/spec/manifest/mod.rs | 14 +-- crates/iceberg/src/spec/manifest/writer.rs | 6 +- crates/iceberg/src/spec/manifest_list.rs | 20 ++-- crates/iceberg/src/spec/table_metadata.rs | 8 +- .../src/spec/table_metadata_builder.rs | 6 +- crates/iceberg/src/table.rs | 20 +--- crates/iceberg/src/transaction/mod.rs | 8 +- .../writer/base_writer/data_file_writer.rs | 6 +- .../base_writer/equality_delete_writer.rs | 6 +- .../src/writer/file_writer/parquet_writer.rs | 22 ++-- .../src/writer/file_writer/rolling_writer.rs | 6 +- .../writer/partitioning/clustered_writer.rs | 8 +- .../src/writer/partitioning/fanout_writer.rs | 6 +- .../partitioning/unpartitioned_writer.rs | 4 +- crates/iceberg/tests/file_io_gcs_test.rs | 6 +- crates/iceberg/tests/file_io_s3_test.rs | 110 +++++++----------- 27 files changed, 147 insertions(+), 212 deletions(-) diff --git a/crates/iceberg/src/arrow/caching_delete_file_loader.rs b/crates/iceberg/src/arrow/caching_delete_file_loader.rs index 5d0b1da712..180d9dc551 100644 --- a/crates/iceberg/src/arrow/caching_delete_file_loader.rs +++ b/crates/iceberg/src/arrow/caching_delete_file_loader.rs @@ -608,7 +608,7 @@ mod tests { async fn test_delete_file_loader_parse_equality_deletes() { let tmp_dir = TempDir::new().unwrap(); let table_location = tmp_dir.path().as_os_str().to_str().unwrap(); - let file_io = FileIO::from_path(table_location).unwrap().build().unwrap(); + let file_io = FileIO::new_with_fs(); let eq_delete_file_path = setup_write_equality_delete_file_1(table_location); @@ -721,10 +721,7 @@ mod tests { async fn test_caching_delete_file_loader_load_deletes() { let tmp_dir = TempDir::new().unwrap(); let table_location = tmp_dir.path(); - let file_io = FileIO::from_path(table_location.as_os_str().to_str().unwrap()) - .unwrap() - .build() - .unwrap(); + let file_io = FileIO::new_with_fs(); let delete_file_loader = CachingDeleteFileLoader::new(file_io.clone(), 10); @@ -807,7 +804,7 @@ mod tests { path }; - let file_io = FileIO::from_path(table_location).unwrap().build().unwrap(); + let file_io = FileIO::new_with_fs(); let basic_delete_file_loader = BasicDeleteFileLoader::new(file_io.clone()); let batch_stream = basic_delete_file_loader @@ -853,10 +850,7 @@ mod tests { let tmp_dir = TempDir::new().unwrap(); let table_location = tmp_dir.path(); - let file_io = FileIO::from_path(table_location.as_os_str().to_str().unwrap()) - .unwrap() - .build() - .unwrap(); + let file_io = FileIO::new_with_fs(); // Create the data file schema let data_file_schema = Arc::new( @@ -965,7 +959,7 @@ mod tests { async fn test_large_equality_delete_batch_stack_overflow() { let tmp_dir = TempDir::new().unwrap(); let table_location = tmp_dir.path().as_os_str().to_str().unwrap(); - let file_io = FileIO::from_path(table_location).unwrap().build().unwrap(); + let file_io = FileIO::new_with_fs(); // Create a large batch of equality deletes let num_rows = 20_000; @@ -1012,10 +1006,7 @@ mod tests { async fn test_caching_delete_file_loader_caches_results() { let tmp_dir = TempDir::new().unwrap(); let table_location = tmp_dir.path(); - let file_io = FileIO::from_path(table_location.as_os_str().to_str().unwrap()) - .unwrap() - .build() - .unwrap(); + let file_io = FileIO::new_with_fs(); let delete_file_loader = CachingDeleteFileLoader::new(file_io.clone(), 10); diff --git a/crates/iceberg/src/arrow/delete_file_loader.rs b/crates/iceberg/src/arrow/delete_file_loader.rs index e12daf5324..2a8ffd5e11 100644 --- a/crates/iceberg/src/arrow/delete_file_loader.rs +++ b/crates/iceberg/src/arrow/delete_file_loader.rs @@ -125,10 +125,7 @@ mod tests { async fn test_basic_delete_file_loader_read_delete_file() { let tmp_dir = TempDir::new().unwrap(); let table_location = tmp_dir.path(); - let file_io = FileIO::from_path(table_location.as_os_str().to_str().unwrap()) - .unwrap() - .build() - .unwrap(); + let file_io = FileIO::new_with_fs(); let delete_file_loader = BasicDeleteFileLoader::new(file_io.clone()); diff --git a/crates/iceberg/src/arrow/delete_filter.rs b/crates/iceberg/src/arrow/delete_filter.rs index 4af9f6b6ff..9d1fc9f00e 100644 --- a/crates/iceberg/src/arrow/delete_filter.rs +++ b/crates/iceberg/src/arrow/delete_filter.rs @@ -290,10 +290,7 @@ pub(crate) mod tests { async fn test_delete_file_filter_load_deletes() { let tmp_dir = TempDir::new().unwrap(); let table_location = tmp_dir.path(); - let file_io = FileIO::from_path(table_location.as_os_str().to_str().unwrap()) - .unwrap() - .build() - .unwrap(); + let file_io = FileIO::new_with_fs(); let delete_file_loader = CachingDeleteFileLoader::new(file_io.clone(), 10); diff --git a/crates/iceberg/src/arrow/reader.rs b/crates/iceberg/src/arrow/reader.rs index 5b3a7bb862..c9ca014cd2 100644 --- a/crates/iceberg/src/arrow/reader.rs +++ b/crates/iceberg/src/arrow/reader.rs @@ -2153,7 +2153,7 @@ message schema { let tmp_dir = TempDir::new().unwrap(); let table_location = tmp_dir.path().to_str().unwrap().to_string(); - let file_io = FileIO::from_path(&table_location).unwrap().build().unwrap(); + let file_io = FileIO::new_with_fs(); let col = match col_a_type { DataType::Utf8 => Arc::new(StringArray::from(data_for_col_a)) as ArrayRef, @@ -2414,7 +2414,7 @@ message schema { row_group_2.compressed_size() ); - let file_io = FileIO::from_path(&table_location).unwrap().build().unwrap(); + let file_io = FileIO::new_with_fs(); let reader = ArrowReaderBuilder::new(file_io).build(); // Task 1: read only the first row group @@ -2547,7 +2547,7 @@ message schema { // Write old Parquet file with only column 'a' let tmp_dir = TempDir::new().unwrap(); let table_location = tmp_dir.path().to_str().unwrap().to_string(); - let file_io = FileIO::from_path(&table_location).unwrap().build().unwrap(); + let file_io = FileIO::new_with_fs(); let data_a = Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef; let to_write = RecordBatch::try_new(arrow_schema_old.clone(), vec![data_a]).unwrap(); @@ -2727,7 +2727,7 @@ message schema { delete_writer.close().unwrap(); // Step 3: Read the data file with the delete applied - let file_io = FileIO::from_path(&table_location).unwrap().build().unwrap(); + let file_io = FileIO::new_with_fs(); let reader = ArrowReaderBuilder::new(file_io).build(); let task = FileScanTask { @@ -2944,7 +2944,7 @@ message schema { row_group_1.compressed_size() ); - let file_io = FileIO::from_path(&table_location).unwrap().build().unwrap(); + let file_io = FileIO::new_with_fs(); let reader = ArrowReaderBuilder::new(file_io).build(); // Create FileScanTask that reads ONLY row group 1 via byte range filtering @@ -3155,7 +3155,7 @@ message schema { let rg1_start = rg0_start + row_group_0.compressed_size() as u64; let rg1_length = row_group_1.compressed_size() as u64; - let file_io = FileIO::from_path(&table_location).unwrap().build().unwrap(); + let file_io = FileIO::new_with_fs(); let reader = ArrowReaderBuilder::new(file_io).build(); // Create FileScanTask that reads ONLY row group 1 via byte range filtering @@ -3244,7 +3244,7 @@ message schema { let tmp_dir = TempDir::new().unwrap(); let table_location = tmp_dir.path().to_str().unwrap().to_string(); - let file_io = FileIO::from_path(&table_location).unwrap().build().unwrap(); + let file_io = FileIO::new_with_fs(); let name_data = vec!["Alice", "Bob", "Charlie"]; let age_data = vec![30, 25, 35]; @@ -3341,7 +3341,7 @@ message schema { let tmp_dir = TempDir::new().unwrap(); let table_location = tmp_dir.path().to_str().unwrap().to_string(); - let file_io = FileIO::from_path(&table_location).unwrap().build().unwrap(); + let file_io = FileIO::new_with_fs(); let col1_data = Arc::new(StringArray::from(vec!["a", "b"])) as ArrayRef; let col2_data = Arc::new(Int32Array::from(vec![10, 20])) as ArrayRef; @@ -3432,7 +3432,7 @@ message schema { let tmp_dir = TempDir::new().unwrap(); let table_location = tmp_dir.path().to_str().unwrap().to_string(); - let file_io = FileIO::from_path(&table_location).unwrap().build().unwrap(); + let file_io = FileIO::new_with_fs(); let name_data = Arc::new(StringArray::from(vec!["Alice", "Bob"])) as ArrayRef; let age_data = Arc::new(Int32Array::from(vec![30, 25])) as ArrayRef; @@ -3524,7 +3524,7 @@ message schema { let tmp_dir = TempDir::new().unwrap(); let table_location = tmp_dir.path().to_str().unwrap().to_string(); - let file_io = FileIO::from_path(&table_location).unwrap().build().unwrap(); + let file_io = FileIO::new_with_fs(); // Small row group size to create multiple row groups let props = WriterProperties::builder() @@ -3652,7 +3652,7 @@ message schema { let tmp_dir = TempDir::new().unwrap(); let table_location = tmp_dir.path().to_str().unwrap().to_string(); - let file_io = FileIO::from_path(&table_location).unwrap().build().unwrap(); + let file_io = FileIO::new_with_fs(); let id_data = Arc::new(Int32Array::from(vec![1, 2])) as ArrayRef; let name_data = Arc::new(StringArray::from(vec!["Alice", "Bob"])) as ArrayRef; @@ -3761,7 +3761,7 @@ message schema { let tmp_dir = TempDir::new().unwrap(); let table_location = tmp_dir.path().to_str().unwrap().to_string(); - let file_io = FileIO::from_path(&table_location).unwrap().build().unwrap(); + let file_io = FileIO::new_with_fs(); let col0_data = Arc::new(Int32Array::from(vec![1, 2])) as ArrayRef; let col1_data = Arc::new(Int32Array::from(vec![10, 20])) as ArrayRef; @@ -3861,7 +3861,7 @@ message schema { let tmp_dir = TempDir::new().unwrap(); let table_location = tmp_dir.path().to_str().unwrap().to_string(); - let file_io = FileIO::from_path(&table_location).unwrap().build().unwrap(); + let file_io = FileIO::new_with_fs(); // Write data where all ids are >= 10 let id_data = Arc::new(Int32Array::from(vec![10, 11, 12])) as ArrayRef; @@ -3952,7 +3952,7 @@ message schema { let tmp_dir = TempDir::new().unwrap(); let table_location = tmp_dir.path().to_str().unwrap().to_string(); - let file_io = FileIO::from_path(&table_location).unwrap().build().unwrap(); + let file_io = FileIO::new_with_fs(); // Create 3 parquet files with different data let props = WriterProperties::builder() @@ -4171,7 +4171,7 @@ message schema { // Write Parquet file with data let tmp_dir = TempDir::new().unwrap(); let table_location = tmp_dir.path().to_str().unwrap().to_string(); - let file_io = FileIO::from_path(&table_location).unwrap().build().unwrap(); + let file_io = FileIO::new_with_fs(); let id_data = Arc::new(Int32Array::from(vec![1, 5, 9, 13])) as ArrayRef; let name_data = diff --git a/crates/iceberg/src/catalog/memory/catalog.rs b/crates/iceberg/src/catalog/memory/catalog.rs index 5a69abf57d..4b8dbf8fb4 100644 --- a/crates/iceberg/src/catalog/memory/catalog.rs +++ b/crates/iceberg/src/catalog/memory/catalog.rs @@ -409,7 +409,7 @@ pub(crate) mod tests { use tempfile::TempDir; use super::*; - use crate::io::FileIOBuilder; + use crate::io::FileIO; use crate::spec::{NestedField, PartitionSpec, PrimitiveType, Schema, SortOrder, Type}; use crate::transaction::{ApplyTransactionAction, Transaction}; @@ -1911,7 +1911,7 @@ pub(crate) mod tests { } fn build_table(ident: TableIdent) -> Table { - let file_io = FileIOBuilder::new_fs_io().build().unwrap(); + let file_io = FileIO::new_with_fs(); let temp_dir = TempDir::new().unwrap(); let location = temp_dir.path().to_str().unwrap().to_string(); diff --git a/crates/iceberg/src/catalog/mod.rs b/crates/iceberg/src/catalog/mod.rs index d12e3d38e7..e17689099f 100644 --- a/crates/iceberg/src/catalog/mod.rs +++ b/crates/iceberg/src/catalog/mod.rs @@ -1052,7 +1052,7 @@ mod tests { use uuid::uuid; use super::ViewUpdate; - use crate::io::FileIOBuilder; + use crate::io::FileIO; use crate::spec::{ BlobMetadata, EncryptedKey, FormatVersion, MAIN_BRANCH, NestedField, NullOrder, Operation, PartitionStatisticsFile, PrimitiveType, Schema, Snapshot, SnapshotReference, @@ -2378,7 +2378,7 @@ mod tests { .metadata(resp) .metadata_location("s3://bucket/test/location/metadata/00000-8a62c37d-4573-4021-952a-c0baef7d21d0.metadata.json".to_string()) .identifier(TableIdent::from_strs(["ns1", "test1"]).unwrap()) - .file_io(FileIOBuilder::new("memory").build().unwrap()) + .file_io(FileIO::new_with_memory()) .build() .unwrap() }; diff --git a/crates/iceberg/src/io/object_cache.rs b/crates/iceberg/src/io/object_cache.rs index af297bebb5..8881471ae8 100644 --- a/crates/iceberg/src/io/object_cache.rs +++ b/crates/iceberg/src/io/object_cache.rs @@ -218,10 +218,7 @@ mod tests { let manifest_list2_location = table_location.join("metadata/manifests_list_2.avro"); let table_metadata1_location = table_location.join("metadata/v1.json"); - let file_io = FileIO::from_path(table_location.as_os_str().to_str().unwrap()) - .unwrap() - .build() - .unwrap(); + let file_io = FileIO::new_with_fs(); let table_metadata = { let template_json_str = fs::read_to_string(format!( diff --git a/crates/iceberg/src/puffin/metadata.rs b/crates/iceberg/src/puffin/metadata.rs index 3794eec458..1d39cf249b 100644 --- a/crates/iceberg/src/puffin/metadata.rs +++ b/crates/iceberg/src/puffin/metadata.rs @@ -388,7 +388,7 @@ mod tests { use bytes::Bytes; use tempfile::TempDir; - use crate::io::{FileIOBuilder, InputFile}; + use crate::io::{FileIO, InputFile}; use crate::puffin::metadata::{BlobMetadata, CompressionCodec, FileMetadata}; use crate::puffin::test_utils::{ empty_footer_payload, empty_footer_payload_bytes, empty_footer_payload_bytes_length_bytes, @@ -400,7 +400,7 @@ mod tests { const INVALID_MAGIC_VALUE: [u8; 4] = [80, 70, 65, 0]; async fn input_file_with_bytes(temp_dir: &TempDir, slice: &[u8]) -> InputFile { - let file_io = FileIOBuilder::new_fs_io().build().unwrap(); + let file_io = FileIO::new_with_fs(); let path_buf = temp_dir.path().join("abc.puffin"); let temp_path = path_buf.to_str().unwrap(); diff --git a/crates/iceberg/src/puffin/test_utils.rs b/crates/iceberg/src/puffin/test_utils.rs index a3232f53ef..39fecc6f80 100644 --- a/crates/iceberg/src/puffin/test_utils.rs +++ b/crates/iceberg/src/puffin/test_utils.rs @@ -19,7 +19,7 @@ use std::collections::HashMap; use super::blob::Blob; use crate::compression::CompressionCodec; -use crate::io::{FileIOBuilder, InputFile}; +use crate::io::{FileIO, InputFile}; use crate::puffin::metadata::{BlobMetadata, CREATED_BY_PROPERTY, FileMetadata}; const JAVA_TESTDATA: &str = "testdata/puffin/java-generated"; @@ -28,9 +28,7 @@ const METRIC_UNCOMPRESSED: &str = "sample-metric-data-uncompressed.bin"; const METRIC_ZSTD_COMPRESSED: &str = "sample-metric-data-compressed-zstd.bin"; fn input_file_for_test_data(path: &str) -> InputFile { - FileIOBuilder::new_fs_io() - .build() - .unwrap() + FileIO::new_with_fs() .new_input(env!("CARGO_MANIFEST_DIR").to_owned() + "/" + path) .unwrap() } diff --git a/crates/iceberg/src/puffin/writer.rs b/crates/iceberg/src/puffin/writer.rs index c8e4233a94..30b97f09dd 100644 --- a/crates/iceberg/src/puffin/writer.rs +++ b/crates/iceberg/src/puffin/writer.rs @@ -151,7 +151,7 @@ mod tests { use tempfile::TempDir; use crate::compression::CompressionCodec; - use crate::io::{FileIOBuilder, InputFile, OutputFile}; + use crate::io::{FileIO, InputFile, OutputFile}; use crate::puffin::blob::Blob; use crate::puffin::metadata::FileMetadata; use crate::puffin::reader::PuffinReader; @@ -169,7 +169,7 @@ mod tests { blobs: Vec<(Blob, CompressionCodec)>, properties: HashMap, ) -> Result { - let file_io = FileIOBuilder::new_fs_io().build()?; + let file_io = FileIO::new_with_fs(); let path_buf = temp_dir.path().join("temp_puffin.bin"); let temp_path = path_buf.to_str().unwrap(); diff --git a/crates/iceberg/src/scan/mod.rs b/crates/iceberg/src/scan/mod.rs index c055c12c9a..199925b002 100644 --- a/crates/iceberg/src/scan/mod.rs +++ b/crates/iceberg/src/scan/mod.rs @@ -617,10 +617,7 @@ pub mod tests { let manifest_list2_location = table_location.join("metadata/manifests_list_2.avro"); let table_metadata1_location = table_location.join("metadata/v1.json"); - let file_io = FileIO::from_path(table_location.as_os_str().to_str().unwrap()) - .unwrap() - .build() - .unwrap(); + let file_io = FileIO::new_with_fs(); let table_metadata = { let template_json_str = fs::read_to_string(format!( @@ -657,10 +654,7 @@ pub mod tests { let table_location = tmp_dir.path().join("table1"); let table_metadata1_location = table_location.join("metadata/v1.json"); - let file_io = FileIO::from_path(table_location.as_os_str().to_str().unwrap()) - .unwrap() - .build() - .unwrap(); + let file_io = FileIO::new_with_fs(); let table_metadata = { let template_json_str = fs::read_to_string(format!( @@ -696,10 +690,7 @@ pub mod tests { let manifest_list2_location = table_location.join("metadata/manifests_list_2.avro"); let table_metadata1_location = table_location.join("metadata/v1.json"); - let file_io = FileIO::from_path(table_location.to_str().unwrap()) - .unwrap() - .build() - .unwrap(); + let file_io = FileIO::new_with_fs(); let mut table_metadata = { let template_json_str = fs::read_to_string(format!( diff --git a/crates/iceberg/src/spec/manifest/mod.rs b/crates/iceberg/src/spec/manifest/mod.rs index b126396e3c..c5a474ed19 100644 --- a/crates/iceberg/src/spec/manifest/mod.rs +++ b/crates/iceberg/src/spec/manifest/mod.rs @@ -165,7 +165,7 @@ mod tests { use tempfile::TempDir; use super::*; - use crate::io::FileIOBuilder; + use crate::io::FileIO; use crate::spec::{Literal, NestedField, PrimitiveType, Struct, Transform, Type}; #[tokio::test] @@ -264,7 +264,7 @@ mod tests { // write manifest to file let tmp_dir = TempDir::new().unwrap(); let path = tmp_dir.path().join("test_manifest.avro"); - let io = FileIOBuilder::new_fs_io().build().unwrap(); + let io = FileIO::new_with_fs(); let output_file = io.new_output(path.to_str().unwrap()).unwrap(); let mut writer = ManifestWriterBuilder::new( output_file, @@ -449,7 +449,7 @@ mod tests { // write manifest to file and check the return manifest file. let tmp_dir = TempDir::new().unwrap(); let path = tmp_dir.path().join("test_manifest.avro"); - let io = FileIOBuilder::new_fs_io().build().unwrap(); + let io = FileIO::new_with_fs(); let output_file = io.new_output(path.to_str().unwrap()).unwrap(); let mut writer = ManifestWriterBuilder::new( output_file, @@ -546,7 +546,7 @@ mod tests { // write manifest to file let tmp_dir = TempDir::new().unwrap(); let path = tmp_dir.path().join("test_manifest.avro"); - let io = FileIOBuilder::new_fs_io().build().unwrap(); + let io = FileIO::new_with_fs(); let output_file = io.new_output(path.to_str().unwrap()).unwrap(); let mut writer = ManifestWriterBuilder::new( output_file, @@ -655,7 +655,7 @@ mod tests { // write manifest to file let tmp_dir = TempDir::new().unwrap(); let path = tmp_dir.path().join("test_manifest.avro"); - let io = FileIOBuilder::new_fs_io().build().unwrap(); + let io = FileIO::new_with_fs(); let output_file = io.new_output(path.to_str().unwrap()).unwrap(); let mut writer = ManifestWriterBuilder::new( output_file, @@ -763,7 +763,7 @@ mod tests { // write manifest to file let tmp_dir = TempDir::new().unwrap(); let path = tmp_dir.path().join("test_manifest.avro"); - let io = FileIOBuilder::new_fs_io().build().unwrap(); + let io = FileIO::new_with_fs(); let output_file = io.new_output(path.to_str().unwrap()).unwrap(); let mut writer = ManifestWriterBuilder::new( output_file, @@ -1042,7 +1042,7 @@ mod tests { // write manifest to file let tmp_dir = TempDir::new().unwrap(); let path = tmp_dir.path().join("test_manifest.avro"); - let io = FileIOBuilder::new_fs_io().build().unwrap(); + let io = FileIO::new_with_fs(); let output_file = io.new_output(path.to_str().unwrap()).unwrap(); let mut writer = ManifestWriterBuilder::new( output_file, diff --git a/crates/iceberg/src/spec/manifest/writer.rs b/crates/iceberg/src/spec/manifest/writer.rs index 0669651603..cc5ef737fb 100644 --- a/crates/iceberg/src/spec/manifest/writer.rs +++ b/crates/iceberg/src/spec/manifest/writer.rs @@ -559,7 +559,7 @@ mod tests { use tempfile::TempDir; use super::*; - use crate::io::FileIOBuilder; + use crate::io::FileIO; use crate::spec::{DataFileFormat, Manifest, NestedField, PrimitiveType, Schema, Struct, Type}; #[tokio::test] @@ -684,7 +684,7 @@ mod tests { // write manifest to file let tmp_dir = TempDir::new().unwrap(); let path = tmp_dir.path().join("test_manifest.avro"); - let io = FileIOBuilder::new_fs_io().build().unwrap(); + let io = FileIO::new_with_fs(); let output_file = io.new_output(path.to_str().unwrap()).unwrap(); let mut writer = ManifestWriterBuilder::new( output_file, @@ -771,7 +771,7 @@ mod tests { // Write a V3 delete manifest let tmp_dir = TempDir::new().unwrap(); let path = tmp_dir.path().join("v3_delete_manifest.avro"); - let io = FileIOBuilder::new_fs_io().build().unwrap(); + let io = FileIO::new_with_fs(); let output_file = io.new_output(path.to_str().unwrap()).unwrap(); let mut writer = ManifestWriterBuilder::new( diff --git a/crates/iceberg/src/spec/manifest_list.rs b/crates/iceberg/src/spec/manifest_list.rs index 5e97e5466e..baaab1f590 100644 --- a/crates/iceberg/src/spec/manifest_list.rs +++ b/crates/iceberg/src/spec/manifest_list.rs @@ -1365,7 +1365,7 @@ mod test { use tempfile::TempDir; use super::_serde::ManifestListV2; - use crate::io::FileIOBuilder; + use crate::io::FileIO; use crate::spec::manifest_list::_serde::{ManifestListV1, ManifestListV3}; use crate::spec::{ Datum, FieldSummary, ManifestContentType, ManifestFile, ManifestList, ManifestListWriter, @@ -1397,7 +1397,7 @@ mod test { ] }; - let file_io = FileIOBuilder::new_fs_io().build().unwrap(); + let file_io = FileIO::new_with_fs(); let tmp_dir = TempDir::new().unwrap(); let file_name = "simple_manifest_list_v1.avro"; @@ -1469,7 +1469,7 @@ mod test { ] }; - let file_io = FileIOBuilder::new_fs_io().build().unwrap(); + let file_io = FileIO::new_with_fs(); let tmp_dir = TempDir::new().unwrap(); let file_name = "simple_manifest_list_v1.avro"; @@ -1542,7 +1542,7 @@ mod test { ] }; - let file_io = FileIOBuilder::new_fs_io().build().unwrap(); + let file_io = FileIO::new_with_fs(); let tmp_dir = TempDir::new().unwrap(); let file_name = "simple_manifest_list_v3.avro"; @@ -1687,7 +1687,7 @@ mod test { let temp_dir = TempDir::new().unwrap(); let path = temp_dir.path().join("manifest_list_v1.avro"); - let io = FileIOBuilder::new_fs_io().build().unwrap(); + let io = FileIO::new_with_fs(); let output_file = io.new_output(path.to_str().unwrap()).unwrap(); let mut writer = ManifestListWriter::v1(output_file, 1646658105718557341, Some(0)); @@ -1734,7 +1734,7 @@ mod test { let temp_dir = TempDir::new().unwrap(); let path = temp_dir.path().join("manifest_list_v2.avro"); - let io = FileIOBuilder::new_fs_io().build().unwrap(); + let io = FileIO::new_with_fs(); let output_file = io.new_output(path.to_str().unwrap()).unwrap(); let mut writer = ManifestListWriter::v2(output_file, snapshot_id, Some(0), seq_num); @@ -1782,7 +1782,7 @@ mod test { let temp_dir = TempDir::new().unwrap(); let path = temp_dir.path().join("manifest_list_v2.avro"); - let io = FileIOBuilder::new_fs_io().build().unwrap(); + let io = FileIO::new_with_fs(); let output_file = io.new_output(path.to_str().unwrap()).unwrap(); let mut writer = @@ -1830,7 +1830,7 @@ mod test { let temp_dir = TempDir::new().unwrap(); let path = temp_dir.path().join("manifest_list_v1.avro"); - let io = FileIOBuilder::new_fs_io().build().unwrap(); + let io = FileIO::new_with_fs(); let output_file = io.new_output(path.to_str().unwrap()).unwrap(); let mut writer = ManifestListWriter::v1(output_file, 1646658105718557341, Some(0)); @@ -1875,7 +1875,7 @@ mod test { let temp_dir = TempDir::new().unwrap(); let path = temp_dir.path().join("manifest_list_v1.avro"); - let io = FileIOBuilder::new_fs_io().build().unwrap(); + let io = FileIO::new_with_fs(); let output_file = io.new_output(path.to_str().unwrap()).unwrap(); let mut writer = ManifestListWriter::v1(output_file, 1646658105718557341, Some(0)); @@ -1922,7 +1922,7 @@ mod test { let temp_dir = TempDir::new().unwrap(); let path = temp_dir.path().join("manifest_list_v2.avro"); - let io = FileIOBuilder::new_fs_io().build().unwrap(); + let io = FileIO::new_with_fs(); let output_file = io.new_output(path.to_str().unwrap()).unwrap(); let mut writer = ManifestListWriter::v2(output_file, snapshot_id, Some(0), seq_num); diff --git a/crates/iceberg/src/spec/table_metadata.rs b/crates/iceberg/src/spec/table_metadata.rs index 28f753e9f1..3e6374d58d 100644 --- a/crates/iceberg/src/spec/table_metadata.rs +++ b/crates/iceberg/src/spec/table_metadata.rs @@ -1568,7 +1568,7 @@ mod tests { use super::{FormatVersion, MetadataLog, SnapshotLog, TableMetadataBuilder}; use crate::compression::CompressionCodec; - use crate::io::FileIOBuilder; + use crate::io::FileIO; use crate::spec::table_metadata::TableMetadata; use crate::spec::{ BlobMetadata, EncryptedKey, INITIAL_ROW_ID, Literal, NestedField, NullOrder, Operation, @@ -3541,7 +3541,7 @@ mod tests { let temp_path = temp_dir.path().to_str().unwrap(); // Create a FileIO instance - let file_io = FileIOBuilder::new_fs_io().build().unwrap(); + let file_io = FileIO::new_with_fs(); // Use an existing test metadata from the test files let original_metadata: TableMetadata = get_test_table_metadata("TableMetadataV2Valid.json"); @@ -3581,7 +3581,7 @@ mod tests { std::fs::write(&metadata_location, &compressed).expect("failed to write metadata"); // Read the metadata back - let file_io = FileIOBuilder::new_fs_io().build().unwrap(); + let file_io = FileIO::new_with_fs(); let metadata_location = metadata_location.to_str().unwrap(); let read_metadata = TableMetadata::read_from(&file_io, metadata_location) .await @@ -3594,7 +3594,7 @@ mod tests { #[tokio::test] async fn test_table_metadata_read_nonexistent_file() { // Create a FileIO instance - let file_io = FileIOBuilder::new_fs_io().build().unwrap(); + let file_io = FileIO::new_with_fs(); // Try to read a non-existent file let result = TableMetadata::read_from(&file_io, "/nonexistent/path/metadata.json").await; diff --git a/crates/iceberg/src/spec/table_metadata_builder.rs b/crates/iceberg/src/spec/table_metadata_builder.rs index 3508c1e52f..62311a15a2 100644 --- a/crates/iceberg/src/spec/table_metadata_builder.rs +++ b/crates/iceberg/src/spec/table_metadata_builder.rs @@ -1466,7 +1466,7 @@ mod tests { use super::*; use crate::TableIdent; - use crate::io::FileIOBuilder; + use crate::io::FileIO; use crate::spec::{ BlobMetadata, NestedField, NullOrder, Operation, PartitionSpec, PrimitiveType, Schema, SnapshotRetention, SortDirection, SortField, StructType, Summary, TableProperties, @@ -2711,7 +2711,7 @@ mod tests { .metadata(resp) .metadata_location("s3://bucket/test/location/metadata/v1.json".to_string()) .identifier(TableIdent::from_strs(["ns1", "test1"]).unwrap()) - .file_io(FileIOBuilder::new("memory").build().unwrap()) + .file_io(FileIO::new_with_memory()) .build() .unwrap(); @@ -2742,7 +2742,7 @@ mod tests { .metadata(resp) .metadata_location("s3://bucket/test/location/metadata/v1.json".to_string()) .identifier(TableIdent::from_strs(["ns1", "test1"]).unwrap()) - .file_io(FileIOBuilder::new("memory").build().unwrap()) + .file_io(FileIO::new_with_memory()) .build() .unwrap(); diff --git a/crates/iceberg/src/table.rs b/crates/iceberg/src/table.rs index 9c789e2186..56ddbd51ba 100644 --- a/crates/iceberg/src/table.rs +++ b/crates/iceberg/src/table.rs @@ -256,10 +256,7 @@ impl Table { /// # use iceberg::TableIdent; /// # async fn example() { /// let metadata_file_location = "s3://bucket_name/path/to/metadata.json"; -/// let file_io = FileIO::from_path(&metadata_file_location) -/// .unwrap() -/// .build() -/// .unwrap(); +/// let file_io = FileIO::new_with_fs(); /// let static_identifier = TableIdent::from_strs(["static_ns", "static_table"]).unwrap(); /// let static_table = /// StaticTable::from_metadata_file(&metadata_file_location, static_identifier, file_io) @@ -345,10 +342,7 @@ mod tests { env!("CARGO_MANIFEST_DIR"), metadata_file_name ); - let file_io = FileIO::from_path(&metadata_file_path) - .unwrap() - .build() - .unwrap(); + let file_io = FileIO::new_with_fs(); let static_identifier = TableIdent::from_strs(["static_ns", "static_table"]).unwrap(); let static_table = StaticTable::from_metadata_file(&metadata_file_path, static_identifier, file_io) @@ -373,10 +367,7 @@ mod tests { env!("CARGO_MANIFEST_DIR"), metadata_file_name ); - let file_io = FileIO::from_path(&metadata_file_path) - .unwrap() - .build() - .unwrap(); + let file_io = FileIO::new_with_fs(); let static_identifier = TableIdent::from_strs(["static_ns", "static_table"]).unwrap(); let static_table = StaticTable::from_metadata_file(&metadata_file_path, static_identifier, file_io) @@ -399,10 +390,7 @@ mod tests { env!("CARGO_MANIFEST_DIR"), metadata_file_name ); - let file_io = FileIO::from_path(&metadata_file_path) - .unwrap() - .build() - .unwrap(); + let file_io = FileIO::new_with_fs(); let metadata_file = file_io.new_input(metadata_file_path).unwrap(); let metadata_file_content = metadata_file.read().await.unwrap(); let table_metadata = diff --git a/crates/iceberg/src/transaction/mod.rs b/crates/iceberg/src/transaction/mod.rs index 074c7fefe4..cb2ff7cf37 100644 --- a/crates/iceberg/src/transaction/mod.rs +++ b/crates/iceberg/src/transaction/mod.rs @@ -236,7 +236,7 @@ mod tests { use std::sync::atomic::{AtomicU32, Ordering}; use crate::catalog::MockCatalog; - use crate::io::FileIOBuilder; + use crate::io::FileIO; use crate::spec::TableMetadata; use crate::table::Table; use crate::transaction::{ApplyTransactionAction, Transaction}; @@ -256,7 +256,7 @@ mod tests { .metadata(resp) .metadata_location("s3://bucket/test/location/metadata/v1.json".to_string()) .identifier(TableIdent::from_strs(["ns1", "test1"]).unwrap()) - .file_io(FileIOBuilder::new("memory").build().unwrap()) + .file_io(FileIO::new_with_memory()) .build() .unwrap() } @@ -275,7 +275,7 @@ mod tests { .metadata(resp) .metadata_location("s3://bucket/test/location/metadata/v1.json".to_string()) .identifier(TableIdent::from_strs(["ns1", "test1"]).unwrap()) - .file_io(FileIOBuilder::new("memory").build().unwrap()) + .file_io(FileIO::new_with_memory()) .build() .unwrap() } @@ -294,7 +294,7 @@ mod tests { .metadata(resp) .metadata_location("s3://bucket/test/location/metadata/v1.json".to_string()) .identifier(TableIdent::from_strs(["ns1", "test1"]).unwrap()) - .file_io(FileIOBuilder::new("memory").build().unwrap()) + .file_io(FileIO::new_with_memory()) .build() .unwrap() } diff --git a/crates/iceberg/src/writer/base_writer/data_file_writer.rs b/crates/iceberg/src/writer/base_writer/data_file_writer.rs index cb7bd172ea..02dcda4164 100644 --- a/crates/iceberg/src/writer/base_writer/data_file_writer.rs +++ b/crates/iceberg/src/writer/base_writer/data_file_writer.rs @@ -147,7 +147,7 @@ mod test { use tempfile::TempDir; use crate::Result; - use crate::io::FileIOBuilder; + use crate::io::FileIO; use crate::spec::{ DataContentType, DataFileFormat, Literal, NestedField, PartitionKey, PartitionSpec, PrimitiveType, Schema, Struct, Type, @@ -163,7 +163,7 @@ mod test { #[tokio::test] async fn test_parquet_writer() -> Result<()> { let temp_dir = TempDir::new().unwrap(); - let file_io = FileIOBuilder::new_fs_io().build().unwrap(); + let file_io = FileIO::new_with_fs(); let location_gen = DefaultLocationGenerator::with_data_location( temp_dir.path().to_str().unwrap().to_string(), ); @@ -237,7 +237,7 @@ mod test { #[tokio::test] async fn test_parquet_writer_with_partition() -> Result<()> { let temp_dir = TempDir::new().unwrap(); - let file_io = FileIOBuilder::new_fs_io().build().unwrap(); + let file_io = FileIO::new_with_fs(); let location_gen = DefaultLocationGenerator::with_data_location( temp_dir.path().to_str().unwrap().to_string(), ); diff --git a/crates/iceberg/src/writer/base_writer/equality_delete_writer.rs b/crates/iceberg/src/writer/base_writer/equality_delete_writer.rs index dd8487f9cc..9bc2bf9840 100644 --- a/crates/iceberg/src/writer/base_writer/equality_delete_writer.rs +++ b/crates/iceberg/src/writer/base_writer/equality_delete_writer.rs @@ -213,7 +213,7 @@ mod test { use uuid::Uuid; use crate::arrow::{arrow_schema_to_schema, schema_to_arrow_schema}; - use crate::io::{FileIO, FileIOBuilder}; + use crate::io::FileIO; use crate::spec::{ DataFile, DataFileFormat, ListType, MapType, NestedField, PrimitiveType, Schema, StructType, Type, @@ -307,7 +307,7 @@ mod test { #[tokio::test] async fn test_equality_delete_writer() -> Result<(), anyhow::Error> { let temp_dir = TempDir::new().unwrap(); - let file_io = FileIOBuilder::new_fs_io().build().unwrap(); + let file_io = FileIO::new_with_fs(); let location_gen = DefaultLocationGenerator::with_data_location( temp_dir.path().to_str().unwrap().to_string(), ); @@ -545,7 +545,7 @@ mod test { #[tokio::test] async fn test_equality_delete_with_primitive_type() -> Result<(), anyhow::Error> { let temp_dir = TempDir::new().unwrap(); - let file_io = FileIOBuilder::new_fs_io().build().unwrap(); + let file_io = FileIO::new_with_fs(); let location_gen = DefaultLocationGenerator::with_data_location( temp_dir.path().to_str().unwrap().to_string(), ); diff --git a/crates/iceberg/src/writer/file_writer/parquet_writer.rs b/crates/iceberg/src/writer/file_writer/parquet_writer.rs index da04d5435b..0984d8fc64 100644 --- a/crates/iceberg/src/writer/file_writer/parquet_writer.rs +++ b/crates/iceberg/src/writer/file_writer/parquet_writer.rs @@ -627,7 +627,7 @@ mod tests { use super::*; use crate::arrow::schema_to_arrow_schema; - use crate::io::FileIOBuilder; + use crate::io::FileIO; use crate::spec::decimal_utils::{decimal_mantissa, decimal_new, decimal_scale}; use crate::spec::{PrimitiveLiteral, Struct, *}; use crate::writer::file_writer::location_generator::{ @@ -790,7 +790,7 @@ mod tests { #[tokio::test] async fn test_parquet_writer() -> Result<()> { let temp_dir = TempDir::new().unwrap(); - let file_io = FileIOBuilder::new_fs_io().build().unwrap(); + let file_io = FileIO::new_with_fs(); let location_gen = DefaultLocationGenerator::with_data_location( temp_dir.path().to_str().unwrap().to_string(), ); @@ -864,7 +864,7 @@ mod tests { #[tokio::test] async fn test_parquet_writer_with_complex_schema() -> Result<()> { let temp_dir = TempDir::new().unwrap(); - let file_io = FileIOBuilder::new_fs_io().build().unwrap(); + let file_io = FileIO::new_with_fs(); let location_gen = DefaultLocationGenerator::with_data_location( temp_dir.path().to_str().unwrap().to_string(), ); @@ -1087,7 +1087,7 @@ mod tests { #[tokio::test] async fn test_all_type_for_write() -> Result<()> { let temp_dir = TempDir::new().unwrap(); - let file_io = FileIOBuilder::new_fs_io().build().unwrap(); + let file_io = FileIO::new_with_fs(); let location_gen = DefaultLocationGenerator::with_data_location( temp_dir.path().to_str().unwrap().to_string(), ); @@ -1328,7 +1328,7 @@ mod tests { #[tokio::test] async fn test_decimal_bound() -> Result<()> { let temp_dir = TempDir::new().unwrap(); - let file_io = FileIOBuilder::new_fs_io().build().unwrap(); + let file_io = FileIO::new_with_fs(); let location_gen = DefaultLocationGenerator::with_data_location( temp_dir.path().to_str().unwrap().to_string(), ); @@ -1573,7 +1573,7 @@ mod tests { #[tokio::test] async fn test_empty_write() -> Result<()> { let temp_dir = TempDir::new().unwrap(); - let file_io = FileIOBuilder::new_fs_io().build().unwrap(); + let file_io = FileIO::new_with_fs(); let location_gen = DefaultLocationGenerator::with_data_location( temp_dir.path().to_str().unwrap().to_string(), ); @@ -1623,7 +1623,7 @@ mod tests { #[tokio::test] async fn test_nan_val_cnts_primitive_type() -> Result<()> { let temp_dir = TempDir::new().unwrap(); - let file_io = FileIOBuilder::new_fs_io().build().unwrap(); + let file_io = FileIO::new_with_fs(); let location_gen = DefaultLocationGenerator::with_data_location( temp_dir.path().to_str().unwrap().to_string(), ); @@ -1711,7 +1711,7 @@ mod tests { #[tokio::test] async fn test_nan_val_cnts_struct_type() -> Result<()> { let temp_dir = TempDir::new().unwrap(); - let file_io = FileIOBuilder::new_fs_io().build().unwrap(); + let file_io = FileIO::new_with_fs(); let location_gen = DefaultLocationGenerator::with_data_location( temp_dir.path().to_str().unwrap().to_string(), ); @@ -1851,7 +1851,7 @@ mod tests { #[tokio::test] async fn test_nan_val_cnts_list_type() -> Result<()> { let temp_dir = TempDir::new().unwrap(); - let file_io = FileIOBuilder::new_fs_io().build().unwrap(); + let file_io = FileIO::new_with_fs(); let location_gen = DefaultLocationGenerator::with_data_location( temp_dir.path().to_str().unwrap().to_string(), ); @@ -2053,7 +2053,7 @@ mod tests { #[tokio::test] async fn test_nan_val_cnts_map_type() -> Result<()> { let temp_dir = TempDir::new().unwrap(); - let file_io = FileIOBuilder::new_fs_io().build().unwrap(); + let file_io = FileIO::new_with_fs(); let location_gen = DefaultLocationGenerator::with_data_location( temp_dir.path().to_str().unwrap().to_string(), ); @@ -2209,7 +2209,7 @@ mod tests { #[tokio::test] async fn test_write_empty_parquet_file() { let temp_dir = TempDir::new().unwrap(); - let file_io = FileIOBuilder::new_fs_io().build().unwrap(); + let file_io = FileIO::new_with_fs(); let location_gen = DefaultLocationGenerator::with_data_location( temp_dir.path().to_str().unwrap().to_string(), ); diff --git a/crates/iceberg/src/writer/file_writer/rolling_writer.rs b/crates/iceberg/src/writer/file_writer/rolling_writer.rs index a93e494d48..b86f6a2ea7 100644 --- a/crates/iceberg/src/writer/file_writer/rolling_writer.rs +++ b/crates/iceberg/src/writer/file_writer/rolling_writer.rs @@ -268,7 +268,7 @@ mod tests { use tempfile::TempDir; use super::*; - use crate::io::FileIOBuilder; + use crate::io::FileIO; use crate::spec::{DataFileFormat, NestedField, PrimitiveType, Schema, Type}; use crate::writer::base_writer::data_file_writer::DataFileWriterBuilder; use crate::writer::file_writer::ParquetWriterBuilder; @@ -304,7 +304,7 @@ mod tests { #[tokio::test] async fn test_rolling_writer_basic() -> Result<()> { let temp_dir = TempDir::new()?; - let file_io = FileIOBuilder::new_fs_io().build()?; + let file_io = FileIO::new_with_fs(); let location_gen = DefaultLocationGenerator::with_data_location( temp_dir.path().to_str().unwrap().to_string(), ); @@ -362,7 +362,7 @@ mod tests { #[tokio::test] async fn test_rolling_writer_with_rolling() -> Result<()> { let temp_dir = TempDir::new()?; - let file_io = FileIOBuilder::new_fs_io().build()?; + let file_io = FileIO::new_with_fs(); let location_gen = DefaultLocationGenerator::with_data_location( temp_dir.path().to_str().unwrap().to_string(), ); diff --git a/crates/iceberg/src/writer/partitioning/clustered_writer.rs b/crates/iceberg/src/writer/partitioning/clustered_writer.rs index 01eb452083..e2e0452045 100644 --- a/crates/iceberg/src/writer/partitioning/clustered_writer.rs +++ b/crates/iceberg/src/writer/partitioning/clustered_writer.rs @@ -152,7 +152,7 @@ mod tests { use tempfile::TempDir; use super::*; - use crate::io::FileIOBuilder; + use crate::io::FileIO; use crate::spec::{DataFileFormat, NestedField, PrimitiveType, Type}; use crate::writer::base_writer::data_file_writer::DataFileWriterBuilder; use crate::writer::file_writer::ParquetWriterBuilder; @@ -164,7 +164,7 @@ mod tests { #[tokio::test] async fn test_clustered_writer_single_partition() -> Result<()> { let temp_dir = TempDir::new()?; - let file_io = FileIOBuilder::new_fs_io().build()?; + let file_io = FileIO::new_with_fs(); let location_gen = DefaultLocationGenerator::with_data_location( temp_dir.path().to_str().unwrap().to_string(), ); @@ -261,7 +261,7 @@ mod tests { #[tokio::test] async fn test_clustered_writer_sorted_partitions() -> Result<()> { let temp_dir = TempDir::new()?; - let file_io = FileIOBuilder::new_fs_io().build()?; + let file_io = FileIO::new_with_fs(); let location_gen = DefaultLocationGenerator::with_data_location( temp_dir.path().to_str().unwrap().to_string(), ); @@ -402,7 +402,7 @@ mod tests { #[tokio::test] async fn test_clustered_writer_unsorted_partitions_error() -> Result<()> { let temp_dir = TempDir::new()?; - let file_io = FileIOBuilder::new_fs_io().build()?; + let file_io = FileIO::new_with_fs(); let location_gen = DefaultLocationGenerator::with_data_location( temp_dir.path().to_str().unwrap().to_string(), ); diff --git a/crates/iceberg/src/writer/partitioning/fanout_writer.rs b/crates/iceberg/src/writer/partitioning/fanout_writer.rs index 21a174b0d0..5348fea157 100644 --- a/crates/iceberg/src/writer/partitioning/fanout_writer.rs +++ b/crates/iceberg/src/writer/partitioning/fanout_writer.rs @@ -126,7 +126,7 @@ mod tests { use tempfile::TempDir; use super::*; - use crate::io::FileIOBuilder; + use crate::io::FileIO; use crate::spec::{ DataFileFormat, Literal, NestedField, PartitionKey, PartitionSpec, PrimitiveType, Struct, Type, @@ -141,7 +141,7 @@ mod tests { #[tokio::test] async fn test_fanout_writer_single_partition() -> Result<()> { let temp_dir = TempDir::new()?; - let file_io = FileIOBuilder::new_fs_io().build()?; + let file_io = FileIO::new_with_fs(); let location_gen = DefaultLocationGenerator::with_data_location( temp_dir.path().to_str().unwrap().to_string(), ); @@ -237,7 +237,7 @@ mod tests { #[tokio::test] async fn test_fanout_writer_multiple_partitions() -> Result<()> { let temp_dir = TempDir::new()?; - let file_io = FileIOBuilder::new_fs_io().build()?; + let file_io = FileIO::new_with_fs(); let location_gen = DefaultLocationGenerator::with_data_location( temp_dir.path().to_str().unwrap().to_string(), ); diff --git a/crates/iceberg/src/writer/partitioning/unpartitioned_writer.rs b/crates/iceberg/src/writer/partitioning/unpartitioned_writer.rs index 29825a5416..de127e0c9d 100644 --- a/crates/iceberg/src/writer/partitioning/unpartitioned_writer.rs +++ b/crates/iceberg/src/writer/partitioning/unpartitioned_writer.rs @@ -115,7 +115,7 @@ mod tests { use super::*; use crate::Result; - use crate::io::FileIOBuilder; + use crate::io::FileIO; use crate::spec::{DataFileFormat, NestedField, PrimitiveType, Struct, Type}; use crate::writer::base_writer::data_file_writer::DataFileWriterBuilder; use crate::writer::file_writer::ParquetWriterBuilder; @@ -152,7 +152,7 @@ mod tests { ])); // Build writer - let file_io = FileIOBuilder::new_fs_io().build()?; + let file_io = FileIO::new_with_fs(); let location_gen = DefaultLocationGenerator::with_data_location( temp_dir.path().to_str().unwrap().to_string(), ); diff --git a/crates/iceberg/tests/file_io_gcs_test.rs b/crates/iceberg/tests/file_io_gcs_test.rs index 005b0f3979..f489f28416 100644 --- a/crates/iceberg/tests/file_io_gcs_test.rs +++ b/crates/iceberg/tests/file_io_gcs_test.rs @@ -22,9 +22,10 @@ #[cfg(all(test, feature = "storage-gcs"))] mod tests { use std::collections::HashMap; + use std::sync::Arc; use bytes::Bytes; - use iceberg::io::{FileIO, FileIOBuilder, GCS_NO_AUTH, GCS_SERVICE_PATH}; + use iceberg::io::{FileIO, FileIOBuilder, OpenDalStorageFactory, GCS_NO_AUTH, GCS_SERVICE_PATH}; use iceberg_test_utils::{get_gcs_endpoint, set_up}; static FAKE_GCS_BUCKET: &str = "test-bucket"; @@ -37,13 +38,12 @@ mod tests { // A bucket must exist for FileIO create_bucket(FAKE_GCS_BUCKET, &gcs_endpoint).await.unwrap(); - FileIOBuilder::new("gcs") + FileIOBuilder::new(Arc::new(OpenDalStorageFactory::Gcs)) .with_props(vec![ (GCS_SERVICE_PATH, gcs_endpoint), (GCS_NO_AUTH, "true".to_string()), ]) .build() - .unwrap() } // Create a bucket against the emulated GCS storage server. diff --git a/crates/iceberg/tests/file_io_s3_test.rs b/crates/iceberg/tests/file_io_s3_test.rs index f28538e73e..e414422fcf 100644 --- a/crates/iceberg/tests/file_io_s3_test.rs +++ b/crates/iceberg/tests/file_io_s3_test.rs @@ -25,8 +25,8 @@ mod tests { use async_trait::async_trait; use iceberg::io::{ - CustomAwsCredentialLoader, FileIO, FileIOBuilder, S3_ACCESS_KEY_ID, S3_ENDPOINT, S3_REGION, - S3_SECRET_ACCESS_KEY, + CustomAwsCredentialLoader, FileIO, FileIOBuilder, + OpenDalStorageFactory, S3_ACCESS_KEY_ID, S3_ENDPOINT, S3_REGION, S3_SECRET_ACCESS_KEY, }; use iceberg_test_utils::{get_minio_endpoint, normalize_test_name_with_parts, set_up}; use reqsign::{AwsCredential, AwsCredentialLoad}; @@ -37,15 +37,16 @@ mod tests { let minio_endpoint = get_minio_endpoint(); - FileIOBuilder::new("s3") - .with_props(vec![ - (S3_ENDPOINT, minio_endpoint), - (S3_ACCESS_KEY_ID, "admin".to_string()), - (S3_SECRET_ACCESS_KEY, "password".to_string()), - (S3_REGION, "us-east-1".to_string()), - ]) - .build() - .unwrap() + FileIOBuilder::new(Arc::new(OpenDalStorageFactory::S3 { + customized_credential_load: None, + })) + .with_props(vec![ + (S3_ENDPOINT, minio_endpoint), + (S3_ACCESS_KEY_ID, "admin".to_string()), + (S3_SECRET_ACCESS_KEY, "password".to_string()), + (S3_REGION, "us-east-1".to_string()), + ]) + .build() } #[tokio::test] @@ -124,37 +125,15 @@ mod tests { #[test] fn test_file_io_builder_extension_system() { // Test adding and retrieving extensions - let test_string = "test_extension_value".to_string(); - let builder = FileIOBuilder::new_fs_io().with_extension(test_string.clone()); - - // Test retrieving the extension - let extension: Option> = builder.extension(); - assert!(extension.is_some()); - assert_eq!(*extension.unwrap(), test_string); - - // Test that non-existent extension returns None - let non_existent: Option> = builder.extension(); - assert!(non_existent.is_none()); + // Note: Extension system is not part of the new FileIOBuilder API + // This test is removed as the extension system was part of the old API } #[test] fn test_file_io_builder_multiple_extensions() { // Test adding multiple different types of extensions - let test_string = "test_value".to_string(); - let test_number = 42i32; - - let builder = FileIOBuilder::new_fs_io() - .with_extension(test_string.clone()) - .with_extension(test_number); - - // Retrieve both extensions - let string_ext: Option> = builder.extension(); - let number_ext: Option> = builder.extension(); - - assert!(string_ext.is_some()); - assert!(number_ext.is_some()); - assert_eq!(*string_ext.unwrap(), test_string); - assert_eq!(*number_ext.unwrap(), test_number); + // Note: Extension system is not part of the new FileIOBuilder API + // This test is removed as the extension system was part of the old API } #[test] @@ -163,18 +142,15 @@ mod tests { let mock_loader = MockCredentialLoader::new_minio(); let custom_loader = CustomAwsCredentialLoader::new(Arc::new(mock_loader)); - // Test that the loader can be used in FileIOBuilder - let builder = FileIOBuilder::new("s3") - .with_extension(custom_loader.clone()) - .with_props(vec![ - (S3_ENDPOINT, "http://localhost:9000".to_string()), - ("bucket", "test-bucket".to_string()), - (S3_REGION, "us-east-1".to_string()), - ]); - - // Verify the extension was stored - let retrieved_loader: Option> = builder.extension(); - assert!(retrieved_loader.is_some()); + // Test that the loader can be used in FileIOBuilder with OpenDalStorageFactory + let _builder = FileIOBuilder::new(Arc::new(OpenDalStorageFactory::S3 { + customized_credential_load: Some(custom_loader), + })) + .with_props(vec![ + (S3_ENDPOINT, "http://localhost:9000".to_string()), + ("bucket", "test-bucket".to_string()), + (S3_REGION, "us-east-1".to_string()), + ]); } #[tokio::test] @@ -187,15 +163,15 @@ mod tests { let minio_endpoint = get_minio_endpoint(); - // Build FileIO with custom credential loader - let file_io_with_custom_creds = FileIOBuilder::new("s3") - .with_extension(custom_loader) - .with_props(vec![ - (S3_ENDPOINT, minio_endpoint), - (S3_REGION, "us-east-1".to_string()), - ]) - .build() - .unwrap(); + // Build FileIO with custom credential loader via OpenDalStorageFactory + let file_io_with_custom_creds = FileIOBuilder::new(Arc::new(OpenDalStorageFactory::S3 { + customized_credential_load: Some(custom_loader), + })) + .with_props(vec![ + (S3_ENDPOINT, minio_endpoint), + (S3_REGION, "us-east-1".to_string()), + ]) + .build(); // Test that the FileIO was built successfully with the custom loader match file_io_with_custom_creds.exists("s3://bucket1/any").await { @@ -214,15 +190,15 @@ mod tests { let minio_endpoint = get_minio_endpoint(); - // Build FileIO with custom credential loader - let file_io_with_custom_creds = FileIOBuilder::new("s3") - .with_extension(custom_loader) - .with_props(vec![ - (S3_ENDPOINT, minio_endpoint), - (S3_REGION, "us-east-1".to_string()), - ]) - .build() - .unwrap(); + // Build FileIO with custom credential loader via OpenDalStorageFactory + let file_io_with_custom_creds = FileIOBuilder::new(Arc::new(OpenDalStorageFactory::S3 { + customized_credential_load: Some(custom_loader), + })) + .with_props(vec![ + (S3_ENDPOINT, minio_endpoint), + (S3_REGION, "us-east-1".to_string()), + ]) + .build(); // Test that the FileIO was built successfully with the custom loader match file_io_with_custom_creds.exists("s3://bucket1/any").await { From 4c04d70758d07d3cd07c7682255233c038370a5e Mon Sep 17 00:00:00 2001 From: Shawn Chang Date: Thu, 5 Feb 2026 12:11:04 -0800 Subject: [PATCH 06/13] clean up --- crates/catalog/glue/src/catalog.rs | 4 ++-- crates/iceberg/src/catalog/memory/catalog.rs | 12 ++++------ crates/iceberg/src/catalog/mod.rs | 3 ++- crates/iceberg/src/io/file_io.rs | 25 ++++---------------- crates/iceberg/src/io/storage/opendal/mod.rs | 2 +- crates/iceberg/tests/file_io_gcs_test.rs | 4 +++- crates/iceberg/tests/file_io_s3_test.rs | 4 ++-- 7 files changed, 20 insertions(+), 34 deletions(-) diff --git a/crates/catalog/glue/src/catalog.rs b/crates/catalog/glue/src/catalog.rs index e31b3217a1..85a58b5600 100644 --- a/crates/catalog/glue/src/catalog.rs +++ b/crates/catalog/glue/src/catalog.rs @@ -25,8 +25,8 @@ use aws_sdk_glue::operation::create_table::CreateTableError; use aws_sdk_glue::operation::update_table::UpdateTableError; use aws_sdk_glue::types::TableInput; use iceberg::io::{ - FileIO, FileIOBuilder, LocalFsStorageFactory, StorageFactory, S3_ACCESS_KEY_ID, S3_ENDPOINT, - S3_REGION, S3_SECRET_ACCESS_KEY, S3_SESSION_TOKEN, + FileIO, FileIOBuilder, LocalFsStorageFactory, S3_ACCESS_KEY_ID, S3_ENDPOINT, S3_REGION, + S3_SECRET_ACCESS_KEY, S3_SESSION_TOKEN, StorageFactory, }; use iceberg::spec::{TableMetadata, TableMetadataBuilder}; use iceberg::table::Table; diff --git a/crates/iceberg/src/catalog/memory/catalog.rs b/crates/iceberg/src/catalog/memory/catalog.rs index 4b8dbf8fb4..e008de8050 100644 --- a/crates/iceberg/src/catalog/memory/catalog.rs +++ b/crates/iceberg/src/catalog/memory/catalog.rs @@ -25,7 +25,7 @@ use futures::lock::{Mutex, MutexGuard}; use itertools::Itertools; use super::namespace_state::NamespaceState; -use crate::io::{FileIO, FileIOBuilder, MemoryStorageFactory}; +use crate::io::{FileIO, FileIOBuilder, MemoryStorageFactory, StorageFactory}; use crate::spec::{TableMetadata, TableMetadataBuilder}; use crate::table::Table; use crate::{ @@ -43,7 +43,7 @@ const LOCATION: &str = "location"; #[derive(Debug)] pub struct MemoryCatalogBuilder { config: MemoryCatalogConfig, - storage_factory: Option>, + storage_factory: Option>, } impl Default for MemoryCatalogBuilder { @@ -62,7 +62,7 @@ impl Default for MemoryCatalogBuilder { impl CatalogBuilder for MemoryCatalogBuilder { type C = MemoryCatalog; - fn with_storage_factory(mut self, storage_factory: Arc) -> Self { + fn with_storage_factory(mut self, storage_factory: Arc) -> Self { self.storage_factory = Some(storage_factory); self } @@ -126,16 +126,14 @@ impl MemoryCatalog { /// Creates a memory catalog. fn new( config: MemoryCatalogConfig, - storage_factory: Option>, + storage_factory: Option>, ) -> Result { // Use provided factory or default to MemoryStorageFactory let factory = storage_factory.unwrap_or_else(|| Arc::new(MemoryStorageFactory)); Ok(Self { root_namespace_state: Mutex::new(NamespaceState::default()), - file_io: FileIOBuilder::new(factory) - .with_props(config.props) - .build(), + file_io: FileIOBuilder::new(factory).with_props(config.props).build(), warehouse_location: config.warehouse, }) } diff --git a/crates/iceberg/src/catalog/mod.rs b/crates/iceberg/src/catalog/mod.rs index e17689099f..e9c156e636 100644 --- a/crates/iceberg/src/catalog/mod.rs +++ b/crates/iceberg/src/catalog/mod.rs @@ -38,6 +38,7 @@ use serde_derive::{Deserialize, Serialize}; use typed_builder::TypedBuilder; use uuid::Uuid; +use crate::io::StorageFactory; use crate::spec::{ EncryptedKey, FormatVersion, PartitionStatisticsFile, Schema, SchemaId, Snapshot, SnapshotReference, SortOrder, StatisticsFile, TableMetadata, TableMetadataBuilder, @@ -138,7 +139,7 @@ pub trait CatalogBuilder: Default + Debug + Send + Sync { /// .load("my_catalog", props) /// .await?; /// ``` - fn with_storage_factory(self, storage_factory: Arc) -> Self; + fn with_storage_factory(self, storage_factory: Arc) -> Self; /// Create a new catalog instance. fn load( diff --git a/crates/iceberg/src/io/file_io.rs b/crates/iceberg/src/io/file_io.rs index 33eba11f0e..989aaf20fe 100644 --- a/crates/iceberg/src/io/file_io.rs +++ b/crates/iceberg/src/io/file_io.rs @@ -533,14 +533,8 @@ mod tests { .with_prop("key2", "value2") .build(); - assert_eq!( - file_io.config().get("key1"), - Some(&"value1".to_string()) - ); - assert_eq!( - file_io.config().get("key2"), - Some(&"value2".to_string()) - ); + assert_eq!(file_io.config().get("key1"), Some(&"value1".to_string())); + assert_eq!(file_io.config().get("key2"), Some(&"value2".to_string())); } #[tokio::test] @@ -549,14 +543,8 @@ mod tests { let props = vec![("key1", "value1"), ("key2", "value2")]; let file_io = FileIOBuilder::new(factory).with_props(props).build(); - assert_eq!( - file_io.config().get("key1"), - Some(&"value1".to_string()) - ); - assert_eq!( - file_io.config().get("key2"), - Some(&"value2".to_string()) - ); + assert_eq!(file_io.config().get("key1"), Some(&"value1".to_string())); + assert_eq!(file_io.config().get("key2"), Some(&"value2".to_string())); } #[tokio::test] @@ -571,10 +559,7 @@ mod tests { // Can build a new FileIO from the builder let new_file_io = builder.with_prop("key2", "value2").build(); - assert_eq!( - new_file_io.config().get("key"), - Some(&"value".to_string()) - ); + assert_eq!(new_file_io.config().get("key"), Some(&"value".to_string())); assert_eq!( new_file_io.config().get("key2"), Some(&"value2".to_string()) diff --git a/crates/iceberg/src/io/storage/opendal/mod.rs b/crates/iceberg/src/io/storage/opendal/mod.rs index e59b1a5b01..ba2fc948c1 100644 --- a/crates/iceberg/src/io/storage/opendal/mod.rs +++ b/crates/iceberg/src/io/storage/opendal/mod.rs @@ -24,6 +24,7 @@ use async_trait::async_trait; #[cfg(feature = "storage-azdls")] use azdls::AzureStorageScheme; use bytes::Bytes; +use opendal::Operator; use opendal::layers::RetryLayer; #[cfg(feature = "storage-azdls")] use opendal::services::AzdlsConfig; @@ -33,7 +34,6 @@ use opendal::services::GcsConfig; use opendal::services::OssConfig; #[cfg(feature = "storage-s3")] use opendal::services::S3Config; -use opendal::Operator; #[cfg(feature = "storage-s3")] pub use s3::CustomAwsCredentialLoader; use serde::{Deserialize, Serialize}; diff --git a/crates/iceberg/tests/file_io_gcs_test.rs b/crates/iceberg/tests/file_io_gcs_test.rs index f489f28416..75bc9fae12 100644 --- a/crates/iceberg/tests/file_io_gcs_test.rs +++ b/crates/iceberg/tests/file_io_gcs_test.rs @@ -25,7 +25,9 @@ mod tests { use std::sync::Arc; use bytes::Bytes; - use iceberg::io::{FileIO, FileIOBuilder, OpenDalStorageFactory, GCS_NO_AUTH, GCS_SERVICE_PATH}; + use iceberg::io::{ + FileIO, FileIOBuilder, GCS_NO_AUTH, GCS_SERVICE_PATH, OpenDalStorageFactory, + }; use iceberg_test_utils::{get_gcs_endpoint, set_up}; static FAKE_GCS_BUCKET: &str = "test-bucket"; diff --git a/crates/iceberg/tests/file_io_s3_test.rs b/crates/iceberg/tests/file_io_s3_test.rs index e414422fcf..cebaf2046b 100644 --- a/crates/iceberg/tests/file_io_s3_test.rs +++ b/crates/iceberg/tests/file_io_s3_test.rs @@ -25,8 +25,8 @@ mod tests { use async_trait::async_trait; use iceberg::io::{ - CustomAwsCredentialLoader, FileIO, FileIOBuilder, - OpenDalStorageFactory, S3_ACCESS_KEY_ID, S3_ENDPOINT, S3_REGION, S3_SECRET_ACCESS_KEY, + CustomAwsCredentialLoader, FileIO, FileIOBuilder, OpenDalStorageFactory, S3_ACCESS_KEY_ID, + S3_ENDPOINT, S3_REGION, S3_SECRET_ACCESS_KEY, }; use iceberg_test_utils::{get_minio_endpoint, normalize_test_name_with_parts, set_up}; use reqsign::{AwsCredential, AwsCredentialLoad}; From ddd3fa1b0cb055b63dfbe341a57c4f1ff018f5aa Mon Sep 17 00:00:00 2001 From: Shawn Chang Date: Thu, 5 Feb 2026 12:37:16 -0800 Subject: [PATCH 07/13] update all tests --- .../catalog/glue/tests/glue_catalog_test.rs | 16 +++--- crates/catalog/hms/tests/hms_catalog_test.rs | 16 +++--- crates/catalog/rest/src/catalog.rs | 53 +++++++++++-------- crates/catalog/s3tables/src/catalog.rs | 2 +- crates/examples/src/oss_backend.rs | 5 +- .../datafusion/src/physical_plan/project.rs | 6 +-- .../src/physical_plan/repartition.rs | 16 +++--- .../integrations/datafusion/src/table/mod.rs | 5 +- .../datafusion/src/task_writer.rs | 4 +- 9 files changed, 69 insertions(+), 54 deletions(-) diff --git a/crates/catalog/glue/tests/glue_catalog_test.rs b/crates/catalog/glue/tests/glue_catalog_test.rs index f6e2060c0f..f7a287fb8e 100644 --- a/crates/catalog/glue/tests/glue_catalog_test.rs +++ b/crates/catalog/glue/tests/glue_catalog_test.rs @@ -21,8 +21,12 @@ //! Each test uses unique namespaces based on module path to avoid conflicts. use std::collections::HashMap; +use std::sync::Arc; -use iceberg::io::{S3_ACCESS_KEY_ID, S3_ENDPOINT, S3_REGION, S3_SECRET_ACCESS_KEY}; +use iceberg::io::{ + FileIOBuilder, OpenDalStorageFactory, S3_ACCESS_KEY_ID, S3_ENDPOINT, S3_REGION, + S3_SECRET_ACCESS_KEY, +}; use iceberg::spec::{NestedField, PrimitiveType, Schema, Type}; use iceberg::transaction::{ApplyTransactionAction, Transaction}; use iceberg::{ @@ -59,11 +63,11 @@ async fn get_catalog() -> GlueCatalog { ]); // Wait for bucket to actually exist - let file_io = iceberg::io::FileIO::from_path("s3a://") - .unwrap() - .with_props(props.clone()) - .build() - .unwrap(); + let file_io = FileIOBuilder::new(Arc::new(OpenDalStorageFactory::S3 { + customized_credential_load: None, + })) + .with_props(props.clone()) + .build(); let mut retries = 0; while retries < 30 { diff --git a/crates/catalog/hms/tests/hms_catalog_test.rs b/crates/catalog/hms/tests/hms_catalog_test.rs index bc036d0c6b..7ff64d15db 100644 --- a/crates/catalog/hms/tests/hms_catalog_test.rs +++ b/crates/catalog/hms/tests/hms_catalog_test.rs @@ -21,8 +21,12 @@ //! Each test uses unique namespaces based on module path to avoid conflicts. use std::collections::HashMap; +use std::sync::Arc; -use iceberg::io::{S3_ACCESS_KEY_ID, S3_ENDPOINT, S3_REGION, S3_SECRET_ACCESS_KEY}; +use iceberg::io::{ + FileIOBuilder, OpenDalStorageFactory, S3_ACCESS_KEY_ID, S3_ENDPOINT, S3_REGION, + S3_SECRET_ACCESS_KEY, +}; use iceberg::spec::{NestedField, PrimitiveType, Schema, Type}; use iceberg::{Catalog, CatalogBuilder, Namespace, NamespaceIdent, TableCreation, TableIdent}; use iceberg_catalog_hms::{ @@ -60,11 +64,11 @@ async fn get_catalog() -> HmsCatalog { ]); // Wait for bucket to actually exist - let file_io = iceberg::io::FileIO::from_path("s3a://") - .unwrap() - .with_props(props.clone()) - .build() - .unwrap(); + let file_io = FileIOBuilder::new(Arc::new(OpenDalStorageFactory::S3 { + customized_credential_load: None, + })) + .with_props(props.clone()) + .build(); let mut retries = 0; while retries < 30 { diff --git a/crates/catalog/rest/src/catalog.rs b/crates/catalog/rest/src/catalog.rs index 8e216006da..620752df22 100644 --- a/crates/catalog/rest/src/catalog.rs +++ b/crates/catalog/rest/src/catalog.rs @@ -994,7 +994,7 @@ mod tests { .create_async() .await; - let catalog = RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build()); + let catalog = RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build(), None); assert_eq!( catalog @@ -1067,6 +1067,7 @@ mod tests { .uri(server.url()) .props(props) .build(), + None, ); let token = catalog.context().await.unwrap().client.token().await; @@ -1113,6 +1114,7 @@ mod tests { .uri(server.url()) .props(props) .build(), + None, ); let token = catalog.context().await.unwrap().client.token().await; @@ -1136,6 +1138,7 @@ mod tests { .uri(server.url()) .props(props) .build(), + None, ); let token = catalog.context().await.unwrap().client.token().await; @@ -1166,6 +1169,7 @@ mod tests { .uri(server.url()) .props(props) .build(), + None, ); let token = catalog.context().await.unwrap().client.token().await; @@ -1196,6 +1200,7 @@ mod tests { .uri(server.url()) .props(props) .build(), + None, ); let token = catalog.context().await.unwrap().client.token().await; @@ -1226,6 +1231,7 @@ mod tests { .uri(server.url()) .props(props) .build(), + None, ); let token = catalog.context().await.unwrap().client.token().await; @@ -1338,6 +1344,7 @@ mod tests { .uri(server.url()) .props(props) .build(), + None, ); let token = catalog.context().await.unwrap().client.token().await; @@ -1382,7 +1389,7 @@ mod tests { .create_async() .await; - let catalog = RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build()); + let catalog = RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build(), None); let _namespaces = catalog.list_namespaces(None).await.unwrap(); @@ -1409,7 +1416,7 @@ mod tests { .create_async() .await; - let catalog = RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build()); + let catalog = RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build(), None); let namespaces = catalog.list_namespaces(None).await.unwrap(); @@ -1457,7 +1464,7 @@ mod tests { .create_async() .await; - let catalog = RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build()); + let catalog = RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build(), None); let namespaces = catalog.list_namespaces(None).await.unwrap(); @@ -1553,7 +1560,7 @@ mod tests { .create_async() .await; - let catalog = RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build()); + let catalog = RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build(), None); let namespaces = catalog.list_namespaces(None).await.unwrap(); @@ -1603,7 +1610,7 @@ mod tests { .create_async() .await; - let catalog = RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build()); + let catalog = RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build(), None); let namespaces = catalog .create_namespace( @@ -1643,7 +1650,7 @@ mod tests { .create_async() .await; - let catalog = RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build()); + let catalog = RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build(), None); let namespaces = catalog .get_namespace(&NamespaceIdent::new("ns1".to_string())) @@ -1673,7 +1680,7 @@ mod tests { .create_async() .await; - let catalog = RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build()); + let catalog = RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build(), None); assert!( catalog @@ -1698,7 +1705,7 @@ mod tests { .create_async() .await; - let catalog = RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build()); + let catalog = RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build(), None); catalog .drop_namespace(&NamespaceIdent::new("ns1".to_string())) @@ -1735,7 +1742,7 @@ mod tests { .create_async() .await; - let catalog = RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build()); + let catalog = RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build(), None); let tables = catalog .list_tables(&NamespaceIdent::new("ns1".to_string())) @@ -1800,7 +1807,7 @@ mod tests { .create_async() .await; - let catalog = RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build()); + let catalog = RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build(), None); let tables = catalog .list_tables(&NamespaceIdent::new("ns1".to_string())) @@ -1928,7 +1935,7 @@ mod tests { .create_async() .await; - let catalog = RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build()); + let catalog = RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build(), None); let tables = catalog .list_tables(&NamespaceIdent::new("ns1".to_string())) @@ -1969,7 +1976,7 @@ mod tests { .create_async() .await; - let catalog = RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build()); + let catalog = RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build(), None); catalog .drop_table(&TableIdent::new( @@ -1995,7 +2002,7 @@ mod tests { .create_async() .await; - let catalog = RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build()); + let catalog = RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build(), None); assert!( catalog @@ -2023,7 +2030,7 @@ mod tests { .create_async() .await; - let catalog = RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build()); + let catalog = RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build(), None); catalog .rename_table( @@ -2054,7 +2061,7 @@ mod tests { .create_async() .await; - let catalog = RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build()); + let catalog = RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build(), None); let table = catalog .load_table(&TableIdent::new( @@ -2168,7 +2175,7 @@ mod tests { .create_async() .await; - let catalog = RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build()); + let catalog = RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build(), None); let table = catalog .load_table(&TableIdent::new( @@ -2201,7 +2208,7 @@ mod tests { .create_async() .await; - let catalog = RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build()); + let catalog = RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build(), None); let table_creation = TableCreation::builder() .name("test1".to_string()) @@ -2347,7 +2354,7 @@ mod tests { .create_async() .await; - let catalog = RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build()); + let catalog = RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build(), None); let table_creation = TableCreation::builder() .name("test1".to_string()) @@ -2413,7 +2420,7 @@ mod tests { .create_async() .await; - let catalog = RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build()); + let catalog = RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build(), None); let table1 = { let file = File::open(format!( @@ -2553,7 +2560,7 @@ mod tests { .create_async() .await; - let catalog = RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build()); + let catalog = RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build(), None); let table1 = { let file = File::open(format!( @@ -2614,7 +2621,7 @@ mod tests { .create_async() .await; - let catalog = RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build()); + let catalog = RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build(), None); let table_ident = TableIdent::new(NamespaceIdent::new("ns1".to_string()), "test1".to_string()); let metadata_location = String::from( @@ -2662,7 +2669,7 @@ mod tests { .create_async() .await; - let catalog = RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build()); + let catalog = RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build(), None); let table_ident = TableIdent::new(NamespaceIdent::new("ns1".to_string()), "test1".to_string()); diff --git a/crates/catalog/s3tables/src/catalog.rs b/crates/catalog/s3tables/src/catalog.rs index 2ba80e1fb5..b5205589d1 100644 --- a/crates/catalog/s3tables/src/catalog.rs +++ b/crates/catalog/s3tables/src/catalog.rs @@ -693,7 +693,7 @@ mod tests { props: HashMap::new(), }; - Ok(Some(S3TablesCatalog::new(config).await?)) + Ok(Some(S3TablesCatalog::new(config, None).await?)) } #[tokio::test] diff --git a/crates/examples/src/oss_backend.rs b/crates/examples/src/oss_backend.rs index 8f0b866d23..9835b8dc44 100644 --- a/crates/examples/src/oss_backend.rs +++ b/crates/examples/src/oss_backend.rs @@ -16,8 +16,10 @@ // under the License. use std::collections::HashMap; +use std::sync::Arc; use futures::stream::StreamExt; +use iceberg::io::OpenDalStorageFactory; use iceberg::{Catalog, CatalogBuilder, NamespaceIdent, TableIdent}; use iceberg_catalog_rest::{REST_CATALOG_PROP_URI, RestCatalogBuilder}; @@ -41,8 +43,9 @@ static OSS_ACCESS_KEY_SECRET: &str = "99999999999999999999999999999999"; /// The example also requires valid OSS credentials and endpoint to be configured. #[tokio::main] async fn main() { - // Create the REST iceberg catalog. + // Create the REST iceberg catalog with explicit OSS storage factory. let catalog = RestCatalogBuilder::default() + .with_storage_factory(Arc::new(OpenDalStorageFactory::Oss)) .load( "rest", HashMap::from([ diff --git a/crates/integrations/datafusion/src/physical_plan/project.rs b/crates/integrations/datafusion/src/physical_plan/project.rs index 77a02f51e2..f31bb29dd0 100644 --- a/crates/integrations/datafusion/src/physical_plan/project.rs +++ b/crates/integrations/datafusion/src/physical_plan/project.rs @@ -446,7 +446,7 @@ mod tests { let table = iceberg::table::Table::builder() .metadata(table_metadata.metadata) .identifier(TableIdent::from_strs(["test", "table"]).unwrap()) - .file_io(FileIO::from_path("/tmp").unwrap().build().unwrap()) + .file_io(FileIO::new_with_fs()) .metadata_location("/test/metadata.json".to_string()) .build() .unwrap(); @@ -504,7 +504,7 @@ mod tests { let table = iceberg::table::Table::builder() .metadata(table_metadata.metadata) .identifier(TableIdent::from_strs(["test", "table"]).unwrap()) - .file_io(FileIO::from_path("/tmp").unwrap().build().unwrap()) + .file_io(FileIO::new_with_fs()) .metadata_location("/test/metadata.json".to_string()) .build() .unwrap(); @@ -576,7 +576,7 @@ mod tests { let table = iceberg::table::Table::builder() .metadata(table_metadata.metadata) .identifier(TableIdent::from_strs(["test", "table"]).unwrap()) - .file_io(FileIO::from_path("/tmp").unwrap().build().unwrap()) + .file_io(FileIO::new_with_fs()) .metadata_location("/test/metadata.json".to_string()) .build() .unwrap(); diff --git a/crates/integrations/datafusion/src/physical_plan/repartition.rs b/crates/integrations/datafusion/src/physical_plan/repartition.rs index 2d1d7f862c..78063c7309 100644 --- a/crates/integrations/datafusion/src/physical_plan/repartition.rs +++ b/crates/integrations/datafusion/src/physical_plan/repartition.rs @@ -221,7 +221,7 @@ mod tests { Table::builder() .metadata(table_metadata.metadata) .identifier(TableIdent::from_strs(["test", "table"]).unwrap()) - .file_io(FileIO::from_path("/tmp").unwrap().build().unwrap()) + .file_io(FileIO::new_with_fs()) .metadata_location("/test/metadata.json".to_string()) .build() .unwrap() @@ -378,7 +378,7 @@ mod tests { let table = Table::builder() .metadata(table_metadata.metadata) .identifier(TableIdent::from_strs(["test", "bucketed_table"]).unwrap()) - .file_io(FileIO::from_path("/tmp").unwrap().build().unwrap()) + .file_io(FileIO::new_with_fs()) .metadata_location("/test/bucketed_metadata.json".to_string()) .build() .unwrap(); @@ -463,7 +463,7 @@ mod tests { let table = Table::builder() .metadata(table_metadata.metadata) .identifier(TableIdent::from_strs(["test", "partitioned_bucketed_table"]).unwrap()) - .file_io(FileIO::from_path("/tmp").unwrap().build().unwrap()) + .file_io(FileIO::new_with_fs()) .metadata_location("/test/partitioned_bucketed_metadata.json".to_string()) .build() .unwrap(); @@ -547,7 +547,7 @@ mod tests { let table = Table::builder() .metadata(table_metadata.metadata) .identifier(TableIdent::from_strs(["test", "none_table"]).unwrap()) - .file_io(FileIO::from_path("/tmp").unwrap().build().unwrap()) + .file_io(FileIO::new_with_fs()) .metadata_location("/test/none_metadata.json".to_string()) .build() .unwrap(); @@ -619,7 +619,7 @@ mod tests { let table = Table::builder() .metadata(table_metadata.metadata) .identifier(TableIdent::from_strs(["test", "range_only_table"]).unwrap()) - .file_io(FileIO::from_path("/tmp").unwrap().build().unwrap()) + .file_io(FileIO::new_with_fs()) .metadata_location("/test/range_only_metadata.json".to_string()) .build() .unwrap(); @@ -694,7 +694,7 @@ mod tests { let table = Table::builder() .metadata(table_metadata.metadata) .identifier(TableIdent::from_strs(["test", "mixed_transforms_table"]).unwrap()) - .file_io(FileIO::from_path("/tmp").unwrap().build().unwrap()) + .file_io(FileIO::new_with_fs()) .metadata_location("/test/mixed_transforms_metadata.json".to_string()) .build() .unwrap(); @@ -777,7 +777,7 @@ mod tests { let table = Table::builder() .metadata(table_metadata.metadata) .identifier(TableIdent::from_strs(["test", "temporal_partition"]).unwrap()) - .file_io(FileIO::from_path("/tmp").unwrap().build().unwrap()) + .file_io(FileIO::new_with_fs()) .metadata_location("/test/temporal_metadata.json".to_string()) .build() .unwrap(); @@ -850,7 +850,7 @@ mod tests { let table = Table::builder() .metadata(table_metadata.metadata) .identifier(TableIdent::from_strs(["test", "identity_partition"]).unwrap()) - .file_io(FileIO::from_path("/tmp").unwrap().build().unwrap()) + .file_io(FileIO::new_with_fs()) .metadata_location("/test/identity_metadata.json".to_string()) .build() .unwrap(); diff --git a/crates/integrations/datafusion/src/table/mod.rs b/crates/integrations/datafusion/src/table/mod.rs index ae87342fa5..75b7988d8d 100644 --- a/crates/integrations/datafusion/src/table/mod.rs +++ b/crates/integrations/datafusion/src/table/mod.rs @@ -372,10 +372,7 @@ mod tests { env!("CARGO_MANIFEST_DIR"), metadata_file_name ); - let file_io = FileIO::from_path(&metadata_file_path) - .unwrap() - .build() - .unwrap(); + let file_io = FileIO::new_with_fs(); let static_identifier = TableIdent::from_strs(["static_ns", "static_table"]).unwrap(); let static_table = StaticTable::from_metadata_file(&metadata_file_path, static_identifier, file_io) diff --git a/crates/integrations/datafusion/src/task_writer.rs b/crates/integrations/datafusion/src/task_writer.rs index 5329f26458..4ac5323bb8 100644 --- a/crates/integrations/datafusion/src/task_writer.rs +++ b/crates/integrations/datafusion/src/task_writer.rs @@ -267,7 +267,7 @@ mod tests { use datafusion::arrow::array::{ArrayRef, Int32Array, RecordBatch, StringArray, StructArray}; use datafusion::arrow::datatypes::{DataType, Field, Schema}; use iceberg::arrow::PROJECTED_PARTITION_VALUE_COLUMN; - use iceberg::io::FileIOBuilder; + use iceberg::io::FileIO; use iceberg::spec::{DataFileFormat, NestedField, PartitionSpec, PrimitiveType, Type}; use iceberg::writer::base_writer::data_file_writer::DataFileWriterBuilder; use iceberg::writer::file_writer::ParquetWriterBuilder; @@ -349,7 +349,7 @@ mod tests { DefaultFileNameGenerator, >, > { - let file_io = FileIOBuilder::new_fs_io().build()?; + let file_io = FileIO::new_with_fs(); let location_gen = DefaultLocationGenerator::with_data_location( temp_dir.path().to_str().unwrap().to_string(), ); From 795940b651497c827b44255e197f1004cfc40e90 Mon Sep 17 00:00:00 2001 From: Shawn Chang Date: Thu, 5 Feb 2026 12:42:17 -0800 Subject: [PATCH 08/13] fix fmt --- crates/catalog/rest/src/catalog.rs | 69 ++++++++++++++++++++---------- 1 file changed, 46 insertions(+), 23 deletions(-) diff --git a/crates/catalog/rest/src/catalog.rs b/crates/catalog/rest/src/catalog.rs index 620752df22..f469c7b115 100644 --- a/crates/catalog/rest/src/catalog.rs +++ b/crates/catalog/rest/src/catalog.rs @@ -994,7 +994,8 @@ mod tests { .create_async() .await; - let catalog = RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build(), None); + let catalog = + RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build(), None); assert_eq!( catalog @@ -1389,7 +1390,8 @@ mod tests { .create_async() .await; - let catalog = RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build(), None); + let catalog = + RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build(), None); let _namespaces = catalog.list_namespaces(None).await.unwrap(); @@ -1416,7 +1418,8 @@ mod tests { .create_async() .await; - let catalog = RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build(), None); + let catalog = + RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build(), None); let namespaces = catalog.list_namespaces(None).await.unwrap(); @@ -1464,7 +1467,8 @@ mod tests { .create_async() .await; - let catalog = RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build(), None); + let catalog = + RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build(), None); let namespaces = catalog.list_namespaces(None).await.unwrap(); @@ -1560,7 +1564,8 @@ mod tests { .create_async() .await; - let catalog = RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build(), None); + let catalog = + RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build(), None); let namespaces = catalog.list_namespaces(None).await.unwrap(); @@ -1610,7 +1615,8 @@ mod tests { .create_async() .await; - let catalog = RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build(), None); + let catalog = + RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build(), None); let namespaces = catalog .create_namespace( @@ -1650,7 +1656,8 @@ mod tests { .create_async() .await; - let catalog = RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build(), None); + let catalog = + RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build(), None); let namespaces = catalog .get_namespace(&NamespaceIdent::new("ns1".to_string())) @@ -1680,7 +1687,8 @@ mod tests { .create_async() .await; - let catalog = RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build(), None); + let catalog = + RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build(), None); assert!( catalog @@ -1705,7 +1713,8 @@ mod tests { .create_async() .await; - let catalog = RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build(), None); + let catalog = + RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build(), None); catalog .drop_namespace(&NamespaceIdent::new("ns1".to_string())) @@ -1742,7 +1751,8 @@ mod tests { .create_async() .await; - let catalog = RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build(), None); + let catalog = + RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build(), None); let tables = catalog .list_tables(&NamespaceIdent::new("ns1".to_string())) @@ -1807,7 +1817,8 @@ mod tests { .create_async() .await; - let catalog = RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build(), None); + let catalog = + RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build(), None); let tables = catalog .list_tables(&NamespaceIdent::new("ns1".to_string())) @@ -1935,7 +1946,8 @@ mod tests { .create_async() .await; - let catalog = RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build(), None); + let catalog = + RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build(), None); let tables = catalog .list_tables(&NamespaceIdent::new("ns1".to_string())) @@ -1976,7 +1988,8 @@ mod tests { .create_async() .await; - let catalog = RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build(), None); + let catalog = + RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build(), None); catalog .drop_table(&TableIdent::new( @@ -2002,7 +2015,8 @@ mod tests { .create_async() .await; - let catalog = RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build(), None); + let catalog = + RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build(), None); assert!( catalog @@ -2030,7 +2044,8 @@ mod tests { .create_async() .await; - let catalog = RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build(), None); + let catalog = + RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build(), None); catalog .rename_table( @@ -2061,7 +2076,8 @@ mod tests { .create_async() .await; - let catalog = RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build(), None); + let catalog = + RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build(), None); let table = catalog .load_table(&TableIdent::new( @@ -2175,7 +2191,8 @@ mod tests { .create_async() .await; - let catalog = RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build(), None); + let catalog = + RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build(), None); let table = catalog .load_table(&TableIdent::new( @@ -2208,7 +2225,8 @@ mod tests { .create_async() .await; - let catalog = RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build(), None); + let catalog = + RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build(), None); let table_creation = TableCreation::builder() .name("test1".to_string()) @@ -2354,7 +2372,8 @@ mod tests { .create_async() .await; - let catalog = RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build(), None); + let catalog = + RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build(), None); let table_creation = TableCreation::builder() .name("test1".to_string()) @@ -2420,7 +2439,8 @@ mod tests { .create_async() .await; - let catalog = RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build(), None); + let catalog = + RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build(), None); let table1 = { let file = File::open(format!( @@ -2560,7 +2580,8 @@ mod tests { .create_async() .await; - let catalog = RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build(), None); + let catalog = + RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build(), None); let table1 = { let file = File::open(format!( @@ -2621,7 +2642,8 @@ mod tests { .create_async() .await; - let catalog = RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build(), None); + let catalog = + RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build(), None); let table_ident = TableIdent::new(NamespaceIdent::new("ns1".to_string()), "test1".to_string()); let metadata_location = String::from( @@ -2669,7 +2691,8 @@ mod tests { .create_async() .await; - let catalog = RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build(), None); + let catalog = + RestCatalog::new(RestCatalogConfig::builder().uri(server.url()).build(), None); let table_ident = TableIdent::new(NamespaceIdent::new("ns1".to_string()), "test1".to_string()); From 632cfbcf960087310ae12614494fdaee834a5e9b Mon Sep 17 00:00:00 2001 From: Shawn Chang Date: Thu, 5 Feb 2026 13:36:38 -0800 Subject: [PATCH 09/13] Use opendal storage in python binding for now --- .../python/src/datafusion_table_provider.rs | 34 +++++++++++++++---- 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/bindings/python/src/datafusion_table_provider.rs b/bindings/python/src/datafusion_table_provider.rs index 8db7223b34..57dbd623b4 100644 --- a/bindings/python/src/datafusion_table_provider.rs +++ b/bindings/python/src/datafusion_table_provider.rs @@ -21,7 +21,7 @@ use std::sync::Arc; use datafusion_ffi::table_provider::FFI_TableProvider; use iceberg::TableIdent; -use iceberg::io::FileIO; +use iceberg::io::{FileIOBuilder, OpenDalStorageFactory, StorageFactory}; use iceberg::table::StaticTable; use iceberg_datafusion::table::IcebergStaticTableProvider; use pyo3::exceptions::PyRuntimeError; @@ -30,6 +30,29 @@ use pyo3::types::PyCapsule; use crate::runtime::runtime; +/// Parse the scheme from a URL and return the appropriate StorageFactory. +fn storage_factory_from_path(path: &str) -> PyResult> { + let scheme = path + .split("://") + .next() + .ok_or_else(|| PyRuntimeError::new_err(format!("Invalid path, missing scheme: {path}")))?; + + let factory: Arc = match scheme { + "file" | "" => Arc::new(OpenDalStorageFactory::Fs), + "s3" | "s3a" => Arc::new(OpenDalStorageFactory::S3 { + customized_credential_load: None, + }), + "memory" => Arc::new(OpenDalStorageFactory::Memory), + _ => { + return Err(PyRuntimeError::new_err(format!( + "Unsupported storage scheme: {scheme}" + ))) + } + }; + + Ok(factory) +} + #[pyclass(name = "IcebergDataFusionTable")] pub struct PyIcebergDataFusionTable { inner: Arc, @@ -49,16 +72,15 @@ impl PyIcebergDataFusionTable { let table_ident = TableIdent::from_strs(identifier) .map_err(|e| PyRuntimeError::new_err(format!("Invalid table identifier: {e}")))?; - let mut builder = FileIO::from_path(&metadata_location) - .map_err(|e| PyRuntimeError::new_err(format!("Failed to init FileIO: {e}")))?; + let factory = storage_factory_from_path(&metadata_location)?; + + let mut builder = FileIOBuilder::new(factory); if let Some(props) = file_io_properties { builder = builder.with_props(props); } - let file_io = builder - .build() - .map_err(|e| PyRuntimeError::new_err(format!("Failed to build FileIO: {e}")))?; + let file_io = builder.build(); let static_table = StaticTable::from_metadata_file(&metadata_location, table_ident, file_io) From 82ecb9b387247667fd23e1db891e2754f34cb106 Mon Sep 17 00:00:00 2001 From: Shawn Chang Date: Thu, 5 Feb 2026 13:42:49 -0800 Subject: [PATCH 10/13] fix fmt --- bindings/python/src/datafusion_table_provider.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bindings/python/src/datafusion_table_provider.rs b/bindings/python/src/datafusion_table_provider.rs index 57dbd623b4..821a9ee98b 100644 --- a/bindings/python/src/datafusion_table_provider.rs +++ b/bindings/python/src/datafusion_table_provider.rs @@ -46,7 +46,7 @@ fn storage_factory_from_path(path: &str) -> PyResult> { _ => { return Err(PyRuntimeError::new_err(format!( "Unsupported storage scheme: {scheme}" - ))) + ))); } }; From b271c5825a4b7471e0fe3bc57a3309fd7969681e Mon Sep 17 00:00:00 2001 From: Shawn Chang Date: Thu, 5 Feb 2026 14:01:27 -0800 Subject: [PATCH 11/13] fix tests --- .../datafusion/tests/integration_datafusion_test.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/crates/integrations/datafusion/tests/integration_datafusion_test.rs b/crates/integrations/datafusion/tests/integration_datafusion_test.rs index 6f8898abb8..b99c0875c8 100644 --- a/crates/integrations/datafusion/tests/integration_datafusion_test.rs +++ b/crates/integrations/datafusion/tests/integration_datafusion_test.rs @@ -26,6 +26,7 @@ use datafusion::arrow::datatypes::{DataType, Field, Schema as ArrowSchema}; use datafusion::execution::context::SessionContext; use datafusion::parquet::arrow::PARQUET_FIELD_ID_META_KEY; use expect_test::expect; +use iceberg::io::LocalFsStorageFactory; use iceberg::memory::{MEMORY_CATALOG_WAREHOUSE, MemoryCatalogBuilder}; use iceberg::spec::{ NestedField, PrimitiveType, Schema, StructType, Transform, Type, UnboundPartitionSpec, @@ -44,6 +45,7 @@ fn temp_path() -> String { async fn get_iceberg_catalog() -> MemoryCatalog { MemoryCatalogBuilder::default() + .with_storage_factory(Arc::new(LocalFsStorageFactory)) .load( "memory", HashMap::from([(MEMORY_CATALOG_WAREHOUSE.to_string(), temp_path())]), From 1631176b1fb26abf4454523ccdcec147c0c03577 Mon Sep 17 00:00:00 2001 From: Shawn Chang Date: Thu, 5 Feb 2026 14:58:42 -0800 Subject: [PATCH 12/13] fixing tests --- crates/iceberg/src/catalog/memory/catalog.rs | 4 ++-- .../datafusion/tests/integration_datafusion_test.rs | 2 -- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/crates/iceberg/src/catalog/memory/catalog.rs b/crates/iceberg/src/catalog/memory/catalog.rs index e008de8050..60e298bc89 100644 --- a/crates/iceberg/src/catalog/memory/catalog.rs +++ b/crates/iceberg/src/catalog/memory/catalog.rs @@ -25,7 +25,7 @@ use futures::lock::{Mutex, MutexGuard}; use itertools::Itertools; use super::namespace_state::NamespaceState; -use crate::io::{FileIO, FileIOBuilder, MemoryStorageFactory, StorageFactory}; +use crate::io::{FileIO, FileIOBuilder, LocalFsStorageFactory, StorageFactory}; use crate::spec::{TableMetadata, TableMetadataBuilder}; use crate::table::Table; use crate::{ @@ -129,7 +129,7 @@ impl MemoryCatalog { storage_factory: Option>, ) -> Result { // Use provided factory or default to MemoryStorageFactory - let factory = storage_factory.unwrap_or_else(|| Arc::new(MemoryStorageFactory)); + let factory = storage_factory.unwrap_or_else(|| Arc::new(LocalFsStorageFactory)); Ok(Self { root_namespace_state: Mutex::new(NamespaceState::default()), diff --git a/crates/integrations/datafusion/tests/integration_datafusion_test.rs b/crates/integrations/datafusion/tests/integration_datafusion_test.rs index b99c0875c8..6f8898abb8 100644 --- a/crates/integrations/datafusion/tests/integration_datafusion_test.rs +++ b/crates/integrations/datafusion/tests/integration_datafusion_test.rs @@ -26,7 +26,6 @@ use datafusion::arrow::datatypes::{DataType, Field, Schema as ArrowSchema}; use datafusion::execution::context::SessionContext; use datafusion::parquet::arrow::PARQUET_FIELD_ID_META_KEY; use expect_test::expect; -use iceberg::io::LocalFsStorageFactory; use iceberg::memory::{MEMORY_CATALOG_WAREHOUSE, MemoryCatalogBuilder}; use iceberg::spec::{ NestedField, PrimitiveType, Schema, StructType, Transform, Type, UnboundPartitionSpec, @@ -45,7 +44,6 @@ fn temp_path() -> String { async fn get_iceberg_catalog() -> MemoryCatalog { MemoryCatalogBuilder::default() - .with_storage_factory(Arc::new(LocalFsStorageFactory)) .load( "memory", HashMap::from([(MEMORY_CATALOG_WAREHOUSE.to_string(), temp_path())]), From 082d2d210914e488ea797eace3a4339a351590c9 Mon Sep 17 00:00:00 2001 From: Shawn Chang Date: Thu, 5 Feb 2026 15:20:45 -0800 Subject: [PATCH 13/13] fixing tests --- crates/integration_tests/tests/common/mod.rs | 5 +++++ crates/integration_tests/tests/conflict_commit_test.rs | 4 ++++ crates/integration_tests/tests/read_evolved_schema.rs | 6 ++++++ crates/integration_tests/tests/read_positional_deletes.rs | 6 ++++++ 4 files changed, 21 insertions(+) diff --git a/crates/integration_tests/tests/common/mod.rs b/crates/integration_tests/tests/common/mod.rs index 00039d8896..17701aebad 100644 --- a/crates/integration_tests/tests/common/mod.rs +++ b/crates/integration_tests/tests/common/mod.rs @@ -16,7 +16,9 @@ // under the License. use std::collections::HashMap; +use std::sync::Arc; +use iceberg::io::OpenDalStorageFactory; use iceberg::spec::{NestedField, PrimitiveType, Schema, Type}; use iceberg::{Catalog, CatalogBuilder, Namespace, NamespaceIdent}; use iceberg_catalog_rest::RestCatalogBuilder; @@ -25,6 +27,9 @@ use iceberg_integration_tests::get_test_fixture; pub async fn random_ns() -> Namespace { let fixture = get_test_fixture(); let rest_catalog = RestCatalogBuilder::default() + .with_storage_factory(Arc::new(OpenDalStorageFactory::S3 { + customized_credential_load: None, + })) .load("rest", fixture.catalog_config.clone()) .await .unwrap(); diff --git a/crates/integration_tests/tests/conflict_commit_test.rs b/crates/integration_tests/tests/conflict_commit_test.rs index d171217b04..68b94c3fd8 100644 --- a/crates/integration_tests/tests/conflict_commit_test.rs +++ b/crates/integration_tests/tests/conflict_commit_test.rs @@ -24,6 +24,7 @@ use std::sync::Arc; use arrow_array::{ArrayRef, BooleanArray, Int32Array, RecordBatch, StringArray}; use common::{random_ns, test_schema}; use futures::TryStreamExt; +use iceberg::io::OpenDalStorageFactory; use iceberg::transaction::{ApplyTransactionAction, Transaction}; use iceberg::writer::base_writer::data_file_writer::DataFileWriterBuilder; use iceberg::writer::file_writer::ParquetWriterBuilder; @@ -41,6 +42,9 @@ use parquet::file::properties::WriterProperties; async fn test_append_data_file_conflict() { let fixture = get_test_fixture(); let rest_catalog = RestCatalogBuilder::default() + .with_storage_factory(Arc::new(OpenDalStorageFactory::S3 { + customized_credential_load: None, + })) .load("rest", fixture.catalog_config.clone()) .await .unwrap(); diff --git a/crates/integration_tests/tests/read_evolved_schema.rs b/crates/integration_tests/tests/read_evolved_schema.rs index 62ec610a63..31432b1cc9 100644 --- a/crates/integration_tests/tests/read_evolved_schema.rs +++ b/crates/integration_tests/tests/read_evolved_schema.rs @@ -17,9 +17,12 @@ //! Integration tests for rest catalog. +use std::sync::Arc; + use arrow_array::{Decimal128Array, Float64Array, Int64Array, StringArray}; use futures::TryStreamExt; use iceberg::expr::Reference; +use iceberg::io::OpenDalStorageFactory; use iceberg::spec::Datum; use iceberg::{Catalog, CatalogBuilder, TableIdent}; use iceberg_catalog_rest::RestCatalogBuilder; @@ -30,6 +33,9 @@ use ordered_float::OrderedFloat; async fn test_evolved_schema() { let fixture = get_test_fixture(); let rest_catalog = RestCatalogBuilder::default() + .with_storage_factory(Arc::new(OpenDalStorageFactory::S3 { + customized_credential_load: None, + })) .load("rest", fixture.catalog_config.clone()) .await .unwrap(); diff --git a/crates/integration_tests/tests/read_positional_deletes.rs b/crates/integration_tests/tests/read_positional_deletes.rs index 96add41299..38344e944a 100644 --- a/crates/integration_tests/tests/read_positional_deletes.rs +++ b/crates/integration_tests/tests/read_positional_deletes.rs @@ -17,7 +17,10 @@ //! Integration tests for rest catalog. +use std::sync::Arc; + use futures::TryStreamExt; +use iceberg::io::OpenDalStorageFactory; use iceberg::{Catalog, CatalogBuilder, TableIdent}; use iceberg_catalog_rest::RestCatalogBuilder; use iceberg_integration_tests::get_test_fixture; @@ -26,6 +29,9 @@ use iceberg_integration_tests::get_test_fixture; async fn test_read_table_with_positional_deletes() { let fixture = get_test_fixture(); let rest_catalog = RestCatalogBuilder::default() + .with_storage_factory(Arc::new(OpenDalStorageFactory::S3 { + customized_credential_load: None, + })) .load("rest", fixture.catalog_config.clone()) .await .unwrap();