diff --git a/dlio_benchmark/data_generator/parquet_generator.py b/dlio_benchmark/data_generator/parquet_generator.py index 88867426..96358475 100755 --- a/dlio_benchmark/data_generator/parquet_generator.py +++ b/dlio_benchmark/data_generator/parquet_generator.py @@ -20,7 +20,7 @@ import pyarrow as pa import pyarrow.parquet as pq -from dlio_benchmark.common.enumerations import Compression +from dlio_benchmark.common.enumerations import Compression, StorageType from dlio_benchmark.data_generator.data_generator import DataGenerator from dlio_benchmark.utils.utility import progress, gen_random_tensor, DLIOMPI import dgen_py as _dgen_py @@ -314,7 +314,10 @@ def generate(self): # When enabled, hand off entirely to s3dlio.generate_and_write_parquet_schema_streaming(). # Row groups are pipelined: generation and multipart upload run # concurrently — no full-file buffer, peak RAM ~2× one row group. - if self.use_s3dlio_gen and self.parquet_columns: + # Restricted to object storage (S3/AISTORE): s3dlio requires an + # s3:// URI and raises RuntimeError for local paths (issue #385). + _s3_storage = (StorageType.S3, StorageType.AISTORE) + if self.use_s3dlio_gen and self.parquet_columns and self._args.storage_type in _s3_storage: import s3dlio as _s3dlio _cols = [(str(c.get('name', 'data')), int(c.get('size', 1))) for c in self.parquet_columns] diff --git a/dlio_benchmark/data_loader/torch_data_loader.py b/dlio_benchmark/data_loader/torch_data_loader.py index 7fe2e254..3ca9ccf2 100644 --- a/dlio_benchmark/data_loader/torch_data_loader.py +++ b/dlio_benchmark/data_loader/torch_data_loader.py @@ -465,16 +465,23 @@ def read(self): StorageType.LOCAL_FS, ) ) + _s3_types = (StorageType.S3, StorageType.AISTORE) + # TorchIterableDatasetSimple uses DataLoader(num_workers>0) which forks + # worker processes via os.fork(). On LOCAL_FS, this fork-after-module-import + # pattern causes a ThreadPoolExecutor deadlock (the executor's background + # thread is not fork-safe). Restrict the iterable path to object storage + # (S3/AISTORE) only where the prefetch benefit is most significant and + # the fork issue does not apply. LOCAL_FS falls through to map-style TorchDataset. use_simple_iterable_dataset = ( self.format_type in _simple_iterable_formats and not use_rg_iterable_dataset + and self._args.storage_type in _s3_types ) # Determine concrete reader class name and access pattern for logging. _opts = getattr(self._args, "storage_options", {}) or {} _lib = _opts.get("storage_library", "none") _st = self._args.storage_type - _s3_types = (StorageType.S3, StorageType.AISTORE) _s3_libs = ("s3dlio", "s3torchconnector", "minio") _nw = self._args.read_threads diff --git a/uv.lock b/uv.lock index 98fc2e45..1f35c215 100644 --- a/uv.lock +++ b/uv.lock @@ -226,7 +226,7 @@ wheels = [ [[package]] name = "dlio-benchmark" -version = "3.0.1" +version = "3.0.2" source = { editable = "." } dependencies = [ { name = "dgen-py", marker = "sys_platform == 'linux'" },