From 4aac7d0bc5b037c5fcb5d06de4e09adf6350075f Mon Sep 17 00:00:00 2001 From: Russ Fellows Date: Sun, 31 May 2026 17:49:14 -0600 Subject: [PATCH 1/2] fix: restrict TorchIterableDatasetSimple to S3/AISTORE; gate s3dlio Parquet gen on storage type Two LOCAL_FS bug fixes: 1. Training hang on LOCAL_FS (#391) TorchIterableDatasetSimple was selected for all NPZ/NPY/JPEG/PNG formats regardless of storage type. With read_threads>1, DataLoader forks worker processes after _local_fs_iterable_mixin.py imports a module-level ThreadPoolExecutor. The executor is not fork-safe; child processes deadlock silently with no error output. Fix: add 'and self._args.storage_type in _s3_types' to the use_simple_iterable_dataset guard. LOCAL_FS falls back to map-style TorchDataset which does not fork. 2. Parquet datagen RuntimeError on LOCAL_FS (#385) ParquetGenerator unconditionally called s3dlio.generate_and_write_parquet_schema_streaming() when parquet.use_s3dlio_gen=true, including for local paths. s3dlio requires an s3:// URI and raises RuntimeError: URI must start with s3://. Fix: add StorageType import and guard the s3dlio path on storage_type in (S3, AISTORE). LOCAL_FS falls through to the PyArrow local write path. Fixes #391 Fixes #385 --- dlio_benchmark/data_generator/parquet_generator.py | 7 +++++-- dlio_benchmark/data_loader/torch_data_loader.py | 7 +++++++ uv.lock | 2 +- 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/dlio_benchmark/data_generator/parquet_generator.py b/dlio_benchmark/data_generator/parquet_generator.py index 88867426..96358475 100755 --- a/dlio_benchmark/data_generator/parquet_generator.py +++ b/dlio_benchmark/data_generator/parquet_generator.py @@ -20,7 +20,7 @@ import pyarrow as pa import pyarrow.parquet as pq -from dlio_benchmark.common.enumerations import Compression +from dlio_benchmark.common.enumerations import Compression, StorageType from dlio_benchmark.data_generator.data_generator import DataGenerator from dlio_benchmark.utils.utility import progress, gen_random_tensor, DLIOMPI import dgen_py as _dgen_py @@ -314,7 +314,10 @@ def generate(self): # When enabled, hand off entirely to s3dlio.generate_and_write_parquet_schema_streaming(). # Row groups are pipelined: generation and multipart upload run # concurrently — no full-file buffer, peak RAM ~2× one row group. - if self.use_s3dlio_gen and self.parquet_columns: + # Restricted to object storage (S3/AISTORE): s3dlio requires an + # s3:// URI and raises RuntimeError for local paths (issue #385). + _s3_storage = (StorageType.S3, StorageType.AISTORE) + if self.use_s3dlio_gen and self.parquet_columns and self._args.storage_type in _s3_storage: import s3dlio as _s3dlio _cols = [(str(c.get('name', 'data')), int(c.get('size', 1))) for c in self.parquet_columns] diff --git a/dlio_benchmark/data_loader/torch_data_loader.py b/dlio_benchmark/data_loader/torch_data_loader.py index 7fe2e254..b4187027 100644 --- a/dlio_benchmark/data_loader/torch_data_loader.py +++ b/dlio_benchmark/data_loader/torch_data_loader.py @@ -465,9 +465,16 @@ def read(self): StorageType.LOCAL_FS, ) ) + # TorchIterableDatasetSimple uses DataLoader(num_workers>0) which forks + # worker processes via os.fork(). On LOCAL_FS, this fork-after-module-import + # pattern causes a ThreadPoolExecutor deadlock (the executor's background + # thread is not fork-safe). Restrict the iterable path to object storage + # (S3/AISTORE) only where the prefetch benefit is most significant and + # the fork issue does not apply. LOCAL_FS falls through to map-style TorchDataset. use_simple_iterable_dataset = ( self.format_type in _simple_iterable_formats and not use_rg_iterable_dataset + and self._args.storage_type in _s3_types ) # Determine concrete reader class name and access pattern for logging. diff --git a/uv.lock b/uv.lock index 98fc2e45..1f35c215 100644 --- a/uv.lock +++ b/uv.lock @@ -226,7 +226,7 @@ wheels = [ [[package]] name = "dlio-benchmark" -version = "3.0.1" +version = "3.0.2" source = { editable = "." } dependencies = [ { name = "dgen-py", marker = "sys_platform == 'linux'" }, From 05b7d91d94f55d80d3fccb9c8d135a3fe8502813 Mon Sep 17 00:00:00 2001 From: Devasena Inupakutika Date: Tue, 2 Jun 2026 19:54:17 +0000 Subject: [PATCH 2/2] fix for NameError on _s3_types --- dlio_benchmark/data_loader/torch_data_loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlio_benchmark/data_loader/torch_data_loader.py b/dlio_benchmark/data_loader/torch_data_loader.py index b4187027..3ca9ccf2 100644 --- a/dlio_benchmark/data_loader/torch_data_loader.py +++ b/dlio_benchmark/data_loader/torch_data_loader.py @@ -465,6 +465,7 @@ def read(self): StorageType.LOCAL_FS, ) ) + _s3_types = (StorageType.S3, StorageType.AISTORE) # TorchIterableDatasetSimple uses DataLoader(num_workers>0) which forks # worker processes via os.fork(). On LOCAL_FS, this fork-after-module-import # pattern causes a ThreadPoolExecutor deadlock (the executor's background @@ -481,7 +482,6 @@ def read(self): _opts = getattr(self._args, "storage_options", {}) or {} _lib = _opts.get("storage_library", "none") _st = self._args.storage_type - _s3_types = (StorageType.S3, StorageType.AISTORE) _s3_libs = ("s3dlio", "s3torchconnector", "minio") _nw = self._args.read_threads