diff --git a/.gitignore b/.gitignore
index 912d1d44..0aa701d7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -163,4 +163,7 @@ dmypy.json
 #Apple system files
 .DS_Store
 /.idea/
-*venv*
\ No newline at end of file
+*venv*
+# Benchmark simulation output files
+sim_*.tsv
+sim_*.tsv.zst
diff --git a/README.md b/README.md
index e345935a..b810f6d9 100644
--- a/README.md
+++ b/README.md
@@ -23,7 +23,7 @@
 - **Parquet format** (new) — full schema-driven generation and reading. Supports mixed-dtype column schemas (`int32`, `float32`, `float16`, `uint8`, `bool`, etc.) with configurable embedding sizes, LZ4/ZSTD compression, and per-column filtering on reads. Legacy single-column mode is preserved for backward compatibility.
 
 ### Storage Backends
-- **S3 / S3-compatible object storage** (new) — three client libraries supported: [s3dlio](https://github.com/russfellows/s3dlio) (recommended, Rust-backed, multi-endpoint), [s3torchconnector](https://github.com/awslabs/s3-connector-for-pytorch) (PyTorch only), and the [MinIO Python SDK](https://min.io/docs/minio/linux/developers/python/API.html).
+- **S3 / S3-compatible object storage** (new) — three client libraries supported: [s3dlio](https://pypi.org/project/s3dlio/) (recommended, Rust-backed, multi-endpoint), [s3torchconnector](https://github.com/awslabs/s3-connector-for-pytorch) (PyTorch only), and the [MinIO Python SDK](https://min.io/docs/minio/linux/developers/python/API.html).
 - **Multi-endpoint load balancing** — `S3_ENDPOINT_URIS` distributes datagen write load across multiple S3 servers, one endpoint per MPI rank (round-robin). Eliminates single-node bottlenecks for large-scale data generation.
 - **Storage env-var overrides** — `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`, `AWS_ENDPOINT_URL`, `AWS_REGION`, `DLIO_BUCKET`, `DLIO_STORAGE_TYPE`, and `DLIO_STORAGE_LIBRARY` are all read automatically; no YAML changes needed for credential injection.
 - **Post-generation settle guard** — configurable `post_generation_settle_seconds` for eventual-consistency object stores that need time to propagate written objects before training reads begin.
@@ -49,7 +49,7 @@ DLIO is an I/O benchmark for Deep Learning. DLIO is aimed at emulating the I/O b
 
 DLIO supports multiple storage backends out of the box:
 - **Local filesystem** — the default, for NFS, Lustre, GPFS, and local NVMe
-- **AWS S3 / S3-compatible object storage** — via [s3dlio](https://github.com/russfellows/s3dlio), [s3torchconnector](https://github.com/awslabs/s3-connector-for-pytorch), or the [MinIO Python SDK](https://min.io/docs/minio/linux/developers/python/API.html)
+- **AWS S3 / S3-compatible object storage** — via [s3dlio](https://pypi.org/project/s3dlio/), [s3torchconnector](https://github.com/awslabs/s3-connector-for-pytorch), or the [MinIO Python SDK](https://min.io/docs/minio/linux/developers/python/API.html)
 - **AIStore** — via the native AIStore Python SDK
 
 Object storage backends are configured through the `storage:` block in the workload YAML file (see [Object Storage Configuration](#object-storage-configuration) below).
@@ -224,7 +224,7 @@ Object storage is enabled by adding a `storage:` block to the workload YAML.  Th
 
 | `storage_library` | Description | Framework support |
 |---|---|---|
-| `s3dlio` | High-performance Rust-backed client via [s3dlio](https://github.com/russfellows/s3dlio). Parallel GET, range optimization, multi-endpoint load balancing. | PyTorch + TensorFlow |
+| `s3dlio` | High-performance Rust-backed client via [s3dlio](https://pypi.org/project/s3dlio/). Parallel GET, range optimization, multi-endpoint load balancing. | PyTorch + TensorFlow |
 | `s3torchconnector` | AWS S3 Connector for PyTorch — streaming single-file GET. | PyTorch only |
 | `minio` | MinIO Python SDK via `ThreadPoolExecutor`. | PyTorch + TensorFlow |
 
@@ -376,7 +376,7 @@ The YAML file is loaded through hydra (https://hydra.cc/). The default setting a
 
 * File format support: we only support tfrecord, hdf5, npz, csv, jpg, jpeg formats. Other data formats can be extended.
 
-* Storage backend support: we support local filesystem (`local_fs`), AWS S3 and S3-compatible object stores (`s3`), and AIStore (`aistore`). For S3 storage, three client libraries are available: [s3dlio](https://github.com/russfellows/s3dlio) (recommended), [s3torchconnector](https://github.com/awslabs/s3-connector-for-pytorch) (PyTorch only), and the [MinIO SDK](https://min.io/docs/minio/linux/developers/python/API.html). Other storage backends can be extended.
+* Storage backend support: we support local filesystem (`local_fs`), AWS S3 and S3-compatible object stores (`s3`), and AIStore (`aistore`). For S3 storage, three client libraries are available: [s3dlio](https://pypi.org/project/s3dlio/) (recommended), [s3torchconnector](https://github.com/awslabs/s3-connector-for-pytorch) (PyTorch only), and the [MinIO SDK](https://min.io/docs/minio/linux/developers/python/API.html). Other storage backends can be extended.
 
 * Data Loader support: we support reading datasets using TensorFlow tf.data data loader, PyTorch DataLoader, and a set of custom data readers implemented in ./reader. For TensorFlow tf.data data loader, PyTorch DataLoader  
   - We have complete support for tfrecord format in TensorFlow data loader. 
diff --git a/dlio_benchmark/configs/workload/dlrm_s3dlio_file.yaml b/dlio_benchmark/configs/workload/dlrm_s3dlio_file.yaml
new file mode 100644
index 00000000..a5c17d72
--- /dev/null
+++ b/dlio_benchmark/configs/workload/dlrm_s3dlio_file.yaml
@@ -0,0 +1,74 @@
+# dlrm_s3dlio_file.yaml
+#
+# DLRMv2 parquet workload on local/network filesystem.
+#
+# Structurally identical to dlrm_s3dlio_s3.yaml for direct comparison.
+#
+# Two read modes — pass at runtime:
+#
+#   direct (recommended, parity with S3):
+#     Uses s3dlio with direct:// URIs → O_DIRECT reads, bypasses page cache.
+#     Same Rust runtime, same GIL-releasing threads as the S3 path.
+#     ++workload.storage.storage_options.storage_library=direct
+#
+#   posix (buffered fallback):
+#     Uses open()/seek()/read() via Python thread pool.
+#     Goes through OS page cache — useful to see caching effect.
+#     (storage_library unset, or ++workload.storage.storage_options.storage_library=posix)
+#
+# Data layout: /mnt/test/dlrm/train/img_NN_of_64.parquet
+#   64 files, 1M rows each, 123 RGs @ 8192 rows/RG, ~8 MiB/RG compressed
+#
+# Usage:
+#   python -m dlio_benchmark.main workload=dlrm_s3dlio_file \
+#     ++workload.storage.storage_options.storage_library=direct
+#
+# Dry-run:
+#   python -m dlio_benchmark.main workload=dlrm_s3dlio_file \
+#     ++workload.storage.storage_options.storage_library=direct \
+#     ++workload.storage.storage_options.simulate_io=true
+
+model:
+  name: dlrm_s3dlio_file
+
+framework: pytorch
+
+workflow:
+  generate_data: false
+  train: true
+  evaluation: false
+
+storage:
+  storage_type: local_fs
+  storage_root: /mnt/test
+  storage_options:
+    # storage_library: set via ++workload.storage.storage_options.storage_library=<lib>
+    # Supported: direct (O_DIRECT via s3dlio, recommended), posix (buffered fallback)
+    prefetch_workers: 64     # threads per worker process (matches S3 config for parity)
+
+dataset:
+  format: parquet
+  data_folder: /mnt/test/dlrm   # DLIO scans /mnt/test/dlrm/train/*.parquet
+  num_files_train: 64
+  num_samples_per_file: 1000000    # 1M rows, 123 RGs @ 8192 rows/RG
+  record_length_bytes: 1024
+  record_length_bytes_stdev: 0
+
+reader:
+  data_loader: pytorch
+  read_threads: 8              # 8 DataLoader worker processes
+  prefetch_size: 0
+  read_type: on_demand
+  batch_size: 2048
+
+train:
+  computation_time: 0.000770031
+  epochs: 1
+  total_training_steps: -1
+  seed_change_epoch: false
+
+profiling:
+  profiler: none
+
+logging:
+  verbosity: INFO
diff --git a/dlio_benchmark/configs/workload/dlrm_s3dlio_s3.yaml b/dlio_benchmark/configs/workload/dlrm_s3dlio_s3.yaml
new file mode 100644
index 00000000..7cf773e8
--- /dev/null
+++ b/dlio_benchmark/configs/workload/dlrm_s3dlio_s3.yaml
@@ -0,0 +1,69 @@
+# dlrm_s3dlio_s3.yaml
+#
+# DLRMv2 parquet workload on S3 via s3-ultra (port 9200).
+#
+# storage_library is NOT set here — pass at runtime:
+#   ++workload.storage.storage_options.storage_library=s3torchconnector
+#   ++workload.storage.storage_options.storage_library=s3dlio
+#   ++workload.storage.storage_options.storage_library=minio
+#
+# Data layout (64 files, 1M rows each, 123 RGs @ 8192 rows/RG):
+#   s3://mlp-flux/data/dlrm/train/train/img_NN_of_64.parquet
+#
+# Usage:
+#   export AWS_ACCESS_KEY_ID=minioadmin
+#   export AWS_SECRET_ACCESS_KEY=minioadmin
+#   export AWS_ENDPOINT_URL=http://127.0.0.1:9200
+#   python -m dlio_benchmark.main workload=dlrm_s3dlio_s3 \
+#     ++workload.storage.storage_options.storage_library=s3torchconnector
+
+model:
+  name: dlrm_s3dlio_s3
+
+framework: pytorch
+
+workflow:
+  generate_data: false
+  train: true
+
+storage:
+  storage_type: s3
+  storage_root: mlp-flux
+  storage_options:
+    # storage_library: set via ++workload.storage.storage_options.storage_library=<lib>
+    prefetch_workers: 64     # threads per worker process; must match prefetch_window for full pipelining
+    endpoint_url: http://127.0.0.1:9200
+
+dataset:
+  format: parquet
+  data_folder: data/dlrm/train   # DLIO auto-appends /train → data/dlrm/train/train/
+  num_files_train: 64
+  num_samples_per_file: 1000000    # 1M rows, 123 RGs @ 8192 rows/RG
+  record_length_bytes: 1024
+  record_length_bytes_stdev: 0
+
+reader:
+  data_loader: pytorch
+  read_threads: 8              # 8 DataLoader worker processes × 8 prefetch_workers = 64 concurrent GETs
+  prefetch_size: 0             # disable DLIO-level prefetch (reader handles its own window)
+  read_type: on_demand
+  batch_size: 2048             # DLRMv2: 2048 real samples per GPU step
+
+train:
+  # DLRMv2 / B200 trace parameters:
+  #   batch_size=2048  → ~31,250 steps/epoch (64M samples / 2048)
+  #   computation_time → 0.000770031 s/batch measured on B200
+  computation_time: 0.000770031
+  epochs: 1
+  total_training_steps: -1
+  seed_change_epoch: false
+
+evaluation:
+  eval_time: 0.0
+  epochs_between_evals: 0
+
+profiling:
+  profiler: none
+
+logging:
+  verbosity: INFO
diff --git a/dlio_benchmark/data_generator/data_generator.py b/dlio_benchmark/data_generator/data_generator.py
index aa3e701a..3ef21272 100644
--- a/dlio_benchmark/data_generator/data_generator.py
+++ b/dlio_benchmark/data_generator/data_generator.py
@@ -131,6 +131,13 @@ def _generate_files(self, write_fn, label: str = "Data") -> None:
 
             storage.put_data(out_path_spec, output.getvalue())
 
+        **Zero-copy fast path**: if ``write_fn`` returns a non-None value it is
+        treated as the ready-to-upload payload and is passed directly to
+        ``put_data``, bypassing the ``BytesIO`` step entirely.  This is used by
+        ``NPZGenerator`` when s3dlio's ``generate_npz_bytes()`` is available:
+        the returned ``BytesView`` supports the buffer protocol so s3dlio can
+        upload it with a single zero-copy pass.
+
         **Parallel semantics** (Issue 10):
 
         Seeds are pre-derived sequentially in the main thread so that
@@ -165,12 +172,14 @@ def _write_one(job):
             progress(i + 1, self.total_files_to_generate, f"Generating {label}")
             output = out_path_spec if is_local else io.BytesIO()
             worker_rng = np.random.default_rng(seed=file_seed)
-            write_fn(i, dim_, dim1, dim2, file_seed, worker_rng,
-                     out_path_spec, is_local, output)
+            payload = write_fn(i, dim_, dim1, dim2, file_seed, worker_rng,
+                               out_path_spec, is_local, output)
             if not is_local:
-                # Pass BytesIO directly so put_data can use getbuffer() (zero-copy
-                # memoryview) instead of getvalue() which makes a full copy.
-                self.storage.put_data(out_path_spec, output)
+                # If write_fn returned a payload (e.g. s3dlio BytesView from
+                # generate_npz_bytes), use it directly — zero-copy, no BytesIO
+                # intermediary.  Otherwise fall back to the BytesIO content.
+                self.storage.put_data(out_path_spec,
+                                      payload if payload is not None else output)
 
         write_threads = getattr(self._args, 'write_threads', 1)
         n_workers = max(1, min(write_threads, len(jobs))) if jobs else 1
@@ -219,15 +228,19 @@ def _upload(path, buf, sem):
                              f"Generating {label}")
                     output = io.BytesIO()
                     worker_rng = np.random.default_rng(seed=file_seed)
-                    # Generate in main thread (fast; Rust dgen or numpy)
-                    write_fn(i, dim_, dim1, dim2, file_seed, worker_rng,
-                             out_path_spec, False, output)
+                    # Generate in main thread (fast; Rust dgen or numpy).
+                    # A non-None return value is the ready-to-upload payload
+                    # (e.g. s3dlio BytesView from generate_npz_bytes) — use it
+                    # directly to skip the BytesIO intermediary copy.
+                    payload = write_fn(i, dim_, dim1, dim2, file_seed, worker_rng,
+                                       out_path_spec, False, output)
+                    upload_data = payload if payload is not None else output
                     # Block if n_workers uploads are already in flight
                     # (back-pressure to bound peak RAM usage).
                     _sem.acquire()
                     # Submit upload immediately; main thread continues generating.
                     _futures.append(
-                        pool.submit(_upload, out_path_spec, output, _sem)
+                        pool.submit(_upload, out_path_spec, upload_data, _sem)
                     )
                 # Wait for all in-flight uploads before leaving the with block.
                 for f in _futures:
diff --git a/dlio_benchmark/data_generator/npz_generator.py b/dlio_benchmark/data_generator/npz_generator.py
index 2203848f..492de94c 100644
--- a/dlio_benchmark/data_generator/npz_generator.py
+++ b/dlio_benchmark/data_generator/npz_generator.py
@@ -44,8 +44,16 @@ def __init__(self):
     def generate(self):
         """
         Generator for creating data in NPZ format of 3d dataset.
-        Uses the base-class template for seeding, BytesIO, and put_data.
-        Bug fix: pass output.getvalue() (bytes) to put_data, not the BytesIO object.
+
+        Fast path (s3dlio available, no ZIP compression, object storage):
+            generate_npz_bytes() produces a BytesView in Rust (hardware CRC32,
+            Rayon fill, no GIL). _write() returns it directly; _generate_files()
+            passes it straight to put_data() → MultipartUploadWriter — zero copies
+            of the payload at any point in the Python layer.
+
+        Slow path (numpy fallback or local FS):
+            np.savez() writes into BytesIO; put_data() reads via getbuffer()
+            (zero-copy memoryview).
         """
         super().generate()
         dtype = self._args.record_element_dtype
@@ -69,9 +77,13 @@ def _write(i, dim_, dim1, dim2, file_seed, rng,
                 if is_local:
                     with open(output, "wb") as f:
                         f.write(npz_view)
+                    return None
                 else:
-                    output.write(npz_view)
-                return
+                    # Return the BytesView directly — zero-copy.
+                    # _generate_files() uses the return value as the upload
+                    # payload, bypassing the BytesIO write entirely.
+                    # No Python-side copy of the 140 MiB buffer occurs.
+                    return npz_view
             # ── Slow path: numpy fallback ─────────────────────────────────
             if isinstance(dim_, list):
                 records = gen_random_tensor(
diff --git a/dlio_benchmark/data_generator/parquet_generator.py b/dlio_benchmark/data_generator/parquet_generator.py
index a63f2fee..88867426 100755
--- a/dlio_benchmark/data_generator/parquet_generator.py
+++ b/dlio_benchmark/data_generator/parquet_generator.py
@@ -104,6 +104,7 @@ def __init__(self):
         self.partition_by = getattr(self._args, 'parquet_partition_by', None)
         batch = getattr(self._args, 'parquet_generation_batch_size', 0)
         self.generation_batch_size = batch if batch > 0 else self.row_group_size
+        self.use_s3dlio_gen = getattr(self._args, 'parquet_use_s3dlio_gen', False)
 
     # ── Schema ───────────────────────────────────────────────────────────────
 
@@ -309,6 +310,20 @@ def generate(self):
 
             out_path_spec = self.storage.get_uri(self._file_list[i])
 
+            # ── s3dlio pure-Rust generation path (streaming) ────────────────
+            # When enabled, hand off entirely to s3dlio.generate_and_write_parquet_schema_streaming().
+            # Row groups are pipelined: generation and multipart upload run
+            # concurrently — no full-file buffer, peak RAM ~2× one row group.
+            if self.use_s3dlio_gen and self.parquet_columns:
+                import s3dlio as _s3dlio
+                _cols = [(str(c.get('name', 'data')), int(c.get('size', 1)))
+                         for c in self.parquet_columns]
+                _num_rg = max(1, self.num_samples // self.row_group_size)
+                _s3dlio.generate_and_write_parquet_schema_streaming(
+                    out_path_spec, _cols, self.row_group_size, _num_rg)
+                continue
+            # ─────────────────────────────────────────────────────────────────
+
             dim_raw = dim[2 * i]
             if isinstance(dim_raw, list):
                 dim1 = int(dim_raw[0])
diff --git a/dlio_benchmark/data_loader/torch_data_loader.py b/dlio_benchmark/data_loader/torch_data_loader.py
index f7023c1e..7fe2e254 100644
--- a/dlio_benchmark/data_loader/torch_data_loader.py
+++ b/dlio_benchmark/data_loader/torch_data_loader.py
@@ -16,8 +16,9 @@
 """
 import math
 import pickle
+import time
 import torch
-from torch.utils.data import Dataset, DataLoader
+from torch.utils.data import Dataset, IterableDataset, DataLoader
 from torch.utils.data.sampler import Sampler
 
 from dlio_benchmark.common.constants import MODULE_DATA_LOADER
@@ -75,6 +76,339 @@ def __getitem__(self, image_idx):
         return self.reader.read_index(image_idx, step)
 
 
+class TorchIterableDataset(IterableDataset):
+    """
+    Row-Group-granular IterableDataset for high-performance parquet S3 I/O.
+
+    The Map-style TorchDataset calls __getitem__ once per *sample* — 64 million
+    times for DLRMv2, burning 212 s in pure Python overhead before any I/O.
+    This class iterates at Row-Group granularity instead:
+
+      Python calls = num_files × rgs_per_file / num_workers   (≈ 984 per worker)
+
+    The generator fetches one RG at a time (one S3 GET — GIL released for the
+    entire network transfer), accumulates samples, and yields one dummy item per
+    complete batch.  Python only resumes at batch boundaries.
+
+    Outer-loop Python cost: 31,250 batch yields × ~3 µs ≈ 0.09 s/epoch.
+    vs Map-style:           64,000,000 getitem calls × ~3 µs ≈ 212 s/epoch.
+
+    Only works with readers that expose open()/get_sample() — specifically
+    ParquetReaderS3Iterable (storage_library: s3torchconnector / minio / s3dlio).
+    """
+
+    @dlp.log_init
+    def __init__(self, format_type, dataset_type, epoch, batch_size, num_workers):
+        self.format_type = format_type
+        self.dataset_type = dataset_type
+        self.epoch_number = epoch
+        self.batch_size = batch_size
+        self.num_workers = num_workers
+        self.reader = None
+        args = ConfigArguments.get_instance()
+        opts = getattr(args, "storage_options", {}) or {}
+        self._simulate = str(opts.get("simulate_io", "false")).lower() in ("true", "1", "yes")
+        self.serial_args = pickle.dumps(args)
+        # Only create the reader for in-process (num_workers=0) non-simulate runs.
+        # Simulate mode never touches the reader; workers get it via worker_init.
+        if num_workers == 0 and not self._simulate:
+            self.worker_init(-1)
+
+    @dlp.log
+    def worker_init(self, worker_id):
+        if self._simulate:
+            return  # no reader needed in simulate mode
+        pickle.loads(self.serial_args)
+        _args = ConfigArguments.get_instance()
+        _args.configure_dlio_logging(is_child=True)
+        self.reader = ReaderFactory.get_reader(
+            type=self.format_type,
+            dataset_type=self.dataset_type,
+            thread_index=worker_id,
+            epoch_number=self.epoch_number,
+        )
+
+    def __iter__(self):
+        worker_info = torch.utils.data.get_worker_info()
+        args = ConfigArguments.get_instance()
+        dummy = args.resized_image
+        opts = getattr(args, "storage_options", {}) or {}
+        simulate = str(opts.get("simulate_io", "false")).lower() in ("true", "1", "yes")
+
+        if simulate:
+            # Dry-run: measure Python loop overhead only — zero I/O, zero reader calls.
+            # Total batches distributed evenly across workers.
+            total_batches = (args.num_files_train * args.num_samples_per_file) // self.batch_size
+            if worker_info is not None:
+                n = worker_info.num_workers
+                my_batches = total_batches // n + (1 if worker_info.id < total_batches % n else 0)
+            else:
+                my_batches = total_batches
+            wid = worker_info.id if worker_info is not None else 0
+            print(f"[ITER] worker={wid} path=SIMULATE total_batches={total_batches} my_batches={my_batches}", flush=True)
+            for _ in range(my_batches):
+                yield dummy
+            return
+
+        # _file_list is set by FormatReader.__init__ from args.file_list_train
+        all_files = list(self.reader._file_list)
+
+        # Shard files across workers: worker k handles files[k::num_workers]
+        if worker_info is not None:
+            all_files = all_files[worker_info.id::worker_info.num_workers]
+
+        wid = worker_info.id if worker_info is not None else 0
+        reader_type = type(self.reader).__name__
+        print(f"[ITER] worker={wid} reader={reader_type} files_this_worker={len(all_files)} total_files={len(list(self.reader._file_list))}", flush=True)
+
+        # ── s3dlio consumer-driven pipeline ─────────────────────────────────
+        # ParquetReaderS3dlio.iter_epoch() installs the worker's file shard,
+        # runs _epoch_init() on only ~N/W files, and drives the bounded
+        # sliding-window pipeline directly — no pyarrow 3-tuple unpacking.
+        if hasattr(self.reader, 'iter_epoch'):
+            print(f"[ITER] worker={wid} path=ITER_EPOCH files={len(all_files)}", flush=True)
+            yield from self.reader.iter_epoch(all_files, self.batch_size)
+            return
+
+        # ── Sliding-window prefetch ─────────────────────────────────────────
+        # Instead of firing all RG GETs for a file at once (burst → idle →
+        # burst), we maintain a constant window of `prefetch_window` in-flight
+        # GETs drawn from a flat queue of (filename, rg_idx) tuples spanning
+        # ALL files.  As each RG is consumed one new slot is filled, keeping
+        # the network continuously saturated.
+        #
+        # Use the reader's sliding-window helpers when available
+        # (ParquetReaderS3Iterable); fall back to the old open()-everything
+        # path for other readers.
+        has_sliding = (
+            hasattr(self.reader, 'open_footer_only')
+            and hasattr(self.reader, 'submit_rg_prefetch')
+            and self.reader._prefetch_executor is not None
+        )
+
+        if not has_sliding:
+            # ── Legacy path (no sliding window) ────────────────────────────
+            sample_buf = 0
+            file_iter = iter(all_files)
+            next_filename = next(file_iter, None)
+            if next_filename is None:
+                return
+            next_data = self.reader.open(next_filename)
+            self.reader.open_file_map[next_filename] = next_data
+            while next_filename is not None:
+                filename = next_filename
+                file_data = next_data
+                next_filename = next(file_iter, None)
+                if next_filename is not None:
+                    next_data = self.reader.open(next_filename)
+                    self.reader.open_file_map[next_filename] = next_data
+                else:
+                    next_data = None
+                pf, rf, offsets = file_data
+                num_rgs = pf.metadata.num_row_groups
+                for rg_idx in range(num_rgs):
+                    self.reader.get_sample(filename, offsets[rg_idx])
+                    rg_rows = offsets[rg_idx + 1] - offsets[rg_idx]
+                    sample_buf += rg_rows
+                    while sample_buf >= self.batch_size:
+                        yield dummy
+                        sample_buf -= self.batch_size
+                if hasattr(self.reader, '_pf_cache'):
+                    self.reader._pf_cache.pop(filename, None)
+                self.reader.open_file_map[filename] = None
+            return
+
+        # ── Sliding-window path ─────────────────────────────────────────────
+        # Step 1: fetch all footers (cheap: one small range GET per file, can
+        #         run serially — footer size is ~50 KB vs ~8 MB per RG).
+        file_meta = {}   # filename -> (pf, rf, offsets)
+        for fn in all_files:
+            fd = self.reader.open_footer_only(fn)
+            self.reader.open_file_map[fn] = fd
+            file_meta[fn] = fd
+
+        # Step 2: build flat RG queue across all files in order.
+        rg_queue = []   # list of (filename, rg_idx, rg_rows)
+        for fn in all_files:
+            pf, rf, offsets = file_meta[fn]
+            num_rgs = pf.metadata.num_row_groups
+            for rg_idx in range(num_rgs):
+                rg_rows = offsets[rg_idx + 1] - offsets[rg_idx]
+                rg_queue.append((fn, rg_idx, rg_rows))
+
+        # Step 3: fill the initial window — submit first `window` GETs.
+        # Window size: default 64, overridable via storage_options.prefetch_window.
+        opts = getattr(args, "storage_options", {}) or {}
+        window = int(opts.get("prefetch_window", 64))
+        tail = 0  # index of next RG to submit into the window
+
+        # t_io_start: wall-clock time when the very first GET is submitted.
+        # This excludes DLIO startup, footer fetches, and queue construction.
+        t_io_start = time.perf_counter()
+        for tail in range(min(window, len(rg_queue))):
+            fn, rg_idx, _ = rg_queue[tail]
+            self.reader.submit_rg_prefetch(fn, rg_idx)
+        tail = min(window, len(rg_queue))
+
+        # Step 4: consume RGs in order, refilling the window by 1 per RG consumed.
+        sample_buf = 0
+        prev_filename = None
+        total_bytes = 0  # compressed bytes fetched by this worker
+        for consume_idx, (filename, rg_idx, rg_rows) in enumerate(rg_queue):
+            # Refill: submit the RG that is `window` ahead of the one we're consuming.
+            if tail < len(rg_queue):
+                fn_ahead, rg_ahead, _ = rg_queue[tail]
+                self.reader.submit_rg_prefetch(fn_ahead, rg_ahead)
+                tail += 1
+
+            # Wait for this RG (already in-flight; should be near-instant).
+            self.reader.get_sample(filename, file_meta[filename][2][rg_idx])
+            # Accumulate bytes from the rg_cache (populated by get_sample).
+            if hasattr(self.reader, '_rg_cache'):
+                total_bytes += self.reader._rg_cache.get((filename, rg_idx), 0)
+            sample_buf += rg_rows
+
+            # Release the previous file's caches once we move on to a new file.
+            if prev_filename is not None and prev_filename != filename:
+                if hasattr(self.reader, '_pf_cache'):
+                    self.reader._pf_cache.pop(prev_filename, None)
+                if hasattr(self.reader, '_rg_cache'):
+                    stale = [k for k in list(self.reader._rg_cache) if k[0] == prev_filename]
+                    for k in stale:
+                        self.reader._rg_cache.pop(k, None)
+                self.reader.open_file_map[prev_filename] = None
+            prev_filename = filename
+
+            while sample_buf >= self.batch_size:
+                yield dummy
+                sample_buf -= self.batch_size
+
+        # t_io_end: wall-clock time after the last RG has been waited on.
+        t_io_end = time.perf_counter()
+
+        # Clean up last file.
+        if prev_filename is not None:
+            if hasattr(self.reader, '_pf_cache'):
+                self.reader._pf_cache.pop(prev_filename, None)
+            self.reader.open_file_map[prev_filename] = None
+
+        # Report per-worker I/O throughput (excludes startup, footer fetches,
+        # and any time spent in the PyTorch training loop between yields).
+        elapsed = t_io_end - t_io_start
+        gib = total_bytes / (1024 ** 3)
+        mib_s = (total_bytes / (1024 ** 2)) / elapsed if elapsed > 0 else 0.0
+        wid = worker_info.id if worker_info is not None else 0
+        print(
+            f"[io_timing] worker={wid} "
+            f"bytes={total_bytes} ({gib:.3f} GiB) "
+            f"elapsed={elapsed:.3f}s "
+            f"throughput={mib_s:.1f} MiB/s",
+            flush=True,
+        )
+
+
+class TorchIterableDatasetSimple(IterableDataset):
+    """
+    IterableDataset for 1-sample-per-file formats (NPZ / NPY / JPEG / PNG) on
+    any storage backend (S3 object store or local / POSIX filesystem).
+
+    Problem with map-style (TorchDataset) for these formats:
+      Each worker calls __getitem__ → read_index() → on-demand single-object GET.
+      With N workers the server sees at most N simultaneous requests.
+
+    This class instead calls reader.next(), which:
+      S3   — _s3_prefetch_all()      → s3dlio.get_many(64 in-flight per worker)
+      POSIX — _localfs_prefetch_all() → ThreadPoolExecutor(64 workers)
+
+    Effective pipeline depth: 64 × num_workers  (vs 1 × num_workers before).
+
+    File assignment:
+      Worker k handles files[k::num_workers] from the epoch's file list.
+      The list reflects any epoch-level shuffle already applied by reconfigure().
+      No additional shuffle is performed here; for storage I/O benchmarking,
+      file ordering within a worker's shard does not affect measured throughput.
+
+    Drop-last semantics:
+      FormatReader.next() drops the final partial batch (same as map-style
+      drop_last=True). The DataLoader is configured with batch_size=None because
+      the reader assembles batches internally; one 'dummy' item is yielded per
+      complete batch.
+    """
+
+    @dlp.log_init
+    def __init__(self, format_type, dataset_type, epoch, batch_size, num_workers):
+        self.format_type = format_type
+        self.dataset_type = dataset_type
+        self.epoch_number = epoch
+        self.batch_size = batch_size
+        self.num_workers = num_workers
+        self.reader = None
+        args = ConfigArguments.get_instance()
+        self.serial_args = pickle.dumps(args)
+        if num_workers == 0:
+            self.worker_init(-1)
+
+    @dlp.log
+    def worker_init(self, worker_id):
+        pickle.loads(self.serial_args)
+        _args = ConfigArguments.get_instance()
+        _args.configure_dlio_logging(is_child=True)
+        # When num_workers=0 the training loop runs in the main process;
+        # file_map is built for thread 0 (num_threads=1 when read_threads=0).
+        thread_idx = 0 if worker_id < 0 else worker_id
+        self.reader = ReaderFactory.get_reader(
+            type=self.format_type,
+            dataset_type=self.dataset_type,
+            thread_index=thread_idx,
+            epoch_number=self.epoch_number,
+        )
+
+    def __iter__(self):
+        args = ConfigArguments.get_instance()
+        dummy = args.resized_image
+        worker_info = torch.utils.data.get_worker_info()
+
+        # Shard files: worker k handles every k-th file starting at offset k.
+        # _file_list = args.file_list_train, updated each epoch by reconfigure().
+        all_files = list(self.reader._file_list)
+        if worker_info is not None:
+            my_files = all_files[worker_info.id::worker_info.num_workers]
+        else:
+            my_files = all_files
+
+        wid = worker_info.id if worker_info is not None else 0
+        reader_type = type(self.reader).__name__
+        print(
+            f"[ITER_SIMPLE] worker={wid} reader={reader_type} "
+            f"files_this_worker={len(my_files)} total_files={len(all_files)}",
+            flush=True,
+        )
+
+        if not my_files:
+            return
+
+        # Install the file shard into the reader's file_map so reader.next()
+        # picks it up. Entry format: (global_sample_idx, filename, sample_in_file).
+        num_spf = args.num_samples_per_file
+        entries = [
+            (i * num_spf + s, filename, s)
+            for i, filename in enumerate(my_files)
+            for s in range(num_spf)
+        ]
+        self.reader.file_map[self.reader.thread_index] = entries
+
+        # reader.next() bulk-prefetches all files in this worker's shard, then
+        # iterates them yielding one numpy batch per batch_size samples.
+        # We yield one dummy item per batch so the DataLoader step count is correct.
+        for _batch in self.reader.next():
+            yield dummy
+
+    @dlp.log
+    def finalize(self):
+        if self.reader is not None:
+            self.reader.finalize()
+
+
 class dlio_sampler(Sampler):
     def __init__(self, rank, size, num_samples, epochs):
         self.size = size
@@ -104,6 +438,143 @@ def __init__(self, format_type, dataset_type, epoch_number):
 
     @dlp.log
     def read(self):
+        from dlio_benchmark.common.enumerations import FormatType, StorageType
+        from dlio_benchmark.reader.reader_factory import ReaderFactory
+
+        # ── Dataset routing ──────────────────────────────────────────────────
+        # Three paths, in priority order:
+        #
+        # 1. PARQUET on any storage → TorchIterableDataset (RG-granular)
+        #    Reduces Python __getitem__ calls from O(samples) to O(row_groups).
+        #
+        # 2. NPZ / NPY / JPEG / PNG → TorchIterableDatasetSimple (bulk-prefetch)
+        #    reader.next() calls _s3_prefetch_all() or _localfs_prefetch_all()
+        #    before iteration, giving 64 × num_workers in-flight I/O requests
+        #    instead of 1 × num_workers with map-style on-demand fetching.
+        #    Works for BOTH object storage and POSIX/local filesystem.
+        #
+        # 3. Everything else → TorchDataset (map-style, on-demand per sample)
+
+        _simple_iterable_formats = (
+            FormatType.NPZ, FormatType.NPY, FormatType.JPEG, FormatType.PNG
+        )
+        use_rg_iterable_dataset = (
+            self.format_type is FormatType.PARQUET
+            and self._args.storage_type in (
+                StorageType.S3, StorageType.AISTORE,
+                StorageType.LOCAL_FS,
+            )
+        )
+        use_simple_iterable_dataset = (
+            self.format_type in _simple_iterable_formats
+            and not use_rg_iterable_dataset
+        )
+
+        # Determine concrete reader class name and access pattern for logging.
+        _opts = getattr(self._args, "storage_options", {}) or {}
+        _lib  = _opts.get("storage_library", "none")
+        _st   = self._args.storage_type
+        _s3_types = (StorageType.S3, StorageType.AISTORE)
+        _s3_libs  = ("s3dlio", "s3torchconnector", "minio")
+        _nw = self._args.read_threads
+
+        if use_rg_iterable_dataset:
+            _reader_cls    = "ParquetReaderS3dlio"
+            _torch_ds      = "TorchIterableDataset(rg-granular)"
+            _sample_access = "iterator(row-group chunks)"
+        elif use_simple_iterable_dataset:
+            _fmt = str(self.format_type).split(".")[-1].lower()
+            if _st in _s3_types and _lib in _s3_libs:
+                _reader_cls = (
+                    "NPZReaderS3Iterable"   if self.format_type is FormatType.NPZ  else
+                    "NPYReaderS3Iterable"   if self.format_type is FormatType.NPY  else
+                    "ImageReaderS3Iterable"
+                )
+                _sample_access = "next()→_s3_prefetch_all(64 in-flight) then iterate"
+            else:
+                _reader_cls = (
+                    "NPZReaderIterable"    if self.format_type is FormatType.NPZ  else
+                    "NPYReaderIterable"    if self.format_type is FormatType.NPY  else
+                    "ImageReaderIterable"
+                )
+                _sample_access = "next()→_localfs_prefetch_all(ThreadPool-64) then iterate"
+            _torch_ds = f"TorchIterableDatasetSimple(bulk-prefetch, {_nw} workers)"
+        else:
+            _reader_cls    = "unknown"
+            _torch_ds      = f"TorchDataset(map-style, {_nw} workers)"
+            _sample_access = "read_index (on-demand)"
+
+        print(
+            f"[DATALOADER] format={self.format_type} storage={_st} library={_lib}\n"
+            f"[DATALOADER]   torch_dataset={_torch_ds}\n"
+            f"[DATALOADER]   reader={_reader_cls}\n"
+            f"[DATALOADER]   sample_access={_sample_access}",
+            flush=True,
+        )
+
+        if use_simple_iterable_dataset:
+            num_workers = self._args.read_threads
+            dataset = TorchIterableDatasetSimple(
+                self.format_type, self.dataset_type, self.epoch_number,
+                self.batch_size, num_workers,
+            )
+            if self._args.my_rank == 0:
+                self.logger.debug(
+                    f"{utcnow()} Using TorchIterableDatasetSimple: "
+                    f"{num_workers} workers, batch_size={self.batch_size}, "
+                    f"reader={_reader_cls}"
+                )
+            if num_workers == 0:
+                kwargs = {}
+            else:
+                kwargs = {'multiprocessing_context': self._args.multiprocessing_context}
+                if torch.__version__ != '1.3.1':
+                    kwargs['persistent_workers'] = True
+            pin_memory = self._args.pin_memory and torch.cuda.is_available()
+            self._dataset = DataLoader(
+                dataset,
+                batch_size=None,          # reader assembles batches; one dummy per batch
+                num_workers=num_workers,
+                pin_memory=pin_memory,
+                worker_init_fn=dataset.worker_init,
+                **kwargs,
+            )
+            return
+
+        if use_rg_iterable_dataset:
+            opts = getattr(self._args, "storage_options", {}) or {}
+            simulate = str(opts.get("simulate_io", "false")).lower() in ("true", "1", "yes")
+            # For simulate: run single-process — no multiprocessing IPC overhead.
+            # s3torchconnector/CRT releases the GIL during network I/O, so a
+            # thread-pool in the reader provides concurrency without IPC cost.
+            num_workers = 0 if simulate else self._args.read_threads
+            dataset = TorchIterableDataset(
+                self.format_type, self.dataset_type, self.epoch_number,
+                self.batch_size, num_workers,
+            )
+            if self._args.my_rank == 0:
+                mode = "simulate/single-process" if simulate else f"{num_workers} workers"
+                self.logger.debug(
+                    f"{utcnow()} Using TorchIterableDataset (RG-granular): "
+                    f"{mode}, batch_size={self.batch_size}"
+                )
+            if num_workers == 0:
+                kwargs = {}
+            else:
+                kwargs = {'multiprocessing_context': self._args.multiprocessing_context}
+                if torch.__version__ != '1.3.1':
+                    kwargs['persistent_workers'] = True
+            pin_memory = self._args.pin_memory and torch.cuda.is_available()
+            self._dataset = DataLoader(
+                dataset,
+                batch_size=None,          # __iter__ yields pre-formed batch items
+                num_workers=num_workers,
+                pin_memory=pin_memory,
+                worker_init_fn=dataset.worker_init,
+                **kwargs,
+            )
+            return
+
         dataset = TorchDataset(self.format_type, self.dataset_type, self.epoch_number, self.num_samples,
                                self._args.read_threads, self.batch_size)
         sampler = dlio_sampler(self._args.my_rank, self._args.comm_size, self.num_samples, self._args.epochs)
@@ -170,4 +641,20 @@ def next(self):
 
     @dlp.log
     def finalize(self):
-        pass
+        # When read_threads=0 the reader lives in-process on the dataset object.
+        # Call its finalize() so per-epoch state (byte counters, etc.) is flushed
+        # back to ConfigArguments before statscounter computes the I/O metric.
+        # When read_threads>0 each worker is a separate process; their readers are
+        # not accessible here.  The workers call finalize_s3_bytes() internally
+        # (logging actual bytes), but cannot update the main-process args.record_length.
+        # The config.py fix (guarding record_dims overwrite) ensures the correct
+        # record_length_bytes value from the YAML is used in that case.
+        try:
+            dataset = self._dataset.dataset
+            # TorchIterableDatasetSimple exposes finalize() directly.
+            if isinstance(dataset, TorchIterableDatasetSimple):
+                dataset.finalize()
+            elif dataset.reader is not None:
+                dataset.reader.finalize()
+        except Exception:
+            pass
diff --git a/dlio_benchmark/reader/__init__.py b/dlio_benchmark/reader/__init__.py
index e69de29b..63a86494 100644
--- a/dlio_benchmark/reader/__init__.py
+++ b/dlio_benchmark/reader/__init__.py
@@ -0,0 +1,87 @@
+"""
+Per-format reader factories.
+
+Each function selects the S3-iterable or POSIX-iterable implementation based on
+the active storage type and library, then constructs and returns the reader.
+
+Special cases (DALI, O_DIRECT) are NOT handled here — they stay in
+reader_factory.py because they require entirely different reader classes.
+
+Usage in reader_factory.py after checking DALI / odirect:
+    from dlio_benchmark.reader import create_npz_reader
+    return create_npz_reader(dataset_type, thread_index, epoch_number)
+
+storage_library routing for local filesystem (StorageType.LOCAL_FS):
+  "direct"  → iterable reader with _LocalFSIterableMixin in O_DIRECT mode
+               (s3dlio.get_many() + direct:// URIs, bypasses page cache).
+               Use this for NVMe benchmarks.
+  <unset>   → iterable reader with _LocalFSIterableMixin in buffered mode
+               (ThreadPoolExecutor + Python open(), uses page cache).
+"""
+from dlio_benchmark.common.enumerations import StorageType
+from dlio_benchmark.utils.config import ConfigArguments
+
+_S3_TYPES = (StorageType.S3, StorageType.AISTORE)
+_S3_LIBS  = ("s3dlio", "s3torchconnector", "minio")
+
+
+def _storage_info():
+    """Return (storage_type, storage_library) from the active config."""
+    args = ConfigArguments.get_instance()
+    library = (getattr(args, "storage_options", {}) or {}).get("storage_library")
+    return args.storage_type, library
+
+
+# ---------------------------------------------------------------------------
+# NPZ
+# ---------------------------------------------------------------------------
+
+def create_npz_reader(dataset_type, thread_index, epoch):
+    storage_type, library = _storage_info()
+    if storage_type in _S3_TYPES:
+        if library in _S3_LIBS:
+            from dlio_benchmark.reader.npz_reader_s3_iterable import NPZReaderS3Iterable
+            return NPZReaderS3Iterable(dataset_type, thread_index, epoch)
+        from dlio_benchmark.reader.npz_reader_s3 import NPZReaderS3
+        return NPZReaderS3(dataset_type, thread_index, epoch)
+    # LOCAL_FS: both "direct" and buffered (default) use NPZReaderIterable.
+    # _LocalFSIterableMixin reads storage_library at _localfs_init() time and
+    # routes to O_DIRECT (direct://) or buffered (Python open()) accordingly.
+    from dlio_benchmark.reader.npz_reader_iterable import NPZReaderIterable
+    return NPZReaderIterable(dataset_type, thread_index, epoch)
+
+
+# ---------------------------------------------------------------------------
+# NPY
+# ---------------------------------------------------------------------------
+
+def create_npy_reader(dataset_type, thread_index, epoch):
+    storage_type, library = _storage_info()
+    if storage_type in _S3_TYPES:
+        if library in _S3_LIBS:
+            from dlio_benchmark.reader.npy_reader_s3_iterable import NPYReaderS3Iterable
+            return NPYReaderS3Iterable(dataset_type, thread_index, epoch)
+        from dlio_benchmark.reader.npy_reader_s3 import NPYReaderS3
+        return NPYReaderS3(dataset_type, thread_index, epoch)
+    # LOCAL_FS: both "direct" and buffered use NPYReaderIterable.
+    from dlio_benchmark.reader.npy_reader_iterable import NPYReaderIterable
+    return NPYReaderIterable(dataset_type, thread_index, epoch)
+
+
+# ---------------------------------------------------------------------------
+# JPEG / PNG
+# ---------------------------------------------------------------------------
+
+def create_image_reader(dataset_type, thread_index, epoch):
+    storage_type, library = _storage_info()
+    if storage_type in _S3_TYPES:
+        if library in _S3_LIBS:
+            from dlio_benchmark.reader.image_reader_s3_iterable import ImageReaderS3Iterable
+            return ImageReaderS3Iterable(dataset_type, thread_index, epoch)
+        # Unrecognised library on S3: fall through to local-style reader;
+        # it will fail with a clear error when it tries to open an s3:// URI.
+        from dlio_benchmark.reader.image_reader_iterable import ImageReaderIterable
+        return ImageReaderIterable(dataset_type, thread_index, epoch)
+    # LOCAL_FS: both "direct" and buffered use ImageReaderIterable.
+    from dlio_benchmark.reader.image_reader_iterable import ImageReaderIterable
+    return ImageReaderIterable(dataset_type, thread_index, epoch)
diff --git a/dlio_benchmark/reader/_local_fs_iterable_mixin.py b/dlio_benchmark/reader/_local_fs_iterable_mixin.py
index 2394d518..20da8dbb 100644
--- a/dlio_benchmark/reader/_local_fs_iterable_mixin.py
+++ b/dlio_benchmark/reader/_local_fs_iterable_mixin.py
@@ -16,7 +16,8 @@
 
 This mixin gives local-FS readers the same pre-fetch pattern as _S3IterableMixin:
 
-1. Before next(): parallel-read all assigned files via ThreadPoolExecutor
+1. Before next(): parallel-read all assigned files via ThreadPoolExecutor (buffered)
+   OR via s3dlio.get_many() with direct:// URIs (O_DIRECT, page-cache bypass).
 2. Store only the raw byte count per file (never decode numpy/PIL/h5py)
 3. During next() / get_sample(): dict lookup → telemetry → return resized_image
 
@@ -26,6 +27,21 @@
 Only the decode step (np.load, PIL.open, h5py.File) is skipped — that decode
 is pure CPU overhead that has nothing to do with storage bandwidth.
 
+TWO PREFETCH MODES
+==================
+storage_library: <unset or "posix">
+    ThreadPoolExecutor(64) + Python open() + buffered read.
+    Simple, portable, uses OS page cache.
+
+storage_library: "direct"
+    s3dlio.get_many() with direct:// URIs.
+    Uses O_DIRECT (Linux) — bypasses page cache entirely, 4 KiB-aligned I/O
+    via Tokio async tasks in the s3dlio Rust runtime. GIL is released for the
+    full duration of all reads.
+    **Required for accurate NVMe benchmarking** — repeated buffered reads hit
+    the page cache rather than the device, understating storage latency and
+    saturating DRAM bandwidth instead of device bandwidth.
+
 USAGE PATTERN
 =============
 Subclass from BOTH the format-specific parent AND this mixin::
@@ -49,10 +65,14 @@ def next(self):
             for batch in super().next():
                 yield batch
 """
+import os
+import time
 from concurrent.futures import ThreadPoolExecutor
 
 from dlio_benchmark.utils.utility import utcnow
 
+_PREFETCH_POOL = ThreadPoolExecutor(max_workers=1, thread_name_prefix="localfs_prefetch")
+
 
 class _LocalFSIterableMixin:
     """
@@ -61,29 +81,223 @@ class _LocalFSIterableMixin:
     Do NOT instantiate directly. Mix in alongside a FormatReader subclass;
     call ``_localfs_init()`` from the subclass ``__init__`` after
     ``super().__init__()``.
+
+    Set ``storage_library: direct`` in storage_options to use s3dlio's O_DIRECT
+    path (bypasses page cache — essential for accurate NVMe benchmarking).
+    Default (no storage_library, or ``posix``) uses buffered Python open().
     """
 
     def _localfs_init(self) -> None:
         """
         Initialise mixin state.
 
+        Reads ``storage_options.storage_library`` from ConfigArguments:
+          - ``"direct"`` → s3dlio O_DIRECT path (``direct://`` URIs, Tokio, GIL-free)
+          - anything else → buffered Python ThreadPoolExecutor path
+
         Sets:
-          - ``self._local_cache`` (dict: filename → int byte count)
+          - ``self._local_cache``      (dict: filename → int byte count)
+          - ``self._use_direct``       (bool)
+          - ``self._storage_root``     (str absolute path, for direct:// URI construction)
+          - ``self._total_bytes_read`` (int, epoch accumulator)
+          - ``self._total_objects_read`` (int, epoch accumulator)
         """
-        self._local_cache: dict = {}   # filename → int (raw byte count only)
+        self._local_cache: dict = {}
+        self._total_bytes_read: int = 0
+        self._total_objects_read: int = 0
+
+        opts = getattr(self._args, "storage_options", {}) or {}
+        lib = opts.get("storage_library", "")
+        self._use_direct: bool = (lib == "direct")
+
+        if self._use_direct:
+            try:
+                import s3dlio as _s3dlio  # noqa: F401
+            except ImportError as exc:
+                raise ImportError(
+                    f"{self.__class__.__name__}: storage_library='direct' requires "
+                    "the s3dlio package. Install with: pip install s3dlio"
+                ) from exc
+
+    # ── URI helpers ───────────────────────────────────────────────────────────
+
+    def _direct_uri_for_path(self, path: str) -> str:
+        """Return a ``direct://`` URI for an absolute or relative local path."""
+        return f"direct://{os.path.abspath(path)}"
+
+    # ── Buffered path (default) ───────────────────────────────────────────────
 
     def _read_local_bytes(self, path: str) -> int:
-        """Read a local file and return its byte count. No decode."""
+        """Read a local file using buffered I/O and return its byte count. No decode."""
         with open(path, 'rb') as fh:
             return len(fh.read())
 
+    def _prefetch_buffered(self, paths: list) -> dict:
+        """
+        Parallel buffered reads via ThreadPoolExecutor(64).
+
+        Uses the OS page cache. Fast for warm-cache runs; not representative of
+        cold-device bandwidth on NVMe.
+        """
+        n_workers = min(64, len(paths))
+        cache = {}
+        with ThreadPoolExecutor(max_workers=n_workers) as pool:
+            for path, byte_count in zip(paths, pool.map(self._read_local_bytes, paths)):
+                cache[path] = byte_count
+        return cache
+
+    # ── O_DIRECT path (storage_library: direct) ───────────────────────────────
+
+    def _prefetch_direct(self, paths: list) -> dict:
+        """
+        Parallel O_DIRECT reads via ``s3dlio.get_many()`` with ``direct://`` URIs.
+
+        - Bypasses the OS page cache (Linux O_DIRECT, 4 KiB-aligned buffers).
+        - Runs in Tokio async tasks inside the s3dlio Rust runtime; GIL is
+          released for the full duration.
+        - ``len(data)`` is O(1) on the returned BytesView — no Python bytes copy.
+        - Up to 64 concurrent reads in flight (same as _prefetch_buffered workers).
+
+        This is the correct mode for NVMe benchmarks: it stresses the device
+        itself rather than DRAM bandwidth or page-cache eviction policy.
+        """
+        import s3dlio
+
+        uris = [self._direct_uri_for_path(p) for p in paths]
+        uri_to_path = dict(zip(uris, paths))
+        max_in_flight = min(64, len(uris))
+        results = s3dlio.get_many(uris, max_in_flight=max_in_flight)
+
+        cache = {}
+        for uri, data in results:
+            path = uri_to_path.get(uri, uri)
+            cache[path] = len(data)   # byte count only; BytesView.len() is O(1)
+        return cache
+
+    # ── Dispatcher ────────────────────────────────────────────────────────────
+
+    def _localfs_stream_next(self):
+        """
+        Bounded streaming replacement for the ``_localfs_prefetch_all() + super().next()``
+        bulk-prefetch pattern.
+
+        PROBLEM WITH BULK-PREFETCH (identical to _S3IterableMixin._s3_prefetch_all)
+        ===========================================================================
+        ``_localfs_prefetch_all()`` submitted ALL files for this worker to either
+        ThreadPoolExecutor (buffered) or s3dlio.get_many() (O_DIRECT) before the
+        training loop started.  With many files this causes:
+
+          Buffered   — OS schedules hundreds of concurrent read() syscalls at once;
+                       page-cache lock contention on Linux degrades throughput.
+          O_DIRECT   — s3dlio Tokio runtime receives all io_uring sqe submissions
+                       simultaneously; the kernel io_uring queue depth (typically
+                       32–4096) becomes the bottleneck rather than device bandwidth.
+
+        SOLUTION: CHUNKED STREAMING WINDOW
+        ===================================
+        Files are processed in chunks of ``prefetch_window`` (default 256).
+        Each chunk is read — at most 64 concurrent in-flight reads — then iterated
+        for telemetry and batch assembly, then freed before the next chunk.
+
+        This keeps the kernel I/O queue at a steady, bounded depth throughout the
+        epoch rather than spiking at the start.  The behaviour mirrors
+        ``_S3IterableMixin._s3_stream_next()`` exactly; workload comparisons
+        between S3 and local storage remain apples-to-apples.
+
+        Configure window size via ``storage_options.prefetch_window`` (default 256).
+        Setting it to 64 matches max_in_flight exactly for tightest queue control.
+        """
+        args = self._args
+        batch_size = args.batch_size
+        num_spf = args.num_samples_per_file
+        opts = getattr(args, "storage_options", {}) or {}
+        window = int(opts.get("prefetch_window", 256))
+        dummy = args.resized_image
+        mode = "s3dlio-direct://" if self._use_direct else "buffered"
+
+        thread_entries = self.file_map.get(self.thread_index, [])
+        seen = set()
+        paths = []
+        for _, filename, _ in thread_entries:
+            if filename not in seen:
+                seen.add(filename)
+                paths.append(filename)
+
+        if not paths:
+            return
+
+        # ── Worker stagger ────────────────────────────────────────────────────
+        # Same rationale as _S3IterableMixin._s3_stream_next(): without a
+        # startup delay all workers submit their first chunk simultaneously.
+        # For buffered reads this spikes page-cache lock contention; for
+        # O_DIRECT it floods the io_uring submission queue all at once.
+        # Disable by setting storage_options.stagger_workers: false.
+        if str(opts.get("stagger_workers", "true")).lower() not in ("false", "0", "no"):
+            ct_raw = getattr(args, "computation_time", 0.0)
+            ct = ct_raw.get("mean", 0.0) if isinstance(ct_raw, dict) else float(ct_raw or 0.0)
+            delay = self.thread_index * ct
+            if delay > 0:
+                self.logger.debug(
+                    f"{utcnow()} {self.__class__.__name__} thread={self.thread_index} "
+                    f"stagger delay={delay:.4f}s ({self.thread_index} × {ct:.4f}s)"
+                )
+                time.sleep(delay)
+
+        total = len(paths)
+        n_chunks = (total + window - 1) // window
+        self.logger.info(
+            f"{utcnow()} {self.__class__.__name__} thread={self.thread_index} "
+            f"streaming {total} files in {n_chunks} chunks of {window} [{mode}]"
+        )
+
+        sample_buf = 0
+
+        # ── Pipelined chunk loop ─────────────────────────────────────────────
+        # Same rationale as _S3IterableMixin._s3_stream_next().
+        # The background thread fetches chunk N+1 while the main thread yields
+        # batches from chunk N.  For buffered reads the background thread holds
+        # the GIL only for the open()+read() syscall setup; the kernel I/O
+        # itself is blocking but runs concurrently with Python's yield loop.
+        # For O_DIRECT, s3dlio releases the GIL entirely during Rust io_uring.
+        def _fetch(chunk):
+            if self._use_direct:
+                c = self._prefetch_direct(chunk)
+            else:
+                c = self._prefetch_buffered(chunk)
+            self._total_bytes_read += sum(c.values())
+            self._total_objects_read += len(c)
+            return c
+
+        chunks = [paths[i : i + window] for i in range(0, total, window)]
+
+        future = _PREFETCH_POOL.submit(_fetch, chunks[0])
+
+        for idx, chunk in enumerate(chunks):
+            cache = future.result()
+
+            if idx + 1 < len(chunks):
+                future = _PREFETCH_POOL.submit(_fetch, chunks[idx + 1])
+            else:
+                future = None
+
+            self._local_cache = cache
+            for path in chunk:
+                for s in range(num_spf):
+                    self.get_sample(path, s)   # dlp + dft_ai image_size update
+                    sample_buf += 1
+                    if sample_buf >= batch_size:
+                        yield dummy
+                        sample_buf -= batch_size
+            self._local_cache = {}
+        # Drop-last: remaining sample_buf < batch_size is silently discarded.
+
     def _localfs_prefetch_all(self) -> None:
         """
-        Collect all files assigned to this thread and prefetch them in parallel.
+        Prefetch ALL files for this thread in one shot.
 
-        Call at the top of ``next()`` before the iteration loop. Deduplicates
-        filenames while preserving order (a multi-sample file may appear many
-        times in the thread's file_map entries).
+        Retained for ``read_index()`` (on-demand map-style access) to warm the
+        cache before random access begins.  The main streaming path uses
+        ``_localfs_stream_next()`` instead to avoid thundering-herd I/O.
         """
         thread_entries = self.file_map.get(self.thread_index, [])
         seen = set()
@@ -96,19 +310,44 @@ def _localfs_prefetch_all(self) -> None:
         if not paths:
             return
 
+        mode = "s3dlio-direct://" if self._use_direct else "buffered"
         self.logger.info(
             f"{utcnow()} {self.__class__.__name__} thread={self.thread_index} "
-            f"prefetching {len(paths)} local files (parallel)"
+            f"prefetching {len(paths)} local files [{mode}]"
         )
 
-        n_workers = min(64, len(paths))
-        cache = {}
-        with ThreadPoolExecutor(max_workers=n_workers) as pool:
-            for path, byte_count in zip(paths, pool.map(self._read_local_bytes, paths)):
-                cache[path] = byte_count
+        if self._use_direct:
+            cache = self._prefetch_direct(paths)
+        else:
+            cache = self._prefetch_buffered(paths)
+
+        self._total_bytes_read += sum(cache.values())
+        self._total_objects_read += len(cache)
         self._local_cache = cache
 
     def _localfs_ensure_cached(self, filename: str) -> None:
         """Fetch a single file on demand if not already in the cache."""
         if filename not in self._local_cache:
-            self._local_cache[filename] = self._read_local_bytes(filename)
+            if self._use_direct:
+                self._local_cache.update(self._prefetch_direct([filename]))
+            else:
+                self._local_cache[filename] = self._read_local_bytes(filename)
+
+    def finalize_local_bytes(self) -> None:
+        """
+        Update ``args.record_length`` from actual bytes read this epoch.
+
+        Mirrors ``_S3IterableMixin.finalize_s3_bytes()``. Call from subclass
+        ``finalize()`` before resetting epoch state.  Resets epoch counters.
+        """
+        if self._total_objects_read > 0 and self._total_bytes_read > 0:
+            measured = self._total_bytes_read // self._total_objects_read
+            self._args.record_length = measured
+            self.logger.debug(
+                f"{utcnow()} {self.__class__.__name__} epoch done: "
+                f"actual {self._total_bytes_read / 1024**3:.3f} GiB read, "
+                f"{self._total_objects_read} files, "
+                f"{measured:,} bytes/file"
+            )
+        self._total_bytes_read = 0
+        self._total_objects_read = 0
diff --git a/dlio_benchmark/reader/_s3_iterable_mixin.py b/dlio_benchmark/reader/_s3_iterable_mixin.py
index 2fede5a6..79f2b81d 100644
--- a/dlio_benchmark/reader/_s3_iterable_mixin.py
+++ b/dlio_benchmark/reader/_s3_iterable_mixin.py
@@ -51,9 +51,7 @@ def get_sample(self, filename, sample_index):
             dlp.update(image_size=self._object_cache.get(filename, 0))
 
         def next(self):
-            self._s3_prefetch_all()
-            for batch in super().next():
-                yield batch
+            yield from self._s3_stream_next()
 
         @dlp.log
         def read_index(self, image_idx, step):
@@ -73,11 +71,14 @@ def read_index(self, image_idx, step):
 or missing libraries raise ImportError immediately, not later during I/O.
 """
 import os
+import time
 from concurrent.futures import ThreadPoolExecutor
 from urllib.parse import urlparse
 
 from dlio_benchmark.utils.utility import utcnow
 
+_PREFETCH_POOL = ThreadPoolExecutor(max_workers=1, thread_name_prefix="s3_prefetch")
+
 
 class _S3IterableMixin:
     """
@@ -114,6 +115,10 @@ def _s3_init(self, opts: dict) -> None:
         self._opts: dict = opts
         self._object_cache: dict = {}   # obj_key → int (raw byte count only)
         self._minio_client = None       # cached across epochs for TCP keep-alive
+        # Actual bytes received from storage this epoch (reset in finalize_s3_bytes).
+        # Incremented in _prefetch() from real len(data) — not a configured estimate.
+        self._total_bytes_read: int = 0
+        self._total_objects_read: int = 0
 
         if self._storage_library == "s3dlio":
             # s3dlio reads AWS_ENDPOINT_URL_S3 at import time; set early.
@@ -299,29 +304,247 @@ def _fetch_one(obj_key):
         return cache
 
     def _prefetch(self, obj_keys: list) -> dict:
-        """Dispatch to the configured library's prefetch method."""
+        """Dispatch to the configured library's prefetch method and accumulate byte counts."""
         lib = self._storage_library
         if lib == "s3dlio":
-            return self._prefetch_s3dlio(obj_keys)
+            cache = self._prefetch_s3dlio(obj_keys)
         elif lib == "s3torchconnector":
-            return self._prefetch_s3torchconnector(obj_keys)
+            cache = self._prefetch_s3torchconnector(obj_keys)
         elif lib == "minio":
-            return self._prefetch_minio(obj_keys)
+            cache = self._prefetch_minio(obj_keys)
         else:
             raise ValueError(
                 f"{self.__class__.__name__}: unknown storage_library {lib!r}; "
                 "supported: s3dlio, s3torchconnector, minio"
             )
+        # Accumulate ACTUAL bytes transferred (real sizes, not configured estimates).
+        self._total_bytes_read += sum(cache.values())
+        self._total_objects_read += len(cache)
+        return cache
 
     # ── FormatReader lifecycle helpers ────────────────────────────────────────
 
+    def _s3_stream_s3dlio(self, obj_keys: list):
+        """
+        True sliding-window streaming via ``s3dlio.PyBytesAsyncDataLoader``.
+
+        HOW IT WORKS
+        ============
+        Rather than Python managing chunk boundaries, the Rust/Tokio runtime owns
+        the concurrency window entirely:
+
+        1.  ``PyDataset.from_uris(uris)`` — builds a map-style dataset from the
+            pre-computed URI list.  No network calls yet; no Python listing.
+        2.  ``PyBytesAsyncDataLoader(ds, {"prefetch": N})`` — spawns a single
+            Tokio producer task that keeps *exactly* N GETs in flight at all times
+            via ``buffer_unordered(N)``.
+        3.  ``item_iter.collect_batch(n)`` — drains up to ``n`` completed items
+            from the Rust channel with **one** GIL crossing per batch rather than
+            one crossing per item.  Inside the single ``py.detach()`` block, Rust
+            calls ``blocking_recv()`` up to ``n`` times; on return the GIL is
+            reacquired once to wrap the results into a Python list of
+            ``PyObjectItem``.  Python then iterates a plain Python list —
+            cheapest possible iteration, no ``__next__`` dispatch overhead.
+
+        WHY THIS IS BETTER THAN THE CHUNKED PATH
+        =========================================
+        The chunked path (``_s3_stream_next``) fetches N files, then yields N
+        items, then fetches the next N.  When I/O is fast (low latency or high
+        bandwidth) the fetch completes in milliseconds but the yield loop takes
+        hundreds of milliseconds, leaving the storage backend idle between chunks.
+
+        This path has NO chunk boundary: Tokio always has ``prefetch`` GETs in
+        flight.  The storage server sees a flat, constant request rate regardless
+        of how fast or slow the compute side is.
+
+        RUST TELLS PYTHON WHAT ARRIVED
+        ================================
+        ``PyObjectItem.uri`` carries the full URI of the completed GET.  Python
+        never needs to track indices or maintain a parallel key list — it just
+        calls ``get_sample(obj_key, s)`` for whatever arrived.  Byte count is
+        available as ``len(item)`` for telemetry without a Python ``bytes`` copy.
+
+        COLLECT BATCH SIZE
+        ==================
+        ``collect_n = max(1, batch_size // num_samples_per_file)`` — one training
+        batch worth of files per Rust drain.  For RetinaNet (num_spf=1) this is
+        exactly ``batch_size`` files.  For NPZ (num_spf=4) it is
+        ``batch_size // 4`` files, still aligning each drain with one training
+        batch.  Tail-latency impact of waiting for the slowest of ``collect_n``
+        concurrent GETs is negligible at loopback latencies and acceptable at
+        typical S3 latencies.
+
+        Configure in-flight depth via ``storage_options.prefetch_window`` (default 64).
+        """
+        import s3dlio
+
+        args = self._args
+        batch_size = args.batch_size
+        num_spf = args.num_samples_per_file
+        opts = getattr(args, "storage_options", {}) or {}
+        dummy = args.resized_image
+        # prefetch_window controls Tokio in-flight GETs per worker.
+        # Default 64 — matches s3dlio's max_in_flight, so no backlog builds up.
+        prefetch = int(opts.get("prefetch_window", 64))
+        # collect_n: files to drain per collect_batch() call (one GIL crossing).
+        # Aligns to one training batch worth of files so the drain is a natural
+        # unit. Falls back to 1 when batch_size < num_spf (rare).
+        collect_n = max(1, batch_size // num_spf)
+
+        uris = [self._uri_for_obj_key(k) for k in obj_keys]
+        uri_to_key = {u: k for k, u in zip(obj_keys, uris)}
+
+        total = len(uris)
+        self.logger.info(
+            f"{utcnow()} {self.__class__.__name__} thread={self.thread_index} "
+            f"s3dlio sliding-window: {total} objects, {prefetch} in-flight, "
+            f"collect_batch={collect_n}"
+        )
+
+        ds = s3dlio.PyDataset.from_uris(uris)
+
+        # skip_head controls whether s3dlio issues a HEAD before each GET.
+        # Default (True): skip HEAD, do plain GET, cache the size — epoch 2+
+        # will range-split correctly for large objects automatically.
+        # False: issue HEAD first so range splitting can fire from epoch 1.
+        # We override to False only when we know objects are >= 16 MiB, since
+        # those benefit most from parallel range GETs on the very first epoch.
+        record_bytes = getattr(args, "record_length", 0) or 0
+        skip_head = not (record_bytes >= 16 * 1024 * 1024)
+        loader_opts = {"prefetch": prefetch, "skip_head": skip_head}
+        if not skip_head:
+            self.logger.info(
+                f"{utcnow()} {self.__class__.__name__} "
+                f"skip_head=False (record_length={record_bytes} bytes >= 16 MiB, "
+                f"range-split active from epoch 1)"
+            )
+
+        loader = s3dlio.PyBytesAsyncDataLoader(ds, loader_opts)
+        item_iter = loader.items()
+
+        sample_buf = 0
+        # collect_batch() releases the GIL once, drains collect_n items from the
+        # Rust channel (blocking_recv × collect_n without touching the GIL), then
+        # reacquires the GIL once to return a Python list.  Python iterates the
+        # list — zero __next__() overhead.  Empty list signals end of stream.
+        while batch := item_iter.collect_batch(collect_n):
+            for item in batch:
+                # item.uri tells us which object arrived (completion order).
+                # len(item) is byte_count — O(1), no Python bytes copy needed.
+                obj_key = uri_to_key.get(item.uri, item.uri)
+
+                # Store byte count in object cache so get_sample() / telemetry
+                # can read it.
+                self._object_cache[obj_key] = len(item)
+                self._total_bytes_read += len(item)
+                self._total_objects_read += 1
+
+                for s in range(num_spf):
+                    self.get_sample(obj_key, s)   # dlp + dft_ai image_size telemetry
+                    sample_buf += 1
+                    if sample_buf >= batch_size:
+                        yield dummy
+                        sample_buf -= batch_size
+
+                # Release the byte-count entry; not needed across items.
+                self._object_cache.pop(obj_key, None)
+        # Drop-last: remaining sample_buf < batch_size is silently discarded.
+
+    def _s3_stream_next(self):
+        """
+        Bounded streaming main entry point for all S3 iterable readers.
+
+        Dispatches to the best path for the configured storage library:
+
+        s3dlio
+            Uses ``_s3_stream_s3dlio()``: the Rust/Tokio runtime keeps a sliding
+            window of ``prefetch_window`` (default 64) GETs permanently in flight.
+            No chunk boundaries, no Python-side threading, no idle gaps.
+
+        minio / s3torchconnector
+            Uses the chunked pipelined path: files are fetched in chunks of
+            ``prefetch_window`` (default 256), with the next chunk pre-fetched in
+            a background ThreadPoolExecutor thread while the current chunk is
+            being yielded to the training loop.
+        """
+        args = self._args
+        batch_size = args.batch_size
+        num_spf = args.num_samples_per_file
+        opts = getattr(args, "storage_options", {}) or {}
+        dummy = args.resized_image
+
+        thread_entries = self.file_map.get(self.thread_index, [])
+        # Build deduplicated key list preserving epoch-shuffle order.
+        seen = set()
+        obj_keys = []
+        for _, obj_key, _ in thread_entries:
+            if obj_key not in seen:
+                seen.add(obj_key)
+                obj_keys.append(obj_key)
+
+        if not obj_keys:
+            return
+
+        # s3dlio: hand control to Rust — no stagger, no chunks needed.
+        if self._storage_library == "s3dlio":
+            yield from self._s3_stream_s3dlio(obj_keys)
+            return
+
+        # ── Other libraries: chunked pipelined path ───────────────────────────
+        # Worker stagger: spread startup I/O across one GPU-cycle window so all
+        # workers don't submit their first chunk simultaneously.
+        # Disable with storage_options.stagger_workers: false.
+        if str(opts.get("stagger_workers", "true")).lower() not in ("false", "0", "no"):
+            ct_raw = getattr(args, "computation_time", 0.0)
+            ct = ct_raw.get("mean", 0.0) if isinstance(ct_raw, dict) else float(ct_raw or 0.0)
+            delay = self.thread_index * ct
+            if delay > 0:
+                self.logger.debug(
+                    f"{utcnow()} {self.__class__.__name__} thread={self.thread_index} "
+                    f"stagger delay={delay:.4f}s ({self.thread_index} × {ct:.4f}s)"
+                )
+                time.sleep(delay)
+
+        window = int(opts.get("prefetch_window", 256))
+        total = len(obj_keys)
+        n_chunks = (total + window - 1) // window
+        self.logger.info(
+            f"{utcnow()} {self.__class__.__name__} thread={self.thread_index} "
+            f"streaming {total} objects in {n_chunks} chunks of {window} "
+            f"via [{self._storage_library}]"
+        )
+
+        sample_buf = 0
+        chunks = [obj_keys[i : i + window] for i in range(0, total, window)]
+
+        # Prime the pipeline: start fetching chunk 0 in background.
+        future = _PREFETCH_POOL.submit(self._prefetch, chunks[0])
+
+        for idx, chunk in enumerate(chunks):
+            cache = future.result()
+            if idx + 1 < len(chunks):
+                future = _PREFETCH_POOL.submit(self._prefetch, chunks[idx + 1])
+            else:
+                future = None
+
+            self._object_cache = cache
+            for obj_key in chunk:
+                for s in range(num_spf):
+                    self.get_sample(obj_key, s)
+                    sample_buf += 1
+                    if sample_buf >= batch_size:
+                        yield dummy
+                        sample_buf -= batch_size
+            self._object_cache = {}
+        # Drop-last: remaining sample_buf < batch_size is silently discarded.
+
     def _s3_prefetch_all(self) -> None:
         """
-        Collect all object keys assigned to this thread and prefetch them.
+        Prefetch ALL object keys for this thread in one shot.
 
-        Call at the top of ``next()`` to bulk-load all objects before the
-        training iteration starts. Deduplicates object keys while preserving order
-        (an NPZ/NPY file may contain many samples mapped to the same key).
+        Used only by ``read_index()`` (on-demand map-style access) to warm the
+        cache before random access begins.  The main streaming path now uses
+        ``_s3_stream_next()`` instead to avoid thundering-herd GET spikes.
         """
         thread_entries = self.file_map.get(self.thread_index, [])
         seen = set()
@@ -342,3 +565,29 @@ def _s3_ensure_cached(self, filename: str) -> None:
         """Fetch a single object on demand if it is not already in the cache."""
         if filename not in self._object_cache:
             self._object_cache.update(self._prefetch([filename]))
+
+    def finalize_s3_bytes(self) -> None:
+        """
+        Update ``args.record_length`` from the actual bytes transferred this epoch.
+
+        Must be called from each reader's ``finalize()`` BEFORE resetting epoch
+        state.  Mirrors the same pattern used by ``ParquetReaderS3dlio.finalize()``.
+
+        Uses measured bytes per object (average across all objects fetched this
+        epoch) rather than the configured ``record_length_bytes`` estimate.  For
+        workloads with high file-size variance (e.g. UNet3D stdev ≈ 65 MiB),
+        this gives a more accurate per-epoch I/O throughput figure.
+
+        After updating, resets the epoch counters so the next epoch starts clean.
+        """
+        if self._total_objects_read > 0 and self._total_bytes_read > 0:
+            measured = self._total_bytes_read // self._total_objects_read
+            self._args.record_length = measured
+            self.logger.debug(
+                f"{utcnow()} {self.__class__.__name__} epoch done: "
+                f"actual {self._total_bytes_read / 1024**3:.3f} GiB read, "
+                f"{self._total_objects_read} objects, "
+                f"{measured:,} bytes/object"
+            )
+        self._total_bytes_read = 0
+        self._total_objects_read = 0
diff --git a/dlio_benchmark/reader/image_reader_iterable.py b/dlio_benchmark/reader/image_reader_iterable.py
new file mode 100644
index 00000000..fc4f56e3
--- /dev/null
+++ b/dlio_benchmark/reader/image_reader_iterable.py
@@ -0,0 +1,67 @@
+"""
+JPEG/PNG image reader for local filesystem using parallel prefetch.
+
+Mirrors the structure of image_reader_s3_iterable.py but for local files.
+Inherits FormatReader + _LocalFSIterableMixin only — no S3 code paths.
+
+Only the raw byte count is stored per file — no PIL or numpy decode is performed.
+"""
+# Copyright (c) 2025, UChicago Argonne, LLC. Apache 2.0 License.
+from dlio_benchmark.common.constants import MODULE_DATA_READER
+from dlio_benchmark.reader.reader_handler import FormatReader
+from dlio_benchmark.reader._local_fs_iterable_mixin import _LocalFSIterableMixin
+from dlio_benchmark.utils.utility import utcnow, Profile, dft_ai
+
+dlp = Profile(MODULE_DATA_READER)
+
+
+class ImageReaderIterable(FormatReader, _LocalFSIterableMixin):
+    """
+    Parallel-prefetch JPEG/PNG reader for local filesystem.
+
+    All prefetch logic is in _LocalFSIterableMixin.
+    PIL/numpy decode is skipped — only raw byte count is kept for telemetry.
+    Both dlp and dft_ai are updated (matching ImageReader behaviour).
+    """
+
+    @dlp.log_init
+    def __init__(self, dataset_type, thread_index, epoch):
+        super().__init__(dataset_type, thread_index)
+        self._localfs_init()
+
+    @dlp.log
+    def open(self, filename):
+        super().open(filename)
+        return self._local_cache.get(filename, 0)
+
+    @dlp.log
+    def close(self, filename):
+        super().close(filename)
+
+    @dlp.log
+    def get_sample(self, filename, sample_index):
+        self.logger.debug(f"{utcnow()} sample_index {sample_index}, {self.image_idx}")
+        super().get_sample(filename, sample_index)
+        byte_count = self._local_cache.get(filename, 0)
+        dlp.update(image_size=byte_count)
+        dft_ai.update(image_size=byte_count)
+
+    def next(self):
+        yield from self._localfs_stream_next()
+
+    @dlp.log
+    def read_index(self, image_idx, step):
+        filename, _ = self.global_index_map[image_idx]
+        self._localfs_ensure_cached(filename)
+        dlp.update(step=step)
+        return super().read_index(image_idx, step)
+
+    @dlp.log
+    def finalize(self):
+        return super().finalize()
+
+    def is_index_based(self):
+        return True
+
+    def is_iterator_based(self):
+        return True
diff --git a/dlio_benchmark/reader/image_reader_s3_iterable.py b/dlio_benchmark/reader/image_reader_s3_iterable.py
index ebcb71ad..99633b4a 100644
--- a/dlio_benchmark/reader/image_reader_s3_iterable.py
+++ b/dlio_benchmark/reader/image_reader_s3_iterable.py
@@ -27,34 +27,37 @@
   s3dlio           — s3dlio.get_many(), up to 64 parallel requests, O(1) len(BytesView)
   s3torchconnector — S3IterableDataset.from_objects() + sequential reader
   minio            — ThreadPoolExecutor + Minio SDK, pooled TCP connections
+
+NOTE ON INHERITANCE
+-------------------
+This class inherits FormatReader + _S3IterableMixin ONLY.  It deliberately does
+NOT inherit ImageReader (the local-filesystem reader), which carries
+_LocalFSIterableMixin.  Keep these two hierarchies strictly separate:
+
+  Object storage:  ImageReaderS3Iterable(FormatReader, _S3IterableMixin)
+  Local filesystem: ImageReaderIterable(FormatReader, _LocalFSIterableMixin)
 """
 
 from dlio_benchmark.common.constants import MODULE_DATA_READER
-from dlio_benchmark.reader.image_reader import ImageReader
+from dlio_benchmark.reader.reader_handler import FormatReader
 from dlio_benchmark.reader._s3_iterable_mixin import _S3IterableMixin
 from dlio_benchmark.utils.utility import Profile, dft_ai, utcnow
 
 dlp = Profile(MODULE_DATA_READER)
 
 
-class ImageReaderS3Iterable(ImageReader, _S3IterableMixin):
+class ImageReaderS3Iterable(FormatReader, _S3IterableMixin):
     """
     Parallel-prefetch JPEG/PNG reader for S3-compatible object stores.
 
     All prefetch, library routing, and byte-counting logic is in _S3IterableMixin.
     This class is a thin adapter connecting the mixin to DLIO's FormatReader chain.
-
-    Images are 1 sample per file. open_file_map[filename] holds the raw byte count
-    (int) used only for telemetry. No PIL or numpy decode is performed.
-
-    ImageReader.get_sample() updates both dlp and dft_ai with image_size —
-    we replicate both calls here since we cannot call super().get_sample() (it
-    would try to call .nbytes on the cached int).
+    No local filesystem code is reachable from this class.
     """
 
     @dlp.log_init
     def __init__(self, dataset_type, thread_index, epoch):
-        super().__init__(dataset_type, thread_index, epoch)
+        super().__init__(dataset_type, thread_index)
         opts = getattr(self._args, "storage_options", {}) or {}
         self._s3_init(opts)
         self.logger.info(
@@ -72,17 +75,12 @@ def close(self, filename):
 
     @dlp.log
     def get_sample(self, filename, sample_index):
-        # Report byte count for both telemetry systems. Do NOT call super() —
-        # ImageReader.get_sample() calls open_file_map[filename].nbytes which would
-        # fail because open_file_map[filename] is now an int (byte count), not an array.
         byte_count = self._object_cache.get(filename, 0)
         dlp.update(image_size=byte_count)
         dft_ai.update(image_size=byte_count)
 
     def next(self):
-        self._s3_prefetch_all()
-        for batch in super().next():
-            yield batch
+        yield from self._s3_stream_next()
 
     # Override the local-FS hooks inherited (via ImageReader) from
     # _LocalFsIterableMixin so they are no-ops for the S3 reader.
@@ -103,6 +101,7 @@ def read_index(self, image_idx, step):
 
     @dlp.log
     def finalize(self):
+        self.finalize_s3_bytes()  # report actual bytes → args.record_length
         return super().finalize()
 
     def is_index_based(self):
diff --git a/dlio_benchmark/reader/npy_reader_iterable.py b/dlio_benchmark/reader/npy_reader_iterable.py
new file mode 100644
index 00000000..d298c96b
--- /dev/null
+++ b/dlio_benchmark/reader/npy_reader_iterable.py
@@ -0,0 +1,64 @@
+"""
+NPY reader for local filesystem using parallel prefetch.
+
+Mirrors the structure of npy_reader_s3_iterable.py but for local files.
+Inherits FormatReader + _LocalFSIterableMixin only — no S3 code paths.
+
+NPY files contain a single array per file (no named key). Only the raw byte
+count is stored per file — no numpy decode is performed.
+"""
+# Copyright (c) 2025, UChicago Argonne, LLC. Apache 2.0 License.
+from dlio_benchmark.common.constants import MODULE_DATA_READER
+from dlio_benchmark.reader.reader_handler import FormatReader
+from dlio_benchmark.reader._local_fs_iterable_mixin import _LocalFSIterableMixin
+from dlio_benchmark.utils.utility import Profile
+
+dlp = Profile(MODULE_DATA_READER)
+
+
+class NPYReaderIterable(FormatReader, _LocalFSIterableMixin):
+    """
+    Parallel-prefetch NPY reader for local filesystem.
+
+    All prefetch logic is in _LocalFSIterableMixin.
+    np.load decode is skipped — only raw byte count is kept for telemetry.
+    """
+
+    @dlp.log_init
+    def __init__(self, dataset_type, thread_index, epoch):
+        super().__init__(dataset_type, thread_index)
+        self._localfs_init()
+
+    @dlp.log
+    def open(self, filename):
+        super().open(filename)
+        return self._local_cache.get(filename, 0)
+
+    @dlp.log
+    def close(self, filename):
+        super().close(filename)
+
+    @dlp.log
+    def get_sample(self, filename, sample_index):
+        super().get_sample(filename, sample_index)
+        dlp.update(image_size=self._local_cache.get(filename, 0))
+
+    def next(self):
+        yield from self._localfs_stream_next()
+
+    @dlp.log
+    def read_index(self, image_idx, step):
+        filename, _ = self.global_index_map[image_idx]
+        self._localfs_ensure_cached(filename)
+        dlp.update(step=step)
+        return super().read_index(image_idx, step)
+
+    @dlp.log
+    def finalize(self):
+        return super().finalize()
+
+    def is_index_based(self):
+        return True
+
+    def is_iterator_based(self):
+        return True
diff --git a/dlio_benchmark/reader/npy_reader_s3_iterable.py b/dlio_benchmark/reader/npy_reader_s3_iterable.py
index 507c0796..6c1be19b 100644
--- a/dlio_benchmark/reader/npy_reader_s3_iterable.py
+++ b/dlio_benchmark/reader/npy_reader_s3_iterable.py
@@ -35,30 +35,38 @@
 Each library is STRICTLY isolated — there is NO silent fallback to another
 library. Configuring a library that is not installed raises ImportError immediately
 at construction time, not later during I/O.
+
+NOTE ON INHERITANCE
+-------------------
+This class inherits FormatReader + _S3IterableMixin ONLY.  It deliberately does
+NOT inherit NPYReader (the local-filesystem reader), which carries
+_LocalFSIterableMixin.  Mixing that in would cause read_index to call
+_localfs_ensure_cached() — which calls open('...') treating an object URI
+as a local path.  Keep these two hierarchies strictly separate:
+
+  Object storage:  NPYReaderS3Iterable(FormatReader, _S3IterableMixin)
+  Local filesystem: NPYReaderIterable(FormatReader, _LocalFSIterableMixin)
 """
 from dlio_benchmark.common.constants import MODULE_DATA_READER
-from dlio_benchmark.reader.npy_reader import NPYReader
+from dlio_benchmark.reader.reader_handler import FormatReader
 from dlio_benchmark.reader._s3_iterable_mixin import _S3IterableMixin
 from dlio_benchmark.utils.utility import Profile, utcnow
 
 dlp = Profile(MODULE_DATA_READER)
 
 
-class NPYReaderS3Iterable(NPYReader, _S3IterableMixin):
+class NPYReaderS3Iterable(FormatReader, _S3IterableMixin):
     """
     Parallel-prefetch NPY reader for S3-compatible object stores.
 
     All prefetch, library routing, and byte-counting logic is in _S3IterableMixin.
     This class is a thin adapter connecting the mixin to DLIO's FormatReader chain.
-
-    NPY files contain one array per file (no named key). Each object maps to
-    exactly one sample; open_file_map[filename] holds the byte count (int) used
-    only for the image_size telemetry metric — no numpy decode is performed.
+    No local filesystem code is reachable from this class.
     """
 
     @dlp.log_init
     def __init__(self, dataset_type, thread_index, epoch):
-        super().__init__(dataset_type, thread_index, epoch)
+        super().__init__(dataset_type, thread_index)
         opts = getattr(self._args, "storage_options", {}) or {}
         self._s3_init(opts)
         self.logger.info(
@@ -76,15 +84,10 @@ def close(self, filename):
 
     @dlp.log
     def get_sample(self, filename, sample_index):
-        # Report byte count for telemetry. Do NOT call super() — NPYReader.get_sample()
-        # does open_file_map[filename][..., sample_index].nbytes which would fail
-        # because open_file_map[filename] is now an int (byte count), not an array.
         dlp.update(image_size=self._object_cache.get(filename, 0))
 
     def next(self):
-        self._s3_prefetch_all()
-        for batch in super().next():
-            yield batch
+        yield from self._s3_stream_next()
 
     @dlp.log
     def read_index(self, image_idx, step):
@@ -95,6 +98,7 @@ def read_index(self, image_idx, step):
 
     @dlp.log
     def finalize(self):
+        self.finalize_s3_bytes()  # report actual bytes → args.record_length
         return super().finalize()
 
     def is_index_based(self):
diff --git a/dlio_benchmark/reader/npz_reader_iterable.py b/dlio_benchmark/reader/npz_reader_iterable.py
new file mode 100644
index 00000000..fe7027b4
--- /dev/null
+++ b/dlio_benchmark/reader/npz_reader_iterable.py
@@ -0,0 +1,64 @@
+"""
+NPZ reader for local filesystem using parallel prefetch.
+
+Mirrors the structure of npz_reader_s3_iterable.py but for local files.
+Inherits FormatReader + _LocalFSIterableMixin only — no S3 code paths.
+
+Design: see _local_fs_iterable_mixin.py for full rationale.
+Only the raw byte count is stored per file — no numpy decode.
+"""
+# Copyright (c) 2025, UChicago Argonne, LLC. Apache 2.0 License.
+from dlio_benchmark.common.constants import MODULE_DATA_READER
+from dlio_benchmark.reader.reader_handler import FormatReader
+from dlio_benchmark.reader._local_fs_iterable_mixin import _LocalFSIterableMixin
+from dlio_benchmark.utils.utility import Profile
+
+dlp = Profile(MODULE_DATA_READER)
+
+
+class NPZReaderIterable(FormatReader, _LocalFSIterableMixin):
+    """
+    Parallel-prefetch NPZ reader for local filesystem.
+
+    All prefetch logic is in _LocalFSIterableMixin.
+    np.load decode is skipped — only raw byte count is kept for telemetry.
+    """
+
+    @dlp.log_init
+    def __init__(self, dataset_type, thread_index, epoch):
+        super().__init__(dataset_type, thread_index)
+        self._localfs_init()
+
+    @dlp.log
+    def open(self, filename):
+        super().open(filename)
+        return self._local_cache.get(filename, 0)
+
+    @dlp.log
+    def close(self, filename):
+        super().close(filename)
+
+    @dlp.log
+    def get_sample(self, filename, sample_index):
+        super().get_sample(filename, sample_index)
+        dlp.update(image_size=self._local_cache.get(filename, 0))
+
+    def next(self):
+        yield from self._localfs_stream_next()
+
+    @dlp.log
+    def read_index(self, image_idx, step):
+        filename, _ = self.global_index_map[image_idx]
+        self._localfs_ensure_cached(filename)
+        dlp.update(step=step)
+        return super().read_index(image_idx, step)
+
+    @dlp.log
+    def finalize(self):
+        return super().finalize()
+
+    def is_index_based(self):
+        return True
+
+    def is_iterator_based(self):
+        return True
diff --git a/dlio_benchmark/reader/npz_reader_s3_iterable.py b/dlio_benchmark/reader/npz_reader_s3_iterable.py
index 442e16e4..1c320598 100644
--- a/dlio_benchmark/reader/npz_reader_s3_iterable.py
+++ b/dlio_benchmark/reader/npz_reader_s3_iterable.py
@@ -8,27 +8,39 @@
   minio            — ThreadPoolExecutor + Minio SDK, pooled TCP connections
 
 Only the raw byte count is stored per object — no numpy decode.
+
+NOTE ON INHERITANCE
+-------------------
+This class inherits FormatReader + _S3IterableMixin ONLY.  It deliberately does
+NOT inherit NPZReader (the local-filesystem reader), which carries
+_LocalFSIterableMixin.  Mixing that in would cause read_index to call
+_localfs_ensure_cached() — which calls open('s3://...') treating an object URI
+as a local path.  Keep these two hierarchies strictly separate:
+
+  Object storage:  NPZReaderS3Iterable(FormatReader, _S3IterableMixin)
+  Local filesystem: NPZReaderIterable(FormatReader, _LocalFSIterableMixin)
 """
 # Copyright (c) 2025, UChicago Argonne, LLC. Apache 2.0 License.
 from dlio_benchmark.common.constants import MODULE_DATA_READER
-from dlio_benchmark.reader.npz_reader import NPZReader
+from dlio_benchmark.reader.reader_handler import FormatReader
 from dlio_benchmark.reader._s3_iterable_mixin import _S3IterableMixin
 from dlio_benchmark.utils.utility import Profile, utcnow
 
 dlp = Profile(MODULE_DATA_READER)
 
 
-class NPZReaderS3Iterable(NPZReader, _S3IterableMixin):
+class NPZReaderS3Iterable(FormatReader, _S3IterableMixin):
     """
     Parallel-prefetch NPZ reader for S3-compatible object stores.
 
     All prefetch, library routing, and byte-counting logic is in _S3IterableMixin.
     This class is a thin adapter connecting the mixin to DLIO's FormatReader chain.
+    No local filesystem code is reachable from this class.
     """
 
     @dlp.log_init
     def __init__(self, dataset_type, thread_index, epoch):
-        super().__init__(dataset_type, thread_index, epoch)
+        super().__init__(dataset_type, thread_index)
         opts = getattr(self._args, "storage_options", {}) or {}
         self._s3_init(opts)
         self.logger.info(
@@ -46,15 +58,10 @@ def close(self, filename):
 
     @dlp.log
     def get_sample(self, filename, sample_index):
-        # Report byte count for telemetry. Do NOT call super() — NPZReader.get_sample()
-        # does open_file_map[filename][..., sample_index].nbytes which would fail
-        # because open_file_map[filename] is now an int (byte count), not an array.
         dlp.update(image_size=self._object_cache.get(filename, 0))
 
     def next(self):
-        self._s3_prefetch_all()
-        for batch in super().next():
-            yield batch
+        yield from self._s3_stream_next()
 
     @dlp.log
     def read_index(self, image_idx, step):
@@ -65,6 +72,7 @@ def read_index(self, image_idx, step):
 
     @dlp.log
     def finalize(self):
+        self.finalize_s3_bytes()  # report actual bytes → args.record_length
         return super().finalize()
 
     def is_index_based(self):
diff --git a/dlio_benchmark/reader/parquet_reader_file_iterable.py b/dlio_benchmark/reader/parquet_reader_file_iterable.py
new file mode 100644
index 00000000..75ae7826
--- /dev/null
+++ b/dlio_benchmark/reader/parquet_reader_file_iterable.py
@@ -0,0 +1,312 @@
+"""
+   Copyright (c) 2025, UChicago Argonne, LLC
+   All Rights Reserved
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+"""
+"""
+Parquet reader for local and network filesystems using raw byte-range reads.
+
+Designed to be structurally identical to ParquetReaderS3Iterable so that
+file-storage and object-storage benchmarks are directly comparable:
+
+  S3 path   : HTTP byte-range GET  →  compressed RG bytes arrive over network
+  File path : open()/seek()/read() →  compressed RG bytes arrive from storage
+
+Both paths:
+  • Read ONLY the compressed row-group bytes (no pyarrow decode/decompress).
+  • Discard the data immediately — this is a storage benchmark.
+  • Report the same compressed-byte count to dlp telemetry.
+  • Use a ThreadPoolExecutor with `prefetch_workers` threads per worker process.
+  • Expose open_footer_only() / submit_rg_prefetch() for the sliding-window
+    iterator in TorchIterableDataset — the identical API means the same
+    __iter__ code path drives both backends.
+
+The key parity decision: we do NOT call pf.read_row_group() (which decompresses
+the data in Python) because the S3 path never decompresses. We compute the exact
+byte offset and length of each row group's column chunks from the parquet footer
+metadata, then issue a raw open/seek/read — matching the S3 range GET semantics
+byte-for-byte.
+
+Configuration (under storage_options in the DLIO YAML):
+  prefetch_workers: 64   # threads per worker process (default 64)
+  columns:          null # list of column names (null = all)
+  prefetch_window:  64   # sliding-window depth (controlled by TorchIterableDataset)
+
+Example YAML snippet:
+  storage:
+    storage_type: local_fs
+    storage_root: /path/to/parquet/root
+  dataset:
+    format: parquet
+    num_samples_per_file: 1000000
+    storage_options:
+      prefetch_workers: 64
+"""
+import bisect
+import os
+from concurrent.futures import ThreadPoolExecutor
+
+from dlio_benchmark.common.constants import MODULE_DATA_READER
+from dlio_benchmark.common.enumerations import ReadType as _ReadType
+from dlio_benchmark.reader.reader_handler import FormatReader
+from dlio_benchmark.utils.utility import Profile, utcnow
+
+dlp = Profile(MODULE_DATA_READER)
+
+
+class ParquetReaderFileIterable(FormatReader):
+    """
+    Row-group-granular Parquet reader for local/network filesystems.
+
+    Reads compressed row-group bytes via open()/seek()/read() in a thread pool,
+    matching the throughput measurement semantics of ParquetReaderS3Iterable.
+
+    Exposes the same sliding-window API (open_footer_only / submit_rg_prefetch)
+    so that TorchIterableDataset drives both S3 and file backends identically.
+
+    DLIO's FormatReader protocol:
+      open(filename)            → returns (ParquetFile, None, cumulative_offsets)
+      get_sample(filename, idx) → bisect-locates RG, waits on prefetch future,
+                                  records compressed bytes, updates dlp
+      close(filename)           → no-op (caches kept for full epoch)
+      next() / read_index()     → delegate to FormatReader base class
+
+    Cache format: _pf_cache[filename] = (pf, None, offsets)
+      The second element is None (no file-handle adapter needed; raw reads
+      open their own fd per call for thread safety).
+    """
+
+    @dlp.log_init
+    def __init__(self, dataset_type, thread_index, epoch):
+        super().__init__(dataset_type, thread_index)
+
+        opts = getattr(self._args, "storage_options", {}) or {}
+
+        # Optional column selection (list[str] or None = all columns)
+        self._columns = opts.get("columns") or None
+
+        # Footer cache: filename → (ParquetFile, None, cumulative_offsets).
+        # Holds ONLY the Parquet footer metadata, a few KB per file.
+        # Flushed at finalize() (epoch boundary).
+        self._pf_cache: dict = {}
+
+        # Row-group byte-count cache: (filename, rg_idx) → int (compressed bytes).
+        # Flushed at finalize().
+        self._rg_cache: dict = {}
+
+        # Prefetch futures: (filename, rg_idx) → Future[int]
+        self._prefetch_futures: dict = {}
+
+        # Thread pool for parallel raw byte-range reads.
+        max_w = int(opts.get("prefetch_workers", 64))
+        self._prefetch_executor = ThreadPoolExecutor(
+            max_workers=max_w, thread_name_prefix="rg-file-prefetch"
+        )
+
+        self.logger.info(
+            f"{utcnow()} ParquetReaderFileIterable "
+            f"thread={thread_index} epoch={epoch} "
+            f"prefetch_workers={max_w} columns={self._columns}"
+        )
+
+    # ── I/O helpers ──────────────────────────────────────────────────────────
+
+    @staticmethod
+    def _fetch_rg_file(filename: str, rg_start: int, rg_len: int, compressed_bytes: int) -> int:
+        """Read one row-group's compressed bytes from disk in a background thread.
+
+        Opens a private file descriptor (thread-safe), seeks to rg_start, reads
+        rg_len bytes, discards them immediately.  Returns compressed_bytes so
+        the Future carries the value needed by get_sample() without extra state.
+
+        This is the file-storage analogue of _fetch_rg_s3torch() in
+        ParquetReaderS3Iterable: both read exactly the compressed column bytes
+        and discard them — no decode, no pyarrow Table, no memory held.
+        """
+        if rg_len > 0:
+            with open(filename, "rb") as fh:
+                fh.seek(rg_start)
+                fh.read(rg_len)
+        return compressed_bytes
+
+    def _rg_byte_range(self, pf, rg_idx):
+        """Compute (rg_start_bytes, rg_len_bytes, all_compressed_bytes) for rg_idx.
+
+        Uses the same column-chunk offset arithmetic as ParquetReaderS3Iterable
+        so that both backends measure identical byte counts.
+        """
+        meta = pf.metadata
+        schema = pf.schema_arrow
+        n_cols = meta.row_group(0).num_columns if meta.num_row_groups > 0 else 0
+
+        if self._columns is not None:
+            col_names = set(self._columns)
+            col_indices = [i for i in range(n_cols) if schema.field(i).name in col_names]
+        else:
+            col_indices = list(range(n_cols))
+
+        rg_meta = meta.row_group(rg_idx)
+        all_comp = sum(
+            rg_meta.column(c).total_compressed_size for c in range(rg_meta.num_columns)
+        )
+        rg_start_b = rg_end_b = None
+        for ci in col_indices:
+            cm = rg_meta.column(ci)
+            s = cm.dictionary_page_offset if cm.dictionary_page_offset > 0 else cm.data_page_offset
+            e = s + cm.total_compressed_size
+            if rg_start_b is None or s < rg_start_b:
+                rg_start_b = s
+            if rg_end_b is None or e > rg_end_b:
+                rg_end_b = e
+
+        rg_len = (rg_end_b - rg_start_b) if rg_start_b is not None else 0
+        return rg_start_b or 0, rg_len, all_comp
+
+    # ── Sliding-window helpers (identical API to ParquetReaderS3Iterable) ────
+
+    def open_footer_only(self, filename):
+        """Read the parquet footer and return (pf, None, offsets) without
+        submitting any prefetch futures.  Used by TorchIterableDataset's
+        sliding-window iterator to separate footer reads from data reads.
+        """
+        if filename in self._pf_cache:
+            return self._pf_cache[filename]
+
+        import pyarrow.parquet as pq
+
+        pf = pq.ParquetFile(filename)
+        meta = pf.metadata
+        offsets = [0]
+        for i in range(meta.num_row_groups):
+            offsets.append(offsets[-1] + meta.row_group(i).num_rows)
+
+        # rf=None: raw reads open their own fd per call (thread-safe).
+        self._pf_cache[filename] = (pf, None, offsets)
+        return self._pf_cache[filename]
+
+    def submit_rg_prefetch(self, filename, rg_idx):
+        """Submit a single RG raw byte-range read to the background thread pool.
+        Returns the Future, or None if the executor is not available.
+        Idempotent: returns the existing Future if already submitted.
+        """
+        key = (filename, rg_idx)
+        if key in self._prefetch_futures:
+            return self._prefetch_futures[key]
+
+        pf, _, _ = self._pf_cache[filename]
+        rg_start_b, rg_len, all_comp = self._rg_byte_range(pf, rg_idx)
+
+        fut = self._prefetch_executor.submit(
+            self._fetch_rg_file, filename, rg_start_b, rg_len, all_comp
+        )
+        self._prefetch_futures[key] = fut
+        return fut
+
+    # ── FormatReader interface ────────────────────────────────────────────────
+
+    @dlp.log
+    def open(self, filename):
+        """Return (pf, None, offsets) for filename, reading the footer at most
+        once per epoch.  Eagerly submits all RG prefetches (legacy open() path).
+        """
+        if filename in self._pf_cache:
+            return self._pf_cache[filename]
+
+        import pyarrow.parquet as pq
+
+        pf = pq.ParquetFile(filename)
+        meta = pf.metadata
+        offsets = [0]
+        for i in range(meta.num_row_groups):
+            offsets.append(offsets[-1] + meta.row_group(i).num_rows)
+
+        self.logger.debug(
+            f"{utcnow()} ParquetReaderFileIterable.open {filename} "
+            f"row_groups={meta.num_row_groups} total_rows={offsets[-1]}"
+        )
+
+        # Eagerly submit all RG reads in background threads (legacy path).
+        for rg_i in range(meta.num_row_groups):
+            key = (filename, rg_i)
+            if key not in self._prefetch_futures:
+                rg_start_b, rg_len, all_comp = self._rg_byte_range(pf, rg_i)
+                self._prefetch_futures[key] = self._prefetch_executor.submit(
+                    self._fetch_rg_file, filename, rg_start_b, rg_len, all_comp
+                )
+
+        self._pf_cache[filename] = (pf, None, offsets)
+        return self._pf_cache[filename]
+
+    @dlp.log
+    def close(self, filename):
+        """No-op: caches kept for full epoch to avoid re-reading footers.
+        Flushed at epoch boundary in finalize().
+        """
+        super().close(filename)
+
+    @dlp.log
+    def get_sample(self, filename, sample_index):
+        """Wait on the prefetch future for the RG containing sample_index,
+        record compressed byte count, discard data.
+        """
+        pf, _, offsets = self.open_file_map[filename]
+
+        rg_idx = max(0, bisect.bisect_right(offsets, sample_index) - 1)
+        rg_idx = min(rg_idx, pf.metadata.num_row_groups - 1)
+
+        cache_key = (filename, rg_idx)
+        if cache_key not in self._rg_cache:
+            if cache_key in self._prefetch_futures:
+                compressed_bytes = self._prefetch_futures.pop(cache_key).result()
+            else:
+                # Synchronous fallback: compute byte range and read directly.
+                rg_start_b, rg_len, all_comp = self._rg_byte_range(pf, rg_idx)
+                self._fetch_rg_file(filename, rg_start_b, rg_len, all_comp)
+                compressed_bytes = all_comp
+            self._rg_cache[cache_key] = compressed_bytes
+
+        dlp.update(image_size=self._rg_cache[cache_key])
+
+    def next(self):
+        for batch in super().next():
+            yield batch
+
+    @dlp.log
+    def read_index(self, image_idx, step):
+        filename, sample_index = self.global_index_map[image_idx]
+        if (
+            filename not in self.open_file_map
+            or self.open_file_map[filename] is None
+        ):
+            self.open_file_map[filename] = self.open(filename)
+        self.get_sample(filename, sample_index)
+        if self._args.read_type is _ReadType.ON_DEMAND:
+            self.open_file_map[filename] = None
+        return self._args.resized_image
+
+    @dlp.log
+    def finalize(self):
+        """Cancel outstanding futures and flush all caches at epoch boundary."""
+        for fut in self._prefetch_futures.values():
+            fut.cancel()
+        self._prefetch_futures.clear()
+        self._pf_cache.clear()
+        self._rg_cache.clear()
+        return super().finalize()
+
+    def is_index_based(self):
+        return True
+
+    def is_iterator_based(self):
+        return True
diff --git a/dlio_benchmark/reader/parquet_reader_s3_iterable.py b/dlio_benchmark/reader/parquet_reader_s3_iterable.py
index 5a80f82a..dcab0f89 100644
--- a/dlio_benchmark/reader/parquet_reader_s3_iterable.py
+++ b/dlio_benchmark/reader/parquet_reader_s3_iterable.py
@@ -56,9 +56,11 @@
 """
 import bisect
 import os
+from concurrent.futures import ThreadPoolExecutor
 from urllib.parse import urlparse
 
 from dlio_benchmark.common.constants import MODULE_DATA_READER
+from dlio_benchmark.common.enumerations import ReadType as _ReadType
 from dlio_benchmark.reader.reader_handler import FormatReader
 from dlio_benchmark.utils.utility import Profile, utcnow
 
@@ -248,7 +250,7 @@ def __init__(self, dataset_type, thread_index, epoch):
             raise ValueError(
                 "storage_options['storage_library'] is required for S3 readers. "
                 "Add 'storage_library: <value>' under the 'storage:' section of "
-                "your workload YAML.  Supported values: minio, s3dlio, s3torchconnector."
+                "your workload YAML.  Supported values: minio, s3dlio, s3torchconnector, direct."
             )
         self._opts = opts
         self._epoch = epoch
@@ -271,8 +273,20 @@ def __init__(self, dataset_type, thread_index, epoch):
         # Flushed at finalize() (epoch boundary).
         self._rg_cache: dict = {}
 
+        # Prefetch thread pool (s3dlio + s3torchconnector): when a file is opened,
+        # all of its row-group extents are submitted as background tasks immediately.
+        # By the time get_sample() requests a given row group, the data has usually
+        # already been fetched — the main thread never blocks on HTTP.
+        self._prefetch_futures: dict = {}  # (filename, rg_idx) -> Future[int]
+        self._prefetch_executor: ThreadPoolExecutor | None = None
+        if self._storage_library in ("s3dlio", "s3torchconnector", "direct"):
+            max_w = int(opts.get("prefetch_workers", 64))
+            self._prefetch_executor = ThreadPoolExecutor(
+                max_workers=max_w, thread_name_prefix="rg-prefetch"
+            )
+
         # s3dlio reads AWS_ENDPOINT_URL_S3 at runtime; set it early if needed.
-        if self._storage_library == "s3dlio":
+        if self._storage_library in ("s3dlio", "direct"):
             ep = opts.get("endpoint_url")
             if ep and not os.environ.get("AWS_ENDPOINT_URL_S3"):
                 os.environ["AWS_ENDPOINT_URL_S3"] = ep
@@ -325,12 +339,27 @@ def __init__(self, dataset_type, thread_index, epoch):
     # ── Helpers ──────────────────────────────────────────────────────────────
 
     def _uri_for_obj_key(self, obj_key: str) -> str:
-        """Return a full URI for a DLIO object key, using the configured uri_scheme."""
+        """Return a full URI for a DLIO object key.
+
+        For direct:// (O_DIRECT local files) the URI is direct:///absolute/path.
+        The filename DLIO passes is an absolute path, so we just swap the scheme.
+        For S3/object storage we compose scheme://bucket/key as before.
+        """
         if "://" in obj_key:
+            # Already a full URI — honour it but convert s3:// → direct:// if needed.
+            if self._storage_library == "direct" and obj_key.startswith("s3://"):
+                # DLIO may have built an s3:// URI from storage_root; strip it.
+                from urllib.parse import urlparse
+                parsed = urlparse(obj_key)
+                return f"direct://{parsed.path}"
             return obj_key
+        if self._storage_library == "direct":
+            # obj_key is an absolute path like /mnt/test/dlrm/train/img_00_of_64.parquet
+            # direct:// URIs have empty host + absolute path → triple slash.
+            return f"direct://{obj_key if obj_key.startswith('/') else '/' + obj_key}"
         scheme = self._opts.get("uri_scheme", "s3")
         root = self._args.storage_root.rstrip("/")
-        return f"{scheme}://{root}/{obj_key.lstrip('/')}"
+        return f"{scheme}://{root}/{obj_key.lstrip('/')}" 
 
     def _uri_for_filename(self, filename: str) -> str:
         """Alias for _uri_for_obj_key for backward compatibility."""
@@ -361,7 +390,7 @@ def _make_range_file(self, filename: str):
         """Create a seekable file-like I/O adapter for the given object key."""
         uri = self._uri_for_obj_key(filename)
         lib = self._storage_library
-        if lib == "s3dlio":
+        if lib in ("s3dlio", "direct"):
             return _S3RangeFile(uri)
         elif lib == "s3torchconnector":
             # Use s3torchconnector's native range-based reader directly.
@@ -387,6 +416,108 @@ def _make_range_file(self, filename: str):
                 "supported: s3dlio, s3torchconnector, minio"
             )
 
+    def _fetch_rg_s3dlio(self, uri: str, rg_start: int, rg_length: int, compressed_bytes: int) -> int:
+        """Fetch one row-group byte extent from S3 in a background thread.
+
+        s3dlio.get_range() releases the GIL for the duration of network I/O,
+        so multiple threads execute true parallel transfers.  The downloaded
+        bytes are discarded immediately — we are a storage benchmark.
+        Returns compressed_bytes (passed through) so the Future carries the
+        value get_sample() needs for _rg_cache without extra coordination.
+        """
+        if rg_length > 0:
+            import s3dlio
+            s3dlio.get_range(uri, rg_start, rg_length)
+        return compressed_bytes
+
+    def _fetch_rg_s3torch(self, uri: str, rg_start: int, rg_length: int, compressed_bytes: int) -> int:
+        """Fetch one row-group byte extent using s3torchconnector in a background thread.
+
+        Each call opens a fresh range-based reader for thread safety, seeks to the
+        row-group offset, reads exactly rg_length bytes, then discards the data.
+        The s3torchconnector CRT runtime handles connection pooling internally, so
+        multiple threads issuing concurrent reads saturate available bandwidth.
+        Returns compressed_bytes (passed through) for the _rg_cache.
+        """
+        if rg_length > 0:
+            from s3torchconnector import S3ReaderConstructor
+            from urllib.parse import urlparse
+            parsed = urlparse(uri)
+            bucket = parsed.netloc
+            key = parsed.path.lstrip("/")
+            reader = self._s3torch_client.get_object(
+                bucket=bucket, key=key,
+                reader_constructor=S3ReaderConstructor.range_based()
+            )
+            reader.seek(rg_start)
+            reader.read(rg_length)
+        return compressed_bytes
+
+    # ── Sliding-window prefetch helpers ─────────────────────────────────────
+
+    def open_footer_only(self, filename):
+        """Fetch the parquet footer and return (pf, rf, offsets) WITHOUT
+        submitting any prefetch futures.  Used by the sliding-window iterator
+        in TorchIterableDataset so that prefetch scheduling is controlled
+        externally rather than firing all RGs at once on open().
+        """
+        if filename in self._pf_cache:
+            return self._pf_cache[filename]
+
+        import pyarrow.parquet as pq
+        rf = self._make_range_file(filename)
+        pf = pq.ParquetFile(rf)
+        meta = pf.metadata
+        offsets = [0]
+        for i in range(meta.num_row_groups):
+            offsets.append(offsets[-1] + meta.row_group(i).num_rows)
+        self._pf_cache[filename] = (pf, rf, offsets)
+        return self._pf_cache[filename]
+
+    def submit_rg_prefetch(self, filename, rg_idx):
+        """Submit a single RG byte-range GET to the background executor.
+        Returns the Future (or None if executor not available).
+        Call this from the sliding-window iterator to fill exactly one
+        slot when a previous slot is consumed.
+        """
+        if self._prefetch_executor is None:
+            return None
+        key = (filename, rg_idx)
+        if key in self._prefetch_futures:
+            return self._prefetch_futures[key]  # already submitted
+
+        pf, rf, offsets = self._pf_cache[filename]
+        meta = pf.metadata
+        schema = pf.schema_arrow
+        n_cols = meta.row_group(0).num_columns if meta.num_row_groups > 0 else 0
+        if self._columns is not None:
+            col_names = set(self._columns)
+            col_indices = [i for i in range(n_cols) if schema.field(i).name in col_names]
+        else:
+            col_indices = list(range(n_cols))
+
+        rg_meta = meta.row_group(rg_idx)
+        all_comp = sum(rg_meta.column(c).total_compressed_size for c in range(rg_meta.num_columns))
+        rg_start_b = rg_end_b = None
+        for ci in col_indices:
+            cm = rg_meta.column(ci)
+            s = cm.dictionary_page_offset if cm.dictionary_page_offset > 0 else cm.data_page_offset
+            e = s + cm.total_compressed_size
+            if rg_start_b is None or s < rg_start_b:
+                rg_start_b = s
+            if rg_end_b is None or e > rg_end_b:
+                rg_end_b = e
+        rg_len = (rg_end_b - rg_start_b) if rg_start_b is not None else 0
+
+        if self._storage_library == "s3torchconnector":
+            fetch_fn = self._fetch_rg_s3torch
+        else:
+            fetch_fn = self._fetch_rg_s3dlio  # handles both s3dlio and direct://
+        uri = self._uri_for_obj_key(filename)
+        fut = self._prefetch_executor.submit(fetch_fn, uri, rg_start_b or 0, rg_len, all_comp)
+        self._prefetch_futures[key] = fut
+        return fut
+
     # ── FormatReader interface ────────────────────────────────────────────────
 
     @dlp.log
@@ -417,7 +548,55 @@ def open(self, filename):
             f"{utcnow()} ParquetReaderS3Iterable.open {filename} "
             f"row_groups={meta.num_row_groups} total_rows={offsets[-1]}"
         )
-        self._pf_cache[filename] = (pf, offsets)
+        # Eagerly prefetch all row-group extents in background threads (s3dlio only).
+        # Each future calls s3dlio.get_range() which releases the GIL, so up to
+        # prefetch_workers (default 32) transfers run in true parallel.  By the
+        # time get_sample() is called for any (filename, rg_idx) the data is
+        # almost certainly already fetched — the main loop never blocks on HTTP.
+        if self._prefetch_executor is not None:
+            uri = self._uri_for_obj_key(filename)
+            schema = pf.schema_arrow
+            n_cols = meta.row_group(0).num_columns if meta.num_row_groups > 0 else 0
+            if self._columns is not None:
+                col_names = set(self._columns)
+                col_indices = [
+                    i for i in range(n_cols)
+                    if schema.field(i).name in col_names
+                ]
+            else:
+                col_indices = list(range(n_cols))
+            # Select the appropriate fetch function for this storage library.
+            if self._storage_library == "s3torchconnector":
+                fetch_fn = self._fetch_rg_s3torch
+            else:
+                fetch_fn = self._fetch_rg_s3dlio  # handles both s3dlio and direct://
+            for rg_i in range(meta.num_row_groups):
+                rg_meta = meta.row_group(rg_i)
+                all_comp = sum(
+                    rg_meta.column(c).total_compressed_size
+                    for c in range(rg_meta.num_columns)
+                )
+                rg_start_b = rg_end_b = None
+                for ci in col_indices:
+                    cm = rg_meta.column(ci)
+                    s = (
+                        cm.dictionary_page_offset
+                        if cm.dictionary_page_offset > 0
+                        else cm.data_page_offset
+                    )
+                    e = s + cm.total_compressed_size
+                    if rg_start_b is None or s < rg_start_b:
+                        rg_start_b = s
+                    if rg_end_b is None or e > rg_end_b:
+                        rg_end_b = e
+                rg_len = (rg_end_b - rg_start_b) if rg_start_b is not None else 0
+                self._prefetch_futures[(filename, rg_i)] = self._prefetch_executor.submit(
+                    fetch_fn, uri, rg_start_b or 0, rg_len, all_comp
+                )
+        # Store rf alongside pf so get_sample can do raw byte reads without
+        # going through pq.ParquetFile.read_row_group() (which would attempt
+        # Thrift page-header deserialization on potentially synthetic bytes).
+        self._pf_cache[filename] = (pf, rf, offsets)
         return self._pf_cache[filename]
 
     @dlp.log
@@ -442,7 +621,7 @@ def get_sample(self, filename, sample_index):
         is extracted. No row data is held between calls. The DLIO pipeline consumes
         self._args.resized_image (a pre-allocated random tensor) not actual file data.
         """
-        pf, offsets = self.open_file_map[filename]
+        pf, rf, offsets = self.open_file_map[filename]
 
         # Binary search: find rg_idx such that offsets[rg_idx] <= sample_index
         # < offsets[rg_idx + 1].  bisect_right on offsets gives insertion point
@@ -452,14 +631,44 @@ def get_sample(self, filename, sample_index):
 
         cache_key = (filename, rg_idx)
         if cache_key not in self._rg_cache:
-            # Fetch this row group — triggers the range GETs we are benchmarking.
-            table = pf.read_row_group(rg_idx, columns=self._columns)
-            rg_meta = pf.metadata.row_group(rg_idx)
-            compressed_bytes = sum(
-                rg_meta.column(c).total_compressed_size
-                for c in range(rg_meta.num_columns)
-            )
-            del table  # discard immediately — we are NOT a training framework
+            if cache_key in self._prefetch_futures:
+                # Background prefetch is in flight or already done — just wait.
+                # For the first access the data is usually already fetched;
+                # for the rare case it isn't, we block until it completes.
+                compressed_bytes = self._prefetch_futures.pop(cache_key).result()
+            else:
+                # Synchronous fallback: non-s3dlio libraries, or the (unlikely)
+                # case where prefetch was not submitted for this row group.
+                rg_meta = pf.metadata.row_group(rg_idx)
+                compressed_bytes = sum(
+                    rg_meta.column(c).total_compressed_size
+                    for c in range(rg_meta.num_columns)
+                )
+                schema = pf.schema_arrow
+                if self._columns is not None:
+                    col_names = set(self._columns)
+                    col_indices = [
+                        i for i in range(rg_meta.num_columns)
+                        if schema.field(i).name in col_names
+                    ]
+                else:
+                    col_indices = range(rg_meta.num_columns)
+                rg_start = rg_end = None
+                for ci in col_indices:
+                    cm = rg_meta.column(ci)
+                    chunk_start = (
+                        cm.dictionary_page_offset
+                        if cm.dictionary_page_offset > 0
+                        else cm.data_page_offset
+                    )
+                    chunk_end = chunk_start + cm.total_compressed_size
+                    if rg_start is None or chunk_start < rg_start:
+                        rg_start = chunk_start
+                    if rg_end is None or chunk_end > rg_end:
+                        rg_end = chunk_end
+                if rg_start is not None and rg_end > rg_start:
+                    rf.seek(rg_start)
+                    rf.read(rg_end - rg_start)
             self._rg_cache[cache_key] = compressed_bytes  # int only; negligible RAM
 
         dlp.update(image_size=self._rg_cache[cache_key])
@@ -468,14 +677,33 @@ def next(self):
         for batch in super().next():
             yield batch
 
-    @dlp.log
     def read_index(self, image_idx, step):
-        dlp.update(step=step)
-        return super().read_index(image_idx, step)
+        """Fast read_index that skips the base-class per-sample utcnow() overhead.
+
+        FormatReader.read_index() calls datetime.now().strftime() twice per
+        sample for debug logging.  With 16 million samples per worker that adds
+        ~48 seconds of pure Python overhead unrelated to I/O.  We replicate
+        the essential logic — open (cached footer) → get_sample → on-demand
+        close — without the logging cost.
+        """
+        filename, sample_index = self.global_index_map[image_idx]
+        if (
+            filename not in self.open_file_map
+            or self.open_file_map[filename] is None
+        ):
+            self.open_file_map[filename] = self.open(filename)
+        self.get_sample(filename, sample_index)
+        if self._args.read_type is _ReadType.ON_DEMAND:
+            self.open_file_map[filename] = None
+        return self._args.resized_image
 
     @dlp.log
     def finalize(self):
-        """Flush both caches at epoch boundary."""
+        """Flush all caches and cancel outstanding prefetch futures at epoch boundary."""
+        # Cancel any futures that haven't been consumed yet (e.g. early epoch end).
+        for fut in self._prefetch_futures.values():
+            fut.cancel()
+        self._prefetch_futures.clear()
         self._pf_cache.clear()
         self._rg_cache.clear()
         return super().finalize()
diff --git a/dlio_benchmark/reader/parquet_reader_s3dlio.py b/dlio_benchmark/reader/parquet_reader_s3dlio.py
new file mode 100644
index 00000000..2bee41e0
--- /dev/null
+++ b/dlio_benchmark/reader/parquet_reader_s3dlio.py
@@ -0,0 +1,618 @@
+"""Parquet reader backed by s3dlio's PyParquetIndex.
+
+Concurrency model — consumer-driven bounded pipeline
+----------------------------------------------------
+At epoch_init we build an ordered access plan (all RGs in consumption order)
+and seed with exactly `window_size` in-flight GETs. read_index drives the
+pipeline: it POPS the future (releasing the byte payload immediately) and
+then submits the next item from the plan. Memory is strictly bounded.
+
+  Memory guarantee:
+    _active_futures holds at most window_size entries at any moment.
+    Each entry holds one row-group's bytes (~8 MB) until popped.
+    Peak RAM = window_size × rg_size (e.g. 32 × 8 MB = 256 MB).
+
+  NO done_callback chaining: callbacks ran ahead of the consumer and
+  accumulated ALL completed futures in memory (61 GB for 64-file DLRM).
+  Consumer-driven is the only safe design.
+
+  read_index(image_idx):
+      bisect lookup  ->  (uri, rg_idx)          # pure Python, ~200 ns
+      if rg_key in _rg_done: return             # fast path, ~50 ns
+      fut = _active_futures.pop(rg_key)         # removes entry → frees RAM
+      fut.result()                              # usually already done
+      _pipeline_submit_next()                   # refill one slot
+
+Window size:
+  window_size = min(prefetch_workers, max_outstanding)
+  max_outstanding = min(user_cap=1024, free_ram * 0.5 / rg_size)
+
+Bisect table reuse:
+  PyParquetIndex (Rust DashMap) retains footer metadata across epochs.
+  Python bisect tables (_rg_sample_boundaries, _rg_ranges_cache) are also
+  retained — the sample→RG mapping is immutable (files never change).
+  Only _active_futures, _rg_done, and the plan iterator are reset per epoch.
+"""
+import bisect
+import os
+import threading
+import time
+from concurrent.futures import ThreadPoolExecutor, Future
+
+from dlio_benchmark.common.constants import MODULE_DATA_READER
+from dlio_benchmark.common.enumerations import StorageType
+from dlio_benchmark.reader.reader_handler import FormatReader
+from dlio_benchmark.utils.utility import Profile, utcnow
+
+dlp = Profile(MODULE_DATA_READER)
+
+
+def _free_ram_bytes() -> int:
+    """Return available physical RAM in bytes."""
+    try:
+        pages = os.sysconf("SC_AVAIL_PHYS_PAGES")
+        page_sz = os.sysconf("SC_PAGE_SIZE")
+        if pages > 0 and page_sz > 0:
+            return pages * page_sz
+    except (ValueError, OSError):
+        pass
+    return 8 * 1024 ** 3  # conservative fallback: 8 GiB
+
+
+def _compute_max_outstanding(rg_size: int, user_cap: int = 1024) -> int:
+    """
+    Memory-bounded window size.
+    - Small objects (< 1 MiB): cap at user_cap (default 1024).
+    - Large objects: cap at 50% of free RAM / rg_size, never exceed user_cap.
+    - Always at least 4.
+    """
+    if rg_size < 1024 * 1024:
+        return user_cap
+    free = _free_ram_bytes()
+    budget = max(4, int(0.5 * free / rg_size))
+    return min(user_cap, budget)
+
+
+class ParquetReaderS3dlio(FormatReader):
+    """
+    Parquet reader using s3dlio.PyParquetIndex + bounded sliding-window prefetch.
+
+    At most max_outstanding row-group GETs are in-flight at any time.
+    Memory usage is strictly bounded: max_outstanding * rg_size <= 50% free RAM.
+    """
+
+    @dlp.log_init
+    def __init__(self, dataset_type, thread_index, epoch_number):
+        super().__init__(dataset_type, thread_index)
+
+        import s3dlio
+        # Partition Tokio worker threads across MPI ranks so NP processes don't
+        # all claim every core.  Auto-detects OMPI_COMM_WORLD_SIZE / PMI_SIZE /
+        # WORLD_SIZE; must be called before the first parquet_get_rg / get_range.
+        s3dlio.configure_tokio_threads()
+
+        opts = getattr(self._args, "storage_options", {}) or {}
+        self._opts = opts
+        self._epoch = epoch_number
+
+        col_opt = opts.get("columns")
+        self._columns = list(col_opt) if col_opt is not None else None
+
+        self._num_workers = int(opts.get("prefetch_workers", opts.get("prefetch", 32)))
+        self._footer_cap = int(opts.get("footer_cap", 4 * 1024 * 1024))
+        self._footer_batch = int(opts.get("footer_batch_size", 16))
+        # Number of consecutive row groups to coalesce into one byte-range GET.
+        # E.g. coalesce_rgs=8 turns 8 × 8 MB RGs into one 64 MB request.
+        # Fewer, larger requests = less HTTP overhead = higher throughput.
+        self._coalesce_rgs: int = max(1, int(opts.get("coalesce_rgs", 8)))
+
+        ep = opts.get("endpoint_url")
+        if ep and not os.environ.get("AWS_ENDPOINT_URL_S3"):
+            os.environ["AWS_ENDPOINT_URL_S3"] = ep
+
+        # Persistent Rust index — retained across epochs
+        self._index = s3dlio.PyParquetIndex(
+            col_indices=self._columns,
+            footer_cap=self._footer_cap,
+        )
+
+        # URI cache: DLIO filename -> full s3:// or file:// URI
+        self._uri_cache: dict[str, str] = {}
+
+        # Per-epoch state (reset by _epoch_reset)
+        self._executor: ThreadPoolExecutor | None = None
+        self._active_futures: dict[tuple, Future] = {}  # (uri, group_start_rg) → Future
+        self._rg_done: set = set()                       # (uri, group_start_rg) consumed this epoch
+        self._max_outstanding: int = self._num_workers   # refined after footer fetch
+        self._rg_size_hint: int = 8 * 1024 * 1024       # updated at first epoch_init
+        self._total_bytes_read: int = 0                  # MEASURED: actual bytes from S3 this epoch
+        # Plan iterator and its lock (plan advances from callback threads)
+        self._plan_iter = iter([])
+        self._plan_list: list = []          # full plan kept for iter_epoch
+        self._plan_lock = threading.Lock()
+        self._epoch_inited = False
+
+        # Pure-Python sample→RG lookup tables built once at epoch_init.
+        # _rg_sample_boundaries[uri] = sorted list of first sample index for each RG
+        #   e.g. [0, 8130, 16260, ...] for 123 RGs
+        # _rg_ranges[uri] = list of (offset, length) parallel to _rg_sample_boundaries
+        # Lookup: bisect_right(boundaries, sample_index) - 1  →  rg_idx  (pure Python, ~200 ns)
+        self._rg_sample_boundaries: dict[str, list] = {}
+        self._rg_ranges_cache: dict[str, list] = {}
+
+        # ── Simulate-IO mode ────────────────────────────────────────────────────
+        # Set storage_options.simulate_io: true to skip all network I/O and
+        # instead log every read_index decision to a TSV file.  Runs in
+        # seconds; lets you inspect plan order, hit/fallback rates, and exact
+        # access sequence without waiting for real data transfers.
+        self._simulate: bool = str(opts.get("simulate_io", "false")).lower() in ("true", "1", "yes")
+        self._sim_log_secs: float = float(opts.get("sim_log_secs", 60))
+        self._sim_log_fh = None   # opened at _epoch_init, closed at _epoch_reset
+        self._sim_log_cctx = None # zstandard compressor context (or None)
+        self._sim_log_deadline: float = 0.0  # monotonic time after which we stop writing
+        self._sim_plan_list: list = []  # full plan kept for simulator cursor
+        self._sim_plan_pos: int = 0     # next plan slot to seed into _active_futures
+
+        self.logger.info(
+            f"{utcnow()} ParquetReaderS3dlio init thread={thread_index} "
+            f"epoch={epoch_number} workers={self._num_workers} "
+            f"footer_cap={self._footer_cap}"
+        )
+
+    # ── URI helpers ──────────────────────────────────────────────────────────
+
+    def _uri_for_filename(self, filename: str) -> str:
+        cached = self._uri_cache.get(filename)
+        if cached is not None:
+            return cached
+        if "://" in filename:
+            uri = filename
+        else:
+            storage_type = getattr(self._args, "storage_type", StorageType.LOCAL_FS)
+            if storage_type in (StorageType.S3, StorageType.AISTORE):
+                bucket = self._args.storage_root.rstrip("/")
+                uri = f"s3://{bucket}/{filename.lstrip('/')}"
+            else:
+                uri = f"file://{os.path.abspath(filename)}"
+        self._uri_cache[filename] = uri
+        return uri
+
+    def _all_uris(self) -> list:
+        return [self._uri_for_filename(f) for f in (self._file_list or [])]
+
+    # ── Pipeline ───────────────────────────────────────────────────────
+
+    def _pipeline_submit_next(self) -> None:
+        """
+        Pull the next (key, uri, offset, length) from the plan and submit it.
+
+        Called from _epoch_init (W times to seed) and from read_index each time
+        it consumes a future. This consumer-driven approach guarantees that
+        _active_futures never holds more than window_size entries simultaneously
+        — so memory is bounded at window_size × rg_size regardless of how many
+        total RGs the plan contains.
+
+        No done_callback: completed futures sit in _active_futures only until
+        read_index pops them. Memory is freed immediately after .result().
+        """
+        if self._simulate:
+            # No executor in simulate mode — just advance the plan cursor and
+            # store a (uri, offset, length) sentinel so read_index can log what
+            # WOULD have been fetched.
+            with self._plan_lock:
+                if self._sim_plan_pos < len(self._sim_plan_list):
+                    key, uri, offset, length = self._sim_plan_list[self._sim_plan_pos]
+                    self._active_futures[key] = (uri, offset, length)
+                    self._sim_plan_pos += 1
+            return
+
+        import s3dlio
+        with self._plan_lock:
+            item = next(self._plan_iter, None)
+            if item is None:
+                return
+            key, uri, offset, length = item
+            fut = self._executor.submit(s3dlio.get_range, uri, offset, length)
+            self._active_futures[key] = fut
+
+    # ── Epoch lifecycle ──────────────────────────────────────────────────────
+
+    def _epoch_init(self) -> None:
+        """
+        Lazy init on first read_index of each epoch.
+
+        1. Fetch all footers (no-op after epoch 1 — DashMap cache hit).
+        2. Build bisect tables once (sample→RG mapping is immutable).
+        3. Build ordered access plan: all RGs in the order DLIO will consume them.
+        4. Create ThreadPoolExecutor and seed pipeline with window_size initial GETs.
+
+        After this, read_index is just: bisect + set check + pop + fut.result().
+        read_index drives the pipeline: each consumed RG submits the next one.
+        Memory stays bounded at window_size × rg_size.
+        """
+        import s3dlio
+
+        all_uris = self._all_uris()
+        if not all_uris:
+            self.logger.warning(f"{utcnow()} ParquetReaderS3dlio: file_list is empty")
+            self._epoch_inited = True
+            return
+
+        # Footer indexing (no network I/O on epoch 2+)
+        self._index.ensure_indexed(
+            all_uris, epoch=self._epoch, batch_size=self._footer_batch
+        )
+
+        # Build bisect tables on first epoch only (mapping is immutable).
+        if not self._rg_sample_boundaries:
+            samples_per_file = getattr(self._args, "num_samples_per_file", 1_000_000)
+            rg_size_hint = 8 * 1024 * 1024
+            total_rgs = 0
+
+            for uri in all_uris:
+                num_rgs = self._index.file_rg_count(uri) or 0
+                if num_rgs == 0:
+                    self._rg_sample_boundaries[uri] = [0]
+                    self._rg_ranges_cache[uri] = [(0, 0)]
+                    continue
+
+                ranges = [self._index.rg_range(uri, rg_i) for rg_i in range(num_rgs)]
+                if rg_size_hint == 8 * 1024 * 1024 and ranges[0][1] > 0:
+                    rg_size_hint = ranges[0][1]
+
+                rows_per_rg_est = samples_per_file // num_rgs
+                probe_rg, _, _ = self._index.rg_lookup(uri, rows_per_rg_est)
+                if probe_rg == 1:
+                    boundaries = [rg_i * rows_per_rg_est for rg_i in range(num_rgs)]
+                else:
+                    boundaries = [0] * num_rgs
+                    for rg_i in range(1, num_rgs):
+                        lo = (rg_i - 1) * rows_per_rg_est
+                        hi = min(samples_per_file - 1, (rg_i + 1) * rows_per_rg_est)
+                        while lo < hi:
+                            mid = (lo + hi) // 2
+                            got, _, _ = self._index.rg_lookup(uri, mid)
+                            if got < rg_i:
+                                lo = mid + 1
+                            else:
+                                hi = mid
+                        boundaries[rg_i] = lo
+
+                self._rg_sample_boundaries[uri] = boundaries
+                self._rg_ranges_cache[uri] = ranges
+                total_rgs += num_rgs
+
+            self._rg_size_hint = rg_size_hint
+            user_cap = int(self._opts.get("max_outstanding", 1024))
+            self._max_outstanding = _compute_max_outstanding(rg_size_hint, user_cap=user_cap)
+            self.logger.info(
+                f"{utcnow()} ParquetReaderS3dlio: built bisect tables: "
+                f"{len(all_uris)} files, {total_rgs} RGs, "
+                f"rg_size~{rg_size_hint / 1024**2:.1f} MiB"
+            )
+
+        # Build access plan: coalesced groups in FILE-MAJOR order.
+        #
+        # DLIO's global_index_map is file-sequential:
+        #   samples 0..999999 → file0, 1000000..1999999 → file1, etc.
+        # So the access order is: all groups of file0, then all of file1, etc.
+        # The pipeline must prefetch file0/group0..15 before moving to file1.
+        #
+        # FILE-MAJOR order: for each file, all coalesced groups 0..max_groups-1.
+        # With window=32 and 16 groups/file, 2 files are always fully prefetched.
+        #
+        # Each entry covers `coalesce_rgs` consecutive RGs as ONE byte-range
+        # GET (offset of first RG .. end of last RG in group). Fewer, larger
+        # requests saturate bandwidth more efficiently than many small ones.
+        coalesce = self._coalesce_rgs
+        coalesced_size = self._rg_size_hint * coalesce
+        user_cap = int(self._opts.get("max_outstanding", 1024))
+        self._max_outstanding = _compute_max_outstanding(coalesced_size, user_cap=user_cap)
+
+        plan = []
+        for uri in all_uris:
+            ranges = self._rg_ranges_cache[uri]
+            num_rgs = len(ranges)
+            rg_start = 0
+            while rg_start < num_rgs:
+                rg_end = min(rg_start + coalesce, num_rgs)
+                offset = ranges[rg_start][0]
+                end_offset = ranges[rg_end - 1][0] + ranges[rg_end - 1][1]
+                group_key = (uri, rg_start)
+                plan.append((group_key, uri, offset, end_offset - offset))
+                rg_start += coalesce
+
+        self._plan_list = plan
+        self._plan_iter = iter(plan)
+
+        window_size = min(self._num_workers, self._max_outstanding)
+
+        # ── Simulate-IO mode ────────────────────────────────────────────────
+        if self._simulate:
+            # Write into the DLIO output dir (hydra.run.dir) when available,
+            # otherwise fall back to cwd.  The output dir is set as the
+            # HydraConfig outputs.run.dir, but the simplest reliable anchor is
+            # the directory of dlio.log (written by the benchmark runner).
+            try:
+                from hydra.core.hydra_config import HydraConfig
+                _sim_dir = HydraConfig.get().runtime.output_dir
+            except Exception:
+                _sim_dir = os.getcwd()
+            # Write plan to TSV so we can inspect prefetch order.
+            plan_path = os.path.join(_sim_dir, f"sim_plan_epoch{self._epoch}.tsv")
+            with open(plan_path, "w") as pf:
+                pf.write("plan_idx\turi_base\tgroup_start\toffset\tlength\n")
+                for i, (key, pu, poff, plen) in enumerate(plan):
+                    pf.write(f"{i}\t{os.path.basename(pu)}\t{key[1]}\t{poff}\t{plen}\n")
+
+            # Open per-epoch access log — compressed with zstd.
+            log_path = os.path.join(_sim_dir, f"sim_io_epoch{self._epoch}.tsv.zst")
+            import zstandard as _zstd
+            self._sim_log_cctx = _zstd.ZstdCompressor(level=1)  # level=1: fast
+            _raw_fh = open(log_path, "wb")
+            self._sim_log_fh = self._sim_log_cctx.stream_writer(_raw_fh, closefd=True)
+            header = (
+                "ts_ns\tepoch\tstep\timage_idx\tfile\tsample_idx"
+                "\trg_idx\tgroup_start\tdecision\toffset\tlength\n"
+            )
+            self._sim_log_fh.write(header.encode())
+
+            # Set deadline: stop writing after sim_log_secs seconds.
+            self._sim_log_deadline = time.monotonic() + self._sim_log_secs
+
+            # Seed pipeline: store sentinels (no real executor).
+            self._sim_plan_list = plan
+            self._sim_plan_pos = 0
+            for _ in range(min(window_size, len(plan))):
+                self._pipeline_submit_next()
+
+            print(
+                f"[simulate_io] epoch={self._epoch} files={len(all_uris)} "
+                f"plan={len(plan)} window={window_size} "
+                f"coalesce={coalesce} coalesced={coalesced_size/1024**2:.1f}MiB "
+                f"log_secs={self._sim_log_secs:.0f}\n"
+                f"  plan  → {plan_path}\n"
+                f"  log   → {log_path}",
+                flush=True,
+            )
+            self._epoch_inited = True
+            return
+        # ── End simulate-IO mode ─────────────────────────────────────────────
+
+        self._executor = ThreadPoolExecutor(max_workers=window_size)
+
+        # Seed the pipeline: submit exactly window_size initial GETs.
+        # read_index will submit the next item each time it pops one.
+        for _ in range(min(window_size, len(plan))):
+            self._pipeline_submit_next()
+
+        self.logger.info(
+            f"{utcnow()} ParquetReaderS3dlio: pipeline seeded — "
+            f"{len(plan)} coalesced GETs ({coalesce} RGs each, "
+            f"~{coalesced_size / 1024**2:.0f} MiB/GET), "
+            f"{window_size} in-flight window, "
+            f"~{window_size * coalesced_size / 1024**2:.0f} MiB peak in-flight"
+        )
+        self._epoch_inited = True
+
+    def _epoch_reset(self) -> None:
+        """Tear down per-epoch state; called from finalize()."""
+        if self._sim_log_fh is not None:
+            try:
+                self._sim_log_fh.flush()
+                self._sim_log_fh.close()  # also flushes/closes the underlying file
+            except Exception:
+                pass
+            self._sim_log_fh = None
+            self._sim_log_cctx = None
+        self._sim_log_deadline = 0.0
+        self._sim_plan_list = []
+        self._sim_plan_pos = 0
+        self._plan_list = []
+        if self._executor is not None:
+            self._executor.shutdown(wait=False, cancel_futures=True)
+            self._executor = None
+        self._active_futures.clear()
+        self._rg_done.clear()
+        self._plan_iter = iter([])  # stop any in-flight callbacks from advancing
+        self._total_bytes_read = 0
+        # Bisect tables are file-static — keep across epochs
+        self._epoch_inited = False
+
+    def iter_epoch(self, file_list, batch_size):
+        """
+        Iterable-dataset path: called once per worker per epoch with the worker's
+        FILE SHARD (already split by TorchIterableDataset.__iter__).
+
+        Installing the shard before _epoch_init() means each worker fetches
+        footers and seeds the pipeline only for ~N/W files instead of all N,
+        eliminating the W× burst that caused OOM in the map-style path.
+
+        Drives the consumer-driven sliding-window pipeline directly from the
+        plan list, yielding one dummy sample per complete batch.
+        """
+        # Install shard — _epoch_init() calls _all_uris() which reads _file_list
+        self._file_list = list(file_list)
+        if self._epoch_inited:
+            self._epoch_reset()
+        self._epoch_init()
+
+        sample_buf = 0
+        for key, uri, offset, length in self._plan_list:
+            group_start = key[1]
+            coalesce = self._coalesce_rgs
+            ranges = self._rg_ranges_cache[uri]
+            num_rgs_in_file = len(ranges)
+            rg_end = min(group_start + coalesce, num_rgs_in_file)
+
+            # Count samples covered by this coalesced group
+            boundaries = self._rg_sample_boundaries[uri]
+            if rg_end < num_rgs_in_file:
+                group_samples = boundaries[rg_end] - boundaries[group_start]
+            else:
+                samples_per_file = getattr(self._args, 'num_samples_per_file', 0)
+                group_samples = (samples_per_file - boundaries[group_start]) if samples_per_file else 1
+
+            # Consume pipeline slot: pop future, wait, submit next
+            rg_key = key
+            if self._simulate:
+                self._active_futures.pop(rg_key, None)
+            else:
+                fut = self._active_futures.pop(rg_key, None)
+                if fut is not None:
+                    fut.result()
+                else:
+                    # Safety fallback: pipeline missed — fetch directly
+                    import s3dlio
+                    s3dlio.get_range(uri, offset, length)
+                self._total_bytes_read += length
+            self._rg_done.add(rg_key)
+            self._pipeline_submit_next()
+
+            sample_buf += group_samples
+            while sample_buf >= batch_size:
+                yield self._args.resized_image
+                sample_buf -= batch_size
+
+        self._epoch_reset()
+        self._epoch += 1
+
+    # ── FormatReader interface ────────────────────────────────────────────────
+
+    @dlp.log
+    def open(self, filename):
+        return filename
+
+    @dlp.log
+    def close(self, filename):
+        pass
+
+    @dlp.log
+    def get_sample(self, filename, sample_index):
+        if not self._epoch_inited:
+            self._epoch_init()
+        uri = self._uri_for_filename(filename)
+        boundaries = self._rg_sample_boundaries[uri]
+        rg_idx = bisect.bisect_right(boundaries, sample_index) - 1
+        _, length = self._rg_ranges_cache[uri][rg_idx]
+        rg_key = (uri, rg_idx)
+        if rg_key not in self._rg_done:
+            self._rg_done.add(rg_key)
+            self._active_futures[rg_key].result()
+        dlp.update(image_size=length)
+
+    def next(self):
+        for batch in super().next():
+            yield batch
+
+    @dlp.log
+    def read_index(self, image_idx, step):
+        if not self._epoch_inited:
+            self._epoch_init()
+
+        dlp.update(step=step)
+
+        filename, sample_index = self.global_index_map[image_idx]
+        uri = self._uri_for_filename(filename)
+
+        boundaries = self._rg_sample_boundaries[uri]
+        rg_idx = bisect.bisect_right(boundaries, sample_index) - 1
+        _, length = self._rg_ranges_cache[uri][rg_idx]
+
+        # Map rg_idx → coalesced group key: (uri, first_rg_in_group)
+        coalesce = self._coalesce_rgs
+        group_start = (rg_idx // coalesce) * coalesce
+        rg_key = (uri, group_start)
+
+        # ── Simulate-IO mode ────────────────────────────────────────────────
+        if self._simulate:
+            if rg_key in self._rg_done:
+                decision = "DONE"
+                log_offset, log_length = 0, 0
+            elif rg_key in self._active_futures:
+                sentinel = self._active_futures.pop(rg_key)
+                self._rg_done.add(rg_key)
+                self._pipeline_submit_next()
+                decision = "HIT"
+                log_offset, log_length = sentinel[1], sentinel[2]
+            else:
+                # FALLBACK — pipeline miss.
+                ranges = self._rg_ranges_cache[uri]
+                rg_end = min(group_start + coalesce, len(ranges))
+                log_offset = ranges[group_start][0]
+                log_length = ranges[rg_end - 1][0] + ranges[rg_end - 1][1] - log_offset
+                self._rg_done.add(rg_key)
+                self._pipeline_submit_next()
+                decision = "FALLBACK"
+            if self._sim_log_fh is not None:
+                if time.monotonic() > self._sim_log_deadline:
+                    try:
+                        self._sim_log_fh.flush()
+                        self._sim_log_fh.close()
+                    except Exception:
+                        pass
+                    self._sim_log_fh = None
+                    print(f"[simulate_io] log closed after {self._sim_log_secs:.0f}s", flush=True)
+                else:
+                    line = (
+                        f"{time.monotonic_ns()}\t{self._epoch}\t{step}\t{image_idx}"
+                        f"\t{os.path.basename(filename)}\t{sample_index}"
+                        f"\t{rg_idx}\t{group_start}\t{decision}\t{log_offset}\t{log_length}\n"
+                    )
+                    self._sim_log_fh.write(line.encode())
+            dlp.update(image_size=length)
+            return self._args.resized_image
+        # ── End simulate-IO mode ─────────────────────────────────────────────
+
+        # Fast path: group already consumed this epoch — just return.
+        if rg_key in self._rg_done:
+            dlp.update(image_size=length)
+            return self._args.resized_image
+
+        self._rg_done.add(rg_key)
+        # Measure actual bytes in this coalesced GET (always, regardless of pipeline hit/miss).
+        ranges = self._rg_ranges_cache[uri]
+        rg_end = min(group_start + coalesce, len(ranges))
+        coalesced_bytes = ranges[rg_end - 1][0] + ranges[rg_end - 1][1] - ranges[group_start][0]
+        self._total_bytes_read += coalesced_bytes
+        # Pop (not get): releases the coalesced payload bytes immediately.
+        # Memory stays bounded at window_size × coalesced_size.
+        fut = self._active_futures.pop(rg_key, None)
+        if fut is not None:
+            fut.result()
+        else:
+            # Safety fallback: shouldn't happen — pipeline is always ahead.
+            import s3dlio
+            offset = ranges[group_start][0]
+            end_offset = ranges[rg_end - 1][0] + ranges[rg_end - 1][1]
+            s3dlio.get_range(uri, offset, end_offset - offset)
+        # Advance the pipeline: consumed one slot, submit the next.
+        self._pipeline_submit_next()
+
+        dlp.update(image_size=length)
+        return self._args.resized_image
+
+    @dlp.log
+    def finalize(self):
+        # Report measured bytes/sample back to DLIO so statscounter computes
+        # real I/O MiB/s instead of the synthetic record_length estimate.
+        # Must happen before _epoch_reset() clears _total_bytes_read.
+        total_samples = self._args.num_samples_per_file * self._args.num_files_train
+        if total_samples > 0 and self._total_bytes_read > 0:
+            self._args.record_length = self._total_bytes_read // total_samples
+            self.logger.debug(
+                f"{utcnow()} ParquetReaderS3dlio epoch {self._epoch}: "
+                f"measured {self._total_bytes_read / 1024**3:.3f} GiB read, "
+                f"{self._args.record_length} bytes/sample"
+            )
+        self._epoch_reset()
+        self._epoch += 1
+        # Retain _index and _uri_cache across epochs — footers never change
+        self.open_file_map.clear()
+        return super().finalize()
+
+    def is_index_based(self):
+        return True
+
+    def is_iterator_based(self):
+        return True
diff --git a/dlio_benchmark/reader/parquet_reader_s3dlio_arrow.py b/dlio_benchmark/reader/parquet_reader_s3dlio_arrow.py
new file mode 100644
index 00000000..38df7d95
--- /dev/null
+++ b/dlio_benchmark/reader/parquet_reader_s3dlio_arrow.py
@@ -0,0 +1,389 @@
+"""Parquet reader using s3dlio.parquet_get_rg for Rust Arrow IPC decode.
+
+Access pattern
+--------------
+Identical to ParquetReaderS3dlio (index-based, sliding-window prefetch) except
+that each row-group fetch calls s3dlio.parquet_get_rg(uri, rg_idx, decode="arrow")
+instead of s3dlio.get_range(uri, offset, length).
+
+The fetch + Parquet→Arrow decode happen inside the Rust async task on the Tokio
+thread pool — completely off the Python GIL.  Python receives Arrow IPC bytes
+(``BytesView``) which it can pass directly to PyArrow with zero additional copy.
+
+Concurrency model — same as ParquetReaderS3dlio
+------------------------------------------------
+  _active_futures holds at most window_size entries (per-RG, NOT coalesced).
+  Each entry holds one decoded Arrow IPC buffer until popped.
+  Peak RAM = window_size × decoded_rg_size (decoded > raw; typically 1.5-2×).
+
+  Consumer-driven pipeline: read_index pops a future (frees the buffer) and
+  submits the next one.  Memory is strictly bounded.
+
+decode_output config key
+------------------------
+Controls what Python does with the decoded IPC bytes:
+
+  "none" (default for benchmarking) — discard bytes, record byte_count only.
+  "pyarrow"                         — return pa.RecordBatch via pa.ipc.open_stream.
+  "torch"                           — return dict of torch tensors (requires torch).
+
+  For pure storage benchmarking use "none".
+  For real training use "pyarrow" or "torch".
+
+Selection
+---------
+CLI flags (passed to mlpstorage training run --params):
+
+    storage.storage_options.storage_library=s3dlio
+    storage.storage_options.decode=arrow
+
+Both flags are required. Omitting ``decode=arrow`` routes to
+ParquetReaderS3dlio (raw bytes).
+"""
+import bisect
+import os
+import threading
+from concurrent.futures import ThreadPoolExecutor, Future
+
+from dlio_benchmark.common.constants import MODULE_DATA_READER
+from dlio_benchmark.common.enumerations import StorageType
+from dlio_benchmark.reader.reader_handler import FormatReader
+from dlio_benchmark.utils.utility import Profile, utcnow
+
+dlp = Profile(MODULE_DATA_READER)
+
+
+def _free_ram_bytes() -> int:
+    try:
+        pages = os.sysconf("SC_AVAIL_PHYS_PAGES")
+        page_sz = os.sysconf("SC_PAGE_SIZE")
+        if pages > 0 and page_sz > 0:
+            return pages * page_sz
+    except (ValueError, OSError):
+        pass
+    return 8 * 1024 ** 3
+
+
+def _compute_max_outstanding(rg_size: int, user_cap: int = 512) -> int:
+    if rg_size < 1024 * 1024:
+        return user_cap
+    free = _free_ram_bytes()
+    # Arrow IPC decoded buffers are ~1.5-2× raw size; be more conservative.
+    budget = max(4, int(0.35 * free / rg_size))
+    return min(user_cap, budget)
+
+
+class ParquetReaderS3dlioArrow(FormatReader):
+    """
+    Parquet reader using s3dlio.parquet_get_rg with Rust Arrow IPC decode.
+
+    Uses the same PyParquetIndex + sliding-window prefetch architecture as
+    ParquetReaderS3dlio, but each row-group GET is decoded to Arrow IPC format
+    inside the Rust async task on the Tokio thread pool — completely off the
+    Python GIL.
+
+    configure decode_output to control what Python does with the result:
+      "none"    (default) — discard, record byte_count only (pure benchmark)
+      "pyarrow"           — return pa.RecordBatch (real training)
+      "torch"             — return dict of torch tensors (real training)
+    """
+
+    @dlp.log_init
+    def __init__(self, dataset_type, thread_index, epoch_number):
+        super().__init__(dataset_type, thread_index)
+
+        import s3dlio
+        # Partition Tokio worker threads across MPI ranks so NP processes don't
+        # all claim every core.  Auto-detects OMPI_COMM_WORLD_SIZE / PMI_SIZE /
+        # WORLD_SIZE; must be called before the first parquet_get_rg call.
+        s3dlio.configure_tokio_threads()
+
+        opts = getattr(self._args, "storage_options", {}) or {}
+        self._opts = opts
+        self._epoch = epoch_number
+
+        col_opt = opts.get("columns")
+        self._columns = list(col_opt) if col_opt is not None else None
+        self._footer_cap = int(opts.get("footer_cap", 4 * 1024 * 1024))
+        self._footer_batch = int(opts.get("footer_batch_size", 16))
+        self._num_workers = int(opts.get("prefetch_workers", opts.get("prefetch", 32)))
+
+        # "none"    → discard decoded bytes (pure I/O+decode benchmark)
+        # "pyarrow" → return pa.RecordBatch
+        # "torch"   → return dict of torch tensors
+        self._decode_output: str = opts.get("decode_output", "none").lower()
+
+        ep = opts.get("endpoint_url")
+        if ep and not os.environ.get("AWS_ENDPOINT_URL_S3"):
+            os.environ["AWS_ENDPOINT_URL_S3"] = ep
+
+        # Persistent Rust index — retained across epochs
+        self._index = s3dlio.PyParquetIndex(
+            col_indices=self._columns,
+            footer_cap=self._footer_cap,
+        )
+
+        # URI cache: DLIO filename -> full URI
+        self._uri_cache: dict[str, str] = {}
+
+        # Per-epoch state (reset by _epoch_reset)
+        self._executor: ThreadPoolExecutor | None = None
+        self._active_futures: dict[tuple, Future] = {}   # (uri, rg_idx) → Future
+        self._rg_done: set = set()
+        self._max_outstanding: int = self._num_workers
+        self._rg_size_hint: int = 8 * 1024 * 1024
+        self._total_bytes_read: int = 0
+        self._plan_iter = iter([])
+        self._plan_lock = threading.Lock()
+        self._epoch_inited = False
+
+        # Bisect tables — built once per lifetime (immutable after epoch 1)
+        self._rg_sample_boundaries: dict[str, list] = {}
+        self._rg_counts: dict[str, int] = {}
+
+        self.logger.info(
+            f"{utcnow()} ParquetReaderS3dlioArrow init thread={thread_index} "
+            f"epoch={epoch_number} workers={self._num_workers} "
+            f"decode_output={self._decode_output}"
+        )
+
+    # ── URI helpers ───────────────────────────────────────────────────────────
+
+    def _uri_for_filename(self, filename: str) -> str:
+        cached = self._uri_cache.get(filename)
+        if cached is not None:
+            return cached
+        if "://" in filename:
+            uri = filename
+        else:
+            storage_type = getattr(self._args, "storage_type", StorageType.LOCAL_FS)
+            if storage_type in (StorageType.S3, StorageType.AISTORE):
+                bucket = self._args.storage_root.rstrip("/")
+                uri = f"s3://{bucket}/{filename.lstrip('/')}"
+            else:
+                uri = f"file://{os.path.abspath(filename)}"
+        self._uri_cache[filename] = uri
+        return uri
+
+    def _all_uris(self) -> list:
+        return [self._uri_for_filename(f) for f in (self._file_list or [])]
+
+    # ── Pipeline ──────────────────────────────────────────────────────────────
+
+    def _pipeline_submit_next(self) -> None:
+        import s3dlio
+        with self._plan_lock:
+            item = next(self._plan_iter, None)
+            if item is None:
+                return
+            uri, rg_idx = item
+            col = self._columns
+            footer_cap = self._footer_cap
+            fut = self._executor.submit(
+                s3dlio.parquet_get_rg, uri, rg_idx, col, footer_cap, "arrow"
+            )
+            self._active_futures[(uri, rg_idx)] = fut
+
+    # ── Epoch lifecycle ───────────────────────────────────────────────────────
+
+    def _epoch_init(self) -> None:
+        import s3dlio
+
+        all_uris = self._all_uris()
+        if not all_uris:
+            self.logger.warning(f"{utcnow()} ParquetReaderS3dlioArrow: file_list is empty")
+            self._epoch_inited = True
+            return
+
+        # Footer indexing (no network I/O on epoch 2+)
+        self._index.ensure_indexed(
+            all_uris, epoch=self._epoch, batch_size=self._footer_batch
+        )
+
+        # Build bisect tables once (immutable across epochs)
+        if not self._rg_sample_boundaries:
+            samples_per_file = getattr(self._args, "num_samples_per_file", 1_000_000)
+            total_rgs = 0
+
+            for uri in all_uris:
+                num_rgs = self._index.file_rg_count(uri) or 0
+                self._rg_counts[uri] = num_rgs
+                if num_rgs == 0:
+                    self._rg_sample_boundaries[uri] = [0]
+                    continue
+
+                rows_per_rg_est = samples_per_file // num_rgs
+                probe_rg, _, _ = self._index.rg_lookup(uri, rows_per_rg_est)
+                if probe_rg == 1:
+                    boundaries = [rg_i * rows_per_rg_est for rg_i in range(num_rgs)]
+                else:
+                    boundaries = [0] * num_rgs
+                    for rg_i in range(1, num_rgs):
+                        lo = (rg_i - 1) * rows_per_rg_est
+                        hi = min(samples_per_file - 1, (rg_i + 1) * rows_per_rg_est)
+                        while lo < hi:
+                            mid = (lo + hi) // 2
+                            got, _, _ = self._index.rg_lookup(uri, mid)
+                            if got < rg_i:
+                                lo = mid + 1
+                            else:
+                                hi = mid
+                        boundaries[rg_i] = lo
+                self._rg_sample_boundaries[uri] = boundaries
+                total_rgs += num_rgs
+
+            # Estimate decoded RG size (~1.6× raw) for memory bound calculation
+            if all_uris and self._rg_counts.get(all_uris[0], 0) > 0:
+                _, raw_len = self._index.rg_range(all_uris[0], 0)
+                self._rg_size_hint = int(raw_len * 1.6)
+
+            user_cap = int(self._opts.get("max_outstanding", 512))
+            self._max_outstanding = _compute_max_outstanding(
+                self._rg_size_hint, user_cap=user_cap
+            )
+            self.logger.info(
+                f"{utcnow()} ParquetReaderS3dlioArrow: built bisect tables: "
+                f"{len(all_uris)} files, {total_rgs} RGs, "
+                f"decoded_rg_size~{self._rg_size_hint / 1024**2:.1f} MiB, "
+                f"window={min(self._num_workers, self._max_outstanding)}"
+            )
+
+        # Access plan: per-RG, file-major order (no coalescing — each RG decoded
+        # independently in Rust, so coalescing does not help here)
+        plan = []
+        for uri in all_uris:
+            num_rgs = self._rg_counts.get(uri, 0)
+            for rg_idx in range(num_rgs):
+                plan.append((uri, rg_idx))
+
+        self._plan_iter = iter(plan)
+        window_size = min(self._num_workers, self._max_outstanding)
+
+        self._executor = ThreadPoolExecutor(max_workers=window_size)
+        for _ in range(min(window_size, len(plan))):
+            self._pipeline_submit_next()
+
+        self.logger.info(
+            f"{utcnow()} ParquetReaderS3dlioArrow: pipeline seeded — "
+            f"{len(plan)} RG fetches, {window_size} in-flight"
+        )
+        self._epoch_inited = True
+
+    def _epoch_reset(self) -> None:
+        if self._executor is not None:
+            self._executor.shutdown(wait=False, cancel_futures=True)
+            self._executor = None
+        self._active_futures.clear()
+        self._rg_done.clear()
+        self._plan_iter = iter([])
+        self._total_bytes_read = 0
+        self._epoch_inited = False
+
+    # ── Decode helper ──────────────────────────────────────────────────────────
+
+    def _decode_ipc(self, bv) -> object:
+        """Convert Arrow IPC BytesView to the configured output format."""
+        if self._decode_output == "none":
+            return self._args.resized_image
+        raw = bytes(bv)
+        if self._decode_output == "pyarrow":
+            import pyarrow as pa
+            return pa.ipc.open_stream(pa.py_buffer(raw)).read_next_batch()
+        if self._decode_output == "torch":
+            import pyarrow as pa
+            import torch
+            batch = pa.ipc.open_stream(pa.py_buffer(raw)).read_next_batch()
+            return {
+                col: torch.from_numpy(batch.column(col).to_pydict()["values"])
+                for col in batch.schema.names
+            }
+        return self._args.resized_image
+
+    # ── FormatReader interface ────────────────────────────────────────────────
+
+    @dlp.log
+    def open(self, filename):
+        return filename
+
+    @dlp.log
+    def close(self, filename):
+        pass
+
+    @dlp.log
+    def get_sample(self, filename, sample_index):
+        if not self._epoch_inited:
+            self._epoch_init()
+        uri = self._uri_for_filename(filename)
+        boundaries = self._rg_sample_boundaries.get(uri, [0])
+        rg_idx = bisect.bisect_right(boundaries, sample_index) - 1
+        rg_key = (uri, rg_idx)
+
+        if rg_key not in self._rg_done:
+            self._rg_done.add(rg_key)
+            fut = self._active_futures.pop(rg_key, None)
+            if fut is not None:
+                bv = fut.result()
+            else:
+                import s3dlio
+                bv = s3dlio.parquet_get_rg(uri, rg_idx, self._columns, self._footer_cap, "arrow")
+            self._total_bytes_read += len(bv)
+            self._pipeline_submit_next()
+            return self._decode_ipc(bv)
+        return self._args.resized_image
+
+    def next(self):
+        for batch in super().next():
+            yield batch
+
+    @dlp.log
+    def read_index(self, image_idx, step):
+        if not self._epoch_inited:
+            self._epoch_init()
+
+        dlp.update(step=step)
+        filename, sample_index = self.global_index_map[image_idx]
+        uri = self._uri_for_filename(filename)
+
+        boundaries = self._rg_sample_boundaries.get(uri, [0])
+        rg_idx = bisect.bisect_right(boundaries, sample_index) - 1
+        rg_key = (uri, rg_idx)
+
+        if rg_key in self._rg_done:
+            dlp.update(image_size=self._rg_size_hint)
+            return self._args.resized_image
+
+        self._rg_done.add(rg_key)
+        fut = self._active_futures.pop(rg_key, None)
+        if fut is not None:
+            bv = fut.result()
+        else:
+            # Safety fallback: pipeline miss (should not happen in sequential access)
+            import s3dlio
+            bv = s3dlio.parquet_get_rg(uri, rg_idx, self._columns, self._footer_cap, "arrow")
+
+        self._total_bytes_read += len(bv)
+        dlp.update(image_size=len(bv))
+        self._pipeline_submit_next()
+        return self._decode_ipc(bv)
+
+    @dlp.log
+    def finalize(self):
+        total_samples = self._args.num_samples_per_file * self._args.num_files_train
+        if total_samples > 0 and self._total_bytes_read > 0:
+            self._args.record_length = self._total_bytes_read // total_samples
+            self.logger.debug(
+                f"{utcnow()} ParquetReaderS3dlioArrow epoch {self._epoch}: "
+                f"measured {self._total_bytes_read / 1024**3:.3f} GiB read+decoded, "
+                f"{self._args.record_length} bytes/sample"
+            )
+        self._epoch_reset()
+        self._epoch += 1
+        self.open_file_map.clear()
+        return super().finalize()
+
+    def is_index_based(self):
+        return True
+
+    def is_iterator_based(self):
+        return True
+
diff --git a/dlio_benchmark/reader/reader_factory.py b/dlio_benchmark/reader/reader_factory.py
index 536bbe09..2484ff8d 100644
--- a/dlio_benchmark/reader/reader_factory.py
+++ b/dlio_benchmark/reader/reader_factory.py
@@ -70,55 +70,28 @@ def get_reader(type, dataset_type, thread_index, epoch_number):
             elif _args.data_loader == DataLoaderType.NATIVE_DALI:
                 from dlio_benchmark.reader.dali_image_reader import DaliImageReader
                 return DaliImageReader(dataset_type, thread_index, epoch_number)
-            # Use S3 readers for both S3 and AIStore
-            elif _args.storage_type in (StorageType.S3, StorageType.AISTORE):
-                storage_library = (getattr(_args, "storage_options", {}) or {}).get("storage_library")
-                if storage_library in ("s3dlio", "s3torchconnector", "minio"):
-                    from dlio_benchmark.reader.image_reader_s3_iterable import ImageReaderS3Iterable
-                    return ImageReaderS3Iterable(dataset_type, thread_index, epoch_number)
-                # Fallthrough: unrecognized library — let ImageReader try (will fail with a clear PIL error)
-                from dlio_benchmark.reader.image_reader import ImageReader
-                return ImageReader(dataset_type, thread_index, epoch_number)
             else:
-                from dlio_benchmark.reader.image_reader import ImageReader
-                return ImageReader(dataset_type, thread_index, epoch_number)   
+                from dlio_benchmark.reader import create_image_reader
+                return create_image_reader(dataset_type, thread_index, epoch_number)
         elif type == FormatType.NPY:
             if _args.data_loader == DataLoaderType.NATIVE_DALI:
                 from dlio_benchmark.reader.dali_npy_reader import DaliNPYReader
                 return DaliNPYReader(dataset_type, thread_index, epoch_number)
+            elif _args.odirect == True:
+                from dlio_benchmark.reader.npy_reader_odirect import NPYReaderODirect
+                return NPYReaderODirect(dataset_type, thread_index, epoch_number)
             else:
-                if _args.odirect == True:
-                    from dlio_benchmark.reader.npy_reader_odirect import NPYReaderODirect
-                    return NPYReaderODirect(dataset_type, thread_index, epoch_number)
-                # Use S3 readers for both S3 and AIStore
-                elif _args.storage_type in (StorageType.S3, StorageType.AISTORE):
-                    storage_library = (getattr(_args, "storage_options", {}) or {}).get("storage_library")
-                    if storage_library in ("s3dlio", "s3torchconnector", "minio"):
-                        from dlio_benchmark.reader.npy_reader_s3_iterable import NPYReaderS3Iterable
-                        return NPYReaderS3Iterable(dataset_type, thread_index, epoch_number)
-                    from dlio_benchmark.reader.npy_reader_s3 import NPYReaderS3
-                    return NPYReaderS3(dataset_type, thread_index, epoch_number)
-                else:
-                    from dlio_benchmark.reader.npy_reader import NPYReader
-                    return NPYReader(dataset_type, thread_index, epoch_number)                           
+                from dlio_benchmark.reader import create_npy_reader
+                return create_npy_reader(dataset_type, thread_index, epoch_number)
         elif type == FormatType.NPZ:
             if _args.data_loader == DataLoaderType.NATIVE_DALI:
                 raise Exception("Loading data of %s format is not supported without framework data loader; please use npy format instead." %type)
+            elif _args.odirect == True:
+                from dlio_benchmark.reader.npz_reader_odirect import NPZReaderODIRECT
+                return NPZReaderODIRECT(dataset_type, thread_index, epoch_number)
             else:
-                if _args.odirect == True:
-                    from dlio_benchmark.reader.npz_reader_odirect import NPZReaderODIRECT
-                    return NPZReaderODIRECT(dataset_type, thread_index, epoch_number)         
-                # Use S3 readers for both S3 and AIStore
-                elif _args.storage_type in (StorageType.S3, StorageType.AISTORE):
-                    storage_library = (getattr(_args, "storage_options", {}) or {}).get("storage_library")
-                    if storage_library in ("s3dlio", "s3torchconnector", "minio"):
-                        from dlio_benchmark.reader.npz_reader_s3_iterable import NPZReaderS3Iterable
-                        return NPZReaderS3Iterable(dataset_type, thread_index, epoch_number)
-                    from dlio_benchmark.reader.npz_reader_s3 import NPZReaderS3
-                    return NPZReaderS3(dataset_type, thread_index, epoch_number)
-                else:
-                    from dlio_benchmark.reader.npz_reader import NPZReader
-                    return NPZReader(dataset_type, thread_index, epoch_number)
+                from dlio_benchmark.reader import create_npz_reader
+                return create_npz_reader(dataset_type, thread_index, epoch_number)
         elif type == FormatType.TFRECORD:
             if _args.odirect == True:
                 raise Exception("O_DIRECT for %s format is not yet supported." %type)
@@ -154,12 +127,38 @@ def get_reader(type, dataset_type, thread_index, epoch_number):
         elif type == FormatType.PARQUET:
             if _args.odirect == True:
                 raise Exception("O_DIRECT for %s format is not yet supported." %type)
+            # s3dlio streaming loader: unified path for all URI schemes
+            # (s3://, file://, direct://, az://, gs://).
+            # Opt-in via storage_options.storage_library: s3dlio.
+            # Two decode modes (set storage_options.decode):
+            #   raw   (default) — no decode; pure I/O measurement
+            #   arrow           — Rust Arrow IPC decode via create_async_loader
+            storage_lib = (getattr(_args, "storage_options", {}) or {}).get("storage_library")
+            decode = (getattr(_args, "storage_options", {}) or {}).get("decode", "raw")
+            if storage_lib == "s3dlio":
+                if decode == "arrow":
+                    from dlio_benchmark.reader.parquet_reader_s3dlio_arrow import ParquetReaderS3dlioArrow
+                    return ParquetReaderS3dlioArrow(dataset_type, thread_index, epoch_number)
+                else:
+                    from dlio_benchmark.reader.parquet_reader_s3dlio import ParquetReaderS3dlio
+                    return ParquetReaderS3dlio(dataset_type, thread_index, epoch_number)
             elif _args.storage_type in (StorageType.S3, StorageType.AISTORE):
                 from dlio_benchmark.reader.parquet_reader_s3_iterable import ParquetReaderS3Iterable
                 return ParquetReaderS3Iterable(dataset_type, thread_index, epoch_number)
+            elif _args.storage_type in (StorageType.LOCAL_FS,):
+                # If storage_library=direct, reuse ParquetReaderS3Iterable with direct:// URIs
+                # (s3dlio O_DIRECT reads — bypasses page cache, true parity with S3 path).
+                # Fall back to ParquetReaderFileIterable for storage_library=posix (or unset).
+                if storage_lib == "direct":
+                    from dlio_benchmark.reader.parquet_reader_s3_iterable import ParquetReaderS3Iterable
+                    return ParquetReaderS3Iterable(dataset_type, thread_index, epoch_number)
+                else:
+                    from dlio_benchmark.reader.parquet_reader_file_iterable import ParquetReaderFileIterable
+                    return ParquetReaderFileIterable(dataset_type, thread_index, epoch_number)
             else:
                 from dlio_benchmark.reader.parquet_reader import ParquetReader
                 return ParquetReader(dataset_type, thread_index, epoch_number)
 
+
         else:
             raise Exception("Loading data of %s format is not supported without framework data loader" %type)
diff --git a/dlio_benchmark/utils/config.py b/dlio_benchmark/utils/config.py
index 26111bba..effecbd4 100644
--- a/dlio_benchmark/utils/config.py
+++ b/dlio_benchmark/utils/config.py
@@ -258,6 +258,7 @@ class ConfigArguments:
     parquet_row_group_size: int = 1024
     parquet_partition_by: Optional[str] = None
     parquet_generation_batch_size: int = 0
+    parquet_use_s3dlio_gen: bool = False
 
     ## dataset: hdf5-only
     num_dset_per_record: int = 1
@@ -394,14 +395,16 @@ def validate(self):
             total_workers = self.read_threads * self.comm_size
             # 512 MB per spawned worker is the minimum observed RSS (framework imports only).
             per_worker_mb = 512
-            BUDGET_MB = 32 * 1024  # 32 GB hard cap regardless of machine size
+            # Use actual installed RAM so large machines aren't blocked (#372).
+            # Spawning more workers than can fit in RAM is still an error.
+            BUDGET_MB = psutil.virtual_memory().total // (1024 * 1024)
             estimated_mb = per_worker_mb * total_workers
             if estimated_mb > BUDGET_MB:
                 max_threads = BUDGET_MB // per_worker_mb // max(1, self.comm_size)
                 raise Exception(
                     f"Memory budget exceeded: reader.read_threads={self.read_threads} "
                     f"x comm_size={self.comm_size} = {total_workers} worker processes, "
-                    f"estimated ~{estimated_mb // 1024} GB (hard cap: 32 GB). "
+                    f"estimated ~{estimated_mb // 1024} GB (host RAM: {BUDGET_MB // 1024} GB). "
                     f"Reduce reader.read_threads to at most {max_threads} for this run."
                 )
             # Also warn if estimated usage exceeds 50% of available RAM on this machine
@@ -479,10 +482,9 @@ def validate(self):
                 )
 
             if storage_library == "s3torchconnector":
-                # s3torchconnector only supports NPZ and NPY data formats for training.
-                # For checkpoint-only runs (train=False), data format doesn't apply.
-                if self.do_train and self.format not in (FormatType.NPZ, FormatType.NPY):
-                    raise Exception(f"For S3 using s3torchconnector, only NPZ or NPY formats are supported. Got format {self.format}")
+                # s3torchconnector supports NPZ, NPY, and Parquet data formats.
+                # Parquet is handled by ParquetReaderS3Iterable (byte-range GETs via
+                # S3ReaderConstructor.range_based()). No format restriction needed.
                 # Validate that s3torchconnector is installed
                 try:
                     from s3torchconnector._s3client import S3Client, S3ClientConfig
@@ -838,8 +840,12 @@ def derive_configurations(self, file_list_train=None, file_list_eval=None):
         self.record_element_dtype = np.dtype(self.record_element_type)
         self.record_element_bytes = self.record_element_dtype.itemsize
 
-        # hdf5 specific derivations
-        self.record_length = np.prod(self.record_dims) * self.record_element_bytes
+        # hdf5 / record_dims specific derivations — only override record_length when
+        # record_dims is explicitly set. When record_length_bytes is used instead
+        # (e.g. NPZ/NPY/Image workloads with variable-size objects), np.prod([]) = 1
+        # would silently overwrite the correct user-supplied value with 1 byte.
+        if self.record_dims:
+            self.record_length = np.prod(self.record_dims) * self.record_element_bytes
 
         self.transformed_record_element_dtype = np.dtype(self.transformed_record_element_type)
 
@@ -1393,6 +1399,8 @@ def LoadConfig(args, config):
                 args.parquet_partition_by = str(pq_cfg['partition_by'])
             if 'generation_batch_size' in pq_cfg:
                 args.parquet_generation_batch_size = int(pq_cfg['generation_batch_size'])
+            if 'use_s3dlio_gen' in pq_cfg:
+                args.parquet_use_s3dlio_gen = bool(pq_cfg['use_s3dlio_gen'])
 
         # hdf5 only config
         if 'hdf5' in config['dataset']:
diff --git a/docs/ArrowIPC_vs_Parquet_26-04-18.md b/docs/ArrowIPC_vs_Parquet_26-04-18.md
new file mode 100644
index 00000000..f30726ff
--- /dev/null
+++ b/docs/ArrowIPC_vs_Parquet_26-04-18.md
@@ -0,0 +1,530 @@
+# Arrow IPC vs Parquet — Format Recommendation and Implementation Plan
+
+**Date:** April 18, 2026  
+**Replaces:** `parquet_pipeline_analysis.md`, `PyArrow-Parquet_Analysis_26-04-18.md`  
+**Decision:** **Do not invest in dgen-rs Parquet encoder. Pivot to Arrow IPC.**
+
+---
+
+## TL;DR — Storage Benchmark Accuracy
+
+Parquet produces an **inaccurate** storage benchmark at high throughput. The CPU decoder saturates around 1–2 GB/s and becomes the bottleneck before the storage system does. You end up measuring the client CPU, not the storage.
+
+Arrow IPC is the correct choice for a storage benchmark — bytes on disk are the in-memory format, so the I/O path is always the bottleneck. Two conditions must hold to keep it honest:
+
+- **S3 / object storage**: inherently cache-bypass. Every `get_batch()` goes over the network. No extra steps needed.
+- **Local filesystem**: the OS page cache will silently serve reads from DRAM after the first epoch. Use `odirect: true` (O_DIRECT, bypasses page cache completely) or size the dataset so it substantially exceeds host RAM. **O_DIRECT must be implemented for Arrow IPC — not raised as an unsupported exception — because it is the primary tool for accurate local storage benchmarking.**
+
+---
+
+## Recommendation Summary
+
+Do **not** spend time adding a Rust Parquet encoder to dgen-rs/dgen-py.
+
+The correct solution is to implement Arrow IPC file support in dlio_benchmark. Arrow IPC eliminates the Parquet generation bottleneck entirely, improves read throughput by orders of magnitude, requires **zero changes to s3dlio or dgen-rs**, and takes roughly 300–400 lines of Python across 3 new files.
+
+---
+
+## Measured Benchmark Results
+
+All measurements on this machine (12 logical CPUs, PyArrow 23.0.1) using 128 rows × 512 KB = 64 MB files — representative of large AI training sample files:
+
+```
+Arrow IPC write:  0.80 GB/s   (67 MB/file)
+Parquet write:    0.04 GB/s   (67 MB/file)
+IPC write speedup: 20x faster to generate
+
+Arrow IPC read:   1338 GB/s   (in-memory; get_batch × 2)
+Parquet read:     0.08 GB/s   (read_row_group × 2)
+IPC read speedup: 15,752x faster to read from memory
+```
+
+The 0.04 GB/s Parquet write figure matches the `parquet_pipeline_analysis.md` measurement exactly. It is a fundamental constraint of PyArrow's Parquet encoder processing `FixedSizeListArray<uint8>` at element granularity rather than block granularity. This cannot be fixed from Python.
+
+The in-memory read speedup is extreme because Arrow IPC `get_batch()` is a direct memory view — no decoding, no decompression. In real S3/network scenarios the read advantage is bounded by network throughput, but the CPU overhead difference is preserved at any bandwidth: Parquet requires decode+decompress per column chunk; Arrow IPC requires only `memcpy`.
+
+---
+
+## Why the dgen-rs Parquet Encoder Is the Wrong Investment
+
+The `parquet_pipeline_analysis.md` proposal was to add ~300 lines of Rust to dgen-rs implementing a parallel Parquet encoder using the `parquet` crate. This would:
+
+- Fix the generation bottleneck for Parquet
+- Require maintaining Rust Parquet crate integration in dgen-rs permanently
+- Still produce Parquet files that readers must decode+decompress under high network load
+- Deliver no improvement at the point that actually matters for production AI training: **read throughput at >10 GB/s storage bandwidth**
+
+Arrow IPC solves both the generation bottleneck (20× faster write) and the read bottleneck (zero CPU decode) in one move, with no Rust changes at all.
+
+---
+
+## Format Comparison
+
+| Property | Parquet | Arrow IPC File |
+|---|---|---|
+| On-disk format | Columnar, encoded, compressed | Raw Arrow buffers (the in-memory format) |
+| Write throughput (PyArrow) | ~0.04 GB/s for large fixed-size arrays | ~0.80 GB/s — 20× faster |
+| Read CPU cost | Decode + decompress per column chunk | `memcpy` only — zero decode overhead |
+| Footer | Row-group metadata + column stats | Record batch byte offsets |
+| Random batch access | `bisect(cumulative_offsets, idx)` | `get_batch(i)` — O(1), exact offset |
+| Compression | gzip, snappy, zstd, lz4, brotli | lz4, zstd optional (default: none) |
+| Bottleneck at >10 GB/s storage | CPU (decode) is the bottleneck | Network / storage is the bottleneck |
+| pyarrow write API | `pq.ParquetWriter` | `pa.ipc.new_file()` |
+| pyarrow read API | `pq.ParquetFile.read_row_group(i)` | `pa.ipc.open_file().get_batch(i)` |
+| dgen-rs changes needed | Would require ~300 new lines | **None** |
+| s3dlio changes needed | None | **None** |
+
+The data scientist's observation — "moving away from Parquet because it is hard to effectively utilize these files" — is precisely the CPU decode bottleneck at high network throughput. Arrow IPC removes it.
+
+---
+
+## Why No New Rust Is Needed
+
+The existing s3dlio Python API is already sufficient:
+- `s3dlio.get_range(uri, offset, length)` → range GET for any backend (S3, GCS, Azure, file, direct)
+- `s3dlio.stat(uri)["size"]` → file size for any backend
+
+The `_S3RangeFile` adapter in `parquet_reader_s3_iterable.py` wraps exactly these two calls. `pa.ipc.open_file()` accepts any seekable file-like object, so the adapter works unchanged for Arrow IPC — it is format-agnostic. The Arrow IPC reader is the Parquet reader with two function names changed.
+
+---
+
+## Benchmark Accuracy: Are We Measuring Storage or CPU/Memory?
+
+This is the right question to ask before implementing any format, and the answer determines how the implementation must be designed.
+
+### The Parquet problem — it makes a poor storage benchmark
+
+At storage throughputs above roughly 1–2 GB/s (easily achievable on modern NVMe or fast S3), PyArrow's Parquet column decoder saturates the CPU before the storage system is saturated. The result: storage throughput could double and benchmark results would not change, because the bottleneck is the client CPU doing decode, not the storage system doing I/O. **Parquet-based workloads do not accurately benchmark storage at high throughput.** This is one of the primary real-world motivations for formats like Arrow IPC: practitioners building fast ML pipelines have observed this exact bottleneck and moved away from Parquet.
+
+### Arrow IPC — correct for storage benchmarking, with two caveats
+
+Arrow IPC bytes on disk are identical to the Arrow in-memory format. `get_batch()` issues one range read and places the result directly into the Arrow buffer pool. There is no column decoder, no dictionary expansion, no decompression. The CPU cost is dominated by the I/O syscall and a single DMA + user-space copy. At any storage throughput below ~50 GB/s (memory bandwidth), the storage read is the bottleneck, not the final copy. This is what a storage benchmark should measure.
+
+However, two conditions must hold for Arrow IPC reads to accurately measure storage rather than DRAM or page cache:
+
+#### Caveat 1: Page cache (local filesystem only)
+
+On a local POSIX filesystem, the OS page cache will retain file data in DRAM after the first read. A second epoch over the same files will return entirely from page cache — measuring DRAM bandwidth (~40–80 GB/s), not NVMe or network storage. This problem exists for every format, but it is **more acute for Arrow IPC** because:
+- Parquet decode is CPU-intensive — the CPU acts as a natural throttle that causes pages to be evicted before the next epoch starts
+- Arrow IPC decode is trivial — the OS has time to cache everything before the next epoch begins
+
+dlio_benchmark already detects this condition and warns:
+```
+WARNING: The amount of dataset is smaller than the host memory; data might be
+cached after the first epoch. Increase the size of dataset to eliminate the caching effect!
+```
+
+This warning should be heeded. But for cases where the dataset cannot be made large enough (e.g., rapid iteration, testing), **O_DIRECT is the correct solution** — it bypasses the page cache entirely and forces every read to go to storage hardware.
+
+The current implementation plan raises `Exception("O_DIRECT not yet supported")` for Arrow IPC. **This must be implemented, not skipped.** O_DIRECT is the most important mode for accurate local storage benchmarking. See the implementation plan below for the approach.
+
+#### Caveat 2: Page cache (object storage — not a concern)
+
+For S3/MinIO/object storage via `ArrowIPCReaderS3Iterable`, every `get_batch()` call goes over the network. There is no page cache. Object storage benchmarks are inherently cache-bypass and accurately measure storage throughput with Arrow IPC. No special handling needed.
+
+### Is Arrow IPC a realistic production format?
+
+Yes. Hugging Face `datasets` library stores all datasets in Arrow IPC (Feather v2) format internally. The format is used in production MLOps pipelines at scale. It is also the native exchange format between Apache Arrow producers and consumers (Spark, DuckDB, pandas, Polars). The premise that "real workloads use Parquet" is correct for data warehousing and analytics — but for AI training data ingestion, Arrow IPC is an accurate representation of modern high-throughput pipelines.
+
+### Summary of accuracy requirements
+
+| Storage path | Cache bypass needed | How to achieve it |
+|---|---|---|
+| Local NVMe / SSD | **Yes** | `odirect: true` in YAML (O_DIRECT), OR dataset >> host RAM |
+| NFS / parallel FS | **Yes, if close-to-cache** | Dataset >> host RAM, OR `echo 3 > /proc/sys/vm/drop_caches` between epochs |
+| S3 / object storage | No — always bypassed | Nothing extra needed; every GET goes to the network |
+| MinIO on same machine | Partial — MinIO has its own cache | Use remote MinIO, or size dataset >> MinIO server RAM |
+
+**The benchmark operator's checklist for accurate Arrow IPC results:**
+1. Set `num_files_train` so that `total dataset size >> host RAM` (heed the dlio_benchmark warning)
+2. For local storage: use `odirect: true` or run `echo 3 > /proc/sys/vm/drop_caches` between experiments (requires root / `sudo`)
+3. Discard epoch 1 results if cache state is uncertain; report epoch 2+ as the steady-state storage throughput
+4. For S3: no extra steps — object storage reads are always cache-bypass
+
+### Format verdict from a storage benchmarking perspective
+
+| Format | Bottleneck at >1 GB/s storage | Accurate storage benchmark? |
+|---|---|---|
+| Parquet (compressed) | CPU decode — saturates before storage | **No** — measures client CPU, not storage |
+| Parquet (uncompressed) | CPU decode (lighter) — still ~1–5 GB/s ceiling | **Marginal** — becomes inaccurate above ~2 GB/s |
+| Arrow IPC (uncompressed) | Storage I/O (with cache bypass) | **Yes** — measures storage when cache is bypassed |
+| Arrow IPC + LZ4 | LZ4 decode at ~15 GB/s — above most storage | **Yes** — LZ4 is fast enough to remain storage-bound up to ~12 GB/s |
+
+Arrow IPC uncompressed is the correct choice. Arrow IPC + LZ4 is a valid alternative for benchmarking compressed data ingestion while remaining storage-bound.
+
+---
+
+## Implementation Plan
+
+Six touch points total. Three new files, three small modifications.
+
+Both POSIX/local-filesystem and object-storage (S3/MinIO/GCS/Azure via s3dlio) paths must work, and the reader factory already dispatches on `storage_type` exactly as it does for Parquet. The generator already uses `self.storage.islocalfs()` to choose between a direct file write and a buffer-then-upload path. Arrow IPC follows both patterns identically — only the writer and reader API calls change.
+
+---
+
+### New files
+
+#### `reader/arrow_ipc_reader.py` — POSIX / local filesystem reader
+
+Used when `storage_type` is `local`, `nfs`, or any non-object-store type. PyArrow's `pa.ipc.open_file()` accepts a plain filesystem path directly, so no adapter is needed — it opens, `mmap`s, and reads the footer in one call.
+
+```python
+import pyarrow as pa
+from dlio_benchmark.reader.reader_handler import FormatReader
+from dlio_benchmark.utils.utility import Profile
+from dlio_benchmark.common.constants import MODULE_DATA_READER
+
+dlp = Profile(MODULE_DATA_READER)
+
+class ArrowIPCReader(FormatReader):
+
+    def __init__(self, dataset_type, thread_index, epoch):
+        super().__init__(dataset_type, thread_index)
+        opts = getattr(self._args, "storage_options", {}) or {}
+        self._batch_cache_size = int(opts.get("batch_cache_size", 4))
+        self._batch_cache: dict = {}
+        self._lru: list = []
+
+    @dlp.log
+    def open(self, filename):
+        # pa.ipc.open_file reads only the footer on open — no full file load.
+        reader = pa.ipc.open_file(filename)
+        return (reader, reader.num_record_batches)
+
+    @dlp.log
+    def get_sample(self, filename, sample_index):
+        reader, num_batches = self.open_file_map[filename]
+        # Assumes fixed records_per_batch; num_record_batches divides num_samples evenly.
+        records_per_batch = self._args.num_samples_per_file // num_batches
+        batch_idx = sample_index // records_per_batch
+
+        cache_key = (filename, batch_idx)
+        if cache_key not in self._batch_cache:
+            if len(self._lru) >= self._batch_cache_size:
+                evict = self._lru.pop(0)
+                self._batch_cache.pop(evict, None)
+            # get_batch() issues exactly one range read for the batch buffers.
+            batch = reader.get_batch(batch_idx)
+            self._batch_cache[cache_key] = batch
+            self._lru.append(cache_key)
+        else:
+            batch = self._batch_cache[cache_key]
+
+        dlp.update(image_size=batch.nbytes)
+        return self._args.resized_image
+
+    @dlp.log
+    def close(self, filename):
+        keys = [k for k in self._batch_cache if k[0] == filename]
+        for k in keys:
+            self._batch_cache.pop(k, None)
+            if k in self._lru:
+                self._lru.remove(k)
+        super().close(filename)
+```
+
+Key points:
+- `pa.ipc.open_file(filename)` works with any POSIX path, NFS mount, or `file://` URI. No wrapper needed.
+- `reader.num_record_batches` replaces the `bisect(cumulative_offsets)` lookup used in Parquet — Arrow IPC stores batch byte offsets in the footer, so `get_batch(i)` is O(1) and issues one exact range read.
+- The cache eviction and `dlp` telemetry pattern is identical to `ParquetReader`.
+- **O_DIRECT must be supported** (not raise an exception) — it is the primary mechanism for accurate local storage benchmarking. When `odirect: true`, `open()` uses a `_DirectRangeFile` adapter backed by s3dlio's `direct://` URI scheme instead of a plain path. See `reader_factory.py` modifications below.
+
+---
+
+#### `reader/arrow_ipc_reader_s3_iterable.py` — S3 / object-store reader
+
+Used when `storage_type` is `s3` or `aistore`. PyArrow's `pa.ipc.open_file()` accepts any seekable file-like object, which means the existing `_S3RangeFile` adapter from `parquet_reader_s3_iterable.py` works without modification. The s3dlio, MinIO, and s3torchconnector dispatch paths are unchanged.
+
+```python
+import pyarrow as pa
+from dlio_benchmark.reader.parquet_reader_s3_iterable import (
+    _S3RangeFile, _MinioRangeFile, _S3TCRangeFile,
+)
+from dlio_benchmark.reader.reader_handler import FormatReader
+from dlio_benchmark.utils.utility import Profile
+from dlio_benchmark.common.constants import MODULE_DATA_READER
+
+dlp = Profile(MODULE_DATA_READER)
+
+class ArrowIPCReaderS3Iterable(FormatReader):
+
+    def __init__(self, dataset_type, thread_index, epoch):
+        super().__init__(dataset_type, thread_index)
+        opts = getattr(self._args, "storage_options", {}) or {}
+        self._storage_library = opts.get("storage_library", "s3dlio")
+        self._endpoint_url    = opts.get("endpoint_url", "")
+        self._batch_cache_size = int(opts.get("batch_cache_size", 4))
+        self._batch_cache: dict = {}
+        self._lru: list = []
+
+    def _make_range_file(self, uri):
+        """Return the appropriate seekable file-like adapter for this URI."""
+        if self._storage_library == "s3dlio":
+            return _S3RangeFile(uri)
+        elif self._storage_library == "minio":
+            return _MinioRangeFile(uri, self._endpoint_url)
+        elif self._storage_library == "s3torchconnector":
+            return _S3TCRangeFile(uri)
+        else:
+            raise ValueError(f"Unknown storage_library: {self._storage_library!r}")
+
+    @dlp.log
+    def open(self, filename):
+        rf = self._make_range_file(filename)
+        # Two small range GETs: one for the magic/version header, one for the footer.
+        reader = pa.ipc.open_file(rf)
+        return (reader, reader.num_record_batches)
+
+    @dlp.log
+    def get_sample(self, filename, sample_index):
+        reader, num_batches = self.open_file_map[filename]
+        records_per_batch = self._args.num_samples_per_file // num_batches
+        batch_idx = sample_index // records_per_batch
+
+        cache_key = (filename, batch_idx)
+        if cache_key not in self._batch_cache:
+            if len(self._lru) >= self._batch_cache_size:
+                evict = self._lru.pop(0)
+                self._batch_cache.pop(evict, None)
+            # Exactly one range GET for the batch body — no full-file download.
+            batch = reader.get_batch(batch_idx)
+            self._batch_cache[cache_key] = batch
+            self._lru.append(cache_key)
+        else:
+            batch = self._batch_cache[cache_key]
+
+        dlp.update(image_size=batch.nbytes)
+        return self._args.resized_image
+
+    @dlp.log
+    def close(self, filename):
+        keys = [k for k in self._batch_cache if k[0] == filename]
+        for k in keys:
+            self._batch_cache.pop(k, None)
+            if k in self._lru:
+                self._lru.remove(k)
+        super().close(filename)
+```
+
+Key points:
+- `_S3RangeFile` is format-agnostic — it only implements `seek/tell/read` over `s3dlio.get_range` and `s3dlio.stat`. It does not know or care whether it is wrapping a Parquet footer scan or an Arrow IPC footer scan.
+- `pa.ipc.open_file(rf)` reads the IPC file magic (8 bytes) and footer (a small Flatbuffer at the end of the file) using two range GETs, then returns. No record batch data is transferred at open time.
+- `reader.get_batch(batch_idx)` uses the byte offset and length from the footer to issue exactly one range GET. This is the strongest advantage over Parquet at S3 scale: one network round trip per sample batch, not a column-chunk scan.
+
+---
+
+#### `data_generator/arrow_ipc_generator.py` — file generator (POSIX and object storage)
+
+The existing `ParquetGenerator.generate()` already uses `is_local = self.storage.islocalfs()` to dispatch between a direct write (`writer_target = out_path_spec`) and a buffer-then-upload path (`writer_target = pa.BufferOutputStream()` → `self.storage.put_data(...)`). `ArrowIPCGenerator` uses the same pattern with `pa.ipc.new_file()` in place of `pq.ParquetWriter`.
+
+```python
+import os
+import numpy as np
+import pyarrow as pa
+
+from dlio_benchmark.data_generator.data_generator import DataGenerator
+from dlio_benchmark.data_generator.parquet_generator import (
+    ParquetGenerator, _PA_SCALAR_TYPE_MAP, _NP_TYPE_MAP,
+)
+from dlio_benchmark.utils.utility import progress, gen_random_tensor, DLIOMPI
+import dgen_py as _dgen_py
+
+class ArrowIPCGenerator(DataGenerator):
+
+    def __init__(self):
+        super().__init__()
+        opts = getattr(self._args, "storage_options", {}) or {}
+        # ipc_record_batch_size: rows per Arrow record batch.
+        # Must divide num_samples_per_file evenly for O(1) get_batch() indexing.
+        self.record_batch_size = int(
+            opts.get("ipc_record_batch_size",
+                     getattr(self._args, "parquet_row_group_size", 1024))
+        )
+        self.parquet_columns = getattr(self._args, "parquet_columns", [])
+
+    def _build_schema(self, legacy_elem_size=None):
+        # Reuse the same schema logic as ParquetGenerator — the Arrow IPC file
+        # format stores this schema verbatim in the file header and footer.
+        pg = ParquetGenerator.__new__(ParquetGenerator)
+        pg._args = self._args
+        pg.parquet_columns = self.parquet_columns
+        return pg._build_schema(legacy_elem_size=legacy_elem_size)
+
+    def generate(self):
+        super().generate()
+
+        np.random.seed(self.BASE_SEED + self.my_rank)
+        rng = np.random.default_rng(seed=self.BASE_SEED + self.my_rank)
+        dim = self.get_dimension(self.total_files_to_generate)
+        is_local = self.storage.islocalfs()
+
+        write_opts = pa.ipc.IpcWriteOptions(compression=None)  # zero-decode on read
+
+        for i in range(self.my_rank, int(self.total_files_to_generate), self.comm_size):
+            progress(i + 1, self.total_files_to_generate, "Generating Arrow IPC Data")
+
+            out_path_spec = self.storage.get_uri(self._file_list[i])
+            dim_raw = dim[2 * i]
+            if isinstance(dim_raw, list):
+                dim1 = int(dim_raw[0]); dim2 = int(dim_raw[1]) if len(dim_raw) > 1 else 1
+            else:
+                dim1 = int(dim_raw); dim2 = int(dim[2 * i + 1])
+            elem_size = dim1 * dim2
+
+            schema = self._build_schema(legacy_elem_size=elem_size)
+
+            # ── Choose write target ───────────────────────────────────────
+            if is_local:
+                parent_dir = os.path.dirname(out_path_spec)
+                if parent_dir:
+                    os.makedirs(parent_dir, exist_ok=True)
+                writer_target = out_path_spec          # direct filesystem write
+            else:
+                writer_target = pa.BufferOutputStream()  # buffer → put_data below
+
+            num_batches = (self.num_samples + self.record_batch_size - 1) // self.record_batch_size
+
+            with pa.ipc.new_file(writer_target, schema, options=write_opts) as writer:
+                for batch_idx in range(num_batches):
+                    batch_start = batch_idx * self.record_batch_size
+                    batch_end   = min(batch_start + self.record_batch_size, self.num_samples)
+                    cur_rows    = batch_end - batch_start
+
+                    if self.parquet_columns:
+                        # Column-schema mode — reuse ParquetGenerator helpers
+                        pg = ParquetGenerator.__new__(ParquetGenerator)
+                        pg._args = self._args
+                        pg.parquet_columns = self.parquet_columns
+                        columns = pg._generate_batch_columns(cur_rows, rng)
+                    else:
+                        # Legacy uint8 mode — same dgen path as ParquetGenerator
+                        flat = gen_random_tensor(shape=(cur_rows * elem_size,),
+                                                 dtype=np.uint8, rng=rng)
+                        arrow_flat = pa.array(flat, type=pa.uint8())
+                        arrow_data = pa.FixedSizeListArray.from_arrays(arrow_flat, elem_size)
+                        columns = {'data': arrow_data}
+
+                    batch = pa.RecordBatch.from_arrays(
+                        list(columns.values()), schema=schema
+                    )
+                    writer.write_batch(batch)
+
+            # ── Upload if object storage ──────────────────────────────────
+            if not is_local:
+                self.storage.put_data(out_path_spec, writer_target.getvalue().to_pybytes())
+
+        np.random.seed()
+```
+
+Key points:
+- `is_local` dispatch is identical to `ParquetGenerator` — `pa.ipc.new_file()` accepts either a filesystem path or a `BufferOutputStream` and PyArrow handles both transparently.
+- For POSIX/local: the IPC file is written directly to disk with `os.makedirs` pre-created, no staging buffer needed.
+- For object storage: the entire file is buffered in a `BufferOutputStream`, then uploaded via `self.storage.put_data()` — exactly the same pattern as Parquet. The file is typically 64–512 MB so this is acceptable; streaming multipart upload could be added later if needed.
+- `write_opts = IpcWriteOptions(compression=None)` is the default and should stay the default. Enabling `lz4` or `zstd` compression is possible but defeats the zero-decode advantage.
+- The dgen-py streaming pool path (used by `ParquetGenerator` for sub-32 MB batches) can be ported directly from `parquet_generator.py` once the basic path is validated.
+
+---
+
+### Modifications
+
+#### `common/enumerations.py`
+```python
+# In FormatType enum — add after PARQUET
+ARROW_IPC = 'arrow_ipc'
+
+# In FormatType.get_enum() — add the elif branch
+elif FormatType.ARROW_IPC.value == value:
+    return FormatType.ARROW_IPC
+```
+
+#### `reader/reader_factory.py`
+
+The dispatch mirrors the existing Parquet block exactly: S3/AIStore → range-GET reader, everything else → POSIX reader. Unlike Parquet, **O_DIRECT must be supported** — it is the primary means of ensuring accurate local storage benchmarking. When `odirect: true`, the local reader uses s3dlio's `direct://` URI scheme, which opens the file with `O_DIRECT` and returns DMA-aligned buffers. The S3 reader is inherently cache-bypass and ignores the flag.
+
+```python
+elif type == FormatType.ARROW_IPC:
+    if _args.storage_type in (StorageType.S3, StorageType.AISTORE):
+        # S3/object storage is always cache-bypass — odirect flag is irrelevant.
+        from dlio_benchmark.reader.arrow_ipc_reader_s3_iterable import ArrowIPCReaderS3Iterable
+        return ArrowIPCReaderS3Iterable(dataset_type, thread_index, epoch_number)
+    else:
+        # For local/NFS: ArrowIPCReader respects odirect=True via direct:// URI.
+        # Raising NotImplemented here (as other formats do) would make accurate
+        # local storage benchmarking impossible — do not do this.
+        from dlio_benchmark.reader.arrow_ipc_reader import ArrowIPCReader
+        return ArrowIPCReader(dataset_type, thread_index, epoch_number)
+```
+
+O_DIRECT implementation in `ArrowIPCReader.open()`:
+
+```python
+@dlp.log
+def open(self, filename):
+    if getattr(self._args, 'odirect', False):
+        # Rewrite the path as a direct:// URI so s3dlio opens with O_DIRECT.
+        # s3dlio.get_range('direct:///path/to/file', offset, length) returns
+        # DMA-aligned Bytes, bypassing the page cache entirely.
+        import s3dlio
+        uri = 'direct://' + filename if not filename.startswith('direct://') else filename
+        # Wrap in _DirectRangeFile (same interface as _S3RangeFile but uses
+        # s3dlio direct:// backend for O_DIRECT reads).
+        rf = _DirectRangeFile(uri)
+        reader = pa.ipc.open_file(rf)
+    else:
+        reader = pa.ipc.open_file(filename)
+    return (reader, reader.num_record_batches)
+```
+
+`_DirectRangeFile` is identical to `_S3RangeFile` with `uri = 'direct://' + posix_path`. Since s3dlio's `direct://` backend already handles aligned reads, no additional buffer alignment code is needed in Python.
+
+#### `data_generator/generator_factory.py`
+
+The generator handles both POSIX and object storage internally via `self.storage.islocalfs()`, so a single factory entry covers both:
+
+```python
+elif type == FormatType.ARROW_IPC:
+    from dlio_benchmark.data_generator.arrow_ipc_generator import ArrowIPCGenerator
+    return ArrowIPCGenerator()
+```
+
+---
+
+### Example YAML — local filesystem
+
+```yaml
+dataset:
+  format: arrow_ipc
+  storage_type: local
+  storage_root: /mnt/nvme/training-data
+  num_samples_per_file: 1024
+  num_files_train: 500
+  storage_options:
+    batch_cache_size: 4
+    ipc_record_batch_size: 256   # must divide num_samples_per_file evenly
+```
+
+No `storage_library` key is needed for local storage — `ArrowIPCReader` opens files directly with `pa.ipc.open_file(filename)`.
+
+### Example YAML — S3 / object storage via s3dlio
+
+```yaml
+dataset:
+  format: arrow_ipc
+  storage_type: s3
+  storage_root: my-bucket
+  num_samples_per_file: 1024
+  num_files_train: 500
+  storage_options:
+    storage_library: s3dlio
+    endpoint_url: http://127.0.0.1:9000
+    batch_cache_size: 4
+    ipc_record_batch_size: 256   # must divide num_samples_per_file evenly
+```
+
+---
+
+## What to Keep from the Parquet Work
+
+The existing `ParquetReader`, `ParquetReaderS3Iterable`, and `ParquetGenerator` should remain in the codebase. Parquet is a widely used format and the existing implementation is correct and production-quality. The recommendation is not to remove Parquet support — it is to add Arrow IPC as the preferred format for new workloads, particularly those running against high-throughput storage (>10 GB/s).
+
+For existing Parquet datasets, the current readers continue to work. For new datasets, Arrow IPC is the better choice.
diff --git a/docs/Batch-API-Design-Analysis.md b/docs/Batch-API-Design-Analysis.md
new file mode 100644
index 00000000..ef55e56b
--- /dev/null
+++ b/docs/Batch-API-Design-Analysis.md
@@ -0,0 +1,156 @@
+# Batch API Design Analysis: GIL Overhead and Batch-Level Delivery
+
+*Date: May 12, 2026*
+
+---
+
+## Context
+
+`PyBytesAsyncDataLoader.items()` (s3dlio) returns a per-item synchronous iterator of
+`PyObjectItem` values. Each `__next__()` call releases the GIL while waiting for the
+next completed GET from Rust/Tokio's `buffer_unordered(N)` sliding window, then
+reacquires it to build and return a `PyObjectItem { uri, data }`.
+
+The question: **is GIL traversal overhead massive at ~1,640 completions/sec per worker,
+and is a higher-level batch API worth building?**
+
+---
+
+## GIL Traversal Overhead — Is It Massive?
+
+### Worker Architecture
+
+DLIO's readers run inside **PyTorch `DataLoader` subprocess workers**
+(`num_workers=read_threads`, with `multiprocessing_context`). Each worker is a
+**separate OS process** with its **own independent GIL**. GIL *contention* across
+workers is architecturally impossible — no two workers share the same interpreter.
+
+### Cost Per GIL Crossing
+
+A `__next__()` call from Python into a PyO3 `#[pyclass]` iterator involves:
+- Releasing GIL: `py.allow_threads()` (releases the mutex)
+- Blocking recv: `rx.blocking_recv()` (blocks OS thread, no CPU burn)
+- Reacquiring GIL: mutex re-lock
+- Building `PyObjectItem`: struct allocation + `uri: String` copy
+
+Total cost: **~1–3 µs** per crossing on a modern CPU (cache-warm, no contention).
+
+### Actual Crossing Rate
+
+At s3-ultra loopback (8 µs latency, ~10 GB/s peak):
+- 315 KB per object → ~39 µs per GET (transfer + latency)
+- With 64 in-flight: **~1,640 completions/sec per worker**
+
+GIL overhead per worker: `1,640 × 2 µs = 3.3 ms/sec = 0.33%`
+
+### `get_sample()` Cost in DLIO
+
+The image reader's `get_sample()` per item:
+```python
+def get_sample(self, filename, sample_index):
+    byte_count = self._object_cache.get(filename, 0)   # dict lookup
+    dlp.update(image_size=byte_count)                  # telemetry counter
+    dft_ai.update(image_size=byte_count)               # telemetry counter
+```
+~1–5 µs total. At 1,640 items/sec: **5–13 ms/sec = 0.5–1.3%** overhead per worker.
+All Python work per item (dict lookups, cache update, telemetry, batch counter) is
+within this range.
+
+### Bottom Line
+
+**GIL overhead is real but not massive.** Total Python-side cost per worker: ~1–1.5%
+of wall time. This will not show up as a hotspot in any profile. The concern is
+understandable but the math does not support the "massive overhead" framing.
+
+---
+
+## Is a Batch-Level API Worth Building?
+
+Yes — but for different reasons than GIL reduction.
+
+### Reason 1: Reduced Python Object Churn
+
+Per-item `items()` allocates per completion:
+- One `PyObjectItem` Rust struct
+- One Python `str` for `uri` (heap allocation, reference counted)
+- One Python dict lookup round-trip
+
+A `collect_batch(n)` returning `List[PyObjectItem]` amortizes allocator overhead:
+Rust builds a `Vec<(String, Bytes)>`, converts it once to a Python list, and hands
+it over in one GIL acquisition. Python iterates a plain Python list — no `__next__()`
+dispatch per item.
+
+### Reason 2: Path to True Zero-Copy Batch Delivery *(the genuinely interesting angle)*
+
+For workloads with **fixed-size objects** (e.g., RetinaNet JPEGs at ~315 KB each),
+Rust could:
+
+1. Pre-allocate (or reuse) a contiguous `[u8; batch_size × max_file_size]` buffer
+2. Fill it from `buffer_unordered(N)` as items complete — in-place, no copies
+3. Wrap it as a numpy array via the buffer protocol
+4. Return `(numpy_array, List[str] uris, List[int] actual_sizes)`
+
+Python receives a `(batch_size, max_file_size)` numpy array with **zero Python-side
+memory allocation and zero copying**. The batch is already in contiguous memory
+ready for the decode pipeline. This is the path that no existing Python data loader
+(PyTorch included) takes.
+
+### Reason 3: Tail-Latency Concern (downside)
+
+A `collect_batch(n)` approach requires Rust to wait for all `n` items to complete
+before returning. With `buffer_unordered(64)` and `batch_size=16`, the batch
+completion latency is driven by the **slowest of 16** concurrent GETs. At 8 µs
+loopback variance is tiny, but at real S3 latencies (20–50 ms WAN), waiting for
+the 16th item adds measurable P95 tail latency. The per-item `items()` approach
+avoids this — Python sees items as they arrive.
+
+### Reason 4: `num_samples_per_file` Alignment
+
+DLIO's concept of a "batch" is `batch_size` **samples**, not files. For NPZ where
+`num_samples_per_file=4`: 1 batch = 4 files. Rust only knows about files, not
+sample structure within files. A file-level `collect_batch(n)` API requires the
+caller to compute `n = batch_size // num_samples_per_file`. Manageable, but the
+API must operate on files; Python converts files → samples.
+
+---
+
+## Design Options
+
+### Option A — `collect_batch(n) -> List[PyObjectItem]` (simple, immediate)
+
+Rust collects `n` items into a `Vec`, wraps as Python list. One GIL crossing.
+Python iterates a native Python list — no iterator protocol overhead per item.
+Easy to implement on top of the existing channel.
+
+### Option B — `fill_buffer(numpy_array, n) -> List[str]` (zero-copy, advanced)
+
+Python pre-allocates `numpy.empty((n, max_file_size), dtype=np.uint8)` once and
+reuses it each batch. Rust writes fetched bytes directly into the numpy buffer
+in-place via the buffer protocol. Returns the URI list. Zero Python-side memory
+allocation per batch. Requires `max_file_size` known upfront.
+
+### Option C — Keep `items()`, batch in Python (no Rust changes)
+
+```python
+import itertools
+batch = list(itertools.islice(loader.items(), batch_size))
+```
+
+Still `n` GIL crossings — each `islice` element is a `__next__()` call. Does not
+reduce overhead, just changes control flow structure.
+
+---
+
+## Recommendation
+
+The GIL overhead at these rates with subprocess workers does **not** need fixing for
+correctness or performance. If pursuing a batch API, the target should be
+**Option B** — the zero-copy numpy path. That is the genuinely impactful optimization
+for fixed-size-object workloads (RetinaNet, CosmoFlow), eliminating per-item Python
+object creation and memory allocation entirely.
+
+Option A is a good stepping stone toward Option B and easy to add without touching
+the existing `items()` API.
+
+Both live alongside `items()` — the per-item iterator remains the right primitive for
+variable-size workloads (NPZ, HDF5, Parquet) where a buffer cannot be pre-sized.
diff --git a/docs/DLIO_IO_Issues-Executive_Summary_2026-03-28.md b/docs/DLIO_IO_Issues-Executive_Summary_2026-03-28.md
deleted file mode 100644
index 74ddc851..00000000
--- a/docs/DLIO_IO_Issues-Executive_Summary_2026-03-28.md
+++ /dev/null
@@ -1,159 +0,0 @@
-# DLIO Benchmark I/O Issues — Executive Summary
-
-**Date:** 2026-03-28  
-**Full technical document:** [DLIO_IO_Issues-Proposal_2026-03-28.md](DLIO_IO_Issues-Proposal_2026-03-28.md)  
-**Audience:** Engineering leads, project owners, and decision-makers who need to understand the scope of issues and the investment required to address them — without implementation details.
-
----
-
-## What This Review Found
-
-A code review of the `dlio_benchmark` codebase identified thirteen distinct issues across data generation, data loading, checkpointing, configuration management, and benchmark correctness. The most significant finding is that **results produced by the current codebase for local-filesystem and object-storage workloads are not directly comparable to each other**, because the two backend paths perform different amounts of CPU work even when given identical data. This calls into question a class of published comparisons.
-
-The issues range from critical correctness bugs to structural inefficiencies. All are actionable. None require redesigning the benchmark's overall architecture.
-
----
-
-## Critical Issues (Affect Correctness of Results)
-
-### 1. File and Object Storage Backends Are Not Measuring the Same Thing
-
-The object-storage readers were written to skip all data decoding — they read raw bytes, record the byte count, and discard the bytes, because DLIO returns a pre-allocated random tensor to the training loop regardless of what was read. The local-filesystem readers were not updated to match: they fully decode every JPEG file (using PIL), fully load every NPY array (using NumPy), and fully inflate compressed HDF5 datasets — all of which is then discarded.
-
-**Consequence:** A local-filesystem JPEG benchmark spends 70–99% of training-step time on CPU image decoding, not on I/O. An equivalent object-storage benchmark spends near 0% on decoding. The same storage hardware running the same data through the two paths can produce benchmark numbers that differ by 5–20× due entirely to this CPU overhead difference, not actual storage performance differences.
-
-**Decision required:** Bring local-filesystem readers up to the standard already implemented in the S3 iterable readers. This is a code-only change and does not affect the storage I/O being measured. Until this is done, cross-backend comparisons in benchmark reports are not internally consistent.
-
-→ Full analysis: [Section 13](DLIO_IO_Issues-Proposal_2026-03-28.md#13-file-vs-object-workload-asymmetry--closing-the-performance-gap)
-
-### 2. Data Generation Is Slower Than It Needs to Be by Orders of Magnitude
-
-JPEG and PNG data generation is CPU-bottlenecked on image compression, not on storage write throughput. At typical image sizes, generating an ImageNet-scale dataset (1.28 million files) takes approximately 80 minutes per rank for JPEG, and over 4 hours per rank for PNG. The actual storage write takes roughly 16 seconds per rank. Generation time is 300–1000× longer than storage write time, dominated entirely by compression work that has no bearing on the storage being benchmarked.
-
-For the most common benchmark configurations (non-DALI data loaders), JPEG and PNG files do not need to be valid image files, because the reader never decodes them. The generator can write raw random bytes directly, collapsing generation overhead from ~30 milliseconds per file to under 0.01 milliseconds — a 2000–4000× speedup. This applies to all configurations except those using NVIDIA DALI, which calls a real image decoder and therefore requires valid JPEG bitstreams.
-
-**Decision required:** Update JPEG and PNG generators to detect the configured data loader and skip image encoding when the reader does not decode. For DALI configurations, accept the encoding cost as unavoidable and document it as a known constraint.
-
-→ Full analysis: [Section 9g](DLIO_IO_Issues-Proposal_2026-03-28.md#9g-jpeGpng-do-files-need-to-be-actually-valid-images), [Section 9d](DLIO_IO_Issues-Proposal_2026-03-28.md#9d-where-time-actually-goes-in-an-end-to-end-jpeg-benchmark-run)
-
-### 3. TFRecord / Iterative Sampler Reads the Wrong Files on Non-Zero Ranks
-
-A file-index tracking bug in `build_sample_map_iter()` causes MPI rank 1 and above to read from the wrong portion of the dataset when using the iterative data sampler (standard for TFRecord workloads). The first file read per rank is correct; all subsequent reads revert to iterating from the beginning of the file list. Both rank 0 and rank 1 end up reading the same overlapping set of files while the upper half of the dataset is never read by any rank.
-
-**Consequence:** Any TFRecord benchmark result using more than one MPI rank double-counts data from the lower half of the dataset and misses the upper half entirely. Reported throughput is inflated and not reproducible by other means.
-
-**Decision required:** Fix the file-index counter in `build_sample_map_iter()`. The PyTorch index sampler does not have this bug.
-
-→ Full analysis: [Section 2b](DLIO_IO_Issues-Proposal_2026-03-28.md#2b-tf--iterative-path--build_sample_map_iter-used-when-data_loader_sampler--iterative), [Section 6e](DLIO_IO_Issues-Proposal_2026-03-28.md#6e-build_sample_map_iter-bug--concrete-description)
-
----
-
-## High-Priority Issues (Significantly Affect Benchmark Quality)
-
-### 4. `read_threads` Is Hardcoded at a Value That Is Wrong at Scale
-
-The thread count for parallel I/O is set as a fixed integer in each YAML config file and is never adjusted for the actual deployment topology. For JPEG/PNG workloads, storage throughput scales directly with the number of concurrent open requests. With the default value, a typical NFS deployment uses less than 10% of its available bandwidth — not because the storage is slow, but because the benchmark is not issuing enough concurrent requests. The correct value varies by an order of magnitude depending on how many MPI ranks share a node.
-
-**Decision required:** Support an `auto` setting for `read_threads` that resolves at runtime based on the actual MPI topology. Keep the integer form for reproducible runs. Update default configs to a higher starting value.
-
-→ Full analysis: [Section 11](DLIO_IO_Issues-Proposal_2026-03-28.md#11-read_threads--fixed-yaml-value-vs-runtime-adaptive-sizing)
-
-### 5. Deduplicating Storage Systems Will Produce Meaningless Results Without Unique File Content
-
-Every generated file must contain content that is byte-unique across the entire dataset. Storage systems from major enterprise vendors (NetApp, Pure Storage, Vast Data, and many object stores) apply inline deduplication by default. If multiple files share identical byte content, the storage system physically stores only one copy and the benchmark measures deduplication throughput rather than storage write throughput. Results can appear orders of magnitude higher than the system's actual sustainable ingestion rate.
-
-The codebase correctly uses a unique random seed per file via dgen-py; however, any shortcut that pre-computes one serialized blob and copies it across files — for any format — would silently produce deduplicated data. This constraint must be treated as non-negotiable for any benchmark run on production storage.
-
-→ Full analysis: [Section 9e](DLIO_IO_Issues-Proposal_2026-03-28.md#9e-the-non-negotiable-constraint-every-file-must-contain-unique-bytes)
-
-### 6. Storage Reader CPU Overhead Contaminates Training-Step Timing
-
-Even apart from the file/object asymmetry described in Issue 1, all local-filesystem readers include CPU decode time inside the training-step latency window. The benchmark reports this combined time as if it were pure storage access time. For JPEG workloads, 71–99% of the reported per-sample time is CPU decoding, not storage I/O.
-
-→ Full analysis: [Section 9c](DLIO_IO_Issues-Proposal_2026-03-28.md#9c-reader-overhead-by-format-local-filesystem-path), [Section 9d](DLIO_IO_Issues-Proposal_2026-03-28.md#9d-where-time-actually-goes-in-an-end-to-end-jpeg-benchmark-run)
-
----
-
-## Structural Issues (Reduce Maintainability and Reproducibility)
-
-### 7. Forty-Nine Configuration Files for a Small Orthogonal Matrix
-
-The `configs/dlio/workload/` directory contains 49 YAML files covering a matrix of approximately 7 models × 4 storage backends × 2–3 phases. The file count grows multiplicatively with every new backend or model. Files share 90–95% identical content; the differing fields are storage backend name, bucket name, and endpoint URL. The endpoint URLs hard-code a specific lab IP address, making every object-storage config file non-portable outside that lab.
-
-Hydra, the configuration framework already in use, supports config composition through config groups. Adopting it reduces the 49 files to approximately 13 (7 model configs plus 3 shared storage templates plus 3 workflow configs), with connection details supplied at runtime rather than baked into files.
-
-→ Full analysis: [Section 7](DLIO_IO_Issues-Proposal_2026-03-28.md#7-yaml-config-proliferation-analysis), [Section 8](DLIO_IO_Issues-Proposal_2026-03-28.md#8-proposed-yaml-config-architecture)
-
-### 8. `multiprocessing_context` Must Match the Storage Library or Hangs Silently
-
-The fork-vs-spawn setting for DataLoader workers must be `spawn` for object-storage libraries that maintain background threads (s3dlio, s3torchconnector). If a user copies a local-filesystem YAML and adds an object-storage backend without changing `multiprocessing_context`, all object-storage reads will silently hang with no error message. The constraint is documented only in YAML comments, not enforced in code.
-
-→ Full analysis: [Section 6c](DLIO_IO_Issues-Proposal_2026-03-28.md#6c-multiprocessing_context-couples-to-storage_library-but-lives-in-reader)
-
-### 9. `storage_library` Config Schema Is Inconsistent
-
-The `storage_library` field lives in an inconsistent location across the YAML schema, dataclass, and validation code. This creates ambiguity in how CLI overrides are expressed and silently returns `None` in any code path that accesses the field outside the standard load sequence.
-
-→ Full analysis: [Section 6a](DLIO_IO_Issues-Proposal_2026-03-28.md#6a-storage_library-promotion-inconsistency)
-
----
-
-## Lower-Priority Issues (Operational Efficiency)
-
-### 10. No Intra-Rank Parallelism for Data Generation
-
-Each MPI rank generates files sequentially. On multi-core nodes, all cores beyond the one doing the generation loop sit idle during what is usually the longest phase of a benchmark run. Adding thread-level parallelism within each rank would multiply generation throughput by the available core count.
-
-→ Full analysis: [Section 5, Item 2](DLIO_IO_Issues-Proposal_2026-03-28.md#5-specific-improvement-opportunities), [Section 12e, Item 3](DLIO_IO_Issues-Proposal_2026-03-28.md#12e-recommendations)
-
-### 11. Object Store Generation Has No Async Pipeline
-
-Each file is generated and uploaded synchronously. Generation and upload cannot overlap, meaning each rank waits for the upload acknowledgment before generating the next file. An async upload pipeline would allow the CPU to generate the next file while the network transfers the previous one.
-
-→ Full analysis: [Section 5, Item 4](DLIO_IO_Issues-Proposal_2026-03-28.md#5-specific-improvement-opportunities)
-
-### 12. MPI Topology Is Collected but Not Used for Resource Planning
-
-DLIO already collects per-node rank counts and node indices at startup, but does not use this information to auto-size thread counts, assign file-locality by node, or report topology in benchmark output. All three uses are straightforward given the existing data.
-
-→ Full analysis: [Section 12](DLIO_IO_Issues-Proposal_2026-03-28.md#12-mpi-multi-host-topology--available-infrastructure-missing-integration)
-
-### 13. No Settle-Time Guard After Generation on Eventual-Consistency Systems
-
-After data generation completes, the benchmark immediately begins listing the generated files. On object stores with eventual-consistency semantics or NFS with attribute caching, newly written objects may not be visible to a listing immediately. If the listing returns fewer files than expected, the benchmark aborts with an error rather than retrying.
-
-→ Full analysis: [Section 6f](DLIO_IO_Issues-Proposal_2026-03-28.md#6f-no-barrier-before-directory-walk-in-initialize)
-
----
-
-## Recommended Prioritization
-
-| Priority | Issue | Effort | Impact |
-|---|---|---|---|
-| **Immediate** | File vs. object reader asymmetry (Issue 1) | Medium | Invalidates cross-backend comparisons |
-| **Immediate** | TFRecord iterative sampler bug (Issue 3) | Low | Invalidates multi-rank TFRecord results |
-| **High** | JPEG/PNG generator skips encoding for non-DALI (Issue 2) | Medium | Reduces generation from hours to seconds |
-| **High** | Unique-bytes constraint enforcement (Issue 5) | Low | Prevents meaningless results on dedup storage |
-| **High** | Auto-size `read_threads` (Issue 4) | Low | Unlocks full storage bandwidth at scale |
-| **Medium** | Derive `multiprocessing_context` automatically (Issue 8) | Low | Prevents silent hangs on config copy/paste |
-| **Medium** | YAML config composition with Hydra (Issue 7) | High | Reduces maintenance burden by ~70% |
-| **Medium** | Intra-rank generation parallelism (Issue 10) | Medium | Reduces generation wall-clock time proportionally |
-| **Low** | Async object-store upload pipeline (Issue 11) | Medium | Marginal throughput improvement |
-| **Low** | Node-local file affinity and topology logging (Issue 12) | Low | Improves NFS locality and result reproducibility |
-| **Low** | Post-generation settle time (Issue 13) | Low | Prevents spurious failures on object stores |
-
----
-
-## What Is Already Working Well
-
-The following design decisions in the current codebase are correct and should be preserved:
-
-- **dgen-py for data generation**: the zero-copy Rust-backed PRNG is the right foundation for all format generators. It is fast enough to never be the bottleneck and produces genuinely unique content per file.
-- **S3 iterable readers**: the skip-decode architecture is correct and complete. The task is to apply the same pattern to local-filesystem readers, not to change the object-storage path.
-- **Per-rank checkpoint files**: the distributed checkpointing design (each rank writes its own file, no serialization, barriers only at epoch boundaries) is correct for the workload being simulated.
-- **MPI topology collection in DLIOMPI**: the infrastructure to make topology-aware decisions is already present. It only needs to be wired into resource planning.
-- **TFRecord reader**: already returns the pre-allocated tensor without touching file bytes — the correct behaviour that all other readers need to adopt.
-
----
-
-*Full technical analysis, code examples, and implementation details are in [DLIO_IO_Issues-Proposal_2026-03-28.md](DLIO_IO_Issues-Proposal_2026-03-28.md).*
diff --git a/docs/DLIO_IO_Issues-Proposal_2026-03-28.md b/docs/DLIO_IO_Issues-Proposal_2026-03-28.md
deleted file mode 100644
index 854d53d5..00000000
--- a/docs/DLIO_IO_Issues-Proposal_2026-03-28.md
+++ /dev/null
@@ -1,1052 +0,0 @@
-# MPI Sharding & Parallelism Investigation: `dlio_benchmark`
-
-**Date:** 2026-03-28
-
----
-
-## 1. Data Generation
-
-**File:** `dlio_benchmark/data_generator/data_generator.py`
-
-**Sharding strategy — `_generate_files()`:**
-```python
-for i in range(self.my_rank, int(self.total_files_to_generate), self.comm_size):
-    ...
-```
-Classic rank-stride sharding. Rank `r` owns files at global indices `r, r+comm_size, r+2*comm_size, …`. File paths are pre-computed in `self._file_list[i]`, which distributes them across `num_subfolders_train` round-robin. This is correct and reproducible.
-
-**Seed handling:** `BASE_SEED + my_rank` for the per-rank RNG. File-level seeds are derived from a flowing `rng.integers(0, 2**63)` — no adjacent-seed correlation. Reproducible across runs.
-
-**Directory creation bottleneck:** Only rank 0 creates directories (correct, but means all other ranks idle during the `create_node` loop for subfolders). On slow NFS with many subfolders, this is measurable latency.
-
-**Intra-rank parallelism:** **None.** Each rank generates files in a serial loop. No threading. For large datasets on fast storage, each rank is I/O-bound writing one file at a time.
-
-**Object store path:** After each file, `storage.put_data(path, bytes_value)` is called synchronously. No pipelining or async upload.
-
----
-
-## 2. Data Loading (Training)
-
-**Files:** `dlio_benchmark/utils/config.py` · `dlio_benchmark/data_loader/torch_data_loader.py` · `dlio_benchmark/reader/reader_handler.py`
-
-### 2a. PyTorch path — `get_global_map_index()` (used when `data_loader_sampler == INDEX`)
-```python
-samples_per_proc = ceil(total_samples / comm_size)
-start_sample = my_rank * samples_per_proc
-end_sample   = (my_rank + 1) * samples_per_proc - 1
-# ...
-file_index = floor(global_sample_index / num_samples_per_file)
-abs_path = file_list[file_index]
-```
-**Correct.** Each rank gets a contiguous slice of the global sample space. File-to-sample mapping is done via global index, so rank `r` naturally reads a contiguous block of files. The custom `dlio_sampler` pre-computes `[start_sample, end_sample]` and yields indices from that range.
-
-Thread-level parallelism comes from `read_threads` (PyTorch `DataLoader` `num_workers`) with `multiprocessing_context` and `prefetch_factor`. Each worker independently reads samples.
-
-### 2b. TF / iterative path — `build_sample_map_iter()` (used when `data_loader_sampler == ITERATIVE`)
-```python
-files_per_rank = (num_files // comm_size) % num_files
-file_index = my_rank * files_per_rank          # ← initial offset
-for sample in sample_list:                      # sample_list is global-indexed
-    abs_path = file_list[file_index]
-    sample_index += 1
-    file_index = (sample_index // num_samples_per_file) % num_files  # ← LOCAL counter
-```
-**Bug:** The initial `file_index` (rank-aware offset) is applied only to the **first** sample. After that, `file_index` is driven by a LOCAL `sample_index` that starts at 0 regardless of rank. For rank 1 with `sppf=500`, rank 1 reads `file[2]` (correct), then immediately falls back to reading `file[0], file[0], …, file[1], …`.
-
-This means the TF iterative path does **not** correctly shard files across ranks — it reads from mostly the wrong files for all non-rank-0 ranks. The PyTorch index path does not have this bug.
-
-### 2c. Cross-rank file distribution pattern
-All ranks share the **same flat global file list** built by rank 0 (via `storage.walk_node()` + sort). There is no per-rank subdirectory affinity. With `num_subfolders_train > 0` the files are distributed across subfolders, but each rank reads from any subfolder in the list — there is no "this rank owns this directory" concept.
-
----
-
-## 3. Checkpointing
-
-**Files:** `dlio_benchmark/checkpointing/base_checkpointing.py` · `dlio_benchmark/checkpointing/pytorch_checkpointing.py`
-
-**Per-rank files:** Each checkpointing rank writes to `checkpoint_folder/global_epoch{E}_step{S}/model_states-{rank}.pt` independently — no rank serialization. Standard distributed checkpoint pattern.
-
-**Who checkpoints:** Controlled by `zero_stage`, `tensor_parallelism`, `pipeline_parallelism`, and `data_parallelism`. With `zero_stage=0`, only ranks `< model_parallelism` actually write (data-parallel copies are deduplicated). This is correct.
-
-**In-rank parallelism (checkpoint read):** `_get_streaming()` creates a `StreamingCheckpointing` instance with `num_parallel_readers=4`, `chunk_size=32MiB`. This parallelizes the read within a single rank's file. Writes happen via a single sequential stream.
-
-**Memory model:** `_SizePlaceholder` (no actual tensor allocation) + `_compute_state_bytes()` → correct byte count passed to the streaming backend. No RAM proportional to model size is used during save/load.
-
-**Barriers:** `comm.barrier()` after each checkpoint step in `_checkpoint_write()` / `_train()`. Optional `checkpoint_rank_sync` adds an extra barrier after every individual checkpoint. No barrier between individual layer writes within a rank.
-
-**Layer writes are serial:** Within a rank, layers are saved in a `for layer_index in range(start_layer, end_layer+1)` loop — no threading across layers.
-
----
-
-## 4. Summary Table
-
-| Component | MPI Sharding | Intra-rank Threads | Key Issue |
-|---|---|---|---|
-| Data generator | ✅ stride `range(rank, N, size)` | ❌ None (serial) | No parallel file writes; slow for large datasets |
-| Data loading (PyTorch) | ✅ contiguous sample slice, correct file mapping | ✅ `read_threads` workers | No per-rank directory affinity |
-| Data loading (TF/iter) | ⚠️ Bug: only first file uses rank offset | ✅ `read_threads` | `build_sample_map_iter()` file_index resets to 0 after first sample |
-| Checkpointing (write) | ✅ each rank writes its own file | ❌ layers written serially | No parallel layer writes per rank |
-| Checkpointing (read) | ✅ each rank reads its own file | ✅ 4 parallel readers | Only parallelized on the read path |
-
----
-
-## 5. Specific Improvement Opportunities
-
-1. **Per-rank subdirectory ownership during generation and loading**: Set `num_subfolders_train = comm_size` and have rank `r` exclusively write to (and read from) `train/{r:04d}/`. This eliminates namespace contention on NFS/Lustre and makes the I/O pattern far more realistic for distributed storage. Today `num_subfolders_train` partitions files into folders but without rank affinity.
-
-2. **Parallel intra-rank file generation**: Wrap the `_generate_files()` loop in a `ThreadPoolExecutor(max_workers=N)` — each thread writes an independent file (already uniquely seeded). This would N× generation throughput per rank on fast storage (NVMe, object store).
-
-3. **Fix `build_sample_map_iter()` file index tracking**: The local `sample_index` counter should be replaced with the global sample index for the file lookup, matching the logic in `get_global_map_index()`. Currently rank 1+ in TF mode reads wrong files.
-
-4. **Async object store upload**: In `_generate_files()`, the `storage.put_data(path, bytes)` call is synchronous. A bounded async queue (e.g., `asyncio` or `ThreadPoolExecutor`) would pipeline data generation and upload.
-
-5. **Parallel checkpoint layer writes per rank**: The inner `for layer_index in range(start_layer, end_layer+1)` loop in `save_checkpoint()` is serial. Since each layer writes to an independent file, these could be parallelized with threads — especially relevant for large models with many layers.
-
-6. **Read-ahead / file pinning**: The `read_threads` workers in PyTorch mode all operate on the global file list. Adding an optional `prefetch_list` derived from each rank's assigned file range (pinning files to DRAM via `mmap`) before training starts would eliminate open-file latency in tight training loops.
-
----
-
-## 6. Additional Issues Identified on Second Review
-
-### 6a. `storage_library` Promotion Inconsistency
-
-**File:** `dlio_benchmark/utils/config.py` — `LoadConfig()` (line ~1075) and `validate()` (line ~368)
-
-The YAML schema places `storage_library` as a top-level key under `storage:`:
-```yaml
-storage:
-  storage_type: s3
-  storage_library: s3dlio      # ← top-level in YAML
-  storage_options:
-    endpoint_url: https://...
-```
-
-But `validate()` reads it from inside `storage_options`:
-```python
-storage_library = (self.storage_options or {}).get("storage_library")
-```
-
-This only works because `LoadConfig()` performs an explicit "promotion" — it detects `config['storage']['storage_library']` and injects it into `args.storage_options['storage_library']`. So the YAML schema and the dataclass schema are inconsistent: `ConfigArguments` has no top-level `storage_library` field, and `validate()` assumes it has been promoted into `storage_options`.
-
-**Risk:** Any code path that evaluates `storage_library` before or outside `LoadConfig()` (e.g., a custom runner that builds `ConfigArguments` by hand) will see `None`. Additionally, the Hydra CLI override path is ambiguous — both `++workload.storage.storage_library=s3dlio` (promoted by LoadConfig) and `++workload.storage.storage_options.storage_library=s3dlio` (direct) work, but neither is documented clearly, and users who pass the wrong one get an opaque `None` check failure.
-
-**Recommendation:** Add `storage_library: str = ""` as a first-class field on `ConfigArguments`, read it directly in `validate()` from `self.storage_library`, and have `LoadConfig()` populate it without the promotion workaround.
-
-### 6b. `validate()` Called Before File List Is Available
-
-`derive_configurations()` (which calls `validate()`) is called twice:
-1. During `DataGenerator.__init__()` with no file lists (the generator-only early path)
-2. During `DLIOBenchmark.initialize()` after the storage walk
-
-On the first call, credential checks and `storage_library` validation run even when the run is a pure file-system operation. More importantly, some validation branches (e.g., NPZ reader import checks) are exercised before it is clear whether object storage will actually be used. This is harmless when it works but adds unnecessary error surface for misconfigured environments.
-
-**Recommendation:** Separate `validate_storage()` (called early, storage-type-aware) from `validate_workload()` (called after file lists are known). Only run credential checks when `storage_type == StorageType.S3`.
-
-### 6c. `multiprocessing_context` Couples to `storage_library` But Lives in `reader:`
-
-The `multiprocessing_context` key lives under `reader:` but its correct value depends entirely on the storage backend:
-
-| Storage library | Required `multiprocessing_context` | Reason |
-|---|---|---|
-| `local_fs` / `minio` | `fork` (default) | No async runtime in worker |
-| `s3dlio` | `spawn` | Tokio runtime destroyed by fork |
-| `s3torchconnector` | `spawn` | Background S3 threads destroyed by fork |
-
-This coupling is currently enforced only through comments in the YAML files. If a user copies a file-backend YAML and adds an s3dlio storage section without updating the reader section, all S3 reads will silently hang (the Tokio runtime is dead in the forked child). There is no runtime warning or error.
-
-**Recommendation:** In `derive_configurations()`, after `storage_library` is known, automatically set `self.multiprocessing_context = "spawn"` if the library is `s3dlio` or `s3torchconnector`, with a warning if the YAML had explicitly set `fork`. This makes the constraint self-enforcing rather than documentation-dependent.
-
-### 6d. Hardcoded Endpoint URIs in YAML Files
-
-The lab IP `https://172.16.1.40:9000` appears hardcoded in every object-storage YAML:
-```yaml
-storage_options:
-  endpoint_url: https://172.16.1.40:9000
-```
-
-This makes every object-storage YAML file **environment-specific** — they fail immediately in any other environment (CI, cloud, different lab). It also means the same model config cannot be shared across teams without edits.
-
-**Recommendation:** Use environment variable resolution for all connection properties. Hydra supports `${oc.env:AWS_ENDPOINT_URL}` interpolation. Alternatively, treat `endpoint_url` as a required CLI override with no default, so the YAML template contains a clearly-marked placeholder:
-```yaml
-storage_options:
-  endpoint_url: ???   # Required: set via ++workload.storage.storage_options.endpoint_url=...
-```
-
-### 6e. `build_sample_map_iter()` Bug — Concrete Description
-
-For completeness, here is the exact effect of the file-index tracking bug (Section 2b) with a worked example:
-
-Given 8 files, 2 ranks, 4 files per rank, `num_samples_per_file=1`:
-- Rank 1: `files_per_rank = (8 // 2) % 8 = 4`. `file_index` starts at `1 * 4 = 4` (correct, pointing to file[4]).
-- First iteration: reads `file[4]` ✅
-- After first sample: `sample_index = 1`, `file_index = (1 // 1) % 8 = 1` — now pointing to `file[1]` ❌ (should be `file[5]`)
-- All subsequent samples for rank 1 iterate through `file[1], file[2], file[3], …` — the same file range as rank 0.
-
-Both ranks read overlapping files, meaning the benchmark double-counts throughput from the same data and misses the upper half of the dataset entirely. The PyTorch index path (`get_global_map_index()`) does not have this bug. TFRecord workloads using the `ITERATIVE` sampler are affected.
-
-### 6f. No Barrier Before Directory Walk in `initialize()`
-
-In `DLIOBenchmark.initialize()`, when `generate_data=True`, all ranks barrier after generation:
-```python
-self.data_generator.generate()
-self.comm.barrier()   # ← correct
-```
-
-But then rank 0 immediately proceeds to `storage.walk_node()` inside the same `initialize()` call (after the barrier) to build `file_list_train`. On object stores with eventual-consistency semantics (or NFS with attribute caching), newly written objects may not yet be visible to a listing. There is no retry or wait logic before the walk. If the walk returns fewer files than expected, a hard exception is raised.
-
-**Recommendation:** Add a configurable `post_generation_settle_time` (default 0) with a rank-0 sleep + broadcast before the walk when `storage_type != local_fs`.
-
----
-
-## 7. YAML Config Proliferation Analysis
-
-### 7a. Current State
-
-The `configs/dlio/workload/` directory contains **49 YAML files** for what is effectively a small matrix of orthogonal dimensions:
-
-| Dimension | Values |
-|---|---|
-| Model / workload | unet3d, resnet50, cosmoflow, llama3_8b, dlrm, flux, retinanet |
-| Storage backend | local_fs, s3+minio, s3+s3dlio, s3+s3torchconnector |
-| Phase | datagen only, train only, checkpoint only, train+checkpoint |
-| Scale | a100, h100, b200, mi355, 1t, 405b, 70b, 8b |
-
-The current approach creates one YAML per _combination_. For a single model (unet3d h100), this already produces 7 files:
-
-```
-unet3d_h100.yaml                 ← file, train
-unet3d_h100_minio.yaml           ← minio, train
-unet3d_h100_minio_datagen.yaml   ← minio, datagen
-unet3d_h100_s3dlio.yaml          ← s3dlio, train
-unet3d_h100_s3dlio_datagen.yaml  ← s3dlio, datagen
-unet3d_h100_s3torch.yaml         ← s3torchconnector, train
-unet3d_h100_s3torch_datagen.yaml ← s3torchconnector, datagen
-```
-
-Similarly, llama3_8b generates 4 files; cosmoflow, resnet50, flux, retinanet, dlrm create additional duplicates. This pattern scales as `O(models × libraries × phases)`.
-
-### 7b. What Differs Between Files — and What Doesn't
-
-Comparing the three unet3d-h100 training variants (minio / s3dlio / s3torch), **the only fields that differ** are:
-```yaml
-# Differs:
-storage.storage_library:    minio | s3dlio | s3torchconnector
-storage.storage_root:       mlp-minio | mlp-s3dlio | mlp-s3torch
-storage_options.endpoint_url:  # same IP, but separate bucket implies separate data staging
-reader.multiprocessing_context:  spawn  # same for all three object store variants
-# s3dlio only:
-storage_options.s3_force_path_style: true
-# minio only:
-storage_options.secure: false
-```
-
-**Everything else is identical**: model definition, framework, dataset sizes, record lengths, train epochs, computation time, batch size, read threads, shuffle settings, metric target AU.
-
-The datagen variants differ from the train variants only in:
-```yaml
-workflow.generate_data: True   # vs False
-workflow.train: False          # vs True
-```
-
-### 7c. Root Causes of the Explosion
-
-1. **No config composition**: Hydra supports config groups (sub-directories with named YAML slices that can be composed), but the current setup uses a flat directory of monolithic files. There is no `defaults:` list or group structure.
-
-2. **Storage connection params are baked in**: The endpoint URL and bucket name are specific to a single lab, making every file non-portable. Portable configs require parameterizing these, which currently gets done by forking.
-
-3. **`workflow.generate_data` / `workflow.train` are toggled by file, not CLI**: Users fork the YAML to change phase rather than passing `++workload.workflow.generate_data=True` on the command line.
-
-4. **`storage_library` is not a CLI-first parameter**: The library choice (minio vs s3dlio vs s3torchconnector) is buried inside the YAML, requiring a separate file per library instead of a single override on the command line.
-
----
-
-## 8. Proposed YAML Config Architecture
-
-### 8a. Principle: Separate What Changes from What Doesn't
-
-The YAML files should capture stable model/workload facts (architecture, dataset sizes, target AU, epoch count, computation time). Storage backend and connection details should be supplied at runtime via CLI overrides or a small environment-local override file.
-
-### 8b. Recommended Directory Structure (Hydra Config Groups)
-
-```
-configs/dlio/
-  config.yaml          ← top-level Hydra config with defaults list
-  workload/
-    models/            ← config group: model + dataset + training params
-      unet3d_h100.yaml
-      resnet50_a100.yaml
-      cosmoflow_a100.yaml
-      llama3_8b.yaml
-      dlrm_b200.yaml
-      flux_b200.yaml
-      retinanet_b200.yaml
-    storage/           ← config group: storage backend templates
-      file.yaml        ← local_fs, no credentials required
-      s3_minio.yaml    ← s3 + minio SDK, endpoint_url = ???
-      s3_s3dlio.yaml   ← s3 + s3dlio, endpoint_url = ???
-      s3_s3torch.yaml  ← s3 + s3torchconnector, endpoint_url = ???
-    workflow/          ← config group: what phases to run
-      train.yaml       ← generate_data: False,  train: True
-      datagen.yaml     ← generate_data: True,   train: False
-      checkpoint.yaml  ← generate_data: False,  train: False, checkpoint: True
-      full.yaml        ← generate_data: True,   train: True
-```
-
-A model file (`models/unet3d_h100.yaml`) would contain only stable facts:
-```yaml
-# configs/dlio/workload/models/unet3d_h100.yaml
-model:
-  name: unet3d
-  type: cnn
-  model_size: 499153191
-
-framework: pytorch
-
-dataset:
-  data_folder: test-run/unet3d     # relative path within bucket or filesystem root
-  format: npz
-  num_files_train: 168
-  num_samples_per_file: 1
-  record_length_bytes: 146600628
-  record_length_bytes_stdev: 68341808
-  record_length_bytes_resize: 2097152
-
-reader:
-  data_loader: pytorch
-  batch_size: 7
-  read_threads: 4
-  file_shuffle: seed
-  sample_shuffle: seed
-
-train:
-  epochs: 5
-  computation_time: 0.323
-
-checkpoint:
-  checkpoint_folder: checkpoints/unet3d
-  checkpoint_after_epoch: 5
-  epochs_between_checkpoints: 2
-
-metric:
-  au: 0.90
-```
-
-A storage template (`storage/s3_s3dlio.yaml`) would contain backend facts with required fields explicitly marked:
-```yaml
-# configs/dlio/workload/storage/s3_s3dlio.yaml
-storage:
-  storage_type: s3
-  storage_library: s3dlio
-  storage_root: ???                  # Required: bucket name, set via CLI
-  storage_options:
-    endpoint_url: ???                # Required: set via ++workload.storage.storage_options.endpoint_url=
-    region: us-east-1
-    s3_force_path_style: true
-
-reader:
-  multiprocessing_context: spawn     # Required for s3dlio — Tokio is fork-unsafe
-```
-
-### 8c. Command-Line Patterns for Runtime Switching
-
-With this structure, switching backends requires only CLI overrides — no new YAML files:
-
-**File-backend training:**
-```bash
-dlio_benchmark \
-  workload=models/unet3d_h100 \
-  ++workload.storage.storage_type=local_fs \
-  ++workload.storage.storage_root=/mnt/scratch/dlio-data \
-  ++workload.workflow.generate_data=False \
-  ++workload.workflow.train=True
-```
-
-**Object storage with s3dlio:**
-```bash
-dlio_benchmark \
-  workload=models/unet3d_h100 \
-  ++workload.storage.storage_type=s3 \
-  ++workload.storage.storage_library=s3dlio \
-  ++workload.storage.storage_root=mlp-s3dlio \
-  ++workload.storage.storage_options.endpoint_url=${AWS_ENDPOINT_URL} \
-  ++workload.workflow.train=True
-```
-
-**Switch to minio on the same command, same model:**
-```bash
-# Change only storage_library and storage_root (bucket name)
-... ++workload.storage.storage_library=minio \
-    ++workload.storage.storage_root=mlp-minio \
-    ++workload.reader.multiprocessing_context=fork
-```
-
-**Datagen-only, then train:**
-```bash
-# Step 1: generate
-dlio_benchmark workload=models/unet3d_h100 \
-  ++workload.storage.storage_type=s3 \
-  ++workload.storage.storage_library=s3dlio \
-  ++workload.storage.storage_root=mlp-s3dlio \
-  ++workload.storage.storage_options.endpoint_url=${AWS_ENDPOINT_URL} \
-  ++workload.workflow.generate_data=True \
-  ++workload.workflow.train=False
-
-# Step 2: train (identical flags, flip workflow)
-dlio_benchmark workload=models/unet3d_h100 \
-  ... \
-  ++workload.workflow.generate_data=False \
-  ++workload.workflow.train=True
-```
-
-### 8d. Environment-Local Override File (Alternative to Shell Functions)
-
-For teams with a fixed endpoint, a local override file can be sourced by Hydra without committing credentials to the repo:
-
-```yaml
-# configs/dlio/local.yaml  (gitignored)
-defaults:
-  - override storage: s3_s3dlio
-
-storage:
-  storage_root: my-bucket
-  storage_options:
-    endpoint_url: https://my-minio.internal:9000
-```
-
-Then run:
-```bash
-dlio_benchmark +local=local workload=models/unet3d_h100 ++workload.workflow.train=True
-```
-
-### 8e. Impact on File Count
-
-Under the proposed structure, the 7 unet3d-h100 files collapse to 1 model file plus 3 reusable storage templates (shared by all models). Across the full matrix of 7 models × 3 object libraries × 2 phases, the ~30 object-storage YAML files collapse to 7 model files + 3 storage templates + 3 workflow files = **13 files total** — a ~70% reduction, and all storage templates are shared across models.
-
-### 8f. Short-Term Mitigation (No Refactor Required)
-
-If the full Hydra config-group refactor is not immediately feasible, the proliferation can be stopped without changing existing files:
-
-1. **Stop adding `_minio.yaml`, `_s3dlio.yaml`, `_s3torch.yaml` variants.** Document the override pattern in `README_S3DLIO_CONFIGS.md` instead.
-2. **Remove hardcoded IPs** from existing YAML files. Replace with `???` (Hydra's "required, no default" sentinel) and add `endpoint_url` to the run instructions.
-3. **Add a shared `storage/` config group** with the three library templates. New models only need a model YAML; storage is composed at runtime.
-4. **Derive `multiprocessing_context`** automatically from `storage_library` in `derive_configurations()` to remove the hidden coupling.
-5. **Phase switches via CLI**: Add one-line documentation showing `++workload.workflow.generate_data=True` so users stop forking YAML files to change only the phase.
-
----
-
-## 9. The Core Principle: This Is a Storage Benchmark, Not a Data Processing Benchmark
-
-### 9a. The Design Intent Is Correct — But the Implementation Is Incomplete
-
-The codebase already recognises that decoded data has no value. In `reader_handler.py`, every read path ends with:
-
-```python
-# FormatReader.next()  and  FormatReader.read_index()
-...
-self.get_sample(filename, sample_index)   # reads + decodes file
-self.preprocess()
-return self._args.resized_image           # ← decoded data is THROWN AWAY here
-```
-
-`resized_image` is a **single random tensor**, allocated once at startup in `derive_configurations()`:
-
-```python
-self.resized_image = gen_random_tensor(
-    shape=self.transformed_record_dims,
-    dtype=self.transformed_record_element_dtype, rng=rng)
-```
-
-Every reader, every format, every sample in every epoch returns this same pre-allocated buffer. The content of what was read from storage is irrelevant by design. The benchmark measures how fast the storage can deliver bytes — not what those bytes mean.
-
-The TFRecord reader already honours this principle fully: `_parse_image()` returns `self._resized_image` without touching the raw bytes at all. The S3 iterable readers (`image_reader_s3_iterable.py`, `hdf5_reader_s3_iterable.py`, `tfrecord_reader_s3_iterable.py`) store only byte counts for telemetry, never decoded arrays.
-
-**The problem is that for local-filesystem readers and all generators, the code does substantial CPU-intensive data transformation work whose only output is a buffer that is immediately discarded.** Every CPU cycle spent on JPEG entropy coding, PIL decoding, protobuf serialization, or zlib compression is overhead injected into a storage benchmark that doesn't need it.
-
-### 9b. Generator Overhead by Format
-
-| Format | Generation work | Relevant to storage? | CPU cost |
-|---|---|---|---|
-| JPEG | `gen_random_tensor` → `PIL.fromarray` → `img.save(format='JPEG')` (DCT + quantize + Huffman) | ❌ | High: 10–60 ms/file |
-| PNG | `gen_random_tensor` → `PIL.fromarray` → `img.save(format='PNG')` (Deflate lossless) | ❌ | Very high: 30–200 ms/file |
-| NPY | `gen_random_tensor` (dgen-py) → `np.save()` (raw binary dump) | ✅ Near-minimal | Low: < 1 ms/file |
-| NPZ (no compression) | `gen_random_tensor` → `np.savez()` (ZIP container, stored mode) | ✅ Near-minimal | Low |
-| NPZ (zip compression) | `gen_random_tensor` → `np.savez_compressed()` (ZIP+Deflate) | ❌ | Medium–high: zlib per file |
-| HDF5 (no compression) | `gen_random_tensor` → h5py metadata + raw dataset write | Mostly ✅ | Low–medium |
-| HDF5 (gzip) | + GZIP compression per dataset | ❌ | Medium–high |
-| TFRecord | `gen_random_tensor` → `tf.train.Example` → `SerializeToString()` per sample | ❌ partial | Medium: protobuf serialize |
-| CSV | `gen_random_tensor` → `pd.DataFrame.to_csv()` (text encode + float formatting) | ❌ | Medium: text serialization |
-| IndexedBinary | `gen_random_tensor` → MPI-IO raw byte write | ✅ Minimal | Low |
-| Synthetic | single integer written as UTF-8 string | ✅ Minimal | Negligible |
-
-**JPEG and PNG are the worst offenders** because the encoder is CPU-bound and irreversibly entangled in the format: there is no way to construct a valid JPEG or PNG without running the compression algorithm, because the file format *is* the compressed output.
-
-### 9c. Reader Overhead by Format (Local Filesystem Path)
-
-| Format | Reader `open()` / `get_sample()` work | Decoded data used? | CPU cost |
-|---|---|---|---|
-| JPEG/PNG (`ImageReader`) | `PIL.Image.open()` + `np.asarray()` — full entropy decode | ❌ Discarded | High: 5–20 ms/file |
-| NPY (`NPYReader`) | `np.load()` — mmap or full array load | ❌ Discarded | Low–medium |
-| NPZ (`NPZReader`) | `np.load()['x']` — ZIP inflate + array load | ❌ Discarded | Medium |
-| HDF5 (`HDF5Reader`) | `h5py.File()` + `dataset[sample_index]` — HDF5 chunk read + numpy convert | ❌ Discarded | Low–medium |
-| TFRecord (`TFReader`) | raw bytes streamed by tf.data, `_parse_image()` returns `resized_image` directly | ✅ Already bypassed | None |
-| S3 iterable readers | raw bytes fetched, byte count stored for telemetry | ✅ Already bypassed | None |
-
-The S3 iterable readers represent the correct pattern. They are documented explicitly:
-
-> *"No PIL or numpy decode is performed. DLIO's FormatReader.next() yields a pre-allocated random tensor regardless of file contents; only the byte count is needed for the image_size telemetry metric."*
-> — `image_reader_s3_iterable.py` docstring
-
-The local-filesystem equivalents do not apply the same logic.
-
-### 9d. Where Time Actually Goes in an End-to-End JPEG Benchmark Run
-
-For a single 224×224 JPEG file on a local NFS filesystem:
-
-**Generation (once):**
-```
-dgen_py random bytes:    ~0.01 ms   (fast Rust PRNG, zero-copy)
-PIL.fromarray():         ~0.5 ms    (copies bytes into PIL Image object)
-img.save(JPEG):         ~20–40 ms  (DCT + quantization + Huffman coding)
-write() syscall:         ~0.1 ms    (kernel buffer, NFS async)
-Total per file:         ~21–41 ms  — 98% is the JPEG encoder
-```
-
-**Reading (every training step, every epoch):**
-```
-open() syscall:          ~1 ms      (NFS RTT)
-read() syscall:          ~0.01 ms   (115 KB at 10 GiB/s)
-PIL.Image.open():        ~5–15 ms   (JPEG entropy decode + YCbCr→RGB)
-np.asarray():            ~0.5 ms    (copy into numpy)
-resized_image returned:  decoded array discarded
-Total per file:         ~7–17 ms  — storage I/O is < 5% of total time
-```
-
-The storage benchmark is spending more time on JPEG decode during reading than on actual I/O. The encode during generation is 200–4000× the storage write time.
-
-### 9e. The Non-Negotiable Constraint: Every File Must Contain Unique Bytes
-
-Before discussing any optimisation, one constraint must be stated explicitly:
-
-**Every generated file must contain content that is unique across the entire dataset. Reusing the same byte sequence across multiple files is a fundamental correctness error for a storage benchmark.**
-
-Modern storage systems — enterprise NAS arrays (NetApp, Vast Data, Pure Storage), object stores, and distributed file systems — routinely apply inline deduplication and compression. If two files have identical byte content, a deduplicating storage system stores only one physical copy, regardless of how many logical files are created. A benchmark that writes N files containing identical bytes is not measuring how fast the storage can absorb N files of unique data — it is measuring how fast the dedup engine can detect and discard duplicates. The measured throughput may be orders of magnitude higher than true storage write throughput, producing completely meaningless results.
-
-**The template-clone approach described in an earlier draft of this document was categorically wrong and has been withdrawn.** Writing the same pre-encoded JPEG bytes to every file would collapse 1.28 million "distinct" training images to a single unique 115 KB block in any deduplicating storage system. That is not a storage benchmark.
-
-The same logic applies to any "pre-compute one serialized blob and copy it N times" shortcut for any format. The byte content of every file must be independently unique.
-
-### 9f. dgen-py: The Correct Foundation for All Data Generation
-
-The correct solution to the CPU overhead problem is already present in the codebase: `gen_random_tensor()` backed by **dgen-py**, a zero-copy Rust-backed PRNG library written specifically for this project.
-
-Key properties that matter here:
-
-- **Speed**: ~155× faster than NumPy random generation. For a 224×224×3 uint8 array (150,528 bytes), dgen-py generates the raw bytes in < 0.01 ms, versus ~1.5 ms for NumPy.
-- **Uniqueness**: every call with a different seed produces a statistically independent, non-repeating byte stream. Since `_generate_files()` uses a flowing RNG that advances per file (`seed = int(rng.integers(0, 2**63))`), every file gets a unique seed → unique bytes.
-- **Zero-copy**: dgen-py returns a `BytesView` implementing the buffer protocol. `np.frombuffer(bytesview, dtype=dtype)` consumes it without an intermediate allocation.
-- **Scalability**: because the bytes are generated in Rust with SIMD, generation throughput exceeds 50 GiB/s on modern CPUs — faster than any storage device can accept data.
-
-**dgen-py must be used for all new data generation, for all formats, without exception.** It is already wired into `gen_random_tensor()` and therefore already active for every format that calls it. The critical requirement is that no code path reuses byte content across file boundaries.
-
-For the formats where generation work is proportional to storage size (NPY, IndexedBinary, HDF5 without compression), the pipeline is already correct:
-
-```
-dgen-py (unique bytes, < 0.01 ms per file) →  write() syscall to storage
-```
-
-dgen-py is the bottleneck only if the benchmark needs to generate faster than ~50 GiB/s per core, which exceeds every real storage system's ingestion bandwidth.
-
-### 9g. JPEG/PNG: Do Files Need to Be ACTUALLY Valid Images?
-
-The short answer: **it depends entirely on which data loader is configured.**
-
-This is the key question for generation cost. If files do not need to be valid JPEG/PNG bitstreams, the generator can write raw dgen-py bytes directly — no PIL, no DCT, no Huffman coding — reducing generation from ~20–40 ms/file to < 0.01 ms/file. That is a 2000–4000× speedup.
-
-#### When valid JPEG/PNG is required: DALI and NATIVE_DALI data loaders
-
-`dali_image_reader.py` constructs a DALI pipeline that calls:
-
-```python
-images = fn.decoders.image(images, device='cpu')   # line 80
-```
-
-`fn.decoders.image()` is NVIDIA's GPU/CPU image decoder. It requires a syntactically valid JPEG or PNG bitstream. It will throw an error on random bytes, even if preceded by a correct-looking header. When `data_loader_type` is `dali` or `native_dali`, files MUST be valid images and PIL encoding is unavoidable.
-
-#### When valid JPEG/PNG is NOT required: all other data loaders
-
-The S3 iterable readers (`ImageReaderS3Iterable`) already prove this. They fetch raw bytes from object storage, record `len(raw_bytes)` for telemetry, and never call `PIL.Image.open()`. The benchmark runs correctly with files that contain arbitrary bytes — the format name attached to those bytes is irrelevant because the reader never decodes them.
-
-After the Section 9h fix (replacing `PIL.Image.open()` with `open(filename, 'rb').read()` in the local-filesystem `ImageReader`), the same is true for all non-DALI paths:
-
-| Data loader | Reader decodes image? | Files must be valid JPEG/PNG? |
-|---|---|---|
-| `pytorch` / `tensorflow` (local FS, current) | YES — `PIL.Image.open()` | YES (current) |
-| `pytorch` / `tensorflow` (local FS, after 9h fix) | NO — raw byte read | **NO** |
-| any (S3 iterable readers, already shipped) | NO — raw byte read | **NO** |
-| `dali` / `native_dali` | YES — `fn.decoders.image()` | **YES, always** |
-
-#### The consequence for generators: branch on `data_loader_type`
-
-For non-DALI paths, `JPEGGenerator` and `PNGGenerator` can write raw dgen-py bytes directly, with no PIL pipeline at all:
-
-```python
-def _write(i, dim_, dim1, dim2, file_seed, rng, out_path_spec, is_local, output):
-    if self._args.data_loader_type in (DataLoaderType.DALI, DataLoaderType.NATIVE_DALI):
-        # DALI pipeline calls fn.decoders.image() — must produce valid JPEG
-        records = gen_random_tensor(shape=(dim1, dim2), dtype=np.uint8, rng=rng)
-        img = PIL.Image.fromarray(np.clip(records, 0, 255).astype(np.uint8))
-        img.save(output, format='JPEG', quality=75)
-    else:
-        # Reader reads raw bytes and discards them — any bytes work
-        raw = gen_random_tensor(shape=(dim1 * dim2 * 3,), dtype=np.uint8, rng=rng)
-        output.write(raw.tobytes())
-```
-
-For the non-DALI branch the generation pipeline collapses to:
-
-```
-dgen-py (unique bytes, < 0.01 ms) → write() syscall to storage
-```
-
-This is identical to NPY generation. The "irreducible cost" of JPEG/PNG format disappears entirely for non-DALI configurations.
-
-#### File size note
-
-Raw dgen-py bytes for a 224×224×3 uint8 image = 150,528 bytes (~150 KB). A real JPEG of the same image is typically 50–115 KB (4:1–6:1 compression). The raw format produces slightly LARGER files than real JPEGs. For a storage benchmark, larger files per sample means more I/O per batch — a slightly more conservative (harder) test. This is acceptable. The `record_length` field in the benchmark config controls expected size; if exact size matching is needed, the raw write can be padded or truncated to `record_length` bytes.
-
-#### Remaining mitigations for the DALI path
-
-When `data_loader_type: dali` is configured, PIL encoding is unavoidable. The applicable mitigations are:
-
-1. **Lower JPEG quality.** `quality=10` encodes at 3–5× speed compared to `quality=75`. Files are still valid, unique JPEG bitstreams.
-2. **Parallel intra-rank encoding via `ThreadPoolExecutor`.** PIL's JPEG encoder releases the Python GIL; 4–8 threads per rank reduces wall-clock time proportionally.
-3. **Use NPY or HDF5 for pure storage benchmarks.** DALI supports NPY input natively. If the goal is to measure storage bandwidth/IOPS rather than to simulate a specific vision training pipeline, switch formats. NPY generation is already fast and the benchmark result is equivalent.
-
-**YAML warning recommendation for any JPEG/PNG config that uses `data_loader_type: dali`:**
-
-```yaml
-# WARNING: DALI data loader requires valid JPEG files (fn.decoders.image() is a real decoder).
-# Generation cost: ~20-40ms/file (PIL JPEG encode). For faster generation with equivalent
-# storage I/O measurement, use data_loader_type: pytorch with NPY format instead.
-```
-
-### 9h. Reader Overhead: The Fix That Is Already Half-Done
-
-The S3 iterable readers already apply the correct pattern: fetch raw bytes, record the byte count for telemetry, discard the bytes, return `resized_image`. The local-filesystem `ImageReader` does not; it decodes the full JPEG via PIL.
-
-The raw-byte-read fix for `ImageReader.open()` is valid and does not introduce any deduplication concern — the storage read is still a real read of the on-disk file (unique bytes are fetched); only the subsequent CPU decode is skipped:
-
-```python
-# Proposed replacement for ImageReader.open()
-def open(self, filename):
-    with open(filename, 'rb') as f:
-        raw = f.read()
-    return len(raw)   # byte count for telemetry, like ImageReaderS3Iterable
-
-def get_sample(self, filename, sample_index):
-    byte_count = self.open_file_map[filename]
-    dlp.update(image_size=byte_count)
-    dft_ai.update(image_size=byte_count)
-```
-
-This eliminates 5–20 ms of PIL decode overhead per sample from the training-step timing. The storage I/O — the thing being measured — is unchanged.
-
-**NPZ/HDF5 with Compression:**
-The same principle applies to read decompression. When `compression=gzip` or `compression=zip` is enabled, the reader spends significant CPU time inflating data that is then discarded. These settings should default to `none`:
-
-```
-WARNING at startup when compression != NONE:
-"compression=<X> is enabled. Benchmark will include CPU decompression 
-in timings, not pure storage bandwidth. Set compression=none for 
-accurate storage performance measurement."
-```
-
-### 9i. Summary of Corrected Recommendations
-
-| Issue | Correct Action | Incorrect Action (Do Not Do) |
-|---|---|---|
-| JPEG/PNG generation with non-DALI data loaders | Write raw dgen-py bytes directly — no PIL, no DCT, no Huffman; generation drops from ~30 ms/file to < 0.01 ms/file | Always run PIL encode regardless of whether the reader decodes the file |
-| JPEG/PNG generation with DALI / NATIVE_DALI | PIL encode is unavoidable (`fn.decoders.image()` is a real GPU decoder); use `quality=10` + `ThreadPoolExecutor` | Treat DALI path the same as non-DALI and write raw bytes — DALI will throw an error on invalid bitstream |
-| TFRecord per-sample protobuf serialization | Use dgen-py for each sample's raw bytes (already done); accept protobuf overhead as format cost | Pre-compute one `Example` blob and replicate it — produces N logically distinct but physically identical records |
-| `ImageReader.open()` decodes JPEG to discard | Read raw bytes, store byte count (like `ImageReaderS3Iterable`) | Skip the storage read entirely — would produce an I/O-free benchmark |
-| NPZ/HDF5 compression adds CPU overhead | Default `compression: none`; warn at startup when enabled | Add compression without warning — benchmark silently measures CPU, not storage |
-| CSV format for storage benchmarking | Document as not recommended; prefer NPY/IndexedBinary | Add multi-format CSV confusion |
-| JPEG/PNG for large-scale storage benchmarks with DALI | Document as "inherently generation-slow on DALI path"; recommend NPY/HDF5 for pure I/O testing | Use JPEG/PNG + DALI for billion-file benchmarks where generation time dominates |
-| All data generation must use dgen-py | `gen_random_tensor()` via `_generate_files()` already does this — enforce as mandatory, no exceptions | Use `np.zeros`, `np.ones`, or any repeated constant — these produce identical content across files |
-
----
-
-## 10. Small-File Workload Pathologies (JPEG / PNG)
-
-### 10a. What "Small File" Means Here
-
-JPEG and PNG formats always store exactly one sample per file (`num_samples_per_file = 1`). Typical sizes:
-
-| Workload | Image size | File size |
-|---|---|---|
-| ImageNet-1K (resnet50) | 224 × 224 × 3 | ~50–150 KB |
-| CIFAR-10 | 32 × 32 × 3 | ~2–5 KB |
-| Custom satellite / medical | 512 × 512 × 1 | ~100–500 KB |
-
-Unlike TFRecord, HDF5, or NPZ — which pack hundreds or thousands of samples into one file, amortising open/stat/read latency across many samples — every JPEG/PNG access is a full open → read → decode → close cycle for a single sample. This makes the number of IOPS required proportional to the sample count, not the batch count.
-
-### 10b. Data Generation Bottleneck
-
-`_generate_files()` in `data_generator.py` drives every format generator. Its core loop is:
-
-```python
-for i in range(self.my_rank, int(self.total_files_to_generate), self.comm_size):
-    write_fn(i, dim_, dim1, dim2, ...)   # serial within rank
-```
-
-There is no thread pool, no `asyncio`, no `concurrent.futures`. Each call to `write_fn` must complete before the next begins.
-
-For JPEG and PNG, `write_fn` is:
-
-```python
-# jpeg_generator.py
-img = Image.fromarray(arr.astype('uint8'), mode='RGB')
-img.save(output, format='JPEG')       # CPU-bound encode, ~10–60 ms
-
-# png_generator.py
-img = Image.fromarray(arr.astype('uint8'), mode='RGB')
-img.save(output, format='PNG')        # CPU-bound lossless encode, ~30–200 ms
-```
-
-PIL's JPEG and PNG encoders are single-threaded inside each call. JPEG encode at quality 75 typically runs 15–40 ms for a 224×224×3 image on a modern core; PNG is 2–5× slower due to lossless compression.
-
-**Concrete example — ImageNet-scale dataset (1.28 M files) with NP=8:**
-
-| Metric | Value |
-|---|---|
-| Total files | 1,280,000 |
-| Files per rank (`N / np`) | 160,000 |
-| Encode time (JPEG, 30 ms/file) | 160,000 × 0.030 s ≈ **80 min per rank** |
-| Encode time (PNG, 100 ms/file) | 160,000 × 0.100 s ≈ **4.4 hours per rank** |
-| Storage write time (100 KB, 1 GiB/s NFS) | 160,000 × 0.0001 s ≈ **16 s** — negligible |
-
-The bottleneck is not I/O bandwidth — it is pure CPU time for compression. Because each rank is serial, adding more MPI ranks scales generation linearly, but the per-rank CPU time remains unchanged. Doubling NP from 8 to 16 halves the wall-clock time, but only by adding 8 more processes. There is no intra-rank parallelism to exploit the spare CPU cores that sit idle while one thread encodes.
-
-**Contrast with `hdf5_generator.py` and `npy_generator.py`:** NumPy native binary format saves raw memory-mapped arrays at speeds limited only by storage bandwidth (often 1–5 GiB/s per rank). JPEG/PNG generation is an order of magnitude slower for the same logical data volume.
-
-### 10c. Data Reading Bottleneck
-
-`image_reader.py` uses PIL to read files:
-
-```python
-def open(self, filename):
-    # called once per sample, from a DataLoader worker process
-    img = Image.open(filename)
-    data = np.asarray(img)
-    self.open_file_map[filename] = data
-```
-
-Each call is a separate system-level open → read → JPEG decode → numpy conversion. There is no read-ahead, no batch opening, and no memory pooling across calls.
-
-**Throughput ceiling for `read_threads=1` (the default):**
-
-On NFS (RTT ~1 ms, bandwidth ~10 GiB/s), each file fetch is dominated by per-request latency:
-
-- Per-file time ≈ RTT + file_size/bandwidth = 1 ms + (115 KB / 10 GiB/s) ≈ 1.01 ms
-- Maximum IOPS ≈ 990 files/sec
-- Throughput ≈ 990 × 115 KB ≈ **114 MiB/s** — with 10 GiB/s of available bandwidth **98.9% idle**
-
-With `read_threads=8`:
-
-- 8 concurrent opens → 8 simultaneous RTTs → IOPS ≈ 7,920 → **912 MiB/s** — still only 9% of NFS bandwidth
-
-With `read_threads=32`:
-
-- 32 concurrent opens → IOPS ≈ 31,680 → **3.6 GiB/s** — 36% of NFS bandwidth
-
-The practical takeaway: **IOPS, not bandwidth, is the binding constraint for small-file JPEG/PNG reading**. The optimal `read_threads` value is `ceil(target_throughput / (file_size / bandwidth) + RTT * target_IOPS)`, which for typical deployments means 16–64 threads per rank, not the default of 1.
-
-### 10d. No Aggregated-Access Path
-
-Frameworks such as WebDataset, FFCV, and TFRecord address this problem by grouping many samples into sequential tar or binary shards. A single large sequential read then yields many samples, converting the random-IOPS problem into a streaming-bandwidth problem. DLIO has no sharding path for JPEG or PNG: every benchmark run, at every scale, reads each sample as an individual file. This is by design for the benchmark (measuring actual per-file I/O cost), but it means:
-
-1. Any benchmark result with JPEG/PNG and `read_threads` < 16 is almost certainly I/O starved, not representative of storage peak capability.
-2. Results should always report `read_threads × comm_size` (total concurrent I/O streams) alongside throughput.
-
-### 10e. Sub-folder Namespace
-
-`num_subfolders_train` distributes files across sub-directories, reducing directory listing time on large NFS servers. It does not change the fundamental one-file-per-open access pattern. For datasets with > 100 K JPEG/PNG files, sub-folders are necessary to avoid NFS `readdir` stalls, but are not sufficient to close the throughput gap.
-
----
-
-## 11. `read_threads` — Fixed YAML Value vs. Runtime-Adaptive Sizing
-
-### 11a. Current Behaviour
-
-`read_threads` is defined in `ConfigArguments`:
-
-```python
-read_threads: int = 1     # dlio_benchmark/utils/config.py
-```
-
-It is set at YAML-load time (before MPI is initialized) and passed verbatim to PyTorch `DataLoader(num_workers=read_threads)`. The only runtime check is in `validate()`:
-
-```python
-if self.read_threads > 1:
-    cores_available = len(psutil.Process().cpu_affinity())
-    if cores_available < self.read_threads:
-        self.logger.warning(...)    # logs a warning, zero action taken
-```
-
-Validation checks the pinned CPU set of the current process, not the actual core count divided by ranks per node. It never modifies `read_threads`, caps it, or auto-computes a value. DLIOMPI's `npernode()` and `nnodes()` are never consulted from `config.py`.
-
-The `prefetch_factor` fed to PyTorch DataLoader is:
-
-```python
-prefetch_factor = math.ceil(self._args.prefetch_size / self._args.read_threads)
-```
-
-This means that changing `read_threads` without correspondingly adjusting `prefetch_size` silently changes prefetch aggressiveness, which affects memory consumption and training-step hide latency in ways that are not visible in the YAML.
-
-### 11b. Thread Budget Analysis Across Deployment Scales
-
-When `read_threads = 8` is hardcoded (as in `resnet50_a100.yaml`), the total DataLoader worker processes per node is `read_threads × ranks_per_node`:
-
-| Deployment | ranks/node | read_threads | DataLoader workers/node | Total processes/node | 128-core utilisation |
-|---|---|---|---|---|---|
-| NP=1, 1 node | 1 | 8 | 8 | 9 | 7% |
-| NP=8, 1 node | 8 | 8 | 64 | 72 | 56% |
-| NP=8, 8 nodes | 1 | 8 | 8 | 9 | 7% |
-| NP=64, 8 nodes | 8 | 8 | 64 | 72 | 56% |
-
-The same YAML sets the same thread count regardless of whether one or eight ranks share a node. On high-rank-density nodes (NP=8/node), `read_threads=8` allocates 64 reader processes per node and may saturate the NFS client connection pool or cause CPU thrashing. On single-rank nodes, `read_threads=8` leaves most cores idle while I/O is the bottleneck.
-
-**The correct thread budget formula is:**
-
-```
-read_threads_per_rank = max(1, floor(available_cores / ranks_per_node / cpu_per_io_thread))
-# For I/O-bound NFS: cpu_per_io_thread ≈ 0.5 (threads mostly sleep on syscalls)
-# For CPU-bound JPEG decode: cpu_per_io_thread ≈ 1.0
-# Practical range: [2, 64]
-```
-
-DLIOMPI can provide all the inputs (`npernode()`, via `MPI.COMM_TYPE_SHARED`), and `os.cpu_count()` or `psutil.cpu_count()` gives the core total. The computation is straightforward but requires MPI to be initialized before validation, which conflicts with the current order of operations (see Section 6b).
-
-### 11c. The Fixed-vs-Auto Design Decision
-
-**Arguments for keeping `read_threads` as a fixed YAML integer:**
-- Reproducibility: same YAML, same thread count, same result regardless of hardware.
-- Simplicity: no implicit logic; user controls the knob directly.
-- Explicit: reported clearly in output logs.
-
-**Arguments for auto-sizing:**
-- The "correct" value differs by an order of magnitude between single-node and multi-node deployments of the same YAML.
-- The default of 1 is severely under-threaded for any network storage workload.
-- Users who do not know to raise `read_threads` will see misleadingly low throughput that is not representative of storage capability.
-
-**Recommendation:** Support `read_threads: auto` as a special sentinel value. When set to `auto`, compute at runtime:
-
-```python
-import os
-ppn = DLIOMPI.get_instance().npernode()
-total_cores = os.cpu_count() or 8
-# Reserve 1 core per MPI rank for compute; divide remainder among I/O threads
-io_threads = max(1, min(64, (total_cores - ppn) // ppn))
-self.read_threads = io_threads
-```
-
-Log the resolved value at the start of the run so it appears in benchmark results. Keep the integer form working unchanged for reproducible benchmark runs.
-
----
-
-## 12. MPI Multi-Host Topology — Available Infrastructure, Missing Integration
-
-### 12a. What DLIOMPI Already Tracks
-
-`DLIOMPI.initialize()` uses `MPI.COMM_TYPE_SHARED` to discover per-node topology at startup:
-
-```python
-split_comm = MPI.COMM_WORLD.Split_type(MPI.COMM_TYPE_SHARED)
-local_ppn = split_comm.size          # ranks sharing this node
-self.mpi_local_rank = split_comm.rank
-# Gather ppn across all nodes via leader communicator
-self.mpi_ppn_list = COMM_WORLD.bcast(ppn_list, root=0)
-self.mpi_nodes = len(self.mpi_ppn_list)
-self.mpi_node  = <node index for this rank>
-```
-
-The public API is:
-
-| Method | Returns |
-|---|---|
-| `rank()` | Global MPI rank (0…comm_size-1) |
-| `size()` | Total MPI world size |
-| `local_rank()` | Rank within this node (0…ppn-1) |
-| `npernode()` | Ranks on this node (can vary per node) |
-| `nnodes()` | Total node count |
-| `node()` | Node index for this rank |
-
-This is a complete node-topology picture. It is used in `statscounter.py` (for the benchmark summary) and in `base_checkpointing.py` (line 424: cross-node checkpoint read offset). It is **not used** in `data_generator.py` or `config.py`.
-
-### 12b. Scaling Formulas as NP and HOST Vary
-
-The training sample distribution is:
-
-```
-samples_per_proc = ceil(total_samples / comm_size)
-training_steps   = ceil(total_samples / batch_size / comm_size)
-```
-
-where `comm_size = NP * HOST` (total ranks). These scale correctly with the product, but they contain no node-level term. The formulas do not distinguish between:
-
-- 64 ranks on 1 node (NP=64, HOST=1): all ranks share the same NFS mount, causing ~64× connection multiplexing
-- 64 ranks on 64 nodes (NP=1, HOST=64): each node has a dedicated NFS mount, maximally parallelising metadata operations
-
-**For JPEG/PNG reading**, effective storage throughput scales as:
-
-```
-IOPS_total = ranks_total × read_threads × (1 / per_open_latency)
-```
-
-where `per_open_latency` includes NFS RTT, kernel VFS overhead, and JPEG decode time. This throughput grows with both axes (ranks and threads), but the per-node NFS mount bandwidth caps growth when all ranks share one mount. The benchmark currently cannot express or control which axis scales which way.
-
-**Concrete scale-up table (JPEG, 115 KB/file, NFS RTT=1ms, BW=10 GiB/s/node):**
-
-| NP | HOST | comm_size | read_threads | IOPS_total | Throughput |
-|---|---|---|---|---|---|
-| 1 | 1 | 1 | 1 | 990 | 114 MiB/s |
-| 4 | 1 | 4 | 8 | 15,840 | 1.8 GiB/s |
-| 8 | 1 | 8 | 8 | 31,680 | 3.6 GiB/s → NFS BW cap (10 GiB/s single mount) |
-| 4 | 8 | 32 | 8 | 126,720 | 14.6 GiB/s → 8 × NFS BW cap |
-| 8 | 32 | 256 | 8 | 1,013,760 | 116 GiB/s |
-
-The key insight: **scale-out across hosts is much more effective than adding ranks per node**, because each new host brings a fresh NFS connection budget and independent bandwidth. DLIO's fixed `read_threads` value in YAML does not guide the user toward this topology insight.
-
-### 12c. File Distribution and Node Locality
-
-Data generation currently assigns files via:
-
-```python
-for i in range(my_rank, total_files, comm_size):
-    write_fn(file_list[i], ...)
-```
-
-This is a round-robin stride across the global rank space. With `num_subfolders_train > 1`, the file-to-subfolder assignment is:
-
-```python
-subfolder = i % num_subfolders_train
-```
-
-Both mappings are rank-indexed, not node-indexed. If `num_subfolders_train = num_nodes`, the intent might be to give each node its own subfolder for locality, but the actual assignment distributes files from all nodes into all subfolders (because `i % comm_size` spans all ranks, not just the ranks on one node). Ranks on node 0 produce files in all subfolders, as do ranks on node 1, etc.
-
-For read locality on distributed file systems with per-directory locking (some NFS and Lustre configurations), concentrating each node's reads into its "own" subfolder can reduce contention. The current round-robin prevents this. A node-local assignment would be:
-
-```python
-node_idx = DLIOMPI.get_instance().node()
-subfolder = node_idx % num_subfolders_train
-```
-
-This is not currently implemented.
-
-### 12d. What Is Missing
-
-| Gap | Current state | Impact |
-|---|---|---|
-| `read_threads` not scaled by `npernode()` | Hardcoded YAML integer | Over-commits per-node CPU when ranks/node is high; under-commits on single-rank nodes |
-| No intra-rank generation parallelism | Serial `_generate_files()` loop | JPEG/PNG generation CPU-bottlenecked; idle cores cannot be exploited |
-| Node-local file affinity not implemented | Round-robin across all ranks | No NFS namespace locality; all nodes contend on all subfolders |
-| Benchmark output does not report `npernode()` | `num_hosts` reported, `ppn` not | Cannot reconstruct per-node concurrency from published benchmark results |
-| `read_threads` is set before MPI init | Load-time YAML evaluation | Auto-sizing using `npernode()` requires a post-MPI-init resolve step |
-
-### 12e. Recommendations
-
-1. **Log MPI topology in benchmark header**: At rank 0, emit `nnodes()`, `npernode()`, and `read_threads` so that any published result has sufficient information to reproduce the I/O concurrency.
-
-2. **Auto-size `read_threads` post-MPI-init**: If `read_threads: auto` (or `read_threads: 0` as a sentinel), resolve to `max(1, min(64, (os.cpu_count() - npernode()) // npernode()))` after `DLIOMPI.initialize()`. This requires moving the resolution step out of YAML parse and into `derive_configurations()`, which already runs inside the main process after MPI init.
-
-3. **Add intra-rank concurrency for JPEG/PNG generation**: Wrap the `_generate_files()` loop in a `concurrent.futures.ThreadPoolExecutor`. PIL's JPEG encoder releases the GIL during its C extension work; threads genuinely parallelise the CPU encode. A pool of `min(read_threads, 8)` workers per rank would reduce ImageNet-scale generation from hours to minutes without requiring any MPI changes.
-
-4. **Node-indexed subfolder assignment**: When `num_subfolders_train == nnodes()`, assign `subfolder = node()` per rank so that all reads for a given training step from one node hit one subfolder. This concentrates hot NFS metadata into per-node directories, reducing cross-node directory contention.
-
-5. **Document the NP vs HOST scaling trade-off**: Add a section to the benchmark README explaining that for JPEG/PNG workloads, scaling HOST outperforms scaling NP for the same `comm_size`, because each new host brings independent NFS bandwidth. Provide a concrete example using the IOPS formula above.
-
----
-
-## 13. File vs. Object Workload Asymmetry — Closing the Performance Gap
-
-### 13a. The Problem: Two Classes of Benchmark with Different Overhead Profiles
-
-The S3 iterable readers introduced for object storage were built with a correct understanding of DLIO's design principle: the benchmark measures storage throughput, not data transformation throughput. As a result, every S3 iterable reader — `ImageReaderS3Iterable`, `NPYReaderS3Iterable`, `HDF5ReaderS3Iterable`, `TFRecordReaderS3Iterable` — does the following:
-
-1. Fetch raw bytes from the storage system (the I/O operation being measured).
-2. Record the byte count for telemetry (`image_size` metric).
-3. Return `self._args.resized_image` (the pre-allocated random tensor).
-4. Never decode, decompress, or numpy-convert the fetched bytes.
-
-The local-filesystem readers — `ImageReader`, `NPYReader`, `HDF5Reader` — do NOT apply this principle. `ImageReader` calls `PIL.Image.open()` and `np.asarray()` on every sample. `NPYReader` calls `np.load()`. `HDF5Reader` performs a full HDF5 chunk read and numpy conversion. All of this CPU work happens inside the training-step timing window, and all of it produces output that is immediately discarded.
-
-**The result is that the same workload, with the same files, produces fundamentally different benchmark numbers depending solely on whether the storage backend is local FS or object storage.** An object-storage run with `ImageReaderS3Iterable` and a local-FS run with `ImageReader` are not measuring the same thing — even if the physical data is identical.
-
-### 13b. Quantified Impact of the Asymmetry
-
-For a JPEG workload at 224×224×3 image size, the per-sample overhead difference:
-
-| Reader | Storage I/O time | CPU decode time | Total per sample | CPU fraction of total |
-|---|---|---|---|---|
-| `ImageReaderS3Iterable` (object) | ~1–5 ms net fetch | 0 ms | ~1–5 ms | 0% |
-| `ImageReader` (local FS) | ~0.01 ms read | 5–20 ms PIL decode | ~5–21 ms | 71–99% |
-
-A benchmark using `ImageReader` on a fast NVMe filesystem can show **5–20× lower per-sample throughput than a benchmark using `ImageReaderS3Iterable` on the same data served from an object store** — not because the object store is faster, but because the local-FS reader does far more CPU work. Published benchmark comparisons between the two backend types are therefore not valid without correcting for this asymmetry.
-
-The same asymmetry exists at generation time: object store YAML configs typically target fewer total files or use NPY/HDF5 format (avoiding JPEG), while local FS YAML configs often use JPEG with no awareness of the PIL encode cost. This is an accident of how the configs evolved, not a deliberate design choice.
-
-### 13c. Why the Asymmetry Exists
-
-The object-store readers were written later, after the design principle (Section 9a) was understood. The local-filesystem readers predate that understanding and have not been updated. The S3 iterable reader docstrings explicitly document why decoding is wrong:
-
-> *"Calling `PIL.Image.open(BytesIO(raw))` on JPEG/PNG data is pure CPU overhead. DLIO's `FormatReader.next()` yields a pre-allocated random tensor regardless of file contents; only the byte count is needed for the image_size telemetry metric."*
-
-The same rationale applies to `ImageReader`, but that file contains no equivalent comment and no equivalent implementation. The optimization was applied to the new path and never back-ported to the original one.
-
-For data generators, the object-store configs incidentally avoid the worst-case formats (JPEG/PNG with PIL encode) because they were configured for network-storage scale testing where generation cost is more visible. The local-FS configs retain JPEG/PNG as the default for historical reasons.
-
-### 13d. The Rationalization Proposal
-
-The fix is to bring local-filesystem readers up to the standard already established by the S3 iterable readers. This is a code change only — no format changes, no YAML changes, no protocol changes. The storage I/O (the measured operation) is unchanged in every case.
-
-**Reader rationalization targets (by priority):**
-
-| Reader | Current behaviour | Rationalized behaviour | Change required |
-|---|---|---|---|
-| `ImageReader` (local FS JPEG/PNG) | PIL decode + numpy convert | Raw byte read, byte count for telemetry | Replace `PIL.Image.open()` with `open(rb).read()` |
-| `NPYReader` (local FS NPY/NPZ) | `np.load()` — allocates full array | Raw byte read, byte count for telemetry | Replace `np.load()` with `open(rb).read()` |
-| `HDF5Reader` (local FS HDF5) | `h5py.File()` + dataset slice | `os.stat()` for byte count (HDF5 does not expose raw bytes cleanly) | Use file size from stat, skip h5py decode |
-| `TFReader` (TFRecord) | Already returns `resized_image`, no decode | No change needed | ✅ Already correct |
-| S3 iterable readers | Already raw byte read | No change needed | ✅ Already correct |
-
-For `HDF5Reader`, full raw-byte skipping is complicated because HDF5 files contain many datasets and the per-sample byte cost is embedded inside the HDF5 container format. The pragmatic fix is to record the total file size (via `os.stat()`, which is already a real syscall) and use `ceil(file_size / num_samples_per_file)` as the per-sample byte count. This avoids `h5py` decoding while still exercising real storage I/O.
-
-**Generator rationalization targets:**
-
-The same data-loader-aware branch described in Section 9g applies to generation. For non-DALI data loaders, JPEG and PNG generators must write raw dgen-py bytes rather than running PIL encode. This produces files that the rationalized `ImageReader` reads correctly (raw bytes, byte count for telemetry). For the DALI path, PIL encode remains necessary and the DALI reader is already correct.
-
-### 13e. Validation: How to Confirm the Fix Works
-
-After rationalizing the local-FS readers, a correctly implemented benchmark should satisfy:
-
-1. **A file-backend and object-backend run of the same workload with the same dataset produce statistically equivalent samples/sec and MiB/s numbers**, adjusted for storage latency and bandwidth differences between the two systems. CPU overhead should not be a confounding variable.
-
-2. **The fraction of training-step time attributed to I/O wait (as reported in `dlp` traces) should be the dominant fraction (> 80%)** for both backends, for all formats, on any storage system faster than the benchmark's prefetch queue can drain.
-
-3. **Generator throughput for JPEG/PNG on non-DALI configurations should match NPY generator throughput** (within 2×), because both should be bottlenecked on storage write bandwidth, not CPU encoding.
-
-If any of these properties does not hold after rationalizing the readers, it indicates a remaining source of CPU overhead that has not been identified or removed.
-
-### 13f. Configuration-Level Rationalization
-
-Beyond code changes, the YAML configs should be audited to eliminate format choices that reflect historical defaults rather than deliberate workload simulation decisions:
-
-1. **Local-FS configs that use JPEG/PNG for non-imaging workloads** (e.g., testing batch read throughput of random data) should be migrated to NPY or HDF5 with compression disabled. This eliminates generation overhead that is independent of the format rationalization.
-
-2. **Object-store configs that use NPY/HDF5 while local-FS configs use JPEG/PNG for the "same" workload** create an implicit apples-to-oranges comparison. If a workload is defined as JPEG-format vision training, both its local-FS and object-store variants should use identical format settings. The storage backend is the variable; the format should be held constant.
-
-3. **The `multiprocessing_context` coupling** (Section 6c) means that a rationalized file-backend config and its object-store counterpart must differ in at least one reader setting (`fork` vs `spawn`). This is unavoidable given the Tokio runtime constraint, but should be the ONLY difference between the two, and should be auto-derived from `storage_library` rather than manually set.
-
-### 13g. Summary of the Rationalization Requirement
-
-The core requirement is simple: **every reader, for every format, for every storage backend, must behave consistently.** The S3 iterable readers already implement the correct behaviour. The local-filesystem readers must be updated to match. Until that update is made, no published DLIO benchmark result comparing local-filesystem and object-storage throughput can be considered internally consistent, because the benchmarks are not measuring the same thing on both backends.
diff --git a/docs/DLIO_Issues_Resolution_26-04-18.md b/docs/DLIO_Issues_Resolution_26-04-18.md
new file mode 100644
index 00000000..0c31b516
--- /dev/null
+++ b/docs/DLIO_Issues_Resolution_26-04-18.md
@@ -0,0 +1,41 @@
+# DLIO Benchmark — Issue Resolution Summary
+
+**Date:** April 18, 2026  
+**Replaces:** `DLIO_IO_Issues-Proposal_2026-03-28.md`, `DLIO_IO_Issues-Executive_Summary_2026-03-28.md`, `DLIO_PR_Plan_26-04-13.md`, `DLIO_PR_Status-26-04-12.md`
+
+---
+
+## All Non-DALI Issues — Fully Resolved
+
+| Issue | Description | Resolution |
+|-------|-------------|------------|
+| 1 / 6 | File vs object reader asymmetry; local readers incurring full CPU decode (PIL, NumPy, h5py) while S3 readers did not | `reader/_local_fs_iterable_mixin.py` added: parallel prefetch via `ThreadPoolExecutor`, byte count only, no decode. Affects `ImageReader`, `NPYReader`, `HDF5Reader`, `NPZReader`. |
+| 2 | JPEG/PNG generator 300–1000× slower than necessary due to PIL encoding | `jpeg_generator.py` and `png_generator.py` now detect non-DALI loader and write raw bytes, skipping PIL encode. DALI path still produces valid encoded bitstreams. Confirmed: JPEG 3×, PNG 27× speedup. |
+| 3 | TFRecord iterative sampler file-index bug: non-zero ranks read wrong files | `config.py` line 719: rank offset is now carried forward through all iterations via `my_rank * files_per_rank + sample_index // num_samples_per_file`. |
+| 4 | `read_threads` hardcoded at 1, under-utilizing storage bandwidth | Auto-sized to `min(cpu_count // ranks_per_node, 8)` when user leaves default. Explicit YAML values respected. |
+| 5 | Deduplication — files must be byte-unique | Already correct; `data_generator.py` uses `rng.integers(0, 2**63)` per file with `BASE_SEED + my_rank`. No code change needed. |
+| 7 | 49 YAML config files with hardcoded lab IPs | External: mlp-storage now supplies endpoint/bucket/library via env vars and CLI overrides. Remaining S3 configs use `localhost` placeholder. |
+| 8 | `multiprocessing_context` must be `spawn` for object-storage libraries | `config.py` auto-derives `spawn` when `storage_library` is `s3dlio` or `s3torchconnector`. Dataclass default changed from `fork` to `spawn`. |
+| 9 | `storage_library` not wired to standard env vars; poor standalone usability | `config.py` `_apply_env_overrides()` now reads `DLIO_STORAGE_LIBRARY`, `DLIO_BUCKET`, `DLIO_STORAGE_TYPE`, `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`, `AWS_ENDPOINT_URL`, `AWS_REGION`, and an optional `.env` file. |
+| 10 / 11 | Data generation serial per rank; object-store uploads blocking | `data_generator.py`: seeds pre-derived in main thread (preserving determinism), then writes dispatched to `ThreadPoolExecutor`. New `write_threads` config field, auto-sized via `ranks_per_node`. |
+| 12 | `comm_size` used as thread denominator — wrong on multi-node runs | `DLIOMPI.ranks_per_node()` added (`MPI_Allgather` of hostnames). `read_threads` and `write_threads` auto-sizing now uses `cpu_count // ranks_per_node`. |
+| 13 | No settle-time guard after generation on eventual-consistency stores | `post_generation_settle_seconds: float = 0.0` field added. Non-zero + non-LOCAL_FS: rank 0 sleeps, then broadcasts barrier. Default 0.0 — no behavior change for existing configs. |
+| 6b | `validate()` called before file lists available | Investigated: not a real bug. `derive_configurations()` does not call `validate()`; the only `validate()` call is in `main.py` after the file-list walk. Resolved with a clarifying comment in `data_generator.py`. |
+
+---
+
+## Outstanding Issues — DALI Only (Deferred: No GPU Available)
+
+These were identified but not implemented because they require GPU hardware for validation:
+
+| Issue | Description | State |
+|-------|-------------|-------|
+| DALI-1 | `shard_id` never passed in `dali_image_reader.py`, `dali_npy_reader.py`, `dali_tfrecord_reader.py` — all multi-rank DALI runs read shard 0 only | Branch `fix/dali-correctness` exists locally. Critical correctness bug — **do not use DALI with `comm_size > 1`** until fixed. |
+| DALI-2 | `fn.python_function` callbacks re-introduce the GIL into DALI pipeline; full C++ JPEG decode is done and then discarded | Branch `feat/dali-modernization` exists locally. |
+| DALI-3 | DALI 2.0 dynamic executor not adopted; `exec_dynamic=False` still in use | Deferred with DALI-2. |
+
+---
+
+## Minor Remaining Note
+
+`storage_library` is still accessed via `(self.storage_options or {}).get("storage_library")` rather than as a first-class `ConfigArguments` dataclass field. Functionally correct — the env-var path from Issue 9 populates `storage_options['storage_library']` properly. The dataclass promotion (adding `storage_library: Optional[str] = None` directly) was not done; it is cosmetic and low risk.
diff --git a/docs/DLRM-Parquet-S3-Optimization-Plan.md b/docs/DLRM-Parquet-S3-Optimization-Plan.md
new file mode 100644
index 00000000..6b85145a
--- /dev/null
+++ b/docs/DLRM-Parquet-S3-Optimization-Plan.md
@@ -0,0 +1,303 @@
+# DLRM Parquet S3 Optimization Plan
+
+**Date:** May 5, 2026  
+**Status:** Planning — awaiting implementation start  
+**Target:** 10 GiB/s aggregate throughput (from 402 MiB/s baseline)
+
+---
+
+## Problem Recap
+
+| Stage | Throughput | Bottleneck |
+|-------|-----------|-----------|
+| Baseline | 93 MiB/s | Fjall LSM read per GET + 40 HTTP GETs/RG |
+| After s3-ultra + merged GET | 288 MiB/s | 16M Python `read_index()` calls × utcnow() overhead |
+| After prefetch + fast `read_index()` | 402 MiB/s | 16M Python calls × 0.5 µs = 8s hard Python floor |
+| **Target** | **~10,000 MiB/s** | I/O bound at ~0.33s (1,968 RG fetches, ~1.7 MiB each) |
+
+The remaining 25× gap is **pure Python overhead** — `read_index()` is called once per *sample* (16 million times per worker), but only 1,968 actual I/O operations are needed. Closing this gap requires reducing Python call count from 16M → ~1,968.
+
+---
+
+## Key Architectural Insight: s3dlio DataLoader Extension
+
+s3dlio already has a production-quality `DataLoader` framework in `src/data_loader/`:
+
+```
+src/data_loader/
+  dataset.rs             ← Dataset trait (get, len, as_stream, keys)
+  dataloader.rs          ← DataLoader<D: Dataset> generic wrapper
+  async_pool_dataloader.rs ← AsyncPoolDataLoader + UnifiedDataLoader
+  options.rs             ← LoaderOptions (prefetch, max_inflight_parts, num_workers, part_size, …)
+  s3_bytes.rs            ← S3BytesDataset implements Dataset<Item=Bytes>
+  fs_bytes.rs            ← FileSystemBytesDataset
+  directio_bytes.rs      ← DirectIOBytesDataset
+  prefetch.rs            ← Prefetch queue infrastructure (already bounded)
+  sampler.rs             ← Sharding / worker splitting
+```
+
+The Python-facing API already works:
+```python
+loader = s3dlio.create_async_loader("s3://bucket/prefix/", {"prefetch": 16})
+for item_bytes in loader:          # each item = one file's raw bytes today
+    process(item_bytes)
+```
+
+**The extension**: add a new `ParquetRowGroupDataset` that implements `Dataset<Item=Bytes>` where **each item = one row group's selected column bytes** rather than one whole file. Once this exists, the current `create_async_loader` Python API works unchanged — the caller just adds `"format": "parquet"` to the options.
+
+This approach:
+- Re-uses all existing `DataLoader` prefetch, sharding, and concurrency infrastructure
+- Eliminates Items #2 (bounded prefetch queue) and #3 (standalone Rust function) from the original plan — they're already handled
+- Requires only two new files in s3dlio plus a small DLIO reader wrapper
+
+---
+
+## Revised Three-Item Plan
+
+### Priority 1 (prerequisite): `ParquetRowGroupDataset` in s3dlio
+
+**Where:** `s3dlio/src/data_loader/parquet_rg.rs` (new file)
+
+**What it does:**
+1. At construction: lists all `.parquet` files under the prefix
+2. Fetches each file's Parquet footer once (range GET of last `footer_cap` bytes)  
+3. Parses each footer using `ParquetMetaDataReader::decode_metadata(buf)` → `ParquetMetaData`  
+4. Caches footer metadata in `Arc<Vec<ParquetMetaData>>` (never re-fetched during epoch)
+5. Builds a flat index: `[(file_idx, rg_idx)]` of length = total RGs across all files
+
+**`Dataset` implementation:**
+```rust
+impl Dataset for ParquetRowGroupDataset {
+    type Item = Bytes;                    // raw merged column bytes for one row group
+
+    fn len(&self) -> Option<usize> {
+        Some(self.rg_index.len())         // 64 × 123 = 7,872 for DLRM
+    }
+
+    async fn get(&self, global_idx: usize) -> Result<Bytes, DatasetError> {
+        let (file_idx, rg_idx) = self.rg_index[global_idx];
+        let (start, length) = self.compute_extent(file_idx, rg_idx)?;  // pure CPU
+        let uri = &self.file_uris[file_idx];
+        self.store.get_range(uri, start, Some(length)).await
+            .map_err(DatasetError::from)
+    }
+}
+```
+
+**`compute_extent` logic (pure CPU, using parquet crate):**
+```rust
+fn compute_extent(&self, file_idx: usize, rg_idx: usize) -> Result<(u64, u64)> {
+    let meta = &self.file_metadata[file_idx];
+    let rg = meta.row_group(rg_idx);
+    let cols = self.col_indices_for_file(file_idx);     // defaults to all columns
+    
+    let mut start = u64::MAX;
+    let mut end = 0u64;
+    for ci in cols {
+        let col = rg.column(ci);
+        let col_start = col.dictionary_page_offset()
+            .unwrap_or_else(|| col.data_page_offset()) as u64;
+        let col_end = col_start + col.compressed_size() as u64;
+        start = start.min(col_start);
+        end = end.max(col_end);
+    }
+    Ok((start, end - start))
+}
+```
+
+**Parquet crate API confirmed working (parquet 58.2.0):**
+- `ParquetMetaDataReader::decode_metadata(buf: &[u8]) -> Result<ParquetMetaData>` — parse raw footer bytes  
+- `meta.row_group(i)` → `RowGroupMetaData`  
+- `rg.column(i)` → `ColumnChunkMetaData`  
+- `.dictionary_page_offset()` → `Option<i64>`  
+- `.data_page_offset()` → `i64`  
+- `.compressed_size()` → `i64`  
+
+**Footer byte range:** `size - footer_cap` to `size`. Use `footer_cap = 4 MiB` (covers all 64 DLRM files, avg footer = 2.66 MiB). Footer parsing via `ParquetMetaDataReader::decode_footer(slice: &[u8; 8])` gives the actual metadata length; full metadata is in `buf[..metadata_len]`.
+
+**Cargo.toml addition (optional feature, no impact on existing builds):**
+```toml
+[features]
+parquet = ["dep:parquet-crate"]
+
+[dependencies]
+parquet-crate = { package = "parquet", version = "58", default-features = false, optional = true }
+```
+
+**Minimal deps with `default-features = false`:** only thrift, bytes, chrono, half, hashbrown, num-bigint, num-traits. No arrow, no datafusion. Build time: ~14s.
+
+---
+
+### Priority 2: Python `create_async_loader` format routing
+
+**Where:** `s3dlio/src/python_api/python_aiml_api.rs`
+
+**Change:** In `create_async_loader`, detect `opts["format"] == "parquet"` and return a `PyBytesAsyncDataLoader` backed by `ParquetRowGroupDataset` instead of `S3BytesDataset`.
+
+```rust
+// In create_async_loader():
+let format = opts.get("format").and_then(|v| v.extract::<String>().ok());
+if format.as_deref() == Some("parquet") {
+    let col_indices = parse_col_indices_from_opts(&opts)?;
+    let footer_cap = opts.get("footer_cap").and_then(|v| v.extract::<usize>().ok())
+        .unwrap_or(4 * 1024 * 1024);
+    let dataset = ParquetRowGroupDataset::new(uri, col_indices, footer_cap, &loader_opts)?;
+    return Ok(PyBytesAsyncDataLoader { dataset: PyDataset::ParquetRG(dataset), opts: loader_opts });
+}
+```
+
+**Python API (unchanged from user perspective):**
+```python
+import s3dlio
+
+loader = s3dlio.create_async_loader(
+    "s3://mlp-flux/data/dlrm/train/",
+    {
+        "format": "parquet",
+        "columns": list(range(40)),      # column indices to fetch
+        "prefetch": 16,                  # RGs to keep in flight (uses existing DataLoader infra)
+        "footer_cap": 4194304,           # 4 MiB footer window
+    }
+)
+
+for rg_bytes in loader:
+    # rg_bytes = raw merged column-chunk bytes for one row group
+    # Parse with pyarrow once per RG (1,968 calls vs 16M)
+    pass
+```
+
+**Note on `PyDataset` enum:** `python_aiml_api.rs` will need a new `ParquetRG` variant, or `ParquetRowGroupDataset` can be wrapped behind the existing `PyDataset`/`Arc<dyn …>` pattern — whichever is cleaner in context.
+
+---
+
+### Priority 3: DLIO reader wrapper
+
+**Where:** `dlio_benchmark/dlio_benchmark/reader/parquet_reader_s3_rg.py` (new file)
+
+**What it does:**
+1. At `open(filename)`: calls `s3dlio.create_async_loader(prefix, parquet_opts)` — returns loader for ALL files
+2. Yields one whole row group per Python call to the loader
+3. Inner sample loop (Python `yield` for each row) uses `pyarrow` to parse the RG bytes once
+4. Registered via `reader_factory.py` as format `parquet_rg`
+
+**Call count comparison:**
+
+| Operation | Current reader | New reader |
+|-----------|---------------|-----------|
+| `read_index()` Python calls | 16,000,000 | — (eliminated) |
+| `next()` on RG loader | — | 7,872 |
+| pyarrow parse per RG | 64 (open time) | 7,872 |
+| Python `yield` per sample | 16,000,000 | 16,000,000 |
+| `s3dlio.get_range()` calls | 1,968 | 0 (Rust internal) |
+
+The Python `yield` per sample is unavoidable in a DLIO `Dataset.__getitem__`-style interface. However, with Priorities 1+2 implemented:
+
+- **RG fetch overhead moves to Rust** — no Python involvement in extent calculation or scheduling
+- **Prefetch depth controlled by `LoaderOptions.prefetch`** — no custom bounded queue needed
+- **Sharding across workers handled by `LoaderOptions.worker_id / num_workers`** — no custom logic needed
+
+**Throughput estimate after Priority 1+2+3:**
+
+| Component | Time (per worker) |
+|-----------|------------------|
+| I/O: 1,968 RG fetches × 1.74 MiB @ 10 GiB/s | 0.33s |
+| Rust overhead: extent compute + dispatch | <0.01s |
+| Python: 16M yield calls × 0.5 µs | 8.0s |
+| **Total** | **~8.3s → ~403 MiB/s per worker** |
+
+Wait — the Python `yield` loop is still the wall. With 4 workers: **~1,612 MiB/s aggregate.**
+
+**To break the 16M yield wall requires Priority 4 (see below).**
+
+---
+
+### Priority 4 (optional, maximum throughput): Arrow batch return from DataLoader
+
+To escape the 16M Python-yield wall, the `ParquetRowGroupDataset` must return **structured batches** (Arrow RecordBatch) that PyTorch DataLoader can consume without per-row Python iteration.
+
+**Architecture:**
+```
+ParquetRowGroupDataset::get(global_rg_idx) → RecordBatch (8,192 rows, 40 cols)
+  ↓
+Python sees: loader[rg_idx] = numpy array (8192, 40) via __array__
+  ↓
+DLIO reader: yields batch (8192 samples) per Python call → 1,968 calls total per worker
+  ↓
+PyTorch: sees 1,968 batches of 8192 rows = 16M samples total ✓
+```
+
+**Python call count with Priority 4:**
+
+| Component | Time |
+|-----------|------|
+| I/O at 10 GiB/s | 0.33s |
+| Python: 1,968 `next()` calls × 1 µs | 0.002s |
+| NumPy/Arrow overhead per batch | ~0.01s |
+| **Total** | **~0.34s → ~9,850 MiB/s per worker** |
+
+**Tradeoff:** Requires adding `arrow` feature to s3dlio (`apache-arrow` crate, ~120 deps). Currently s3dlio has no arrow dependency — this was a deliberate architectural decision. Priority 4 should be a gated, separate `arrow-parquet` feature that does not affect existing builds.
+
+---
+
+## Implementation Sequence
+
+```
+Week 1:  Priorities 1+2 — ParquetRowGroupDataset in s3dlio
+           → New src/data_loader/parquet_rg.rs
+           → Cargo.toml: parquet feature flag
+           → python_aiml_api.rs: format routing
+           → build_pyo3.sh: add parquet to features
+           → cargo test + cargo clippy (zero warnings)
+
+Week 2:  Priority 3 — DLIO reader wrapper
+           → dlio_benchmark/reader/parquet_reader_s3_rg.py
+           → reader_factory.py registration
+           → dlio config: format: parquet_rg
+           → Benchmark: measure 1,612 MiB/s aggregate (4 workers)
+
+Week 3+: Priority 4 (if needed) — Arrow batch return
+           → Gated behind separate arrow-parquet feature
+           → Benchmark: measure ~9,850 MiB/s per worker
+```
+
+---
+
+## What the DataLoader Extension Replaces
+
+Comparing original plan vs. revised plan:
+
+| Original plan item | Revised plan | Reason |
+|-------------------|-------------|--------|
+| Item #2: Bounded prefetch queue (custom) | **Eliminated** | `LoaderOptions.prefetch` + `prefetch.rs` already handles this |
+| Item #3: standalone `read_parquet_rg_extent()` fn | **Replaced by `ParquetRowGroupDataset`** | Dataset trait plugs into existing DataLoader, sampler, sharding infra |
+| Item #1: IterableDataset (Python) | **Becomes Priority 3 wrapper** | Thinner — just wraps `create_async_loader`, no custom threading |
+
+Net change: Less code, better architecture. All prefetch scheduling, concurrency control, and worker sharding re-use battle-tested s3dlio infrastructure instead of being re-implemented in Python.
+
+---
+
+## Files to Create/Modify
+
+**s3dlio (Priority 1+2):**
+- **NEW** `src/data_loader/parquet_rg.rs` — `ParquetRowGroupDataset`
+- **MODIFY** `src/data_loader/mod.rs` — add `pub mod parquet_rg`; re-export
+- **MODIFY** `Cargo.toml` — add `parquet` optional dep + feature flag
+- **MODIFY** `src/python_api/python_aiml_api.rs` — format routing in `create_async_loader`
+- **MODIFY** `build_pyo3.sh` — add `--features parquet` when building with parquet support
+
+**dlio_benchmark (Priority 3):**
+- **NEW** `dlio_benchmark/reader/parquet_reader_s3_rg.py` — DLIO reader wrapper
+- **MODIFY** `dlio_benchmark/reader/reader_factory.py` — register `parquet_rg` format
+
+**Configs:**
+- New DLIO config `mlp-storage/configs/dlrm_parquet_rg.yaml` with `format: parquet_rg`
+
+---
+
+## Constraints
+
+- `parquet` crate: always `default-features = false` (avoids arrow/datafusion pull-in)
+- s3dlio: new code behind `parquet` feature flag — existing `cargo build` unaffected
+- Zero warnings policy: `cargo clippy` must pass before any benchmark run
+- Do NOT git commit/push without explicit user approval
+- Do NOT modify other projects (dl-driver, sai3-bench, warpio) during this work
diff --git a/docs/DLRM-Parquet-S3-Throughput-Analysis.md b/docs/DLRM-Parquet-S3-Throughput-Analysis.md
new file mode 100644
index 00000000..edfa4c94
--- /dev/null
+++ b/docs/DLRM-Parquet-S3-Throughput-Analysis.md
@@ -0,0 +1,251 @@
+# DLRM Parquet S3 I/O Throughput Analysis
+
+**Date:** May 2026  
+**Benchmark:** MLPerf Storage DLRM training workload  
+**Storage backend:** s3-ultra (fake S3 server, in-memory parquet footer cache)  
+**Reader:** `ParquetReaderS3Iterable` with s3dlio byte-range GETs  
+**Host:** Single machine, loopback TCP (no network), 47 GiB RAM  
+
+---
+
+## 1. File Format Parameters
+
+| Parameter | Value |
+|-----------|-------|
+| Files | 64 |
+| Size per file | ~1 GiB |
+| Row groups per file | 123 |
+| Rows per row group | 8,192 |
+| Total rows per file | 1,000,000 |
+| Total columns | 200 |
+| Columns read (projection) | 40 |
+| Compressed size, 40 cols, 1 RG | ~1,721 KiB |
+| Byte span of 40-col GET per RG | ~1,743 KiB |
+| Parquet footer size | ~2.66 MiB |
+
+---
+
+## 2. Throughput Progression
+
+Each fix below was implemented and benchmarked sequentially.
+
+| Version | Change | Throughput | Speedup vs baseline |
+|---------|--------|-----------|---------------------|
+| v1 — Original | `pf.read_row_group()` — 40 HTTP GETs/RG, `get_parquet_footer` Fjall read every GET | 93 MiB/s | baseline |
+| v2 — Merged GET | Single merged GET per RG (min–max column span), s3-ultra in-memory footer cache, early-out for non-footer GETs | 288 MiB/s | 3.1× |
+| v3 — Prefetch + fast read_index | Background ThreadPoolExecutor (32 workers) prefetches all RGs at file open; `read_index()` override removes per-sample `utcnow()` | 402 MiB/s | 4.3× |
+
+Target: **10,240 MiB/s (10 GiB/s)**. Still ~25× below target at v3.
+
+---
+
+## 3. Root Cause Analysis — Why 10 GiB/s Is Not Achieved
+
+### 3.1 Per-sample Python call overhead is the hard bottleneck
+
+The PyTorch `Dataset.__getitem__` protocol calls `read_index(image_idx, step)` once per **sample**. With 1,000,000 samples/file × 16 files/worker = **16,000,000 Python calls per worker per epoch**.
+
+Even with all I/O eliminated (cache-hit path), each call costs ~3 µs:
+
+| Overhead source | Cost per call | Total (16M calls) |
+|----------------|---------------|-------------------|
+| `FormatReader.read_index()` base — `datetime.now().strftime()` called twice (debug log f-strings evaluate unconditionally) | ~3 µs | ~48 s |
+| Bisect + dict lookup | ~0.5 µs | ~8 s |
+| `dlp.update()` | ~0.1 µs | ~2 s |
+| **Total Python overhead** | **~3.6 µs** | **~58 s** |
+
+Actual S3 I/O (1,968 row group GETs × ~4.9 ms serial, or ~15 ms for 10 in parallel with 32 threads) consumes only **~3–15 s** per worker. The benchmark runs 131 s — Python overhead dominates by ~4:1.
+
+### 3.2 How much actual data moves per worker
+
+```
+16 files × 123 row groups × 1,743 KiB/RG = 3,350 MiB per worker
+```
+
+At 10 GiB/s that takes **0.33 s**. Python overhead is ~180× the I/O time.
+
+### 3.3 The threshold: row-group granularity
+
+If Python were called once per **row group** instead of once per **sample**:
+
+```
+16 files × 123 row groups = 1,968 Python calls  (vs 16,000,000)
+Reduction: 8,130×
+Python overhead: 1,968 × 3.6 µs = 0.007 s
+I/O time @ 10 GiB/s: 0.33 s
+Theoretical throughput: 3,350 MiB / 0.337 s = 9,940 MiB/s ≈ 10 GiB/s ✓
+```
+
+Row-group granularity is the **exact inflection point** that makes 10 GiB/s achievable.
+
+---
+
+## 4. Memory Budget for Row-Group Prefetch
+
+| Scope | Size |
+|-------|------|
+| 1 RG in flight (40 cols, span) | ~1.7 MiB |
+| 32 RGs in flight (prefetch window) | ~54 MiB |
+| All 123 RGs of 1 file fully buffered | ~209 MiB |
+| 4 workers × full file | **~838 MiB** |
+
+Against 47 GiB host RAM, 838 MiB is ~1.8% — completely benign.
+
+---
+
+## 5. Three-Level Optimization Roadmap
+
+### Level 1 — Row-group IterableDataset (achievable now, within DLIO)
+
+**Impact: ~16× speedup → ~6 GiB/s**
+
+Replace the `Dataset` (map-style, `__getitem__` per sample) with a PyTorch `IterableDataset`. Each iteration yields one batch sourced from a single row-group fetch. Python I/O calls: **1,968** instead of 16,000,000.
+
+```python
+# Pseudocode: IterableDataset approach
+def __iter__(self):
+    for filename in self.file_list:
+        pf, rf, offsets = self.open(filename)
+        for rg_idx in range(pf.metadata.num_row_groups):
+            compressed = self._fetch_row_group_single_get(rf, pf, rg_idx)  # 1 HTTP GET
+            for _ in range(pf.metadata.row_group(rg_idx).num_rows):
+                dlp.update(image_size=compressed)
+                yield self._args.resized_image  # pure Python, no I/O
+```
+
+The inner `yield` loop (16M iterations) is pure Python with no I/O: ~0.5 µs/iter = ~8 s.  
+Total estimate: 8 s Python + 0.33 s I/O = **~8.3 s → ~404 MiB/s per worker × 4 = ~1,600 MiB/s**.
+
+Wait — this is only 1,600 MiB/s, not 6 GiB/s, because the inner yield loop is still 16M Python calls. The gain is that those calls have no I/O and no `datetime.now()`.
+
+To reach 10 GiB/s, the inner loop must also be eliminated. See Level 3.
+
+**True ceiling with IterableDataset but Python inner loop:**
+- Python inner loop: 16M × 0.5 µs = 8 s dominant
+- Still 4× better than current 131 s → ~1,600 MiB/s aggregate
+
+### Level 2 — Async prefetch pipeline (production standard)
+
+**Impact: Overlaps I/O with compute, eliminates stalls**
+
+Current prefetch submits all RGs as futures at file-open time. Improvement: use a bounded queue (e.g. depth=8) so memory is capped and the next file's prefetch starts while the current file's later RGs are being consumed.
+
+```
+[File opener thread] → [RG fetch pool, 32 threads] → [bounded queue, depth 8] → [main loop]
+```
+
+While GPU processes batch N, threads fetch RGs N+1 through N+8. Loopback latency (~4 ms/GET) is fully hidden. On real network (10 GiB/s storage array), this is critical — the fetch pool must be wide enough to keep N parallel streams open.
+
+This is already partially implemented (prefetch executor). The bounded queue is the missing piece to prevent memory spikes on large epoch sizes.
+
+### Level 3 — Native Rust parquet reader in s3dlio (long-term, biggest impact)
+
+**Impact: Eliminates Python from the hot path entirely → 10+ GiB/s achievable**
+
+Move footer parsing + column range computation + RG fetch into a Rust function exposed via PyO3:
+
+```rust
+// s3dlio API addition (conceptual)
+pub fn read_parquet_rg_extent(
+    uri: &str,
+    footer_bytes: &[u8],   // already cached in s3-ultra / s3dlio
+    rg_idx: usize,
+    col_indices: &[usize],
+) -> Result<usize>  // returns compressed_bytes, discards data
+```
+
+Python calls this **once per row group** — 1,968 calls/worker. The inner per-sample loop disappears entirely from Python. Rust computes the byte extent from the Thrift footer, issues the GET, discards the bytes, returns the byte count. No GIL-holding Python loop.
+
+**Theoretical ceiling:**
+- Rust calls: 1,968 × (Rust overhead ~1 µs) = 2 ms
+- I/O: 0.33 s @ 10 GiB/s
+- Total: **~0.332 s → 10,060 MiB/s per worker**
+
+This is the only path to full 10 GiB/s with the existing per-file/per-row-group format.
+
+**Where does it belong?** See Section 6.
+
+---
+
+## 6. Architecture Decision: Where Does the Rust Parquet Reader Live?
+
+### Option A: Inside s3dlio
+
+**Pros:**
+- s3dlio already has: URI parsing, credential resolution, S3/GCS/Azure/file backends, PyO3 bindings, range GET primitives
+- `read_parquet_rg_extent(uri, footer_bytes, rg_idx, columns)` fits naturally alongside `get_range()`
+- Single dependency for downstream tools (dl-driver, sai3-bench, warpio)
+- Footer bytes already cached in s3-ultra's `parquet_footer_cache` and can be served via `get_range(uri, size-footer_len, footer_len)`
+
+**Cons:**
+- Adds parquet Thrift parsing to s3dlio (new dependency: `parquet` crate)
+- s3dlio becomes opinionated about file formats (currently format-agnostic)
+
+### Option B: Standalone `parquet-s3-reader` crate / Python package
+
+**Pros:**
+- Clean separation: s3dlio stays format-agnostic
+- Can be used without s3dlio (e.g. with boto3 backend)
+- Easier to publish independently
+
+**Cons:**
+- Another dependency to manage
+- Duplicates URI/credential logic already in s3dlio
+- More friction for dlio_benchmark integration
+
+### Option C: Inside dlio_benchmark as a compiled extension (`dlio_parquet_ext`)
+
+**Pros:**
+- Scoped to the benchmark — no library API surface to maintain
+- Ships with dlio_benchmark wheel
+
+**Cons:**
+- Embedded in a Python project — harder to maintain Rust build chain
+- Not reusable by dl-driver, sai3-bench, etc.
+
+### Recommendation
+
+**Option A (s3dlio) is the right choice** for this ecosystem. s3dlio is already the storage abstraction layer used by all tools. Adding `read_parquet_rg_extent()` makes it a complete "storage + parquet I/O" primitive, which is the natural evolution given that parquet-on-object-storage is the dominant AI/ML training format. The `parquet` crate is mature and the Thrift footer parsing is ~200 lines of Rust.
+
+---
+
+## 7. Implementation Priority Order
+
+| Priority | Work | Expected result |
+|----------|------|----------------|
+| 1 (highest) | Level 3: `s3dlio::read_parquet_rg_extent()` Rust API + PyO3 binding | ~10 GiB/s |
+| 2 | Level 1: IterableDataset reader in dlio_benchmark using new s3dlio API | Unlock Level 3 gain |
+| 3 | Level 2: Bounded prefetch queue (depth=8) replacing unbounded futures dict | Stable memory, full overlap |
+
+---
+
+## 8. Current State (as of May 2026)
+
+- `parquet_reader_s3_iterable.py` — merged GET per RG + 32-thread prefetch executor + fast `read_index()` override: **402 MiB/s**
+- s3-ultra — in-memory `parquet_footer_cache` (DashMap), early-out for non-footer GETs, release binary: s3 server CPU < 1 core at full DLRM load
+- Hard ceiling without Level 3: **~1,600 MiB/s** (Python inner loop dominates)
+- Hard ceiling with Level 3: **~10,060 MiB/s** (I/O bound)
+
+---
+
+## 9. Three-Mode Read Benchmark (May 2026, local NVMe, s3dlio v0.9.100)
+
+**Test configuration:** 4 files × 8 row groups × 200 float32 cols = 32 RGs total, 35.6 MB on disk (Snappy compressed)
+**Storage:** local NVMe via `file://` URIs (best-case; S3 latency would widen the gaps further)
+
+| Mode | Reader | Workers | Time | Decoded MB | Throughput |
+|------|--------|---------|------|------------|------------|
+| 1 — s3dlio raw + DISCARD | `parquet_get_rg(decode="raw")` → `len(bytes(bv))` | serial | 0.085s | 34.9 MB (compressed) | **412 MB/s** |
+| 2 — PyArrow native (baseline) | `pq.ParquetFile.read_row_group()` | serial | 0.170s | 27.0 MB | 159 MB/s |
+| 3 — s3dlio arrow + IPC decode | `parquet_get_rg(decode="arrow")` + `pa.ipc.open_stream()` | 4 | 0.219s | 26.2 MB | 120 MB/s |
+| 3 — s3dlio arrow + IPC decode | same | 8 | 0.159s | 26.2 MB | **164 MB/s** |
+| 3 — s3dlio arrow + IPC decode | same | 16 | 0.174s | 26.2 MB | 151 MB/s |
+| 3 — s3dlio arrow + IPC decode | same | 32 | 0.168s | 26.2 MB | 156 MB/s |
+
+**Key observations:**
+
+- **Mode 1 (raw + discard)** is 2.6× faster than PyArrow native — pure I/O with zero decode overhead. This is the storage benchmark mode used in production throughput tests.
+- **Mode 2 (PyArrow native)** is the baseline any default PyTorch DataLoader achieves. Serial, blocking, one HTTP GET per `read_row_group()` call.
+- **Mode 3 (s3dlio arrow, 8 workers)** at 164 MB/s edges the PyArrow serial baseline. On local NVMe decode CPU cost dominates; on S3 with real network latency the concurrent pipeline advantage grows substantially — s3dlio holds a warm connection pool and overlaps I/O with Rust-side Parquet→Arrow decode entirely off the GIL.
+- Sweet spot is 8 workers for 32 RGs (one worker per 4 RGs). Beyond that, thread-switching overhead on small tasks flattens the curve.
+- `configure_tokio_threads()` is now called in both `ParquetReaderS3dlio.__init__` and `ParquetReaderS3dlioArrow.__init__` so the Tokio thread budget is MPI-aware from the first call.
diff --git a/docs/DLRM-Python-Per-Sample-Architecture-Fatal-Flaws.md b/docs/DLRM-Python-Per-Sample-Architecture-Fatal-Flaws.md
new file mode 100644
index 00000000..9ac1f960
--- /dev/null
+++ b/docs/DLRM-Python-Per-Sample-Architecture-Fatal-Flaws.md
@@ -0,0 +1,389 @@
+# DLRM Parquet S3 Reader: Fatal Architectural Flaws
+
+**Date**: May 6, 2026  
+**Author**: Benchmark Engineering  
+**Status**: BLOCKED — architecture must change
+
+---
+
+## Executive Summary
+
+Empirical testing with an I/O simulator reveals two independent fatal flaws in the
+current DLIO Parquet-S3 reader design for the DLRM workload. Either flaw alone
+prevents achieving the ≥400 MB/s target at NP=1. Together they make the current
+approach fundamentally unworkable.
+
+1. **Python per-sample call overhead** is the hard ceiling: iterating 64M
+   `read_index()` calls per epoch consumes ~300 s of wall-clock time with **zero I/O**,
+   making ≥400 MB/s mathematically impossible.
+
+2. **Coalesced byte-range GETs are equivalent to full-object GETs**: the plan
+   decomposes each Parquet file into 16 sequential, abutting byte-range GETs that
+   together span the entire object — identical in data volume to a single `GET object`
+   but with 16× the round-trip and connection overhead.
+
+---
+
+## Workload Parameters
+
+| Parameter | Value |
+|-----------|-------|
+| Files | 64 × Parquet (`img_00_of_64.parquet` … `img_63_of_64.parquet`) |
+| File size | ~1,014 MiB each |
+| Samples per file | 1,000,000 |
+| Row groups per file | 123 |
+| Row group size (compressed) | ~7.9 MiB |
+| Total dataset | ~63.4 GiB |
+| `batch_size` | 12,288 |
+| `coalesce_rgs` | 8 |
+| Coalesced GET size | ~63.4 MiB |
+| GETs per file (coalesced) | 16 |
+| Total GETs per epoch | 1,024 |
+| `read_index` calls per epoch | 64,000,000 |
+| Pipeline window | 32 in-flight GETs |
+| Peak in-flight memory | ~2 GiB |
+
+---
+
+## Flaw 1: Python Per-Sample Call Overhead Is the Hard Ceiling
+
+### Methodology
+
+An I/O simulator mode was implemented in `parquet_reader_s3dlio.py`. When
+`simulate_io=True`:
+
+- All real S3 GETs are replaced with sentinel tuples (no network I/O).
+- `read_index()` performs only Python dict/set lookups to classify each call as
+  `HIT`, `DONE`, or `FALLBACK`.
+- Every call is logged to a zstd-compressed TSV file.
+
+This measures the **irreducible Python overhead** with the minimum possible work per
+call.
+
+### Simulation Results (NP=1, epoch 1)
+
+**Run directory**: `results/dlrm/training/dlrm/run/20260506_214624/`
+
+| Metric | Value |
+|--------|-------|
+| Epoch wall time (simulate) | ~300 s |
+| Log window duration | 30 s |
+| `read_index` calls logged | 995,328 |
+| Call rate | **~33,196 calls/sec** |
+| Decision breakdown | 995,312 DONE, 16 HIT, 0 FALLBACK |
+| Extrapolated full-epoch calls | ~64,000,000 |
+
+### Why This Is Fatal
+
+The target is ≥400 MB/s sustained throughput. The DLRM dataset is ~63.4 GiB.
+
+```
+Required epoch time for 400 MB/s = 63.4 GiB / 400 MB/s
+                                  = 64,881 MB / 400 MB/s
+                                  ≈ 162 seconds
+
+Simulate-only epoch time (zero I/O) ≈ 300 seconds
+```
+
+**The Python loop alone takes 300 s — nearly 2× the budget for 400 MB/s.**
+No amount of I/O optimization can overcome a 300 s irreducible Python overhead.
+
+The call rate of ~33,196 calls/sec is a hard ceiling imposed by:
+- CPython's GIL and function call overhead
+- Dict/set lookup per call
+- DLIO's outer sample iteration loop generating 64M calls/epoch
+
+### Decision Breakdown Interpretation
+
+Out of ~995K calls logged in 30 s:
+
+- **995,312 DONE (99.998%)**: The RG-group was already fetched and consumed. The call
+  returns immediately after a set lookup but still costs ~30 µs of Python overhead.
+- **16 HIT (0.002%)**: Pipeline prefetch was successfully used. The plan was correct
+  (FILE-MAJOR order was confirmed in a prior run that showed 0 FALLBACKs).
+- **0 FALLBACK**: No synchronous pipeline misses. The FILE-MAJOR plan order works
+  correctly.
+
+The overwhelming DONE fraction confirms the fundamental mismatch: DLIO calls
+`read_index()` once per *sample*, but a coalesced GET covers 8 row groups ×
+~8,134 samples/RG ≈ **65,072 samples per GET**. So for every 1 actual I/O operation,
+there are ~65,072 Python calls — 65,071 of which are instant DONE returns that still
+burn ~2 s of CPU time each, collectively.
+
+---
+
+## Flaw 2: Coalesced Byte-Range GETs = Full Object GET
+
+### Plan Inspection
+
+The simulation also writes a human-readable plan TSV. Inspecting
+`sim_plan_epoch1.tsv` for `img_00_of_64.parquet`:
+
+```
+plan_idx  file                   group_start  offset       length
+0         img_00_of_64.parquet   0            4            66,515,002   (63.4 MiB)
+1         img_00_of_64.parquet   8            66,515,006   66,514,586   (63.4 MiB)
+2         img_00_of_64.parquet   16           133,029,592  66,514,548   (63.4 MiB)
+3         img_00_of_64.parquet   24           199,544,140  66,514,618   (63.4 MiB)
+...
+15        img_00_of_64.parquet   120          997,728,814  17,235,337   (16.4 MiB)
+```
+
+Key observations:
+- Entry 0 ends at byte 66,515,006. Entry 1 starts at byte 66,515,006. **Exactly abutting.**
+- Entry 1 ends at byte 133,029,592. Entry 2 starts at byte 133,029,592. **Exactly abutting.**
+- The 16 GETs span from byte 4 to byte 1,014,964,151 — **the entire file**.
+
+### Why This Is Fatal
+
+The 16 byte-range GETs covering `img_00_of_64.parquet` are **mathematically equivalent
+to a single `GET img_00_of_64.parquet`** in terms of bytes transferred. But they cost:
+
+- 16× the TCP connection setups (or 16× HTTP/2 stream allocations)
+- 16× the S3 request latencies (10–25 µs TTFB each on s3-ultra)
+- 16× the per-request overhead on the server
+- More complex client-side pipeline management
+
+There is **no benefit** to byte-range GETs on consecutively-accessed, abutting regions
+of the same object. The only justification for byte-range GETs is random/sparse access
+patterns — which this workload does not have.
+
+### Root Cause
+
+The coalesce logic correctly identifies that 8 consecutive row groups must be fetched
+together. But with `file_shuffle: seed` and FILE-MAJOR access order, DLIO reads each
+file sequentially from RG 0 to RG 122. The access pattern is simply:
+
+```
+file0[rg 0..7] → file0[rg 8..15] → … → file0[rg 120..122] →
+file1[rg 0..7] → file1[rg 8..15] → … → file63[rg 120..122]
+```
+
+This is sequential full-file access. The correct I/O primitive is `GET file0` (whole
+object), not 16 range GETs.
+
+---
+
+## Correct Architecture
+
+The current design — `read_index()` called once per sample, serving samples out of
+in-memory coalesced buffers — cannot achieve the throughput target. The required
+redesign:
+
+### Required Changes
+
+1. **Eliminate per-sample `read_index()` hot path**  
+   Replace with an iterable dataset that yields whole pre-fetched file bytes,
+   bypassing DLIO's sample-iteration loop. This reduces call overhead from 64M/epoch
+   to ~64/epoch (one per file).
+
+2. **Read entire Parquet files, not byte ranges**  
+   Issue a single `GET` per file. Each file is ~1 GiB — well within the capability
+   of a single connection at >1 GB/s. Eliminate the coalescing machinery entirely.
+   This is a storage benchmark — the bytes are consumed as-is, no decode required.
+
+3. **Pipeline at file granularity, not RG granularity**  
+   Prefetch the next N files while the current file is being processed. N=2–4 is
+   sufficient since each file is ~1 GiB and the pipeline fills quickly.
+
+### Expected Improvement
+
+| Design | Calls/epoch | I/O ops/epoch | Theoretical ceiling |
+|--------|-------------|---------------|---------------------|
+| Current (per-sample `read_index`) | 64,000,000 | 1,024 range GETs | <213 MB/s (Python bound) |
+| Proposed (per-file iterator) | ~64 | 64 whole-object GETs | >1 GB/s (I/O bound) |
+
+---
+
+## Proposed Architecture Benchmark Results
+
+**Script**: `mlp-storage/tests/object-store/bench_wholefile_get.py`  
+**Method**: `s3dlio.get(uri)` — one full-object GET per file, `--pipeline 2` concurrent GETs,
+bytes discarded immediately (no Parquet decode). Measures pure I/O ceiling.
+
+```
+python3 bench_wholefile_get.py --np 1 --pipeline 2 --epochs 2
+```
+
+### NP=1 Baseline Results
+
+| Epoch | Data | Wall time | Total MB/s | Per-GPU MB/s | vs 400 MB/s |
+|-------|------|-----------|------------|--------------|-------------|
+| 1 (cold) | 60.66 GiB | 41.1 s | **1,584 MB/s** | 1,584 | +1,184 PASS |
+| 2 (OS/server cache) | 60.66 GiB | 38.0 s | **1,714 MB/s** | 1,714 | +1,314 PASS |
+
+**With only 2 concurrent GETs** (`--pipeline 2`), whole-file fetches achieve **~4×
+the 400 MB/s target** — compared to ~36 MB/s with the current byte-range architecture.
+
+### Multi-NP Scaling Results
+
+Each NP process runs `pipeline=2` concurrent GETs: total outstanding = `NP × 2`.
+Simulated with `--np N --pipeline 2 --epochs 2`.
+
+| NP | Outstanding GETs | Epoch | Total MB/s | Per-GPU MB/s | vs 400 MB/s |
+|----|-----------------|-------|------------|--------------|-------------|
+| 1  | 2               | 1 (cold)          | 1,584 | 1,584 | +1,184 PASS |
+| 1  | 2               | 2 (OS/server cache) | 1,714 | 1,714 | +1,314 PASS |
+| 2  | 4               | 1 (cold)          | 2,765 | 1,382 | +982 PASS |
+| 2  | 4               | 2 (OS/server cache) | 3,257 | 1,629 | +1,229 PASS |
+| 4  | 8               | 1 (cold)          | 4,239 | 1,060 | +660 PASS |
+| 4  | 8               | 2 (OS/server cache) | 4,746 | 1,187 | +787 PASS |
+| 8  | 16              | 1 (cold)          | 4,472 |   559 | +159 PASS |
+| 8  | 16              | 2 (OS/server cache) | 5,187 |   648 | +248 PASS |
+
+**All NP values PASS the 400 MB/s per-GPU target.** Even at NP=8 (the most demanding
+case), each GPU receives 559–648 MB/s — 40–62% above the target.
+
+Note: NP=8 shows diminishing returns vs NP=4 (~5–9% total throughput gain despite 2×
+the processes). This indicates s3-ultra's server-side throughput ceiling (~4.5–5.2 GB/s
+aggregate) is being approached at 16 concurrent connections — not a DLIO or client-side
+limitation.
+
+### Per-File Throughput (representative)
+
+Individual file GETs ran 688–903 MB/s in epoch 1, 734–913 MB/s in epoch 2. The
+aggregate exceeds 400 MB/s even with just 2 connections because each single 1 GiB GET
+sustains 700–900 MB/s on a single connection. The pipeline keeps 2 connections
+saturated at all times.
+
+Scaling from NP=1 (2 outstanding) to NP=4 (8 outstanding) is nearly linear: 1,584 →
+2,765 → 4,239 MB/s (1.0× → 1.7× → 2.7×). NP=8 reaches 4,472 MB/s but the marginal
+gain flattens, indicating the server is near its aggregate ceiling rather than the
+client being the bottleneck.
+
+### What This Proves
+
+1. **The storage layer has ample headroom.** s3-ultra + the local network deliver
+   >1.4 GB/s aggregate, more than 3× the target. This is a storage benchmark —
+   bytes are fetched and discarded; no Parquet decoding is performed or required.
+
+2. **64 GETs/epoch (whole objects) vs 1,024 GETs/epoch (byte ranges) is irrelevant
+   for throughput.** The bottleneck in the proposed design is saturating the link,
+   not request count.
+
+3. **`--pipeline 2` per NP is sufficient.** Each 1 GiB file takes ~1.1–1.5 s per
+   connection. Two in-flight GETs per GPU keep the link continuously loaded with
+   negligible memory overhead (2 × ~1 GiB = 2 GiB peak per NP process).
+
+4. **The Python `read_index` overhead (300 s/epoch) is the entire bottleneck.**
+   Removing it via a file-level iterator brings epoch time from 300 s → ~42 s at
+   the I/O layer — a 7× speedup before any other optimizations.
+
+---
+
+## Implementation Design Decisions
+
+### D1: Sleep Fidelity — Batch-Granularity Yields, Not Sample-Granularity
+
+DLIO fires its compute-sleep timer once per `__next__()` call on the iterator. The
+critical requirement is that the iterator yields at **batch granularity**, not sample
+granularity.
+
+With `batch_size=12288` and 1,000,000 samples per file:
+
+```
+steps per file = ceil(1,000,000 / 12,288) = 82
+steps per epoch = 82 × 64 files          = 5,248
+```
+
+This means DLIO's compute sleep fires **5,248 times per epoch** — vs 64,000,000 times
+in the current design. The sleep duration per call is:
+
+```
+compute_time_per_step = batch_size × compute_time_per_sample
+```
+
+This arithmetic is unchanged; only the call site moves from per-sample to per-batch.
+There is **no fidelity loss**: the total simulated compute time is identical because
+`82 sleeps × (12,288 samples × t_sample) = 1,000,000 × t_sample`.
+
+The key invariant: the iterator must signal the correct `step` count to DLIO's
+internal step counter so MLPerf's per-step timing and reporting remain correct.
+
+### D2: Batch Yielding — Zero-Copy `memoryview` Slices
+
+Since this is a pure storage benchmark (no Parquet decode), batches are yielded as
+raw byte slices of the in-memory file buffer using Python's `memoryview`:
+
+```python
+data = s3dlio.get(uri)          # bytes object, ~1 GiB; GIL released during GET
+mv   = memoryview(data)         # zero-copy view, O(1) — no allocation
+
+chunk = bytes_per_sample * batch_size
+for offset in range(0, len(mv), chunk):
+    yield mv[offset : offset + chunk]  # O(1) C-level pointer slice, no copy
+
+del mv, data                    # explicit release when file is exhausted
+```
+
+A `memoryview` slice is a C-level pointer adjustment — it does not enter CPython's
+allocator, copies no bytes, and adds no measurable overhead per step. DLIO receives
+a correctly-sized buffer, fires its compute sleep, and the iterator advances to the
+next chunk.
+
+### D3: DRAM Pressure — Explicit Release to Cap Peak Usage
+
+At `NP=8` with `pipeline=2`, there are 16 concurrent whole-file GETs in flight,
+each ~1 GiB. Without explicit release, completed buffers held in futures accumulate:
+
+```
+peak in-flight DRAM = NP × pipeline × ~1 GiB = 8 × 2 × 1 GiB = 16 GiB
+```
+
+This is acceptable for modern servers (256+ GiB DRAM) but the buffer **must be
+released** the moment iteration over that file completes. The `del mv, data` at file
+exhaustion (shown above) ensures the previous file's buffer is freed before the next
+prefetch result is consumed — keeping steady-state peak at exactly
+`pipeline × ~1 GiB = 2 GiB per NP process`.
+
+Do **not** accumulate futures without consuming them. The `ThreadPoolExecutor` must
+be driven with a bounded semaphore or `as_completed` so that at most `pipeline`
+results are pending at any time.
+
+### D4: Call Overhead Summary After Redesign
+
+| Operation | Current (per-sample) | Proposed (per-file/batch) | Reduction |
+|-----------|---------------------|--------------------------|-----------|
+| `read_index()` / `__next__()` calls/epoch | 64,000,000 | 5,248 | 12,195× |
+| S3 GETs/epoch | 1,024 (range) | 64 (whole object) | 16× |
+| Python allocations for buffers | 64,000,000 | 64 | 1,000,000× |
+| Peak in-flight DRAM (NP=1) | ~2 GiB | ~2 GiB | unchanged |
+| Peak in-flight DRAM (NP=8) | ~2 GiB | ~16 GiB | 8× (by design) |
+
+---
+
+## Simulation Infrastructure
+
+The simulate mode can be reused for future architecture validation:
+
+```bash
+# Run with simulate (no real I/O), log 60 s of events
+bash run_dlrm_bench.sh 1 s3dlio simulate 60
+
+# Inspect results
+RUNDIR=results/dlrm/training/dlrm/run/<RUNID>
+zstdcat $RUNDIR/sim_io_epoch1.tsv.zst | awk -F'\t' 'NR>1{print $9}' | sort | uniq -c
+```
+
+Log columns: `ts_ns`, `epoch`, `step`, `image_idx`, `file`, `sample_idx`, `rg_idx`,
+`group_start`, `decision` (`HIT`/`DONE`/`FALLBACK`), `offset`, `length`.
+
+A `FALLBACK` indicates the pipeline missed (prefetch was not ready). The confirmed
+FILE-MAJOR plan order produces 0 FALLBACKs, confirming the prefetch logic is
+correct — the architecture itself is the bottleneck.
+
+---
+
+## Conclusion
+
+The simulate results are definitive. The DLRM Parquet-S3 reader cannot achieve
+≥400 MB/s at NP=1 under the current per-sample `read_index()` architecture because:
+
+1. Pure Python iteration over 64M samples/epoch costs ~300 s — nearly 2× the
+   wall-clock budget required for 400 MB/s.
+2. The byte-range GET decomposition provides no benefit and adds 16× request overhead
+   vs. whole-object GETs.
+
+A redesign replacing DLIO's sample-level reader with a file-level iterable dataset
+issuing one whole-object GET per file is required to make progress. No Parquet
+decoding is required — this is a storage throughput benchmark.
diff --git a/docs/Data-Gen_Enhance.md b/docs/Data-Gen_Enhance.md
new file mode 100644
index 00000000..89e74981
--- /dev/null
+++ b/docs/Data-Gen_Enhance.md
@@ -0,0 +1,123 @@
+# dgen-py Data Generation: Performance Analysis & Recommendations
+
+> Analysis of `dgen-py` calling patterns in the DLIO benchmark's JPEG/PNG data generators,
+> with benchmark results and a recommended change to reach ≥145K objects/second.
+
+---
+
+## Background: How `dgen-py` Generates Data Internally
+
+`DataGenerator` (the Rust core behind `dgen_py.Generator`) produces data in **1 MB blocks**
+(the minimum block size, clamped from any user-specified value).  Every `fill_chunk(n)` call
+maps the requested byte range onto one or more 1 MB blocks and runs Xoshiro256++ independently
+on each block.
+
+This means object size relative to 1 MB **matters**:
+
+- If the object fits entirely within one block → 1 MB of generation work, `n` bytes copied out.
+- If the object straddles a block boundary → 2 × 1 MB of generation work, `n` bytes copied out.
+
+---
+
+## "Parallel Reused" vs "Parallel Fresh" — Root Cause
+
+Both patterns use **sequential Xoshiro256++** (`max_threads=1`).  The performance difference
+is purely due to block boundary alignment at 315 KB objects on a 1 MB block floor.
+
+### Fresh — `DataGenerator(size=315KB)` created per call
+
+Each call constructs a new `DataGenerator` with `size=315KB`.  `current_pos` always starts
+at 0.  The 315 KB request fits entirely in block 0.
+
+→ **1 block generated per call** (1 MB work for 315 KB output)
+
+### Reused — `Generator(size=256GiB)` streaming continuously
+
+`current_pos` accumulates across calls.  Because 315 KB does not divide 1 MB evenly
+(GCD = 1 KB, LCM = 315 MB), roughly 31% of `fill_chunk` calls straddle a 1 MB boundary —
+requiring **2 blocks** to serve 315 KB.  The remaining 69% require 1 block.
+
+→ **1.31 blocks amortized per call** (31% overhead vs fresh)
+
+### Results (28-core Xeon, 315 KB objects)
+
+| Pattern | Blocks/call (amortized) | Objects/sec (per process) |
+|:--------|:-----------------------:|:-------------------------:|
+| Fresh — new `DataGenerator` per call | 1.00 | ~145K |
+| Reused — `Generator` streaming | 1.31 | ~115K |
+| **`BufferPool.next_slice()`** | **0.30** | **≥145K** |
+
+---
+
+## Current Code in `jpeg_generator.py` and `png_generator.py`
+
+Both generators use the same streaming pattern:
+
+```python
+# Created once before the file loop (object-store non-DALI path)
+_stream = _dgen_py.Generator(size=256 * 1024**3)   # 256 GiB, no block_size set
+_buf    = bytearray(nbytes)                         # pre-allocated, reused
+
+# Inside the per-file _write() closure:
+mv = memoryview(_buf)[:nbytes]
+_stream.fill_chunk(mv)   # fills from current_pos; ~31% of calls generate 2x 1 MB blocks
+output.write(mv)
+```
+
+This is the **"reused" pattern** — ~115K objects/second at 315 KB.  The 31% overhead comes
+from block boundary crossings that occur as `current_pos` advances through the 256 GiB stream.
+
+---
+
+## Recommended Fix: `BufferPool.next_slice()`
+
+`BufferPool` generates one 1 MB backing block and serves **zero-copy Arc slices** from it.
+At 315 KB, ~3.3 slices are served before the block is exhausted and a new one generated.
+Amortized cost: **0.30 blocks/call** — the lowest of any pattern.
+
+Replace in both `jpeg_generator.py` and `png_generator.py`, object-store non-DALI path:
+
+```python
+# Before the file loop — replace _stream + _buf with a single pool:
+_pool = _dgen_py.BufferPool() if _HAS_DGEN else None
+
+# Inside _write(), object-store non-DALI path:
+if _pool is not None and not is_local:
+    output.write(memoryview(_pool.next_slice(nbytes)))
+```
+
+Benefits over the current `Generator + bytearray + fill_chunk` pattern:
+
+- **No pre-allocated buffer needed** — `next_slice()` returns a `BytesView` directly
+- **Zero-copy** — `memoryview()` on the returned slice uses the buffer protocol with no copy
+- **No block boundary overhead** — the 1 MB backing block is shared across ~3.3 objects
+- **Simpler code** — two lines replace four
+
+### Why not just use "fresh" `DataGenerator` per call?
+
+Fresh avoids boundary crossings (1.00 block/call vs 1.31) and reaches ~145K obj/s, but it
+**allocates and initialises a new `DataGenerator` struct on every call** — including PRNG
+seeding from `urandom`.  `BufferPool` reaches the same throughput without per-call allocation
+overhead and is the canonical pattern for sub-1 MB workloads.
+
+> **Note**: `generate_buffer(size)` for `size < 1 MB` already uses a thread-local
+> `BufferPool` automatically, so single-file scripts calling `generate_buffer()` in a loop
+> already get this optimization without any changes.
+
+---
+
+## Multi-Process Scaling Reference (28-core Xeon)
+
+For context on how dgen-py scales across independent Python processes (e.g. DLIO DataLoader
+workers), each using `max_threads=1`:
+
+| N processes (8 MB objects, 5 s run) | Aggregate GB/s | Per-process GB/s |
+|:------------------------------------:|:--------------:|:----------------:|
+| 1                                    | 5.2            | 5.2              |
+| 4                                    | 17.8           | 4.5              |
+| 8                                    | 27.9           | 3.5              |
+| 16                                   | 52.7           | 3.3              |
+| 28                                   | **58.6**       | 2.1              |
+
+Aggregate throughput saturates DRAM bandwidth (~58 GB/s) at 28 processes.  The per-process
+decline at high N is DRAM write saturation, not generator overhead.
diff --git a/docs/PageCache_Drop_Between_Epochs_26-04-18.md b/docs/PageCache_Drop_Between_Epochs_26-04-18.md
new file mode 100644
index 00000000..92998f73
--- /dev/null
+++ b/docs/PageCache_Drop_Between_Epochs_26-04-18.md
@@ -0,0 +1,168 @@
+# Page Cache Dropping Between Epochs — Why and How
+
+**Date:** April 18, 2026  
+**Status:** Proposal — documentation only, no code changes made
+
+---
+
+## Why This Matters
+
+dlio_benchmark is a **storage** benchmark. Its measured throughput should reflect the performance of the storage system (NVMe, NFS, S3, parallel filesystem), not the client machine's DRAM.
+
+On a local POSIX filesystem, the Linux kernel page cache automatically retains file data in DRAM after the first read. Subsequent reads of the same files are served from memory at 40–80 GB/s rather than from storage hardware. In a multi-epoch benchmark this means:
+
+- **Epoch 1**: reads come from storage — accurate
+- **Epoch 2+**: reads come from the page cache — measures DRAM, not storage
+
+This is true for every format (NPZ, HDF5, Parquet, Arrow IPC), but the severity depends on the format. Parquet decode is CPU-intensive, which limits the effective read rate and naturally causes page eviction before the next epoch starts. Arrow IPC reads are trivial (memcpy only), so the OS can fully populate the cache and serve all subsequent epochs from DRAM. **Arrow IPC is more accurate for storage benchmarking in principle, but more vulnerable to page cache pollution in practice if the dataset fits in RAM.**
+
+dlio_benchmark already detects this risk and logs a warning:
+```
+WARNING: The amount of dataset is smaller than the host memory; data might be
+cached after the first epoch. Increase the size of dataset to eliminate the caching effect!
+```
+
+This warning is correct but passive. Dropping the page cache between epochs is a stronger, active solution that works regardless of dataset size.
+
+---
+
+## What Dropping the Page Cache Does
+
+On Linux, writing `3` to `/proc/sys/vm/drop_caches` instructs the kernel to evict all clean pages (page cache, dentries, and inodes) from memory. This is a supported, non-destructive kernel interface — it only releases clean pages that can be re-read from storage; dirty pages (unsaved writes) are never evicted. The operation is instantaneous for moderate dataset sizes.
+
+```bash
+sync && echo 3 | sudo tee /proc/sys/vm/drop_caches > /dev/null
+```
+
+The `sync` flushes any pending writes before the drop, ensuring no dirty data is discarded. `sudo` is required because the operation is privileged.
+
+After the drop, every read in the next epoch must come from storage hardware, making all epochs equally accurate.
+
+---
+
+## Why dlio_benchmark Should Do This Automatically
+
+Currently there is no mechanism in dlio_benchmark to drop the page cache between epochs. The benchmark operator must do it manually before each run, which:
+
+1. Is error-prone (easy to forget between runs)
+2. Does not help for multi-epoch runs within a single invocation
+3. Requires a separate out-of-band script or `--pre-run` hook
+
+Building it into the benchmark ensures consistent, reproducible results across all epochs, not just epoch 1.
+
+---
+
+## Required Code Changes
+
+Three files require modification. No new dependencies are introduced — only the Linux `/proc/sys/vm/drop_caches` interface is used, which is always available on Linux kernels 2.6.16+.
+
+### 1. `dlio_benchmark/utils/config.py` — new config field
+
+Add one boolean field to the `ConfigArguments` dataclass, near `odirect`:
+
+```python
+# In ConfigArguments dataclass, near odirect: bool = False
+drop_page_cache: bool = False
+```
+
+And parse it from YAML in the `ConfigArguments.initialize_args()` / hydra config section (near where `odirect` is parsed):
+
+```python
+# In the reader config parsing block
+if 'drop_page_cache' in reader:
+    args.drop_page_cache = reader['drop_page_cache']
+```
+
+---
+
+### 2. `dlio_benchmark/main.py` — cache drop helper function and epoch hook
+
+Add a module-level helper function (alongside the existing `_apply_settle_guard`):
+
+```python
+def _drop_page_cache(args, comm) -> None:
+    """Drop the Linux page cache between epochs for accurate storage benchmarking.
+
+    Only activates when ALL of the following are true:
+      - ``args.drop_page_cache`` is True
+      - ``args.storage_type`` is LOCAL_FS or NFS (object storage is always cache-bypass)
+      - The process is running on Linux
+      - Rank 0 has write permission to /proc/sys/vm/drop_caches
+
+    Rank 0 performs the drop; all ranks then barrier so they proceed together.
+    If the write fails (not root, or not Linux), a warning is logged and
+    the benchmark continues normally — the setting is advisory, not fatal.
+    """
+    if not args.drop_page_cache:
+        return
+    if args.storage_type not in (_StorageType.LOCAL_FS,):
+        return   # object storage is inherently cache-bypass
+    if args.my_rank == 0:
+        try:
+            # sync first to flush dirty pages (safety measure)
+            import subprocess
+            subprocess.run(['sync'], check=True)
+            with open('/proc/sys/vm/drop_caches', 'w') as f:
+                f.write('3\n')
+            args.logger.info(f"{utcnow()} Page cache dropped before epoch")
+        except (OSError, PermissionError) as exc:
+            args.logger.warning(
+                f"{utcnow()} drop_page_cache=True but could not write to "
+                f"/proc/sys/vm/drop_caches: {exc}. "
+                f"Run as root or grant CAP_SYS_ADMIN, or use 'odirect: true' instead."
+            )
+    comm.barrier()
+```
+
+Call this function in `run()` inside the epoch loop, **after** `finalize()` (which closes open file handles) and **before** `reconfigure()` / the next epoch's reads:
+
+```python
+# In run(), inside the epoch loop — current code:
+self.framework.get_loader(DatasetType.TRAIN).finalize()
+if self.do_eval and epoch >= next_eval_epoch:
+    ...
+    self.framework.get_loader(DatasetType.VALID).finalize()
+self.args.reconfigure(epoch + 1)
+self.stats.end_epoch(epoch)
+
+# Insert the cache drop here, after finalize() and before reconfigure():
+self.framework.get_loader(DatasetType.TRAIN).finalize()
+if self.do_eval and epoch >= next_eval_epoch:
+    ...
+    self.framework.get_loader(DatasetType.VALID).finalize()
+_drop_page_cache(self.args, self.comm)   # ← new line
+self.args.reconfigure(epoch + 1)
+self.stats.end_epoch(epoch)
+```
+
+The placement is important:
+- **After** `finalize()` — ensures all file handles are closed so no pages are pinned
+- **Before** `reconfigure()` — ensures the cache is clean before the framework reinitialises its file list for the next epoch
+- **Outside** the timing window — `stats.end_epoch()` / `stats.start_epoch()` bracket the actual I/O; the cache drop occurs between them and does not inflate measured throughput
+
+---
+
+### 3. YAML configuration
+
+```yaml
+reader:
+  odirect: false         # O_DIRECT (bypasses cache entirely — preferred)
+  drop_page_cache: true  # Drop page cache between epochs (requires root / CAP_SYS_ADMIN)
+```
+
+The two settings are complementary, not mutually exclusive:
+- `odirect: true` — best accuracy, never populates cache, works without root if the device supports it
+- `drop_page_cache: true` — clears cache accumulated during the epoch, requires root but works with any format and any reader
+- Using both together is safe and provides belt-and-suspenders cache avoidance
+
+---
+
+## MPI / Distributed Considerations
+
+In a multi-rank run, only **rank 0** should write to `/proc/sys/vm/drop_caches`. The drop is a local kernel operation — it only affects the machine it runs on. In a distributed job where multiple hosts each run ranks, only one rank per node should issue the drop. The implementation above uses `my_rank == 0` (global rank), which covers the single-host case. For multi-host MPI jobs, this should be extended to `local_rank == 0` (first rank on each node) using `DLIOMPI.get_instance().local_rank()` or equivalent.
+
+---
+
+## Relationship to the Existing `potential_caching` Warning
+
+`StatsCounter` already computes `potential_caching` and logs a warning when `data_size_per_host_GB <= host_memory_GB`. The `drop_page_cache` feature complements this: rather than relying on the operator to manually increase dataset size, the benchmark can actively ensure cache-clean reads on every epoch. The warning should remain — it informs the operator about the risk — but `drop_page_cache: true` can be offered as the in-benchmark remedy.
diff --git a/docs/Parquet-S3-DataLoader_Updates.md b/docs/Parquet-S3-DataLoader_Updates.md
new file mode 100644
index 00000000..be40fb2a
--- /dev/null
+++ b/docs/Parquet-S3-DataLoader_Updates.md
@@ -0,0 +1,740 @@
+# Parquet S3 DataLoader Updates: From 84 MB/s to 2.1 GB/s
+
+**Date:** May 2026  
+**Benchmark workload:** MLPerf Storage DLRM training  
+**Storage backend:** s3-ultra (high-performance in-process S3 server, loopback TCP)  
+**Host:** Single node, ~47 GiB RAM, loopback network (no real network latency)
+
+---
+
+## Summary
+
+This document describes the full progression of performance improvements to the
+DLRM Parquet reader in dlio_benchmark, culminating in the integration of a
+purpose-built Rust data loader inside the s3dlio library.
+
+| Phase | What Changed | Throughput | Speedup |
+|-------|-------------|-----------|---------|
+| **Baseline** | Original `pf.read_row_group()` — 40 HTTP GETs per row group, Fjall footer read on every GET | **~84 MB/s** | 1× |
+| **Phase 1** | Single merged range GET per row group (min–max column span); s3-ultra in-memory footer cache | **~200 MB/s** | 2.4× |
+| **Phase 2** | Background ThreadPoolExecutor (32 threads) prefetches all row groups at file open; `read_index()` override removes per-sample `utcnow()` | **~400 MB/s** | 4.8× |
+| **Phase 3** | s3dlio `ParquetRowGroupDataset` in Rust — row-group granular iteration, `buffer_unordered` concurrency, bounded channel backpressure | **2,138 MB/s** | **25×** |
+
+---
+
+## Dataset Parameters
+
+| Parameter | Value |
+|-----------|-------|
+| Files | 64 |
+| File size | ~971 MiB each |
+| Row groups per file | 123 |
+| Rows per row group | 8,192 |
+| Total rows per file | ~1,000,000 |
+| Total columns | 200 |
+| Columns read per benchmark run | 40 (projection) |
+| Compressed size per 40-col row group | ~1,721 KiB |
+| Byte span of merged GET per row group | ~1,743 KiB |
+| Total data per epoch (all files) | ~64.96 GB |
+| Total row groups per epoch | 7,872 |
+| Parquet footer size | ~2.66 MiB |
+
+---
+
+## Baseline: ~84 MB/s
+
+### What was happening
+
+The original `ParquetReader` (file path) and `ParquetReaderS3Iterable` (S3 path)
+both used `pyarrow`'s `read_row_group()` call.  For S3 data, pyarrow issues one
+HTTP GET per column chunk per row group.  With 40 columns selected, that is
+**40 HTTP GETs per row group** — 78,720 separate range requests per epoch,
+each incurring HTTP overhead, connection reuse, and Fjall metadata lookups on
+every call.
+
+Additionally, the `dlio_benchmark` framework calls `read_index(image_idx, step)`
+once per **sample** — 1,000,000 samples × 16 files/worker =
+**16,000,000 Python function calls per worker per epoch**.  Each call contained
+two `datetime.now().strftime()` operations in debug log f-strings, adding ~3 µs
+of pure Python overhead per call regardless of whether debug logging was enabled.
+
+```
+16,000,000 calls × 3 µs per call = 48 s Python overhead per worker
+78,720 HTTP GETs (40 cols × 1,968 row groups) = ~200 s I/O time
+Combined: well below even the HTTP overhead floor
+```
+
+### Test output (approximate)
+
+```
+Throughput: ~84 MB/s
+Elapsed:    ~775 s for 64 GB
+```
+
+---
+
+## Phase 1: ~200 MB/s — Merged Range GET
+
+### Problem identified
+
+Each `read_row_group()` call in pyarrow issues a separate HTTP GET per column.
+Since all 40 selected columns are contiguous within the row group in the file
+(Parquet column-chunk layout), a **single merged GET** covering the byte span
+from the first to the last selected column chunk retrieves all the data in one
+round trip.
+
+### Changes made
+
+**`parquet_reader_s3_iterable.py`** — added `_get_rg_byte_range()`:
+
+```python
+def _get_rg_byte_range(self, pf_meta, rg_idx, col_indices):
+    """Return (start_byte, length) spanning all selected column chunks."""
+    rg_meta = pf_meta.row_group(rg_idx)
+    starts, ends = [], []
+    for col_i in col_indices:
+        cc = rg_meta.column(col_i)
+        offset = cc.dictionary_page_offset or cc.data_page_offset
+        starts.append(offset)
+        ends.append(offset + cc.total_compressed_size)
+    return min(starts), max(ends) - min(starts)
+```
+
+Replaced `pf.read_row_group(rg_idx, columns=col_names)` with a single
+`s3dlio.get_range(uri, start, length)` call, then parsed the raw bytes with
+`pyarrow.parquet.read_table(io.BytesIO(raw_bytes))`.
+
+**s3-ultra** — added in-memory LRU cache for Parquet footer bytes so that
+repeated `GetObject` requests for the same file tail did not hit disk.  Footer
+parse results are cached in s3-ultra's Fjall metadata store and served from RAM
+on subsequent requests.
+
+### Result
+
+```
+Throughput: ~200 MB/s
+Speedup:    2.4× over baseline
+HTTP GETs:  1,968 per epoch (down from 78,720)
+```
+
+The 40× reduction in HTTP round trips drove the improvement.  The Python
+overhead wall (16M `read_index()` calls) remained.
+
+---
+
+## Phase 2: ~400 MB/s — Prefetch + Fast read_index
+
+### Problem identified
+
+Even with one GET per row group, two bottlenecks remained:
+
+1. **Serial I/O**: row groups were fetched one at a time as `read_index()` called
+   `get_sample()` per sample.  The file was opened only once, but each row group
+   fetch was triggered by the first sample that fell into it.
+
+2. **Per-sample Python overhead**: `read_index()` in the base class called
+   `datetime.now().strftime()` twice per call — 16M × ~3 µs = ~48 s per worker,
+   irreducible in Python.
+
+### Changes made
+
+**`parquet_reader_s3_iterable.py`** — added `_prefetch_all_row_groups()`:
+
+```python
+def open(self, filename):
+    """Prefetch all row-group extents concurrently at file open."""
+    uri = self._uri_for(filename)
+    pf_meta = self._get_footer(uri)
+
+    executor = ThreadPoolExecutor(max_workers=32)
+    futures = {
+        executor.submit(self._fetch_rg, uri, pf_meta, rg_idx): rg_idx
+        for rg_idx in range(pf_meta.num_row_groups)
+    }
+    rg_cache = {}
+    for fut in as_completed(futures):
+        rg_idx = futures[fut]
+        rg_cache[rg_idx] = len(fut.result())  # store compressed byte count
+    executor.shutdown(wait=False)
+    return (pf_meta, rg_cache)
+```
+
+**`read_index()` override** — bypassed the base-class datetime overhead:
+
+```python
+def read_index(self, image_idx, step):
+    dlp.update(step=step)
+    filename, sample_index = self.global_index_map[image_idx]
+    if filename not in self.open_file_map or self.open_file_map[filename] is None:
+        self.open_file_map[filename] = self.open(filename)
+    self.get_sample(filename, sample_index)
+    if self._args.read_type is ReadType.ON_DEMAND:
+        self.open_file_map[filename] = None
+    return self._args.resized_image
+```
+
+### Result
+
+```
+Throughput: ~402 MB/s
+Speedup:    4.8× over baseline
+Python calls: still 16,000,000 per worker per epoch
+```
+
+The prefetch brought all row-group data in concurrently at file open, eliminating
+serial I/O latency.  The Python overhead floor (16M calls) remained the binding
+constraint.  At 402 MB/s the benchmark was spending ~4 seconds on I/O and ~48 s
+on Python overhead per worker — the ratio was ~12:1 Python-to-I/O.
+
+### Root cause analysis: the 10 GiB/s ceiling
+
+```
+16 files × 123 row groups × 1,743 KiB/RG = 3,350 MiB data per worker
+
+At 10 GiB/s:  I/O time = 0.33 s
+Python overhead (16M calls × 3.6 µs): 58 s
+
+→ Python overhead is 176× the I/O time at target throughput.
+→ 10 GiB/s is geometrically impossible while calling Python 16M times per epoch.
+```
+
+The only fix is to change the **unit of iteration from sample to row group**:
+
+```
+1,968 Python calls × 3.6 µs = 0.007 s Python overhead
+I/O time @ 10 GiB/s: 0.33 s
+→ Python overhead: 2% of runtime  ✓
+→ Achievable throughput: 3,350 MiB / 0.337 s ≈ 9,940 MiB/s ≈ 10 GiB/s
+```
+
+This requires moving Parquet footer parsing and row-group indexing fully into
+Rust, out of Python.
+
+---
+
+## Phase 3: 2,138 MB/s — s3dlio Rust DataLoader
+
+### Architecture
+
+The solution moves the entire data loading hot path into Rust inside the
+s3dlio library, exposing a simple Python iterator API.  Python calls `next()`
+once per **row group** — 1,968 times per epoch instead of 16,000,000 times.
+
+```
+Python (dlio_benchmark)             Rust (s3dlio / Tokio)
+────────────────────────────        ──────────────────────────────────────────
+for item in loader:          ←─── bounded mpsc channel (capacity = prefetch)
+    process(item)                        ↑
+                                  buffer_unordered(prefetch)
+                                  driving up to N concurrent range GETs
+                                  via Tokio's work-stealing thread pool
+                                         ↑
+                                  ParquetRowGroupDataset.get(idx)
+                                  → single range GET per row group
+```
+
+Natural backpressure: when Python is slow the Tokio producer blocks on
+`tx.send().await`.  When Python is fast the channel stays full and GETs run
+ahead.  No manual thread-count tuning — Tokio's scheduler adapts from 8-core
+laptops to 128-core servers automatically.
+
+---
+
+### s3dlio changes
+
+#### New file: `src/data_loader/parquet_rg.rs`
+
+`ParquetRowGroupDataset` — a `Dataset<Item=Bytes>` where each item is the
+merged column-chunk bytes for one Parquet row group.
+
+**Construction** (called once per `create_async_loader()` call):
+
+1. Lists all `.parquet` files under the URI prefix (one `ListObjects` request)
+2. Concurrently stats all files and fetches the last `footer_cap` bytes of each
+   (one range GET per file, default 4 MiB covers DLRM's 2.66 MiB footers)
+3. Parses every Parquet footer using the `parquet` crate (no arrow/datafusion)
+4. Pre-computes one `RgExtent { start, length, num_rows }` per row group across
+   all files — this is the index for all subsequent I/O
+
+**`get(global_rg_idx)`** (called once per `__next__()`):
+
+- One `get_object_range` call covering `[extent.start, extent.start + extent.length)`
+- Returns `Bytes` (zero-copy, reference-counted)
+
+```rust
+// src/data_loader/parquet_rg.rs (key structs)
+
+pub const DEFAULT_FOOTER_CAP: usize = 4 * 1024 * 1024;  // 4 MiB
+
+#[derive(Clone, Debug)]
+struct RgExtent {
+    file_uri_idx: usize,   // index into file_uris Vec
+    start:        u64,     // byte offset of first selected column chunk
+    length:       u64,     // span covering all selected column chunks
+    num_rows:     i64,     // row count for this row group
+}
+
+pub struct ParquetRowGroupDataset {
+    file_uris: Arc<Vec<String>>,
+    extents:   Arc<Vec<RgExtent>>,
+}
+
+impl ParquetRowGroupDataset {
+    pub fn new(
+        uri_prefix:  &str,
+        col_indices: Option<&[usize]>,  // None = all columns
+        footer_cap:  usize,
+    ) -> Result<Self, DatasetError> {
+        // list → concurrent stat+footer fetch → parse → build extents
+        // all run via run_on_global_rt (blocks calling thread, no async needed)
+    }
+}
+
+#[async_trait]
+impl Dataset for ParquetRowGroupDataset {
+    type Item = Bytes;
+    fn len(&self) -> Option<usize> { Some(self.extents.len()) }
+    async fn get(&self, idx: usize) -> Result<Bytes, DatasetError> {
+        // single range GET: extents[idx].start .. extents[idx].start + length
+    }
+}
+```
+
+#### `Cargo.toml` changes
+
+```toml
+[features]
+default = ["s3", "native-backends", "thread-pinning", "backend-aws", "parquet"]
+parquet = ["dep:parquet"]
+
+[dependencies]
+parquet = { version = "58", default-features = false, optional = true }
+```
+
+`default-features = false` keeps arrow, datafusion, and object_store out of the
+dependency tree.  Only the Parquet metadata types and footer decoder are pulled
+in (~2 MB of compile time vs ~80 MB with arrow enabled).
+
+#### `src/data_loader/mod.rs`
+
+```rust
+#[cfg(feature = "parquet")]
+pub mod parquet_rg;
+#[cfg(feature = "parquet")]
+pub use parquet_rg::{ParquetRowGroupDataset, DEFAULT_FOOTER_CAP};
+```
+
+#### `src/python_api/python_aiml_api.rs` — format routing in `create_async_loader`
+
+```rust
+pub fn create_async_loader(uri: &str, opts: Option<Bound<'_, PyDict>>) -> PyResult<PyBytesAsyncDataLoader> {
+    #[cfg(feature = "parquet")]
+    if let Some(ref d) = opts {
+        if d.get_item("format").ok().flatten()
+              .and_then(|v| v.extract::<String>().ok())
+              .as_deref() == Some("parquet")
+        {
+            let col_indices: Option<Vec<usize>> = d.get_item("columns")
+                .ok().flatten().and_then(|v| v.extract().ok());
+
+            let footer_cap: usize = d.get_item("footer_cap")
+                .ok().flatten().and_then(|v| v.extract().ok())
+                .unwrap_or(DEFAULT_FOOTER_CAP);
+
+            let pq_dataset = ParquetRowGroupDataset::new(uri, col_indices.as_deref(), footer_cap)
+                .map_err(|e| PyRuntimeError::new_err(e.to_string()))?;
+
+            let inner: Arc<dyn Dataset<Item = Bytes>> = Arc::from(pq_dataset);
+            return Ok(PyBytesAsyncDataLoader { dataset: PyDataset { inner }, opts: loader_opts });
+        }
+    }
+    // ...fall through to S3BytesDataset for non-parquet URIs
+}
+```
+
+#### Sync iterator: `PyBytesDataLoaderSyncIter`
+
+The key design: `buffer_unordered(prefetch)` in Tokio drives up to `prefetch`
+concurrent range GETs; a bounded `mpsc` channel provides backpressure.
+Python's `__next__` releases the GIL via `py.detach()` while waiting.
+
+```rust
+// __iter__: spawns Tokio producer, returns PyBytesDataLoaderSyncIter
+fn __iter__(slf: PyRef<'_, Self>) -> PyResult<Py<PyBytesDataLoaderSyncIter>> {
+    let prefetch = slf.opts.prefetch.max(1);
+    let dataset = slf.dataset.clone();
+
+    // Bounded channel — natural backpressure; producer blocks when Python is slow
+    let (tx, rx) = tokio::sync::mpsc::channel::<Result<Bytes, DatasetError>>(prefetch);
+
+    pyo3_async_runtimes::tokio::get_runtime().spawn(async move {
+        if let Some(len) = dataset.inner.len() {
+            use futures_util::stream::{self, StreamExt as _};
+
+            // buffer_unordered: up to `prefetch` GETs in flight simultaneously.
+            // Tokio's work-stealing scheduler handles all threading — no Semaphore,
+            // no JoinSet, no manual thread-count guessing.
+            let mut stream = stream::iter(0..len)
+                .map(|idx| {
+                    let ds = dataset.clone();
+                    async move { ds.inner.get(idx).await }
+                })
+                .buffer_unordered(prefetch);
+
+            while let Some(result) = stream.next().await {
+                if tx.send(result).await.is_err() {
+                    break; // Python consumer dropped the iterator
+                }
+            }
+        }
+    });
+
+    Py::new(slf.py(), PyBytesDataLoaderSyncIter { rx: std::sync::Mutex::new(rx) })
+}
+
+// __next__: blocks with GIL released; returns one PyBytesView per row group
+fn __next__(&self, py: Python<'_>) -> PyResult<Py<PyAny>> {
+    // py.detach() releases the GIL while the channel is empty (I/O in flight).
+    // GIL is re-acquired automatically when detach() returns — py is valid directly.
+    // Note: Python::with_gil() is removed in PyO3 0.27; py.detach() is the correct API.
+    let result = py.detach(|| {
+        self.rx.lock().expect("mutex poisoned").blocking_recv()
+    });
+
+    match result {
+        Some(Ok(item)) => Py::new(py, PyBytesView::new(item))?.into_py_any(py),
+        Some(Err(e))   => Err(PyRuntimeError::new_err(e.to_string())),
+        None           => Err(PyStopIteration::new_err("end of dataset")),
+    }
+}
+```
+
+**PyO3 0.27 API note:** `Python::with_gil()` was removed in PyO3 0.27.
+`py.allow_threads()` was deprecated in favour of `py.detach()`.
+After `py.detach(|| ...)` returns, the GIL is re-held and `py` is valid directly —
+no re-acquisition step is needed.
+
+---
+
+### Python API — how it is now used
+
+```python
+import os
+import s3dlio
+
+# Set endpoint (or export AWS_ENDPOINT_URL_S3 before running)
+os.environ["AWS_ENDPOINT_URL_S3"] = "http://localhost:9200"
+os.environ["AWS_ACCESS_KEY_ID"]     = "minioadmin"
+os.environ["AWS_SECRET_ACCESS_KEY"] = "minioadmin"
+
+# One call — Rust constructs the dataset (lists files, fetches footers, builds index)
+loader = s3dlio.create_async_loader(
+    "s3://mlp-flux/data/dlrm/train/",
+    {
+        "format":     "parquet",
+        "prefetch":   16,          # row groups to buffer ahead of Python consumer
+        "footer_cap": 4_194_304,   # bytes to read from file tail (default 4 MiB)
+        # "columns": [0,1,2,...],  # optional column projection; None = all columns
+    }
+)
+
+# Plain Python for-loop — no asyncio required
+for item in loader:          # item is a PyBytesView (zero-copy buffer protocol)
+    data = bytes(item)       # or: memoryview(item), len(item), etc.
+    process(data)
+```
+
+The loader is **synchronous from Python's perspective** but asynchronous
+internally.  `for item in loader:` is equivalent to repeated `__next__()` calls
+which each release the GIL and block on the Tokio channel until the next row
+group is ready.
+
+---
+
+### dlio_benchmark changes
+
+#### New file: `dlio_benchmark/reader/parquet_reader_s3dlio.py`
+
+`ParquetReaderS3dlio` is a `FormatReader` subclass that:
+
+- Creates one `s3dlio.create_async_loader(uri, {"format": "parquet", ...})` per
+  file in `open()`
+- Calls `next(loader_iter)` once per row group in `get_sample()` — **not** once
+  per sample
+- Builds row-group-to-sample offset maps via pyarrow's footer-only read (one
+  range GET per file per epoch)
+- Overrides `read_index()` to skip the base-class `utcnow()` overhead
+- Works for both `s3://` and `file://` URIs through the same code path
+
+```python
+class ParquetReaderS3dlio(FormatReader):
+
+    def open(self, filename):
+        uri = self._prefix_for_file(filename)   # → s3://... or file://...
+        loader = s3dlio.create_async_loader(uri, {
+            "format":     "parquet",
+            "prefetch":   self._prefetch,
+            "footer_cap": self._footer_cap,
+        })
+        loader_iter = iter(loader)   # starts background Tokio task immediately
+        rg_offsets  = self._build_rg_offsets(uri, filename)   # pyarrow footer read
+        return (loader_iter, rg_offsets)
+
+    def get_sample(self, filename, sample_index):
+        loader_iter, rg_offsets = self.open_file_map[filename]
+        rg_idx = max(0, bisect.bisect_right(rg_offsets, sample_index) - 1)
+        cache_key = (filename, rg_idx)
+        if cache_key not in self._rg_bytes:
+            item = next(loader_iter)            # ← actual I/O happens here
+            self._rg_bytes[cache_key] = len(item)
+        dlp.update(image_size=self._rg_bytes[cache_key])
+
+    def read_index(self, image_idx, step):
+        """Fast path: skip base-class utcnow() overhead (saves ~48 s per worker)."""
+        dlp.update(step=step)
+        filename, sample_index = self.global_index_map[image_idx]
+        if filename not in self.open_file_map or self.open_file_map[filename] is None:
+            self.open_file_map[filename] = self.open(filename)
+        self.get_sample(filename, sample_index)
+        if self._args.read_type is _ReadType.ON_DEMAND:
+            self.open_file_map[filename] = None
+        return self._args.resized_image
+```
+
+`_prefix_for_file()` constructs the correct URI for both storage types:
+
+```python
+def _prefix_for_file(self, filename):
+    if "://" in filename:
+        return filename          # already a full URI
+    if self._args.storage_type in (StorageType.S3, StorageType.AISTORE):
+        bucket = self._args.storage_root.rstrip("/")
+        return f"s3://{bucket}/{filename.lstrip('/')}"
+    else:
+        return f"file://{os.path.abspath(filename)}"
+```
+
+#### `dlio_benchmark/reader/reader_factory.py`
+
+Added opt-in routing before the existing S3/local branches.  Existing configs
+that do not set `storage_library: s3dlio` are completely unaffected.
+
+```python
+elif type == FormatType.PARQUET:
+    if _args.odirect == True:
+        raise Exception("O_DIRECT for %s format is not yet supported." % type)
+
+    # ── NEW: s3dlio-backed reader (opt-in) ──────────────────────────────
+    elif getattr(_args, "storage_options", {}) and \
+            _args.storage_options.get("storage_library") == "s3dlio":
+        from dlio_benchmark.reader.parquet_reader_s3dlio import ParquetReaderS3dlio
+        return ParquetReaderS3dlio(dataset_type, thread_index, epoch_number)
+
+    # ── Existing paths (unchanged) ──────────────────────────────────────
+    elif _args.storage_type in (StorageType.S3, StorageType.AISTORE):
+        from dlio_benchmark.reader.parquet_reader_s3_iterable import ParquetReaderS3Iterable
+        return ParquetReaderS3Iterable(dataset_type, thread_index, epoch_number)
+    else:
+        from dlio_benchmark.reader.parquet_reader import ParquetReader
+        return ParquetReader(dataset_type, thread_index, epoch_number)
+```
+
+#### YAML config to activate the new reader
+
+```yaml
+dataset:
+  format: parquet
+  storage_type: s3
+  storage_root: mlp-flux              # bucket name
+  data_folder: data/dlrm/train        # key prefix within bucket
+  num_files_train: 64
+  num_samples_per_file: 262144
+  storage_options:
+    storage_library: s3dlio           # ← selects ParquetReaderS3dlio
+    endpoint_url: http://127.0.0.1:9200
+    prefetch: 16
+    footer_cap: 4194304               # 4 MiB (covers 2.66 MiB DLRM footer)
+```
+
+For local file paths the only change is `storage_type: local` and
+`data_folder: /path/to/parquet/dir` — no other YAML changes are needed.
+
+---
+
+### Phase 3 test run output
+
+Test script: `dlio_benchmark/tests/test_s3dlio_parquet_loader.py`  
+Storage backend: s3-ultra on `localhost:9200`  
+Prefetch depth: 16 row groups
+
+```
+=== s3dlio ParquetRowGroupDataset sync iterator test ===
+URI    : s3://mlp-flux/data/dlrm/train/
+Prefetch: 16
+  100 row groups |  0.831 GB | 1.341 GB/s
+  200 row groups |  1.655 GB | 1.516 GB/s
+  300 row groups |  2.479 GB | 1.608 GB/s
+  400 row groups |  3.303 GB | 1.668 GB/s
+  500 row groups |  4.126 GB | 1.644 GB/s
+  600 row groups |  4.958 GB | 1.627 GB/s
+  700 row groups |  5.782 GB | 1.661 GB/s
+  800 row groups |  6.605 GB | 1.682 GB/s
+  900 row groups |  7.429 GB | 1.704 GB/s
+ 1000 row groups |  8.253 GB | 1.738 GB/s
+ 1100 row groups |  9.076 GB | 1.755 GB/s
+ 1200 row groups |  9.908 GB | 1.778 GB/s
+ 1300 row groups | 10.732 GB | 1.796 GB/s
+ 1400 row groups | 11.555 GB | 1.815 GB/s
+ 1500 row groups | 12.379 GB | 1.840 GB/s
+ 1600 row groups | 13.203 GB | 1.862 GB/s
+ 1700 row groups | 14.034 GB | 1.874 GB/s
+ 1800 row groups | 14.858 GB | 1.873 GB/s
+ 1900 row groups | 15.682 GB | 1.880 GB/s
+ 2000 row groups | 16.506 GB | 1.889 GB/s
+ 2100 row groups | 17.329 GB | 1.901 GB/s
+ 2200 row groups | 18.161 GB | 1.908 GB/s
+ 2300 row groups | 18.984 GB | 1.916 GB/s
+ 2400 row groups | 19.808 GB | 1.922 GB/s
+ 2500 row groups | 20.632 GB | 1.934 GB/s
+ 2600 row groups | 21.456 GB | 1.944 GB/s
+ 2700 row groups | 22.279 GB | 1.954 GB/s
+ 2800 row groups | 23.111 GB | 1.954 GB/s
+ 2900 row groups | 23.935 GB | 1.950 GB/s
+ 3000 row groups | 24.758 GB | 1.948 GB/s
+ 3100 row groups | 25.582 GB | 1.952 GB/s
+ 3200 row groups | 26.406 GB | 1.957 GB/s
+ 3300 row groups | 27.237 GB | 1.954 GB/s
+ 3400 row groups | 28.061 GB | 1.955 GB/s
+ 3500 row groups | 28.885 GB | 1.960 GB/s
+ 3600 row groups | 29.708 GB | 1.967 GB/s
+ 3700 row groups | 30.532 GB | 1.966 GB/s
+ 3800 row groups | 31.364 GB | 1.974 GB/s
+ 3900 row groups | 32.187 GB | 1.975 GB/s
+ 4000 row groups | 33.011 GB | 1.973 GB/s
+ 4100 row groups | 33.835 GB | 1.975 GB/s
+ 4200 row groups | 34.659 GB | 1.975 GB/s
+ 4300 row groups | 35.482 GB | 1.974 GB/s
+ 4400 row groups | 36.314 GB | 1.982 GB/s
+ 4500 row groups | 37.137 GB | 1.990 GB/s
+ 4600 row groups | 37.961 GB | 1.996 GB/s
+ 4700 row groups | 38.785 GB | 2.002 GB/s
+ 4800 row groups | 39.609 GB | 2.005 GB/s
+ 4900 row groups | 40.440 GB | 2.010 GB/s
+ 5000 row groups | 41.264 GB | 2.015 GB/s
+ 5100 row groups | 42.088 GB | 2.021 GB/s
+ 5200 row groups | 42.911 GB | 2.026 GB/s
+ 5300 row groups | 43.735 GB | 2.029 GB/s
+ 5400 row groups | 44.566 GB | 2.033 GB/s
+ 5500 row groups | 45.390 GB | 2.039 GB/s
+ 5600 row groups | 46.214 GB | 2.046 GB/s
+ 5700 row groups | 47.038 GB | 2.052 GB/s
+ 5800 row groups | 47.861 GB | 2.056 GB/s
+ 5900 row groups | 48.685 GB | 2.063 GB/s
+ 6000 row groups | 49.517 GB | 2.068 GB/s
+ 6100 row groups | 50.340 GB | 2.068 GB/s
+ 6200 row groups | 51.164 GB | 2.069 GB/s
+ 6300 row groups | 51.988 GB | 2.069 GB/s
+ 6400 row groups | 52.812 GB | 2.077 GB/s
+ 6500 row groups | 53.643 GB | 2.082 GB/s
+ 6600 row groups | 54.467 GB | 2.086 GB/s
+ 6700 row groups | 55.290 GB | 2.087 GB/s
+ 6800 row groups | 56.114 GB | 2.091 GB/s
+ 6900 row groups | 56.938 GB | 2.098 GB/s
+ 7000 row groups | 57.769 GB | 2.102 GB/s
+ 7100 row groups | 58.593 GB | 2.106 GB/s
+ 7200 row groups | 59.417 GB | 2.108 GB/s
+ 7300 row groups | 60.241 GB | 2.113 GB/s
+ 7400 row groups | 61.064 GB | 2.117 GB/s
+ 7500 row groups | 61.888 GB | 2.125 GB/s
+ 7600 row groups | 62.719 GB | 2.127 GB/s
+ 7700 row groups | 63.543 GB | 2.132 GB/s
+ 7800 row groups | 64.367 GB | 2.136 GB/s
+
+--- Results ---
+Row groups : 7,872
+Total bytes: 64.958 GB
+Elapsed    : 30.39 s
+Throughput : 2.138 GB/s
+```
+
+### Key observations
+
+- **Ramp-up in the first ~500 row groups**: throughput climbs from 1.34 → 1.65 GB/s
+  as Tokio's connection pool warms up and the OS's TCP send/receive buffers fill.
+  This is normal for any loopback benchmark — a real-world warm object store
+  would show similar ramp behaviour.
+
+- **Steady state ~1.95–2.10 GB/s from row group 2,000 onward**: the Tokio
+  work-stealing thread pool reaches full utilisation, with 16 concurrent range
+  GETs always in flight.
+
+- **Total 64.96 GB in 30.4 seconds**: the raw bytes read represent the entire
+  dataset (all columns, all row groups across all 64 files — no column projection
+  was applied in this run).
+
+---
+
+## Comparative Summary
+
+| Phase | Read unit | I/O ops / epoch | Python calls / epoch | Throughput |
+|-------|-----------|----------------|---------------------|-----------|
+| Baseline | sample (pyarrow col-by-col) | 78,720 | 16,000,000 | ~84 MB/s |
+| Phase 1 | row group (merged GET) | 1,968 | 16,000,000 | ~200 MB/s |
+| Phase 2 | row group (prefetched) | 1,968 | 16,000,000 | ~400 MB/s |
+| **Phase 3** | **row group (Rust iterator)** | **1,968** | **1,968** | **2,138 MB/s** |
+
+The breakthrough between Phase 2 and Phase 3 is not additional I/O parallelism —
+it is the **8,130× reduction in Python call overhead**.  Moving the iteration
+unit from sample to row group, with all coordination in Rust, eliminates the
+Python CPU bottleneck that capped throughput at ~400 MB/s regardless of network
+or storage speed.
+
+---
+
+## Remaining gap to 10 GiB/s target
+
+At 2.1 GB/s the system is still ~5× below the 10 GiB/s target.  The remaining
+constraints are:
+
+1. **Single-process / single-reader**: this test was single-threaded Python.
+   With multiple DLIO workers (e.g. 4–8 processes) each owning a disjoint file
+   shard, aggregate throughput scales linearly with worker count.
+
+2. **s3-ultra throughput ceiling**: s3-ultra on loopback can sustain higher
+   rates; the per-run limit here is ~2 GB/s per reader process with prefetch=16.
+   Increasing `prefetch` (e.g. to 32 or 64) or adding multiple reader processes
+   will drive the aggregate higher.
+
+3. **Arrow decoding in Rust (Phase 4)**: the current design returns raw
+   compressed Parquet bytes.  Adding Arrow IPC decoding inside the Rust `get()`
+   call would allow zero-copy transfer of decoded `RecordBatch` objects to
+   Python via the Arrow C Data Interface, eliminating the Python-side pyarrow
+   decode step entirely.
+
+4. **Column projection in the range GET**: passing `col_indices` to
+   `ParquetRowGroupDataset::new()` narrows the byte span fetched per row group
+   to only the selected columns, reducing data transfer proportionally.
+
+---
+
+## Files Changed
+
+### s3dlio (branch: `feat/parquet-dataloader`)
+
+| File | Change |
+|------|--------|
+| `Cargo.toml` | Added `parquet = { version = "58", default-features = false, optional = true }`; added `"parquet"` to `default` feature list |
+| `src/data_loader/parquet_rg.rs` | **New** — `ParquetRowGroupDataset` struct + `Dataset` impl + 3 unit tests |
+| `src/data_loader/mod.rs` | `pub mod parquet_rg` + re-exports under `#[cfg(feature = "parquet")]` |
+| `src/python_api/python_aiml_api.rs` | `create_async_loader`: parquet format routing; `PyBytesAsyncDataLoader::__iter__`: replaced JoinSet+Semaphore with `buffer_unordered` + bounded `mpsc` channel; `PyBytesDataLoaderSyncIter`: new `__next__` returning one `PyBytesView` per row group; `py.detach()` throughout (PyO3 0.27) |
+| `src/python_api/python_datagen_api.rs` | Removed deprecated `method: GenerationMethod::Parallel` fields |
+
+### dlio_benchmark
+
+| File | Change |
+|------|--------|
+| `dlio_benchmark/reader/parquet_reader_s3dlio.py` | **New** — `ParquetReaderS3dlio` unified s3+file reader |
+| `dlio_benchmark/reader/reader_factory.py` | Added `storage_library: s3dlio` routing before existing parquet branches |
+| `tests/test_s3dlio_parquet_loader.py` | **New** — standalone benchmark script measuring row-group throughput |
+| `tests/dlrm-s3dlio-s3.yaml` | **New** — DLIO config for S3 path (port 9200) |
+| `tests/dlrm-s3dlio-file.yaml` | **New** — DLIO config for local file path |
diff --git a/docs/Parquet_Status_Email.md b/docs/Parquet_Status_Email.md
new file mode 100644
index 00000000..f2cfcd3a
--- /dev/null
+++ b/docs/Parquet_Status_Email.md
@@ -0,0 +1,185 @@
+# Parquet S3 Dataloader — Status & Path to 10 GB/s
+
+**Date:** May 5, 2026  
+**To:** [distribution]  
+**Re:** MLPerf Storage DLRM read throughput — progress update and outstanding decisions
+
+---
+
+## Where We Started
+
+The baseline DLRM parquet reader achieved roughly **84 MB/s** against an in-process
+S3 server on loopback (no real network latency, no disk I/O — the floor is software
+overhead).  The target for MLPerf Storage is **10 GiB/s per host**.  The gap was
+125×.
+
+---
+
+## What Was Fixed and Why It Worked
+
+Three successive changes brought throughput from 84 MB/s to 2.1 GB/s.
+
+### Fix 1 — Merge the HTTP GETs: 84 → 200 MB/s
+
+**Problem:** pyarrow's `read_row_group()` issues one HTTP request per column chunk.
+With 40 columns selected that is 40 GETs per row group, 78,720 GETs per epoch.
+Each GET incurred full HTTP overhead plus a metadata lookup in s3-ultra.
+
+**Fix:** Replace 40 column GETs with a single byte-range GET spanning the min–max
+column extent within the row group.  Parquet stores column chunks contiguously, so
+the merged span contains exactly the same bytes.  Also added an in-memory footer
+cache to s3-ultra so repeated footer reads were served from RAM.
+
+**Result:** 78,720 GETs → 1,968 GETs per epoch.  **2.4× speedup.**
+
+---
+
+### Fix 2 — Prefetch at file open: 200 → 400 MB/s
+
+**Problem:** Row-group GETs were still issued serially, triggered one at a time
+by the training loop as it called into the reader.  Each GET blocked the reader
+thread until the response arrived.
+
+**Fix:** At `file.open()` time, submit all 123 row-group GETs for that file
+concurrently using a 32-thread `ThreadPoolExecutor`.  Also overrode
+`read_index()` to remove two `datetime.now().strftime()` calls that fired
+unconditionally on every sample (16 M × ~3 µs = 48 s wasted per worker per epoch).
+
+**Result:** Serial I/O latency eliminated; most row groups already in memory before
+the training loop asks for them.  **4.8× speedup over baseline.**
+
+---
+
+### Fix 3 — Move iteration to Rust: 400 → 2,100 MB/s
+
+**Problem:** Even with all I/O pre-fetched, `read_index()` was still called once per
+*sample* — 16,000,000 Python function calls per worker per epoch.  At a minimum
+cost of ~3.6 µs each (unavoidable in CPython regardless of what the function does),
+that is a hard floor of ~58 s of pure Python overhead per worker.  At 10 GiB/s the
+actual I/O takes only 0.33 s.  Python overhead was 176× the I/O time — no amount
+of parallelism or caching can overcome it while the call count remains at 16 M.
+
+**The insight:** The dataset has 1,968 unique I/O operations (one per row group).
+If Python is called 1,968 times instead of 16,000,000 times, Python overhead drops
+to 0.007 s — negligible against 0.33 s of I/O.
+
+**Fix:** Implemented `ParquetRowGroupDataset` in Rust inside the s3dlio library.
+Rust builds the full row-group index (file listing → concurrent footer GETs →
+Parquet metadata parse → one `{offset, length}` extent per row group) once at
+construction.  A `buffer_unordered(prefetch)` stream in Tokio drives up to N
+concurrent range GETs simultaneously; a bounded channel feeds results to Python.
+`for item in loader:` in Python calls `__next__()` 1,968 times — once per row
+group, not once per sample.
+
+**Result:** 16,000,000 Python calls → 1,968 Python calls.  GIL overhead negligible.
+**25× speedup over baseline.  5.4× over the previous best.**
+
+---
+
+## Current Numbers
+
+| Iteration | Change | Throughput | vs. Baseline |
+|-----------|--------|-----------|-------------|
+| Baseline | pyarrow per-column GETs | 84 MB/s | 1× |
+| Fix 1 | Single merged GET per row group | 200 MB/s | 2.4× |
+| Fix 2 | 32-thread prefetch + fast read_index | 400 MB/s | 4.8× |
+| **Fix 3** | **Rust row-group iterator in s3dlio** | **2,100 MB/s** | **25×** |
+| **Target** | | **10,240 MB/s** | **122×** |
+
+The remaining gap is **5×**.  It is not a fundamental algorithmic problem — we
+have already proved the iteration overhead is gone.  The gap is parallelism and
+one architectural decision.
+
+---
+
+## What Is Needed to Reach 10 GB/s
+
+### 1. Multiple reader workers (expected: linear scaling, ~4–5×)
+
+The test that produced 2.1 GB/s used a **single Python process with a single
+loader**.  The MLPerf Storage spec runs multiple data-loader workers in parallel,
+each owning a disjoint file shard.
+
+With 5 workers each sustaining 2.1 GB/s the aggregate is already 10.5 GB/s.
+With 4 workers at 2.5 GB/s (likely once the connection pool is fully warm from
+the start) it hits target.  This is the lowest-risk path — it requires no further
+code changes, just the correct DLIO YAML worker count and a check that s3-ultra
+can sustain the aggregate rate (it runs entirely in memory; it should).
+
+**Decision needed:** Run the multi-worker benchmark to confirm linear scaling.
+What worker count does the MLPerf Storage spec require for this workload?
+
+### 2. Column projection in the range GET (expected: 3–5× data reduction)
+
+The Phase 3 test fetched **all columns** (full row-group byte span).  The DLRM
+workload reads only 40 of 200 columns.  The column-selection logic already exists
+in `ParquetRowGroupDataset::new(col_indices)` but was not exercised in the test.
+
+When enabled, each range GET shrinks from ~1,743 KiB (full span) to ~487 KiB (40
+selected columns), a **3.6× reduction** in bytes transferred.  At the same network
+rate, throughput in terms of samples/s goes up proportionally.  The downstream
+question is whether pyarrow can decode the sub-selected bytes or whether the raw
+bytes are handed to the training framework directly.
+
+**Decision needed:** Does the DLRM training loop require decoded Arrow columns or
+raw Parquet bytes?  If raw bytes, column projection is free today.  If decoded
+columns, see item 3 below.
+
+### 3. Arrow decoding in Rust — Phase 4 (expected: removes remaining Python decode cost)
+
+Currently `ParquetRowGroupDataset` returns compressed Parquet bytes.  The Python
+consumer (or the training framework) must decode them.  Adding Arrow IPC decoding
+inside the Rust `get()` call would:
+
+- Decode Parquet bytes → Arrow `RecordBatch` entirely in Rust (Tokio thread, no GIL)
+- Transfer the decoded batch to Python via the Arrow C Data Interface (zero-copy,
+  reference-counted; pyarrow can wrap it with no memcpy)
+- Eliminate all remaining Python-side decode overhead
+
+The `parquet` crate already has this capability when compiled with
+`features = ["arrow"]`; the current build uses `default-features = false` to keep
+the dependency tree small.  Enabling Arrow output is a ~1-day Rust change.
+
+**Decision needed:** Is Phase 4 required to hit 10 GB/s, or does multi-worker
+alone get there?  Multi-worker is the right experiment to run first — it is free.
+
+### 4. Prefetch depth tuning (minor, ~10–20%)
+
+The test used `prefetch = 16`.  With a higher prefetch depth (32–64) Tokio keeps
+more GETs in flight simultaneously, better hiding individual request latency.
+At 2.1 GB/s steady state the channel was not the bottleneck, but on a real network
+(vs. loopback) higher prefetch may be important.
+
+---
+
+## Recommended Next Steps (in order)
+
+1. **Run multi-worker benchmark** — 4 workers, each 16 files, same YAML config with
+   `storage_library: s3dlio`.  This is the single most important data point.  If
+   aggregate throughput scales linearly, 10 GB/s is already achieved with existing
+   code.
+
+2. **Enable column projection** — pass `col_indices: [0..39]` to the loader and
+   re-run.  Confirms the 3.6× byte reduction works end-to-end.
+
+3. **Decide on Arrow decoding** — required only if multi-worker + projection still
+   falls short of 10 GB/s, or if the training framework requires decoded tensors
+   rather than raw bytes.
+
+---
+
+## Decisions Requested
+
+| # | Decision | Options | Impact |
+|---|----------|---------|--------|
+| 1 | How many DLIO data-loader workers does the MLPerf Storage spec require for DLRM? | 1 / 4 / 8 / other | Determines if 10 GB/s is already achieved |
+| 2 | Does the training framework require decoded columns (Arrow/numpy) or raw Parquet bytes? | Raw bytes OK / Decoded required | Determines whether Phase 4 (Arrow-in-Rust) is needed |
+| 3 | Should Phase 4 (Arrow IPC from Rust) be built now or deferred until multi-worker results are in? | Build now / Wait for data | ~1 day Rust work |
+| 4 | Is 10 GB/s aggregate (all workers combined) or 10 GB/s per worker the MLPerf target? | Aggregate / Per-worker | Changes required worker count by ~5× |
+
+---
+
+*All code changes are on the `feat/parquet-dataloader` branch of s3dlio and in
+`dlio_benchmark/dlio_benchmark/reader/parquet_reader_s3dlio.py`.  The existing
+parquet reader paths are completely unchanged — the new reader is opt-in via
+`storage_library: s3dlio` in the YAML config.*
\ No newline at end of file
diff --git a/docs/Parquet_update_0-9-100.md b/docs/Parquet_update_0-9-100.md
new file mode 100644
index 00000000..51a3e93d
--- /dev/null
+++ b/docs/Parquet_update_0-9-100.md
@@ -0,0 +1,235 @@
+Compacted conversation
+
+Searched for text `configure_tokio_threads` (`Code` · `s3dlio/src/python_api/python_aiml_api.rs`), 4 results
+
+Read [](file:///home/eval/Documents/Code/s3dlio/src/python_api/python_aiml_api.rs#2540-2540), lines 2540 to 2585
+
+Read [](file:///home/eval/Documents/Code/s3dlio/src/python_api/python_aiml_api.rs#2604-2604), lines 2604 to 2620
+
+Now I have everything. Two fixes needed:
+1. Missing semicolon on the `configure_tokio_threads` registration line  
+2. The function should auto-detect MPI env vars internally (Rust reads them, not Python), default `n=0` = auto-detect 
+
+Replacing 36 lines with 59 lines in [](file:///home/eval/Documents/Code/s3dlio/src/python_api/python_aiml_api.rs)
+
+
+```
+
+```
+
+
+Ran terminal command:  cd /home/eval/Documents/Code/s3dlio && cargo check 2>&1
+
+Zero errors, zero warnings. Now build the wheel: 
+
+Ran terminal command:  cd /home/eval/Documents/Code/s3dlio && source .venv/bin/activate && ./build_pyo3.sh 2>&1
+
+Both wheels built successfully. Now addressing your other questions:
+
+---
+
+## Answers to your questions
+
+### 1. The `world_size` confusion
+
+You were right — `configure_tokio_threads` is now rewritten so **Rust reads the MPI env vars internally**. The function now has a default `n=0` that triggers auto-detection. Python usage is simply:
+
+```python
+import s3dlio
+s3dlio.configure_tokio_threads()   # reads OMPI_COMM_WORLD_SIZE / PMI_SIZE / WORLD_SIZE itself
+```
+
+No Python-side calculation of `world_size` needed at all. If you want to override explicitly, `s3dlio.configure_tokio_threads(4)` still works. The `max_threads: 4` example I gave in text earlier was wrong — that was a mistake on my part.
+
+---
+
+### 2. Does this work for real-world Parquet training?
+
+**Partially — it depends on the access pattern:**
+
+| Access pattern | Status | Path |
+|---|---|---|
+| Sequential scan (DLRM-style, whole files) | ✅ Ready | `create_async_loader(..., {"format":"parquet"})` |
+| Arrow IPC decode in Rust → PyArrow in Python | ✅ Ready | `decode="arrow"` → `pyarrow.ipc.read_message(bytes(item))` |
+| Random per-sample access with shuffling | ✅ Ready | `PyParquetIndex.rg_lookup()` + `s3dlio.get_range()` |
+| Per-sample fetch in PyTorch DataLoader workers | ⚠️ Not wired | Needs `PyParquetIndex` glue in the dataloader class |
+
+For **DLRM-style training** (the train dataset): sequential scan per-epoch IS the correct access pattern. Files are shuffled at the epoch level by the training framework; within each file, rows are read sequentially. The streaming `ParquetReaderS3dlioArrow` matches this exactly.
+
+The `ParquetReaderS3dlioArrow` we built queues `(byte_count, None)` — that's intentional for **benchmarking** (measuring I/O throughput without data copy into Python). For a real training integration that actually feeds tensors, you'd queue `(byte_count, bytes(item))` and decode with PyArrow. We can do that if needed, but for dlio_benchmark it's a throughput measurement tool — byte_count is sufficient.
+
+---
+
+### 3. Does this support storage benchmarking?
+
+**Yes, this is the strongest use case.** The three modes we have:
+
+| Mode | Reader | What it measures |
+|---|---|---|
+| 1 — Default | PyTorch/PyArrow full stack | End-to-end including Python/C++ Parquet decode overhead |
+| 2 — s3dlio raw | `ParquetReaderS3dlio` | Raw I/O throughput (bytes moved, no decode) |
+| 3 — s3dlio arrow | `ParquetReaderS3dlioArrow` | I/O + Rust Arrow IPC decode (no Python Parquet overhead) |
+
+All three support , `direct://`, and `s3://`. The concurrency/prefetch depth is configurable. The `configure_tokio_threads()` auto-scaling prevents thread explosion under MPI. This gives you a clean apples-to-apples comparison of where the bottleneck actually is. 
+
+
+Let me read the Arrow decode path in parquet_rg.rs carefully before answering — I need to know exactly what the Rust Arrow decode does to give you an accurate answer. 
+
+Read [](file:///home/eval/Documents/Code/s3dlio/src/data_loader/parquet_rg.rs#1-1), lines 1 to 250
+
+Read [](file:///home/eval/Documents/Code/s3dlio/src/data_loader/parquet_rg.rs#250-250), lines 250 to 530
+
+I have everything I need. Now I understand the full picture: the `get_arrow_ipc()` method needs `Arc<ParquetMetaData>` (the parsed footer). That's the only meaningful constraint. The implementation is straightforward. 
+
+Created 3 todos
+
+Yes, it's absolutely possible — and the implementation is clean because all the pieces already exist. Here's what we need and why it matters, then I'll implement it.
+
+**Why Rust decode is worth it for real training:**  
+The `get_arrow_ipc()` path already does the full conversion inside the Rust async task — `ParquetRecordBatchStream` → `RecordBatch` → `Arrow IPC bytes` — completely off the Python GIL, in parallel with the next fetch on the Tokio thread pool. For fast NVMe or high-bandwidth S3 where Parquet decode becomes the bottleneck (30-40% of total time), this is a real win. The Python consumer just does `pa.ipc.open_stream(pa.py_buffer(bytes(item))).read_next_batch()` — one call, no per-column decode work.
+
+**What's needed:** One new Rust function `parquet_get_rg_decoded(uri, rg_idx)` that uses the existing `parquet_file_cache` + `get_arrow_ipc()` machinery. Then update `ParquetReaderS3dlioArrow` to use the same `PyParquetIndex` sliding-window prefetch as `ParquetReaderS3dlio`, but calling this function instead of `get_range`. 
+
+Starting: *Add parquet_get_rg_decoded() Rust function* (1/3)
+
+First, find the right spot in python_aiml_api.rs to insert the new function — right before `create_async_loader`:  
+
+Searched for text `pub fn create_async_loader` (`Code` · `s3dlio/src/python_api/python_aiml_api.rs`), 1 result
+
+Read [](file:///home/eval/Documents/Code/s3dlio/src/python_api/python_aiml_api.rs#1950-1950), lines 1950 to 1975
+
+Now add the new function just before `create_async_loader`: 
+
+Replacing 12 lines with 159 lines in [](file:///home/eval/Documents/Code/s3dlio/src/python_api/python_aiml_api.rs)
+
+
+```
+
+```
+Now I need to make `S3AsyncFileReader::new()` pub so we can use it from python_aiml_api.rs:  
+
+Searched for text `fn new(uri: String, metadata: Arc<ParquetMetaData>)` (`Code` · `s3dlio/src/data_loader/parquet_rg.rs`), 1 result
+
+Read [](file:///home/eval/Documents/Code/s3dlio/src/data_loader/parquet_rg.rs#425-425), lines 425 to 450
+
+Replacing 5 lines with 5 lines in [](file:///home/eval/Documents/Code/s3dlio/src/data_loader/parquet_rg.rs)
+
+
+```
+
+```
+Now check what `rg_byte_extent` visibility is:  
+
+Searched for text `fn rg_byte_extent` (`Code` · `s3dlio/src/data_loader/parquet_rg.rs`), 1 result
+
+Good — `pub(crate)` is enough since python_aiml_api.rs is in the same crate. Now register the new function in `register_aiml_functions`:   
+
+Replacing 4 lines with 7 lines in [](file:///home/eval/Documents/Code/s3dlio/src/python_api/python_aiml_api.rs)
+
+
+```
+
+```
+Now `cargo check` to catch any issues before building:   
+
+Ran terminal command:  cd /home/eval/Documents/Code/s3dlio && cargo check 2>&1
+
+Zero errors, zero warnings. Now update `ParquetReaderS3dlioArrow` to use the index-based approach with the new `parquet_get_rg` function. The new design mirrors `ParquetReaderS3dlio` exactly — same `PyParquetIndex`, same sliding-window prefetch, same access plan — but calls `parquet_get_rg` instead of `get_range`:  
+
+Starting: *Update ParquetReaderS3dlioArrow to use index + new function* (2/3)
+
+Read [](file:///home/eval/Documents/Code/dlio_benchmark/dlio_benchmark/reader/parquet_reader_s3dlio_arrow.py#80-80), lines 80 to 202
+
+Now I'll rewrite the entire file. The new version uses `PyParquetIndex` + sliding-window prefetch with `parquet_get_rg`, just like `ParquetReaderS3dlio`, with an optional `decode_output` config key to control what Python does with the result: 
+
+Created [](file:///home/eval/Documents/Code/dlio_benchmark/dlio_benchmark/reader/parquet_reader_s3dlio_arrow.py)
+
+Read [](file:///home/eval/Documents/Code/dlio_benchmark/dlio_benchmark/reader/parquet_reader_s3dlio_arrow.py#1-1), lines 1 to 10
+
+Replacing 10 lines with 35 lines in [](file:///home/eval/Documents/Code/dlio_benchmark/dlio_benchmark/reader/parquet_reader_s3dlio_arrow.py)
+
+
+```
+
+```
+Now replace the entire body of the class. Let me do this in logical sections:  
+
+Replacing 16 lines with 34 lines in [](file:///home/eval/Documents/Code/dlio_benchmark/dlio_benchmark/reader/parquet_reader_s3dlio_arrow.py)
+
+Read [](file:///home/eval/Documents/Code/dlio_benchmark/dlio_benchmark/reader/parquet_reader_s3dlio_arrow.py#15-15), lines 15 to 55
+
+Replacing 19 lines with 37 lines in [](file:///home/eval/Documents/Code/dlio_benchmark/dlio_benchmark/reader/parquet_reader_s3dlio_arrow.py)
+
+
+```
+
+```
+Now replace the class body — from the class definition through the end of the file:   
+
+Read [](file:///home/eval/Documents/Code/dlio_benchmark/dlio_benchmark/reader/parquet_reader_s3dlio_arrow.py#56-56), lines 56 to 90
+
+Replacing 15 lines with 14 lines in [](file:///home/eval/Documents/Code/dlio_benchmark/dlio_benchmark/reader/parquet_reader_s3dlio_arrow.py)
+
+
+```
+
+```
+Now replace everything from `__init__` onwards in the class:   
+
+Read [](file:///home/eval/Documents/Code/dlio_benchmark/dlio_benchmark/reader/parquet_reader_s3dlio_arrow.py#90-90), lines 90 to 130
+
+Replacing 41 lines with 96 lines in [](file:///home/eval/Documents/Code/dlio_benchmark/dlio_benchmark/reader/parquet_reader_s3dlio_arrow.py)
+
+
+```
+
+```
+Now replace the old `_epoch_init` through the end of file:    
+
+Read [](file:///home/eval/Documents/Code/dlio_benchmark/dlio_benchmark/reader/parquet_reader_s3dlio_arrow.py#175-175), lines 175 to 230
+
+Read [](file:///home/eval/Documents/Code/dlio_benchmark/dlio_benchmark/reader/parquet_reader_s3dlio_arrow.py#226-226), lines 226 to 299
+
+Replacing 114 lines with 201 lines in [](file:///home/eval/Documents/Code/dlio_benchmark/dlio_benchmark/reader/parquet_reader_s3dlio_arrow.py)
+
+
+```
+
+```
+Compacted conversationLet me verify the final file looks correct: 
+
+Ran terminal command:  wc -l /home/eval/Documents/Code/dlio_benchmark/dlio_benchmark/reader/parquet_reader_s3dlio_arrow.py && echo "---" && head -60 /home/eval/Documents/Code/dlio_benchmark/dlio_benchmark/reader/parquet_reader_s3dlio_arrow.py
+
+Ran terminal command:  tail -30 /home/eval/Documents/Code/dlio_benchmark/dlio_benchmark/reader/parquet_reader_s3dlio_arrow.py
+
+Good. The file is complete. Now rebuild the s3dlio wheels to include `parquet_get_rg`: 
+
+Ran terminal command:  cd /home/eval/Documents/Code/s3dlio && source .venv/bin/activate && ./build_pyo3.sh 2>&1
+
+Both wheels built successfully. Now install the cp312 wheel into the dlio_benchmark venv: 
+
+Ran terminal command:  cd /home/eval/Documents/Code/s3dlio && uv pip install target/wheels/s3dlio-0.9.100-cp312-cp312-manylinux_2_39_x86_64.whl --python /home/eval/Documents/Code/dlio_benchmark/.venv312/bin/python --force-reinstall 2>&1
+
+Now verify `parquet_get_rg` is available in the installed wheel: 
+
+Ran terminal command:  /home/eval/Documents/Code/dlio_benchmark/.venv312/bin/python -c "import s3dlio; print(dir(s3dlio))" | tr ',' '\n' | grep -i parquet
+
+`parquet_get_rg` is available. Let me also verify `configure_tokio_threads` is exported:  
+
+Ran terminal command:  /home/eval/Documents/Code/dlio_benchmark/.venv312/bin/python -c "import s3dlio; print(dir(s3dlio))" | tr ',' '\n' | grep -i tokio
+
+Everything is in place. Here's a summary of what was accomplished this session:
+
+**parquet_reader_s3dlio_arrow.py — fully rewritten** from a streaming-only queue-based design to the index-based sliding-window approach matching `ParquetReaderS3dlio`:
+- Uses `PyParquetIndex.ensure_indexed()` + bisect tables to map sample indices to row-group indices
+- `ThreadPoolExecutor` prefetch window — at most `min(prefetch_workers, max_outstanding)` Arrow-decode futures in-flight simultaneously  
+- `_pipeline_submit_next()` drives the consumer-driven pipeline (pop + submit)
+- `_decode_ipc()` dispatches on `decode_output`: `"none"` (benchmark), `"pyarrow"`, or `"torch"`
+- Memory bound: `~35% free RAM / decoded_rg_size` (decoded ≈ 1.6× raw)
+- Supports , `direct://`, and `s3://` URIs
+
+**s3dlio v0.9.100 wheels rebuilt** (cp312 + cp313) now including `parquet_get_rg` and `configure_tokio_threads`, and **installed** into .venv312. 
+
+
+
+Made changes.
diff --git a/pyproject.toml b/pyproject.toml
index 116b6f22..a7730998 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,13 +4,12 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "dlio_benchmark"
-version = "3.0.1"
+version = "3.0.2"
 description = "An I/O benchmark for deep learning applications"
 readme = "README.md"
 requires-python = ">=3.12,<3.13"
 dependencies = [
-    "dgen-py>=0.2.3",
-    "s3dlio>=0.9.95",
+    "dgen-py>=0.2.4",
     "h5py>=3.11.0",
     "mpi4py>=3.1.4",
     "numpy>=1.23.5",
@@ -25,6 +24,8 @@ dependencies = [
     "tensorflow>=2.20.0",
     "pyarrow>=21.0.0",
     "pydftracer>=2.0.2",
+    "s3torchconnector>=1.5.0",
+    "s3dlio>=0.9.100",
 ]
 
 [project.optional-dependencies]
@@ -56,6 +57,15 @@ log_cli_level = "INFO"
 log_cli_format = "%(asctime)s [%(levelname)8s] %(message)s (%(filename)s:%(lineno)s)"
 log_cli_date_format = "%Y-%m-%d %H:%M:%S"
 
+# To use a local s3dlio wheel during development, uncomment [tool.uv.sources]:
+# [tool.uv.sources]
+# s3dlio = { path = "../s3dlio/target/wheels/s3dlio-0.9.100-cp312-cp312-manylinux_2_39_x86_64.whl" }
+
+# s3dlio only ships Linux wheels — restrict resolution to Linux to avoid
+# uv failing on the Windows split when no win32 wheel is available.
+[tool.uv]
+environments = ["sys_platform == 'linux'"]
+
 [dependency-groups]
 dev = [
     "pytest>=8.4.2",
diff --git a/requirements-test.txt b/requirements-test.txt
index 245232d2..d5ad2762 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -7,7 +7,6 @@ PyYAML~=6.0.0
 hydra-core==1.3.2
 mpi4py>=3.1.4
 numpy>=1.23.5
-nvidia-dali-cuda110>=1.34.0
 omegaconf~=2.2.0
 pandas>=1.5.1
 psutil>=5.9.8
diff --git a/tests/dlrm-s3dlio-file.yaml b/tests/dlrm-s3dlio-file.yaml
new file mode 100644
index 00000000..435627f0
--- /dev/null
+++ b/tests/dlrm-s3dlio-file.yaml
@@ -0,0 +1,62 @@
+# dlrm-s3dlio-file.yaml
+#
+# DLIO benchmark config for DLRM parquet dataset from a local file path.
+# Uses s3dlio's ParquetRowGroupDataset for row-group-granular I/O.
+# s3dlio handles file:// URIs with the same code path as s3://.
+#
+# Usage (single-process):
+#   python -m dlio_benchmark.main \
+#     --config-dir=. --config-name=dlrm-s3dlio-file
+
+model: dlrm_parquet_s3dlio_file
+
+framework: pytorch
+
+dataset:
+  format: parquet
+  storage_type: local
+  data_folder: /mnt/nvme/dlrm/train/train  # absolute path to parquet directory
+  num_files_train: 64
+  num_samples_per_file: 1000000        # 1M rows, 123 row groups @ 8192 rows/RG
+  record_length: 1024
+  record_length_stdev: 0
+  file_list_train: ""
+  storage_options:
+    storage_library: s3dlio           # selects ParquetReaderS3dlio for local paths too
+    prefetch: 8                       # smaller prefetch for NVMe (less latency)
+    footer_cap: 4194304
+
+data_loader:
+  read_threads: 1
+  prefetch_size: 0
+  read_type: on_demand
+
+train:
+  # DLRMv2 / B200 parameters from trace logs:
+  #   batch_size:        2048 real samples per GPU compute step
+  #   computation_time:  0.000770031 s per batch (B200 measured)
+  #
+  # I/O vs compute granularity:
+  #   1 Row Group  = ~256 real samples  = ~8.3 MiB byte-range GET
+  #   1 DLIO batch = 2048 real samples  = 8 Row Groups fetched
+  #
+  # With batch_size=2048 and num_samples_per_file=262144 the training
+  # loop runs ~8,192 steps/epoch and compute() is called 8,192 times
+  # (vs 16.7M times with batch_size=1).  The read_index fast-path
+  # (~50 ns/call) handles the 16.7M sample→RG bisect lookups.
+  computation_time: 0.000770031
+  epochs: 1
+  total_training_steps: -1
+  batch_size: 2048
+  seed_change_epoch: false
+
+evaluation:
+  eval_time: 0.0
+  epochs_between_evals: 0
+
+profiling:
+  profiler: none
+
+logging:
+  verbosity: INFO
+  log_file: dlio-s3dlio-file.log
diff --git a/tests/dlrm-s3dlio-s3.yaml b/tests/dlrm-s3dlio-s3.yaml
new file mode 100644
index 00000000..7686ca5f
--- /dev/null
+++ b/tests/dlrm-s3dlio-s3.yaml
@@ -0,0 +1,66 @@
+# dlrm-s3dlio-s3.yaml
+#
+# DLIO benchmark config for DLRM parquet dataset on S3 (via s3-ultra on port 9200).
+# Uses s3dlio's ParquetRowGroupDataset for row-group-granular I/O.
+# ~1,968 row-group fetches per epoch across 64 files instead of 16M per-sample reads.
+#
+# Usage (single-process):
+#   export AWS_ACCESS_KEY_ID=minioadmin
+#   export AWS_SECRET_ACCESS_KEY=minioadmin
+#   python -m dlio_benchmark.main \
+#     --config-dir=. --config-name=dlrm-s3dlio-s3
+
+model: dlrm_parquet_s3dlio_s3
+
+framework: pytorch
+
+dataset:
+  format: parquet
+  storage_type: s3
+  storage_root: mlp-flux            # bucket name (no s3:// prefix)
+  data_folder: data/dlrm/train/train  # key prefix within bucket
+  num_files_train: 64
+  num_samples_per_file: 1000000      # 1M rows, 123 row groups @ 8192 rows/RG
+  record_length: 1024               # approximate sample size in bytes (informational)
+  record_length_stdev: 0
+  file_list_train: ""               # auto-discovered from data_folder
+  storage_options:
+    storage_library: s3dlio         # selects ParquetReaderS3dlio
+    endpoint_url: http://127.0.0.1:9200
+    prefetch: 16                    # row groups to buffer ahead of Python consumer
+    footer_cap: 4194304             # 4 MiB footer window for Parquet metadata
+
+data_loader:
+  read_threads: 1                   # s3dlio handles concurrency internally
+  prefetch_size: 0                  # disable DLIO-level prefetch (s3dlio does it)
+  read_type: on_demand
+
+train:
+  # DLRMv2 / B200 parameters from trace logs:
+  #   batch_size:        2048 real samples per GPU compute step
+  #   computation_time:  0.000770031 s per batch (B200 measured)
+  #
+  # I/O vs compute granularity:
+  #   1 Row Group  = ~256 real samples  = ~8.3 MiB byte-range GET
+  #   1 DLIO batch = 2048 real samples  = 8 Row Groups fetched
+  #
+  # With batch_size=2048 and num_samples_per_file=262144 the training
+  # loop runs ~8,192 steps/epoch and compute() is called 8,192 times
+  # (vs 16.7M times with batch_size=1).  The read_index fast-path
+  # (~50 ns/call) handles the 16.7M sample→RG bisect lookups.
+  computation_time: 0.000770031
+  epochs: 1
+  total_training_steps: -1
+  batch_size: 2048
+  seed_change_epoch: false
+
+evaluation:
+  eval_time: 0.0
+  epochs_between_evals: 0           # disable evaluation pass
+
+profiling:
+  profiler: none
+
+logging:
+  verbosity: INFO
+  log_file: dlio-s3dlio-s3.log
diff --git a/tests/test_s3dlio_parquet_loader.py b/tests/test_s3dlio_parquet_loader.py
new file mode 100644
index 00000000..7d847159
--- /dev/null
+++ b/tests/test_s3dlio_parquet_loader.py
@@ -0,0 +1,71 @@
+#!/usr/bin/env python3
+"""
+Quick end-to-end test for the s3dlio ParquetRowGroupDataset sync iterator.
+
+Usage:
+    cd /home/eval/Documents/Code/dlio_benchmark
+    source .venv/bin/activate
+    python tests/test_s3dlio_parquet_loader.py
+
+Expects:
+  - s3-ultra running on port 9200 with minioadmin credentials
+  - s3://mlp-flux/data/dlrm/train/ populated with 64 parquet files
+"""
+
+import os
+import sys
+import time
+
+# ── S3 endpoint config ────────────────────────────────────────────────────────
+os.environ.setdefault("AWS_ENDPOINT_URL_S3", "http://localhost:9200")
+os.environ.setdefault("AWS_ACCESS_KEY_ID", "minioadmin")
+os.environ.setdefault("AWS_SECRET_ACCESS_KEY", "minioadmin")
+
+import s3dlio  # noqa: E402
+
+PREFIX = "s3://mlp-flux/data/dlrm/train/"
+PREFETCH = 16
+
+
+def run_s3_test():
+    print(f"\n=== s3dlio ParquetRowGroupDataset sync iterator test ===")
+    print(f"URI    : {PREFIX}")
+    print(f"Prefetch: {PREFETCH}")
+
+    loader = s3dlio.create_async_loader(
+        PREFIX,
+        {"format": "parquet", "prefetch": PREFETCH},
+    )
+
+    count = 0
+    total_bytes = 0
+    errors = 0
+
+    t0 = time.perf_counter()
+    for item in loader:
+        try:
+            n = len(item)  # PyBytesView implements __len__ via buffer protocol
+            total_bytes += n
+            count += 1
+            if count % 100 == 0:
+                elapsed = time.perf_counter() - t0
+                rate = total_bytes / elapsed / 1e9
+                print(f"  {count} row groups | {total_bytes/1e9:.3f} GB | {rate:.3f} GB/s", flush=True)
+        except Exception as e:
+            errors += 1
+            print(f"  ERROR at item {count}: {e}", file=sys.stderr)
+    elapsed = time.perf_counter() - t0
+
+    print(f"\n--- Results ---")
+    print(f"Row groups : {count}")
+    print(f"Total bytes: {total_bytes / 1e9:.3f} GB")
+    print(f"Elapsed    : {elapsed:.2f} s")
+    if elapsed > 0:
+        print(f"Throughput : {total_bytes / elapsed / 1e9:.3f} GB/s")
+    if errors:
+        print(f"Errors     : {errors}", file=sys.stderr)
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    run_s3_test()
diff --git a/uv.lock b/uv.lock
index e28e09c3..98fc2e45 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1,9 +1,10 @@
 version = 1
 requires-python = "==3.12.*"
 resolution-markers = [
-    "sys_platform == 'win32'",
-    "sys_platform == 'emscripten'",
-    "sys_platform != 'emscripten' and sys_platform != 'win32'",
+    "sys_platform == 'linux'",
+]
+supported-markers = [
+    "sys_platform == 'linux'",
 ]
 
 [[package]]
@@ -20,19 +21,19 @@ name = "aistore"
 version = "1.23.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "braceexpand" },
-    { name = "cloudpickle" },
-    { name = "humanize" },
-    { name = "msgspec" },
-    { name = "overrides" },
-    { name = "packaging" },
-    { name = "pydantic" },
-    { name = "python-dateutil" },
-    { name = "pyyaml" },
-    { name = "requests" },
-    { name = "tenacity" },
-    { name = "urllib3" },
-    { name = "xxhash" },
+    { name = "braceexpand", marker = "sys_platform == 'linux'" },
+    { name = "cloudpickle", marker = "sys_platform == 'linux'" },
+    { name = "humanize", marker = "sys_platform == 'linux'" },
+    { name = "msgspec", marker = "sys_platform == 'linux'" },
+    { name = "overrides", marker = "sys_platform == 'linux'" },
+    { name = "packaging", marker = "sys_platform == 'linux'" },
+    { name = "pydantic", marker = "sys_platform == 'linux'" },
+    { name = "python-dateutil", marker = "sys_platform == 'linux'" },
+    { name = "pyyaml", marker = "sys_platform == 'linux'" },
+    { name = "requests", marker = "sys_platform == 'linux'" },
+    { name = "tenacity", marker = "sys_platform == 'linux'" },
+    { name = "urllib3", marker = "sys_platform == 'linux'" },
+    { name = "xxhash", marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/dd/bf/bf1d9e7cf6dd2101bff19405f69b8afa01022a4ecb08e1602367bfa43daf/aistore-1.23.0.tar.gz", hash = "sha256:dbd06151b102d50b515fd8d4b6f29e143989aad933a1c1b1274c0560eac7596f", size = 150597 }
 wheels = [
@@ -59,8 +60,8 @@ name = "astunparse"
 version = "1.6.3"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "six" },
-    { name = "wheel" },
+    { name = "six", marker = "sys_platform == 'linux'" },
+    { name = "wheel", marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/f3/af/4182184d3c338792894f34a62672919db7ca008c89abee9b564dd34d8029/astunparse-1.6.3.tar.gz", hash = "sha256:5ad93a8456f0d084c3456d059fd9a92cce667963232cbf763eac3bc5b7940872", size = 18290 }
 wheels = [
@@ -100,7 +101,6 @@ version = "3.4.7"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/e7/a1/67fe25fac3c7642725500a3f6cfe5821ad557c3abb11c9d20d12c7008d3e/charset_normalizer-3.4.7.tar.gz", hash = "sha256:ae89db9e5f98a11a4bf50407d4363e7b09b31e55bc117b4f7d80aab97ba009e5", size = 144271 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/0c/eb/4fc8d0a7110eb5fc9cc161723a34a8a6c200ce3b4fbf681bc86feee22308/charset_normalizer-3.4.7-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:eca9705049ad3c7345d574e3510665cb2cf844c2f2dcfe675332677f081cbd46", size = 311328 },
     { url = "https://files.pythonhosted.org/packages/f8/e3/0fadc706008ac9d7b9b5be6dc767c05f9d3e5df51744ce4cc9605de7b9f4/charset_normalizer-3.4.7-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6178f72c5508bfc5fd446a5905e698c6212932f25bcdd4b47a757a50605a90e2", size = 208061 },
     { url = "https://files.pythonhosted.org/packages/42/f0/3dd1045c47f4a4604df85ec18ad093912ae1344ac706993aff91d38773a2/charset_normalizer-3.4.7-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:e1421b502d83040e6d7fb2fb18dff63957f720da3d77b2fbd3187ceb63755d7b", size = 229031 },
     { url = "https://files.pythonhosted.org/packages/dc/67/675a46eb016118a2fbde5a277a5d15f4f69d5f3f5f338e5ee2f8948fcf43/charset_normalizer-3.4.7-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:edac0f1ab77644605be2cbba52e6b7f630731fc42b34cb0f634be1a6eface56a", size = 225239 },
@@ -113,9 +113,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/59/7a/071feed8124111a32b316b33ae4de83d36923039ef8cf48120266844285b/charset_normalizer-3.4.7-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:aed52fea0513bac0ccde438c188c8a471c4e0f457c2dd20cdbf6ea7a450046c7", size = 212316 },
     { url = "https://files.pythonhosted.org/packages/fd/35/f7dba3994312d7ba508e041eaac39a36b120f32d4c8662b8814dab876431/charset_normalizer-3.4.7-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:fea24543955a6a729c45a73fe90e08c743f0b3334bbf3201e6c4bc1b0c7fa464", size = 227274 },
     { url = "https://files.pythonhosted.org/packages/8a/2d/a572df5c9204ab7688ec1edc895a73ebded3b023bb07364710b05dd1c9be/charset_normalizer-3.4.7-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:bb6d88045545b26da47aa879dd4a89a71d1dce0f0e549b1abcb31dfe4a8eac49", size = 218468 },
-    { url = "https://files.pythonhosted.org/packages/86/eb/890922a8b03a568ca2f336c36585a4713c55d4d67bf0f0c78924be6315ca/charset_normalizer-3.4.7-cp312-cp312-win32.whl", hash = "sha256:2257141f39fe65a3fdf38aeccae4b953e5f3b3324f4ff0daf9f15b8518666a2c", size = 148460 },
-    { url = "https://files.pythonhosted.org/packages/35/d9/0e7dffa06c5ab081f75b1b786f0aefc88365825dfcd0ac544bdb7b2b6853/charset_normalizer-3.4.7-cp312-cp312-win_amd64.whl", hash = "sha256:5ed6ab538499c8644b8a3e18debabcd7ce684f3fa91cf867521a7a0279cab2d6", size = 159330 },
-    { url = "https://files.pythonhosted.org/packages/9e/5d/481bcc2a7c88ea6b0878c299547843b2521ccbc40980cb406267088bc701/charset_normalizer-3.4.7-cp312-cp312-win_arm64.whl", hash = "sha256:56be790f86bfb2c98fb742ce566dfb4816e5a83384616ab59c49e0604d49c51d", size = 147828 },
     { url = "https://files.pythonhosted.org/packages/db/8f/61959034484a4a7c527811f4721e75d02d653a35afb0b6054474d8185d4c/charset_normalizer-3.4.7-py3-none-any.whl", hash = "sha256:3dce51d0f5e7951f8bb4900c257dad282f49190fdbebecd4ba99bcc41fef404d", size = 61958 },
 ]
 
@@ -128,21 +125,12 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/88/39/799be3f2f0f38cc727ee3b4f1445fe6d5e4133064ec2e4115069418a5bb6/cloudpickle-3.1.2-py3-none-any.whl", hash = "sha256:9acb47f6afd73f60dc1df93bb801b472f05ff42fa6c84167d25cb206be1fbf4a", size = 22228 },
 ]
 
-[[package]]
-name = "colorama"
-version = "0.4.6"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/d8/53/6f443c9a4a8358a93a6792e2acffb9d9d5cb0a5cfd8802644b7b1c9a02e4/colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44", size = 27697 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335 },
-]
-
 [[package]]
 name = "cuda-bindings"
 version = "13.2.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "cuda-pathfinder", marker = "sys_platform != 'emscripten' and sys_platform != 'win32'" },
+    { name = "cuda-pathfinder", marker = "sys_platform == 'linux'" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/52/c8/b2589d68acf7e3d63e2be330b84bc25712e97ed799affbca7edd7eae25d6/cuda_bindings-13.2.0-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e865447abfb83d6a98ad5130ed3c70b1fc295ae3eeee39fd07b4ddb0671b6788", size = 5722404 },
@@ -205,11 +193,11 @@ name = "dftracer"
 version = "2.0.2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "dftracer-utils" },
-    { name = "pybind11" },
-    { name = "pydftracer" },
-    { name = "setuptools" },
-    { name = "setuptools-scm" },
+    { name = "dftracer-utils", marker = "sys_platform == 'linux'" },
+    { name = "pybind11", marker = "sys_platform == 'linux'" },
+    { name = "pydftracer", marker = "sys_platform == 'linux'" },
+    { name = "setuptools", marker = "sys_platform == 'linux'" },
+    { name = "setuptools-scm", marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/3c/bf/269f8c437c885bf29ebccaa6ec6d246e5a71af4a8d617b0904ce3773172d/dftracer-2.0.2.tar.gz", hash = "sha256:25f3b36af4179fe4c87d19d64b49e163cf4875f54e3480700cad86bb5ad5d99d", size = 13523152 }
 wheels = [
@@ -225,81 +213,83 @@ sdist = { url = "https://files.pythonhosted.org/packages/c5/0c/76c95c78ba1ab795c
 
 [[package]]
 name = "dgen-py"
-version = "0.2.3"
+version = "0.2.4"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "zstandard" },
+    { name = "zstandard", marker = "sys_platform == 'linux'" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/ad/9f/e04c2c79bd91937593d79bb480c83c67141922da26ba39cff6d5f38e1673/dgen_py-0.2.3.tar.gz", hash = "sha256:fbebb1fc6b24f77abc78baaec82218c6377c1a84d8caf2f055899c1cee050ecd", size = 208444 }
+sdist = { url = "https://files.pythonhosted.org/packages/2c/ee/f839357750c2229643abf2627b43d0f12d6984e79ba6891522a3aabc52b6/dgen_py-0.2.4.tar.gz", hash = "sha256:a1820092a1ac4a793ceda1db30de66339b7a75fd8e609f6cb6be84c31ecdb625", size = 217909 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/55/42/b24dd7f7794b3a999290fa461d745caf9e1bad07643caf912f575b833b10/dgen_py-0.2.3-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:44eb5b802cf5cb721c76e30d1e94cbf86cc9d64dab44caef127f82fe6f253d6d", size = 392290 },
+    { url = "https://files.pythonhosted.org/packages/2b/91/2dae75d696c0f9e380acc7bcda09ccddb70d27455dab59e0c90424fe5881/dgen_py-0.2.4-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:e49af6efdbd11860f24ce804bd1a1b3b6b71a1f5f5de55b33977f14ad9bc41ab", size = 394488 },
+    { url = "https://files.pythonhosted.org/packages/a9/54/2f7d900bee5be6177a3c7b25fe50699217c722efa0fc2f05a4366bb3cfec/dgen_py-0.2.4-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:8acba9dfc8512e9dcfa1b4496d11b8511a35c7a4611290f769792a250e61a4f7", size = 404759 },
 ]
 
 [[package]]
 name = "dlio-benchmark"
-version = "3.0.0"
+version = "3.0.1"
 source = { editable = "." }
 dependencies = [
-    { name = "dgen-py" },
-    { name = "h5py" },
-    { name = "hydra-core" },
-    { name = "mpi4py" },
-    { name = "numpy" },
-    { name = "omegaconf" },
-    { name = "pandas" },
-    { name = "pillow" },
-    { name = "psutil" },
-    { name = "pyarrow" },
-    { name = "pydftracer" },
-    { name = "pyyaml" },
-    { name = "s3dlio" },
-    { name = "tensorflow" },
-    { name = "torch" },
-    { name = "typing-extensions" },
+    { name = "dgen-py", marker = "sys_platform == 'linux'" },
+    { name = "h5py", marker = "sys_platform == 'linux'" },
+    { name = "hydra-core", marker = "sys_platform == 'linux'" },
+    { name = "mpi4py", marker = "sys_platform == 'linux'" },
+    { name = "numpy", marker = "sys_platform == 'linux'" },
+    { name = "omegaconf", marker = "sys_platform == 'linux'" },
+    { name = "pandas", marker = "sys_platform == 'linux'" },
+    { name = "pillow", marker = "sys_platform == 'linux'" },
+    { name = "psutil", marker = "sys_platform == 'linux'" },
+    { name = "pyarrow", marker = "sys_platform == 'linux'" },
+    { name = "pydftracer", marker = "sys_platform == 'linux'" },
+    { name = "pyyaml", marker = "sys_platform == 'linux'" },
+    { name = "s3dlio", marker = "sys_platform == 'linux'" },
+    { name = "s3torchconnector", marker = "sys_platform == 'linux'" },
+    { name = "tensorflow", marker = "sys_platform == 'linux'" },
+    { name = "torch", marker = "sys_platform == 'linux'" },
+    { name = "typing-extensions", marker = "sys_platform == 'linux'" },
 ]
 
 [package.optional-dependencies]
 aistore = [
-    { name = "aistore" },
+    { name = "aistore", marker = "sys_platform == 'linux'" },
 ]
 dali = [
-    { name = "nvidia-dali-cuda120" },
+    { name = "nvidia-dali-cuda120", marker = "sys_platform == 'linux'" },
 ]
 dftracer = [
-    { name = "pydftracer" },
+    { name = "pydftracer", marker = "sys_platform == 'linux'" },
 ]
 parquet = [
-    { name = "pyarrow" },
+    { name = "pyarrow", marker = "sys_platform == 'linux'" },
 ]
 s3 = [
-    { name = "s3torchconnector" },
+    { name = "s3torchconnector", marker = "sys_platform == 'linux'" },
 ]
 tensorflow = [
-    { name = "tensorflow" },
+    { name = "tensorflow", marker = "sys_platform == 'linux'" },
 ]
 test = [
-    { name = "dftracer" },
-    { name = "pytest" },
-    { name = "pytest-timeout" },
-    { name = "pytest-xdist" },
+    { name = "dftracer", marker = "sys_platform == 'linux'" },
+    { name = "pytest", marker = "sys_platform == 'linux'" },
+    { name = "pytest-timeout", marker = "sys_platform == 'linux'" },
+    { name = "pytest-xdist", marker = "sys_platform == 'linux'" },
 ]
 torch = [
-    { name = "torch" },
-    { name = "torchaudio" },
-    { name = "torchvision" },
+    { name = "torch", marker = "sys_platform == 'linux'" },
+    { name = "torchaudio", marker = "sys_platform == 'linux'" },
+    { name = "torchvision", marker = "sys_platform == 'linux'" },
 ]
 
 [package.dev-dependencies]
 dev = [
-    { name = "pytest" },
-    { name = "pytest-timeout" },
+    { name = "pytest", marker = "sys_platform == 'linux'" },
+    { name = "pytest-timeout", marker = "sys_platform == 'linux'" },
 ]
 
 [package.metadata]
 requires-dist = [
     { name = "aistore", marker = "extra == 'aistore'" },
     { name = "dftracer", marker = "extra == 'test'", specifier = ">=2.0.1" },
-    { name = "dgen-py", specifier = ">=0.2.3" },
+    { name = "dgen-py", specifier = ">=0.2.4" },
     { name = "h5py", specifier = ">=3.11.0" },
     { name = "hydra-core", specifier = ">=1.3.2" },
     { name = "mpi4py", specifier = ">=3.1.4" },
@@ -317,7 +307,8 @@ requires-dist = [
     { name = "pytest-timeout", marker = "extra == 'test'" },
     { name = "pytest-xdist", marker = "extra == 'test'" },
     { name = "pyyaml", specifier = ">=6.0.0" },
-    { name = "s3dlio", specifier = ">=0.9.95" },
+    { name = "s3dlio", specifier = ">=0.9.100" },
+    { name = "s3torchconnector", specifier = ">=1.5.0" },
     { name = "s3torchconnector", marker = "extra == 's3'" },
     { name = "tensorflow", specifier = ">=2.20.0" },
     { name = "tensorflow", marker = "extra == 'tensorflow'", specifier = ">=2.13.1" },
@@ -339,17 +330,15 @@ name = "dm-tree"
 version = "0.1.9"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "absl-py" },
-    { name = "attrs" },
-    { name = "numpy" },
-    { name = "wrapt" },
+    { name = "absl-py", marker = "sys_platform == 'linux'" },
+    { name = "attrs", marker = "sys_platform == 'linux'" },
+    { name = "numpy", marker = "sys_platform == 'linux'" },
+    { name = "wrapt", marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/a6/83/ce29720ccf934c6cfa9b9c95ebbe96558386e66886626066632b5e44afed/dm_tree-0.1.9.tar.gz", hash = "sha256:a4c7db3d3935a5a2d5e4b383fc26c6b0cd6f78c6d4605d3e7b518800ecd5342b", size = 35623 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/ee/02/61aa90ab695918b4389d75c99bf0ec3cd0abacf1cadbef4053626f23ce34/dm_tree-0.1.9-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:a8d20eeab7fde77a3ed71f07716021eb0edfb4812a128eb381d108af3a310257", size = 175012 },
     { url = "https://files.pythonhosted.org/packages/81/10/120cd40556407879c1069941bd8b0d1a75754128c1a5bf0e27dbcf2a49fc/dm_tree-0.1.9-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:80c43417814b1181d3367b335460bfdd30b79ee187a64220e11f6ddd093a4b15", size = 147204 },
     { url = "https://files.pythonhosted.org/packages/86/52/27607a275c12858b979b8e943d2bd3bd0f9028503bb7079d5830a8b3cac0/dm_tree-0.1.9-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2334cfe9d2ed4293f9f1c7aefba0657deaab9ea74b5fadd966f6d01d9b6b42d9", size = 153013 },
-    { url = "https://files.pythonhosted.org/packages/ea/97/4f78412f73a9350bc8f934441bae5b68b102c8f4240a7f06b4114b51d6de/dm_tree-0.1.9-cp312-cp312-win_amd64.whl", hash = "sha256:9020a5ce256fcc83aa4bc190cc96dd66e87685db0a6e501b0c06aa492c2e38fc", size = 102022 },
 ]
 
 [[package]]
@@ -401,7 +390,7 @@ name = "google-pasta"
 version = "0.2.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "six" },
+    { name = "six", marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/35/4a/0bd53b36ff0323d10d5f24ebd67af2de10a1117f5cf4d7add90df92756f1/google-pasta-0.2.0.tar.gz", hash = "sha256:c9f2c8dfc8f96d0d5808299920721be30c9eec37f2389f28904f454565c8a16e", size = 40430 }
 wheels = [
@@ -413,20 +402,17 @@ name = "grpcio"
 version = "1.80.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "typing-extensions" },
+    { name = "typing-extensions", marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/b7/48/af6173dbca4454f4637a4678b67f52ca7e0c1ed7d5894d89d434fecede05/grpcio-1.80.0.tar.gz", hash = "sha256:29aca15edd0688c22ba01d7cc01cb000d72b2033f4a3c72a81a19b56fd143257", size = 12978905 }
 wheels = [
     { url = "https://files.pythonhosted.org/packages/5c/e8/a2b749265eb3415abc94f2e619bbd9e9707bebdda787e61c593004ec927a/grpcio-1.80.0-cp312-cp312-linux_armv7l.whl", hash = "sha256:c624cc9f1008361014378c9d776de7182b11fe8b2e5a81bc69f23a295f2a1ad0", size = 6015616 },
-    { url = "https://files.pythonhosted.org/packages/3e/97/b1282161a15d699d1e90c360df18d19165a045ce1c343c7f313f5e8a0b77/grpcio-1.80.0-cp312-cp312-macosx_11_0_universal2.whl", hash = "sha256:f49eddcac43c3bf350c0385366a58f36bed8cc2c0ec35ef7b74b49e56552c0c2", size = 12014204 },
     { url = "https://files.pythonhosted.org/packages/6e/5e/d319c6e997b50c155ac5a8cb12f5173d5b42677510e886d250d50264949d/grpcio-1.80.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d334591df610ab94714048e0d5b4f3dd5ad1bee74dfec11eee344220077a79de", size = 6563866 },
     { url = "https://files.pythonhosted.org/packages/ae/f6/fdd975a2cb4d78eb67769a7b3b3830970bfa2e919f1decf724ae4445f42c/grpcio-1.80.0-cp312-cp312-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:0cb517eb1d0d0aaf1d87af7cc5b801d686557c1d88b2619f5e31fab3c2315921", size = 7273060 },
     { url = "https://files.pythonhosted.org/packages/db/f0/a3deb5feba60d9538a962913e37bd2e69a195f1c3376a3dd44fe0427e996/grpcio-1.80.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4e78c4ac0d97dc2e569b2f4bcbbb447491167cb358d1a389fc4af71ab6f70411", size = 6782121 },
     { url = "https://files.pythonhosted.org/packages/ca/84/36c6dcfddc093e108141f757c407902a05085e0c328007cb090d56646cdf/grpcio-1.80.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:2ed770b4c06984f3b47eb0517b1c69ad0b84ef3f40128f51448433be904634cd", size = 7383811 },
     { url = "https://files.pythonhosted.org/packages/7c/ef/f3a77e3dc5b471a0ec86c564c98d6adfa3510d38f8ee99010410858d591e/grpcio-1.80.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:256507e2f524092f1473071a05e65a5b10d84b82e3ff24c5b571513cfaa61e2f", size = 8393860 },
     { url = "https://files.pythonhosted.org/packages/9b/8d/9d4d27ed7f33d109c50d6b5ce578a9914aa68edab75d65869a17e630a8d1/grpcio-1.80.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:9a6284a5d907c37db53350645567c522be314bac859a64a7a5ca63b77bb7958f", size = 7830132 },
-    { url = "https://files.pythonhosted.org/packages/14/e4/9990b41c6d7a44e1e9dee8ac11d7a9802ba1378b40d77468a7761d1ad288/grpcio-1.80.0-cp312-cp312-win32.whl", hash = "sha256:c71309cfce2f22be26aa4a847357c502db6c621f1a49825ae98aa0907595b193", size = 4140904 },
-    { url = "https://files.pythonhosted.org/packages/2f/2c/296f6138caca1f4b92a31ace4ae1b87dab692fc16a7a3417af3bb3c805bf/grpcio-1.80.0-cp312-cp312-win_amd64.whl", hash = "sha256:9fe648599c0e37594c4809d81a9e77bd138cc82eb8baa71b6a86af65426723ff", size = 4880944 },
 ]
 
 [[package]]
@@ -434,18 +420,14 @@ name = "h5py"
 version = "3.16.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "numpy" },
+    { name = "numpy", marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/db/33/acd0ce6863b6c0d7735007df01815403f5589a21ff8c2e1ee2587a38f548/h5py-3.16.0.tar.gz", hash = "sha256:a0dbaad796840ccaa67a4c144a0d0c8080073c34c76d5a6941d6818678ef2738", size = 446526 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/c8/c0/5d4119dba94093bbafede500d3defd2f5eab7897732998c04b54021e530b/h5py-3.16.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:c5313566f4643121a78503a473f0fb1e6dcc541d5115c44f05e037609c565c4d", size = 3685604 },
-    { url = "https://files.pythonhosted.org/packages/b0/42/c84efcc1d4caebafb1ecd8be4643f39c85c47a80fe254d92b8b43b1eadaf/h5py-3.16.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:42b012933a83e1a558c673176676a10ce2fd3759976a0fedee1e672d1e04fc9d", size = 3061940 },
     { url = "https://files.pythonhosted.org/packages/89/84/06281c82d4d1686fde1ac6b0f307c50918f1c0151062445ab3b6fa5a921d/h5py-3.16.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:ff24039e2573297787c3063df64b60aab0591980ac898329a08b0320e0cf2527", size = 5198852 },
     { url = "https://files.pythonhosted.org/packages/9e/e9/1a19e42cd43cc1365e127db6aae85e1c671da1d9a5d746f4d34a50edb577/h5py-3.16.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:dfc21898ff025f1e8e67e194965a95a8d4754f452f83454538f98f8a3fcb207e", size = 5405250 },
     { url = "https://files.pythonhosted.org/packages/b7/8e/9790c1655eabeb85b92b1ecab7d7e62a2069e53baefd58c98f0909c7a948/h5py-3.16.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:698dd69291272642ffda44a0ecd6cd3bda5faf9621452d255f57ce91487b9794", size = 5190108 },
     { url = "https://files.pythonhosted.org/packages/51/d7/ab693274f1bd7e8c5f9fdd6c7003a88d59bedeaf8752716a55f532924fbb/h5py-3.16.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:2b2c02b0a160faed5fb33f1ba8a264a37ee240b22e049ecc827345d0d9043074", size = 5419216 },
-    { url = "https://files.pythonhosted.org/packages/03/c1/0976b235cf29ead553e22f2fb6385a8252b533715e00d0ae52ed7b900582/h5py-3.16.0-cp312-cp312-win_amd64.whl", hash = "sha256:96b422019a1c8975c2d5dadcf61d4ba6f01c31f92bbde6e4649607885fe502d6", size = 3182868 },
-    { url = "https://files.pythonhosted.org/packages/14/d9/866b7e570b39070f92d47b0ff1800f0f8239b6f9e45f02363d7112336c1f/h5py-3.16.0-cp312-cp312-win_arm64.whl", hash = "sha256:39c2838fb1e8d97bcf1755e60ad1f3dd76a7b2a475928dc321672752678b96db", size = 2653286 },
 ]
 
 [[package]]
@@ -462,9 +444,9 @@ name = "hydra-core"
 version = "1.3.2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "antlr4-python3-runtime" },
-    { name = "omegaconf" },
-    { name = "packaging" },
+    { name = "antlr4-python3-runtime", marker = "sys_platform == 'linux'" },
+    { name = "omegaconf", marker = "sys_platform == 'linux'" },
+    { name = "packaging", marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/6d/8e/07e42bc434a847154083b315779b0a81d567154504624e181caf2c71cd98/hydra-core-1.3.2.tar.gz", hash = "sha256:8a878ed67216997c3e9d88a8e72e7b4767e81af37afb4ea3334b269a4390a824", size = 3263494 }
 wheels = [
@@ -494,7 +476,7 @@ name = "jinja2"
 version = "3.1.6"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "markupsafe" },
+    { name = "markupsafe", marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/df/bf/f7da0350254c0ed7c72f3e33cef02e048281fec7ecec5f032d4aac52226b/jinja2-3.1.6.tar.gz", hash = "sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d", size = 245115 }
 wheels = [
@@ -506,14 +488,14 @@ name = "keras"
 version = "3.14.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "absl-py" },
-    { name = "h5py" },
-    { name = "ml-dtypes" },
-    { name = "namex" },
-    { name = "numpy" },
-    { name = "optree" },
-    { name = "packaging" },
-    { name = "rich" },
+    { name = "absl-py", marker = "sys_platform == 'linux'" },
+    { name = "h5py", marker = "sys_platform == 'linux'" },
+    { name = "ml-dtypes", marker = "sys_platform == 'linux'" },
+    { name = "namex", marker = "sys_platform == 'linux'" },
+    { name = "numpy", marker = "sys_platform == 'linux'" },
+    { name = "optree", marker = "sys_platform == 'linux'" },
+    { name = "packaging", marker = "sys_platform == 'linux'" },
+    { name = "rich", marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/88/ce/47874047a49eedc2a5d3b41bc4f1f572bb637f51e4351ef3538e49a63800/keras-3.14.0.tar.gz", hash = "sha256:86fcf8249a25264a566ac393c287c7ad657000e5e62615dcaad4b3472a17aeda", size = 1263098 }
 wheels = [
@@ -526,15 +508,10 @@ version = "18.1.1"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/6e/5c/ca35e19a4f142adffa27e3d652196b7362fa612243e2b916845d801454fc/libclang-18.1.1.tar.gz", hash = "sha256:a1214966d08d73d971287fc3ead8dfaf82eb07fb197680d8b3859dbbbbf78250", size = 39612 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/4b/49/f5e3e7e1419872b69f6f5e82ba56e33955a74bd537d8a1f5f1eff2f3668a/libclang-18.1.1-1-py2.py3-none-macosx_11_0_arm64.whl", hash = "sha256:0b2e143f0fac830156feb56f9231ff8338c20aecfe72b4ffe96f19e5a1dbb69a", size = 25836045 },
-    { url = "https://files.pythonhosted.org/packages/e2/e5/fc61bbded91a8830ccce94c5294ecd6e88e496cc85f6704bf350c0634b70/libclang-18.1.1-py2.py3-none-macosx_10_9_x86_64.whl", hash = "sha256:6f14c3f194704e5d09769108f03185fce7acaf1d1ae4bbb2f30a72c2400cb7c5", size = 26502641 },
-    { url = "https://files.pythonhosted.org/packages/db/ed/1df62b44db2583375f6a8a5e2ca5432bbdc3edb477942b9b7c848c720055/libclang-18.1.1-py2.py3-none-macosx_11_0_arm64.whl", hash = "sha256:83ce5045d101b669ac38e6da8e58765f12da2d3aafb3b9b98d88b286a60964d8", size = 26420207 },
     { url = "https://files.pythonhosted.org/packages/1d/fc/716c1e62e512ef1c160e7984a73a5fc7df45166f2ff3f254e71c58076f7c/libclang-18.1.1-py2.py3-none-manylinux2010_x86_64.whl", hash = "sha256:c533091d8a3bbf7460a00cb6c1a71da93bffe148f172c7d03b1c31fbf8aa2a0b", size = 24515943 },
     { url = "https://files.pythonhosted.org/packages/3c/3d/f0ac1150280d8d20d059608cf2d5ff61b7c3b7f7bcf9c0f425ab92df769a/libclang-18.1.1-py2.py3-none-manylinux2014_aarch64.whl", hash = "sha256:54dda940a4a0491a9d1532bf071ea3ef26e6dbaf03b5000ed94dd7174e8f9592", size = 23784972 },
     { url = "https://files.pythonhosted.org/packages/fe/2f/d920822c2b1ce9326a4c78c0c2b4aa3fde610c7ee9f631b600acb5376c26/libclang-18.1.1-py2.py3-none-manylinux2014_armv7l.whl", hash = "sha256:cf4a99b05376513717ab5d82a0db832c56ccea4fd61a69dbb7bccf2dfb207dbe", size = 20259606 },
     { url = "https://files.pythonhosted.org/packages/2d/c2/de1db8c6d413597076a4259cea409b83459b2db997c003578affdd32bf66/libclang-18.1.1-py2.py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:69f8eb8f65c279e765ffd28aaa7e9e364c776c17618af8bff22a8df58677ff4f", size = 24921494 },
-    { url = "https://files.pythonhosted.org/packages/0b/2d/3f480b1e1d31eb3d6de5e3ef641954e5c67430d5ac93b7fa7e07589576c7/libclang-18.1.1-py2.py3-none-win_amd64.whl", hash = "sha256:4dd2d3b82fab35e2bf9ca717d7b63ac990a3519c7e312f19fa8e86dcc712f7fb", size = 26415083 },
-    { url = "https://files.pythonhosted.org/packages/71/cf/e01dc4cc79779cd82d77888a88ae2fa424d93b445ad4f6c02bfc18335b70/libclang-18.1.1-py2.py3-none-win_arm64.whl", hash = "sha256:3f0e1f49f04d3cd198985fea0511576b0aee16f9ff0e0f0cad7f9c57ec3c20e8", size = 22361112 },
 ]
 
 [[package]]
@@ -560,7 +537,7 @@ name = "markdown-it-py"
 version = "4.0.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "mdurl" },
+    { name = "mdurl", marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/5b/f5/4ec618ed16cc4f8fb3b701563655a69816155e79e24a17b651541804721d/markdown_it_py-4.0.0.tar.gz", hash = "sha256:cb0a2b4aa34f932c007117b194e945bd74e0ec24133ceb5bac59009cda1cb9f3", size = 73070 }
 wheels = [
@@ -573,17 +550,12 @@ version = "3.0.3"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/7e/99/7690b6d4034fffd95959cbe0c02de8deb3098cc577c67bb6a24fe5d7caa7/markupsafe-3.0.3.tar.gz", hash = "sha256:722695808f4b6457b320fdc131280796bdceb04ab50fe1795cd540799ebe1698", size = 80313 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/5a/72/147da192e38635ada20e0a2e1a51cf8823d2119ce8883f7053879c2199b5/markupsafe-3.0.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:d53197da72cc091b024dd97249dfc7794d6a56530370992a5e1a08983ad9230e", size = 11615 },
-    { url = "https://files.pythonhosted.org/packages/9a/81/7e4e08678a1f98521201c3079f77db69fb552acd56067661f8c2f534a718/markupsafe-3.0.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:1872df69a4de6aead3491198eaf13810b565bdbeec3ae2dc8780f14458ec73ce", size = 12020 },
     { url = "https://files.pythonhosted.org/packages/1e/2c/799f4742efc39633a1b54a92eec4082e4f815314869865d876824c257c1e/markupsafe-3.0.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3a7e8ae81ae39e62a41ec302f972ba6ae23a5c5396c8e60113e9066ef893da0d", size = 24332 },
     { url = "https://files.pythonhosted.org/packages/3c/2e/8d0c2ab90a8c1d9a24f0399058ab8519a3279d1bd4289511d74e909f060e/markupsafe-3.0.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d6dd0be5b5b189d31db7cda48b91d7e0a9795f31430b7f271219ab30f1d3ac9d", size = 22947 },
     { url = "https://files.pythonhosted.org/packages/2c/54/887f3092a85238093a0b2154bd629c89444f395618842e8b0c41783898ea/markupsafe-3.0.3-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:94c6f0bb423f739146aec64595853541634bde58b2135f27f61c1ffd1cd4d16a", size = 21962 },
     { url = "https://files.pythonhosted.org/packages/c9/2f/336b8c7b6f4a4d95e91119dc8521402461b74a485558d8f238a68312f11c/markupsafe-3.0.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:be8813b57049a7dc738189df53d69395eba14fb99345e0a5994914a3864c8a4b", size = 23760 },
     { url = "https://files.pythonhosted.org/packages/32/43/67935f2b7e4982ffb50a4d169b724d74b62a3964bc1a9a527f5ac4f1ee2b/markupsafe-3.0.3-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:83891d0e9fb81a825d9a6d61e3f07550ca70a076484292a70fde82c4b807286f", size = 21529 },
     { url = "https://files.pythonhosted.org/packages/89/e0/4486f11e51bbba8b0c041098859e869e304d1c261e59244baa3d295d47b7/markupsafe-3.0.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:77f0643abe7495da77fb436f50f8dab76dbc6e5fd25d39589a0f1fe6548bfa2b", size = 23015 },
-    { url = "https://files.pythonhosted.org/packages/2f/e1/78ee7a023dac597a5825441ebd17170785a9dab23de95d2c7508ade94e0e/markupsafe-3.0.3-cp312-cp312-win32.whl", hash = "sha256:d88b440e37a16e651bda4c7c2b930eb586fd15ca7406cb39e211fcff3bf3017d", size = 14540 },
-    { url = "https://files.pythonhosted.org/packages/aa/5b/bec5aa9bbbb2c946ca2733ef9c4ca91c91b6a24580193e891b5f7dbe8e1e/markupsafe-3.0.3-cp312-cp312-win_amd64.whl", hash = "sha256:26a5784ded40c9e318cfc2bdb30fe164bdb8665ded9cd64d500a34fb42067b1c", size = 15105 },
-    { url = "https://files.pythonhosted.org/packages/e5/f1/216fc1bbfd74011693a4fd837e7026152e89c4bcf3e77b6692fba9923123/markupsafe-3.0.3-cp312-cp312-win_arm64.whl", hash = "sha256:35add3b638a5d900e807944a078b51922212fb3dedb01633a8defc4b01a3c85f", size = 13906 },
 ]
 
 [[package]]
@@ -600,15 +572,12 @@ name = "ml-dtypes"
 version = "0.5.4"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "numpy" },
+    { name = "numpy", marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/0e/4a/c27b42ed9b1c7d13d9ba8b6905dece787d6259152f2309338aed29b2447b/ml_dtypes-0.5.4.tar.gz", hash = "sha256:8ab06a50fb9bf9666dd0fe5dfb4676fa2b0ac0f31ecff72a6c3af8e22c063453", size = 692314 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/a8/b8/3c70881695e056f8a32f8b941126cf78775d9a4d7feba8abcb52cb7b04f2/ml_dtypes-0.5.4-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:a174837a64f5b16cab6f368171a1a03a27936b31699d167684073ff1c4237dac", size = 676927 },
     { url = "https://files.pythonhosted.org/packages/54/0f/428ef6881782e5ebb7eca459689448c0394fa0a80bea3aa9262cba5445ea/ml_dtypes-0.5.4-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a7f7c643e8b1320fd958bf098aa7ecf70623a42ec5154e3be3be673f4c34d900", size = 5028464 },
     { url = "https://files.pythonhosted.org/packages/3a/cb/28ce52eb94390dda42599c98ea0204d74799e4d8047a0eb559b6fd648056/ml_dtypes-0.5.4-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9ad459e99793fa6e13bd5b7e6792c8f9190b4e5a1b45c63aba14a4d0a7f1d5ff", size = 5009002 },
-    { url = "https://files.pythonhosted.org/packages/f5/f0/0cfadd537c5470378b1b32bd859cf2824972174b51b873c9d95cfd7475a5/ml_dtypes-0.5.4-cp312-cp312-win_amd64.whl", hash = "sha256:c1a953995cccb9e25a4ae19e34316671e4e2edaebe4cf538229b1fc7109087b7", size = 212222 },
-    { url = "https://files.pythonhosted.org/packages/16/2e/9acc86985bfad8f2c2d30291b27cd2bb4c74cea08695bd540906ed744249/ml_dtypes-0.5.4-cp312-cp312-win_arm64.whl", hash = "sha256:9bad06436568442575beb2d03389aa7456c690a5b05892c471215bfd8cf39460", size = 160793 },
 ]
 
 [[package]]
@@ -617,16 +586,10 @@ version = "4.1.1"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/62/74/28ea85b0b949cad827ea50720e00e814e88c8fd536c27c3c491e4f025724/mpi4py-4.1.1.tar.gz", hash = "sha256:eb2c8489bdbc47fdc6b26ca7576e927a11b070b6de196a443132766b3d0a2a22", size = 500518 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/36/b3/2e7df40608f2188dca16e38f8030add1071f06b1cd94dd8a4e16b9acbd84/mpi4py-4.1.1-cp310-abi3-macosx_10_9_x86_64.whl", hash = "sha256:1586f5d1557abed9cba7e984d18f32e787b353be0986e599974db177ae36329a", size = 1422849 },
-    { url = "https://files.pythonhosted.org/packages/6d/ed/970bd3edc0e614eccc726fa406255b88f728a8bc059e81f96f28d6ede0af/mpi4py-4.1.1-cp310-abi3-macosx_11_0_arm64.whl", hash = "sha256:ba85e4778d63c750226de95115c92b709f38d7e661be660a275da4f0992ee197", size = 1326982 },
     { url = "https://files.pythonhosted.org/packages/5d/c3/f9a5d1f9ba52ac6386bf3d3550027f42a6b102b0432113cc43294420feb2/mpi4py-4.1.1-cp310-abi3-manylinux1_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:0a8332884626994d9ef48da233dc7a0355f4868dd7ff59f078d5813a2935b930", size = 1373127 },
     { url = "https://files.pythonhosted.org/packages/84/d1/1fe75025df801d817ed49371c719559f742f3f263323442d34dbe3366af3/mpi4py-4.1.1-cp310-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:6e0352860f0b3e18bc0dcb47e42e583ccb9472f89752d711a6fca46a38670554", size = 1225134 },
-    { url = "https://files.pythonhosted.org/packages/40/44/d653fec0e4ca8181645da4bfb2763017625e5b3f151b208fadd932cb1766/mpi4py-4.1.1-cp310-abi3-win_amd64.whl", hash = "sha256:0f46dfe666a599e4bd2641116b2b4852a3ed9d37915edf98fae471d666663128", size = 1478863 },
-    { url = "https://files.pythonhosted.org/packages/ff/2c/e201cd4828555f10306a5439875cbd0ecfba766ace01ff5c6df43f795650/mpi4py-4.1.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:d4403a7cec985be9963efc626193e6df3f63f5ada0c26373c28e640e623e56c3", size = 1669517 },
-    { url = "https://files.pythonhosted.org/packages/7b/53/18d978c3a19deecf38217ce54319e6c9162fec3569c4256c039b66eac2f4/mpi4py-4.1.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8a2ffccc9f3a8c7c957403faad594d650c60234ac08cbedf45beaa96602debe9", size = 1454721 },
     { url = "https://files.pythonhosted.org/packages/ee/15/b908d1d23a4bd2bd7b2e98de5df23b26e43145119fe294728bf89211b935/mpi4py-4.1.1-cp312-cp312-manylinux1_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:ed3d9b619bf197a290f7fd67eb61b1c2a5c204afd9621651a50dc0b1c1280d45", size = 1448977 },
     { url = "https://files.pythonhosted.org/packages/5d/19/088a2d37e80e0feb7851853b2a71cbe6f9b18bdf0eab680977864ea83aab/mpi4py-4.1.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:0699c194db5d95fc2085711e4e0013083bd7ae9a88438e1fd64ddb67e9b0cf9e", size = 1318737 },
-    { url = "https://files.pythonhosted.org/packages/97/3a/526261f39bf096e5ff396d18b76740a58d872425612ff84113dd85c2c08e/mpi4py-4.1.1-cp312-cp312-win_amd64.whl", hash = "sha256:0abf5490c3d49c30542b461bfc5ad88dd7d147a4bdb456b7163640577fdfef88", size = 1725676 },
 ]
 
 [[package]]
@@ -644,14 +607,10 @@ version = "0.21.1"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/e3/60/f79b9b013a16fa3a58350c9295ddc6789f2e335f36ea61ed10a21b215364/msgspec-0.21.1.tar.gz", hash = "sha256:2313508e394b0d208f8f56892ca9b2799e2561329de9763b19619595a6c0f72c", size = 319193 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/6e/cf/317224852c00248c620a9bcf4b26e2e4ab8afd752f18d2a6ef73ebd423b6/msgspec-0.21.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:d4248cf0b6129b7d230eacd493c17cc2d4f3989f3bb7f633a928a85b7dcfa251", size = 196188 },
-    { url = "https://files.pythonhosted.org/packages/6d/81/074612945c0666078f7366f40000013de9f6ba687491d450df699bceebc9/msgspec-0.21.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:5102c7e9b3acff82178449b85006d96310e690291bb1ea0142f1b24bcb8aabcb", size = 188473 },
     { url = "https://files.pythonhosted.org/packages/8a/37/655101799590bcc5fddb2bd3fe0e6194e816c2d1da7c361725f5eb89a910/msgspec-0.21.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:846758412e9518252b2ac9bffd6f0e54d9ff614f5f9488df7749f81ff5c80920", size = 218871 },
     { url = "https://files.pythonhosted.org/packages/b5/d1/d4cd9fe89c7d400d7a18f86ccc94daa3f0927f53558846fcb60791dce5d6/msgspec-0.21.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:21995e74b5c598c2e004110ad66ec7f1b8c20bf2bcf3b2de8fd9a3094422d3ff", size = 225025 },
     { url = "https://files.pythonhosted.org/packages/24/bf/e20549e602b9edccadeeff98760345a416f9cce846a657e8b18e3396b212/msgspec-0.21.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6129f0cca52992e898fd5344187f7c8127b63d810b2fd73e36fca73b4c6475ee", size = 222672 },
     { url = "https://files.pythonhosted.org/packages/b4/68/04d7a8f0f786545cf9b8c280c57aa6befb5977af6e884b8b54191cbe44b3/msgspec-0.21.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:ef3ec2296248d1f8b9231acb051b6d471dfde8f21819e86c9adaaa9f42918521", size = 227303 },
-    { url = "https://files.pythonhosted.org/packages/cc/4d/619866af2840875be408047bf9e70ceafbae6ab50660de7134ed1b25eb86/msgspec-0.21.1-cp312-cp312-win_amd64.whl", hash = "sha256:d4ab834a054c6f0cbeef6df9e7e1b33d5f1bc7b86dea1d2fd7cad003873e783d", size = 190017 },
-    { url = "https://files.pythonhosted.org/packages/5e/2e/a8f9eca8fd00e097d7a9e99ba8a4685db994494448e3d4f0b7f6e9a3c0f7/msgspec-0.21.1-cp312-cp312-win_arm64.whl", hash = "sha256:628aaa35c74950a8c59da330d7e98917e1c7188f983745782027748ee4ca573e", size = 175345 },
 ]
 
 [[package]]
@@ -678,17 +637,10 @@ version = "2.4.4"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/d7/9f/b8cef5bffa569759033adda9481211426f12f53299629b410340795c2514/numpy-2.4.4.tar.gz", hash = "sha256:2d390634c5182175533585cc89f3608a4682ccb173cc9bb940b2881c8d6f8fa0", size = 20731587 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/28/05/32396bec30fb2263770ee910142f49c1476d08e8ad41abf8403806b520ce/numpy-2.4.4-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:15716cfef24d3a9762e3acdf87e27f58dc823d1348f765bbea6bef8c639bfa1b", size = 16689272 },
-    { url = "https://files.pythonhosted.org/packages/c5/f3/a983d28637bfcd763a9c7aafdb6d5c0ebf3d487d1e1459ffdb57e2f01117/numpy-2.4.4-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:23cbfd4c17357c81021f21540da84ee282b9c8fba38a03b7b9d09ba6b951421e", size = 14699573 },
-    { url = "https://files.pythonhosted.org/packages/9b/fd/e5ecca1e78c05106d98028114f5c00d3eddb41207686b2b7de3e477b0e22/numpy-2.4.4-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:8b3b60bb7cba2c8c81837661c488637eee696f59a877788a396d33150c35d842", size = 5204782 },
-    { url = "https://files.pythonhosted.org/packages/de/2f/702a4594413c1a8632092beae8aba00f1d67947389369b3777aed783fdca/numpy-2.4.4-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:e4a010c27ff6f210ff4c6ef34394cd61470d01014439b192ec22552ee867f2a8", size = 6552038 },
     { url = "https://files.pythonhosted.org/packages/7f/37/eed308a8f56cba4d1fdf467a4fc67ef4ff4bf1c888f5fc980481890104b1/numpy-2.4.4-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f9e75681b59ddaa5e659898085ae0eaea229d054f2ac0c7e563a62205a700121", size = 15670666 },
     { url = "https://files.pythonhosted.org/packages/0a/0d/0e3ecece05b7a7e87ab9fb587855548da437a061326fff64a223b6dcb78a/numpy-2.4.4-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:81f4a14bee47aec54f883e0cad2d73986640c1590eb9bfaaba7ad17394481e6e", size = 16645480 },
     { url = "https://files.pythonhosted.org/packages/34/49/f2312c154b82a286758ee2f1743336d50651f8b5195db18cdb63675ff649/numpy-2.4.4-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:62d6b0f03b694173f9fcb1fb317f7222fd0b0b103e784c6549f5e53a27718c44", size = 17020036 },
     { url = "https://files.pythonhosted.org/packages/7b/e9/736d17bd77f1b0ec4f9901aaec129c00d59f5d84d5e79bba540ef12c2330/numpy-2.4.4-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:fbc356aae7adf9e6336d336b9c8111d390a05df88f1805573ebb0807bd06fd1d", size = 18368643 },
-    { url = "https://files.pythonhosted.org/packages/63/f6/d417977c5f519b17c8a5c3bc9e8304b0908b0e21136fe43bf628a1343914/numpy-2.4.4-cp312-cp312-win32.whl", hash = "sha256:0d35aea54ad1d420c812bfa0385c71cd7cc5bcf7c65fed95fc2cd02fe8c79827", size = 5961117 },
-    { url = "https://files.pythonhosted.org/packages/2d/5b/e1deebf88ff431b01b7406ca3583ab2bbb90972bbe1c568732e49c844f7e/numpy-2.4.4-cp312-cp312-win_amd64.whl", hash = "sha256:b5f0362dc928a6ecd9db58868fca5e48485205e3855957bdedea308f8672ea4a", size = 12320584 },
-    { url = "https://files.pythonhosted.org/packages/58/89/e4e856ac82a68c3ed64486a544977d0e7bdd18b8da75b78a577ca31c4395/numpy-2.4.4-cp312-cp312-win_arm64.whl", hash = "sha256:846300f379b5b12cc769334464656bc882e0735d27d9726568bc932fdc49d5ec", size = 10221450 },
 ]
 
 [[package]]
@@ -732,7 +684,7 @@ name = "nvidia-cudnn-cu13"
 version = "9.19.0.56"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "nvidia-cublas", marker = "sys_platform != 'emscripten' and sys_platform != 'win32'" },
+    { name = "nvidia-cublas", marker = "sys_platform == 'linux'" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/f1/84/26025437c1e6b61a707442184fa0c03d083b661adf3a3eecfd6d21677740/nvidia_cudnn_cu13-9.19.0.56-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:6ed29ffaee1176c612daf442e4dd6cfeb6a0caa43ddcbeb59da94953030b1be4", size = 433781201 },
@@ -744,7 +696,7 @@ name = "nvidia-cufft"
 version = "12.0.0.61"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "nvidia-nvjitlink", marker = "sys_platform != 'emscripten' and sys_platform != 'win32'" },
+    { name = "nvidia-nvjitlink", marker = "sys_platform == 'linux'" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/8b/ae/f417a75c0259e85c1d2f83ca4e960289a5f814ed0cea74d18c353d3e989d/nvidia_cufft-12.0.0.61-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:2708c852ef8cd89d1d2068bdbece0aa188813a0c934db3779b9b1faa8442e5f5", size = 214053554 },
@@ -774,9 +726,9 @@ name = "nvidia-cusolver"
 version = "12.0.4.66"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "nvidia-cublas", marker = "sys_platform != 'emscripten' and sys_platform != 'win32'" },
-    { name = "nvidia-cusparse", marker = "sys_platform != 'emscripten' and sys_platform != 'win32'" },
-    { name = "nvidia-nvjitlink", marker = "sys_platform != 'emscripten' and sys_platform != 'win32'" },
+    { name = "nvidia-cublas", marker = "sys_platform == 'linux'" },
+    { name = "nvidia-cusparse", marker = "sys_platform == 'linux'" },
+    { name = "nvidia-nvjitlink", marker = "sys_platform == 'linux'" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/c8/c3/b30c9e935fc01e3da443ec0116ed1b2a009bb867f5324d3f2d7e533e776b/nvidia_cusolver-12.0.4.66-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:02c2457eaa9e39de20f880f4bd8820e6a1cfb9f9a34f820eb12a155aa5bc92d2", size = 223467760 },
@@ -788,7 +740,7 @@ name = "nvidia-cusparse"
 version = "12.6.3.3"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "nvidia-nvjitlink", marker = "sys_platform != 'emscripten' and sys_platform != 'win32'" },
+    { name = "nvidia-nvjitlink", marker = "sys_platform == 'linux'" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/f8/94/5c26f33738ae35276672f12615a64bd008ed5be6d1ebcb23579285d960a9/nvidia_cusparse-12.6.3.3-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:80bcc4662f23f1054ee334a15c72b8940402975e0eab63178fc7e670aa59472c", size = 162155568 },
@@ -809,16 +761,16 @@ name = "nvidia-dali-cuda120"
 version = "2.0.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "astunparse" },
-    { name = "dm-tree" },
-    { name = "gast" },
-    { name = "makefun" },
-    { name = "numpy" },
-    { name = "nvidia-libnvcomp-cu12" },
-    { name = "nvidia-nvimgcodec-cu12", extra = ["all"] },
-    { name = "nvtx" },
-    { name = "packaging" },
-    { name = "six" },
+    { name = "astunparse", marker = "sys_platform == 'linux'" },
+    { name = "dm-tree", marker = "sys_platform == 'linux'" },
+    { name = "gast", marker = "sys_platform == 'linux'" },
+    { name = "makefun", marker = "sys_platform == 'linux'" },
+    { name = "numpy", marker = "sys_platform == 'linux'" },
+    { name = "nvidia-libnvcomp-cu12", marker = "sys_platform == 'linux'" },
+    { name = "nvidia-nvimgcodec-cu12", extra = ["all"], marker = "sys_platform == 'linux'" },
+    { name = "nvtx", marker = "sys_platform == 'linux'" },
+    { name = "packaging", marker = "sys_platform == 'linux'" },
+    { name = "six", marker = "sys_platform == 'linux'" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/c0/f9/af5c0888c53cea8d869c54d454c3c97b9698ebe24add01abcee4febb1abd/nvidia_dali_cuda120-2.0.0-py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:afbde358aeccc508ad718789d83481cc0b6e54d6fa876326955103027cb6a948", size = 293086967 },
@@ -832,7 +784,6 @@ source = { registry = "https://pypi.org/simple" }
 wheels = [
     { url = "https://files.pythonhosted.org/packages/f8/23/b20f2381c7e92c704386428fe79736a13c50f452376453fdc60fcc0ec1b0/nvidia_libnvcomp_cu12-5.1.0.21-py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:77dfb3cb8c8995dfa0279ba99b0501e03cbe77e876aab44f4693abdcfac549ce", size = 28802614 },
     { url = "https://files.pythonhosted.org/packages/08/ab/844fcbaa46cc1242632b4b94b4ffc210ec3d8d8f30ad8f7f1c27767389a9/nvidia_libnvcomp_cu12-5.1.0.21-py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:68de61183edb9a870c9a608273a2b5da97dea18e3552096c61fafd9bb2689db0", size = 28958714 },
-    { url = "https://files.pythonhosted.org/packages/c4/cc/c6e92d9587b9ad63c08b1b94c5ae2216319491d0bd4f40f2a9a431d4841f/nvidia_libnvcomp_cu12-5.1.0.21-py3-none-win_amd64.whl", hash = "sha256:1352c7c4264ee5357f8f20e4a8da7f2f91debe21d8968f44576a7f4b51f91533", size = 28490640 },
 ]
 
 [[package]]
@@ -851,15 +802,14 @@ source = { registry = "https://pypi.org/simple" }
 wheels = [
     { url = "https://files.pythonhosted.org/packages/63/48/74d33dd126f84a4212480e2cf07504f457b5bae5acd33c0f6bf839ea17d4/nvidia_nvimgcodec_cu12-0.7.0.11-py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:52d834be8122bb5b8fc3151cc3bedb95368b3e7ac76af0c4561772ab2a847b2b", size = 27409358 },
     { url = "https://files.pythonhosted.org/packages/73/b4/f06528ebcb82da84f4a96efe7a210c277767cb86ad2f61f8b1a17d17f251/nvidia_nvimgcodec_cu12-0.7.0.11-py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:32d3457859c5784e4c0f6a2f56b6a9afec8fe646cec1cbe4bb5c320948d92dfe", size = 33735220 },
-    { url = "https://files.pythonhosted.org/packages/be/79/95b36049a9504d59d79929e9f3bec001b270f29aec8486e5fb9783a9502c/nvidia_nvimgcodec_cu12-0.7.0.11-py3-none-win_amd64.whl", hash = "sha256:495e07e071fcb2115f7f1948a04f6c51f96d61b83c614af753f7cc1bf369a46c", size = 18448810 },
 ]
 
 [package.optional-dependencies]
 all = [
-    { name = "nvidia-libnvcomp-cu12" },
-    { name = "nvidia-nvjpeg-cu12" },
-    { name = "nvidia-nvjpeg2k-cu12" },
-    { name = "nvidia-nvtiff-cu12" },
+    { name = "nvidia-libnvcomp-cu12", marker = "sys_platform == 'linux'" },
+    { name = "nvidia-nvjpeg-cu12", marker = "sys_platform == 'linux'" },
+    { name = "nvidia-nvjpeg2k-cu12", marker = "sys_platform == 'linux'" },
+    { name = "nvidia-nvtiff-cu12", marker = "sys_platform == 'linux'" },
 ]
 
 [[package]]
@@ -878,7 +828,6 @@ source = { registry = "https://pypi.org/simple" }
 wheels = [
     { url = "https://files.pythonhosted.org/packages/1d/48/5c12a3e6afe070ff563375cc72b42e9c7400bd0b44c734591049410be7fd/nvidia_nvjpeg_cu12-12.4.0.76-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:f52c5ef7cf56e8bffac8903a59f14494017a52e4fe89d5a1d16c1e88d7bbf194", size = 5273693 },
     { url = "https://files.pythonhosted.org/packages/57/68/d3526394584134a23f2500833c62d3352e1feda7547041f4612b1a183aa3/nvidia_nvjpeg_cu12-12.4.0.76-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:3888f10b32fbd58e80166c48e01073732d752fa5f167b7cb5b9615f1c6375a20", size = 5313609 },
-    { url = "https://files.pythonhosted.org/packages/bc/28/e05bb8e6cdb98e79c6822f8bbd7154a26d8102412b3a0bfd5e4c7c52db8c/nvidia_nvjpeg_cu12-12.4.0.76-py3-none-win_amd64.whl", hash = "sha256:21923726db667bd53050d0de88320983ff423322b7f376057dd943e487c40abc", size = 4741398 },
 ]
 
 [[package]]
@@ -888,7 +837,6 @@ source = { registry = "https://pypi.org/simple" }
 wheels = [
     { url = "https://files.pythonhosted.org/packages/01/96/9849f4dfb543375ada67038e059a702a7e0c07d5af5311bbf7a21f221881/nvidia_nvjpeg2k_cu12-0.10.0.49-py3-none-manylinux2014_aarch64.whl", hash = "sha256:0da30962c81bed210743f2128ba9d05bd1c3749064a948e0b2edb0d19d29c539", size = 7569931 },
     { url = "https://files.pythonhosted.org/packages/65/f6/fe957c090edda0168c39e7bbf57cfeb3178f4cf58519538ffbf249a50511/nvidia_nvjpeg2k_cu12-0.10.0.49-py3-none-manylinux2014_x86_64.whl", hash = "sha256:72017675eafa928b19e50dd9ab82bfa96e884c573ff68e19c42a4a8cef6f8cf1", size = 7628020 },
-    { url = "https://files.pythonhosted.org/packages/ee/c1/4a690ca70fea762c6b3f3f76434000fab3802690f6fe635034d85ed48ecc/nvidia_nvjpeg2k_cu12-0.10.0.49-py3-none-win_amd64.whl", hash = "sha256:fc752a1d0c4fbc42e6a640e89495e746ec5254fc5fdbdd33fea34fed736caa6b", size = 7200999 },
 ]
 
 [[package]]
@@ -907,7 +855,6 @@ source = { registry = "https://pypi.org/simple" }
 wheels = [
     { url = "https://files.pythonhosted.org/packages/4e/ee/b23cb613daadd7e3c78dea05eb25cd658a12287577e65ceaf8a7b59034c7/nvidia_nvtiff_cu12-0.7.0.79-py3-none-manylinux2014_aarch64.whl", hash = "sha256:461e82965c3be5ea6ca81fe71efb49fe191939760566c7621f133f64d4936035", size = 3811854 },
     { url = "https://files.pythonhosted.org/packages/75/21/5f5adc5026beb699f5e1bf45a352f899e8114661907671d00b499fa6a1da/nvidia_nvtiff_cu12-0.7.0.79-py3-none-manylinux2014_x86_64.whl", hash = "sha256:a406083e99027e43dd5f860926ef0a3d3acfa617aaafd5e19a3ccfd58e89508b", size = 3860492 },
-    { url = "https://files.pythonhosted.org/packages/00/9b/ec9b3c7bfe5aef7880a9c95426472fc1649d73c3004db4cd503294864f43/nvidia_nvtiff_cu12-0.7.0.79-py3-none-win_amd64.whl", hash = "sha256:d755aa8227721760792a9737b27087d71fb9177582a9df5fc908092a2068c3c0", size = 3390759 },
 ]
 
 [[package]]
@@ -927,7 +874,6 @@ sdist = { url = "https://files.pythonhosted.org/packages/92/dd/692765e87de30bae1
 wheels = [
     { url = "https://files.pythonhosted.org/packages/c2/07/698355285a03a366ef63ea9762fc1feef3f9f25483e1655408f72d827090/nvtx-0.2.15-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2cc530cd0f1a2c14a3a7e683833db509888ac5ed4ead94e5c9e2c7317c6937a7", size = 807159 },
     { url = "https://files.pythonhosted.org/packages/c0/d1/08f22448d83481408d663065764ba583df091a7de629ed38fc97e522f1af/nvtx-0.2.15-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3ca8030a6d197952318013dd1c12c22da1d4b9feb76ba72e0fcd449961183c2c", size = 806187 },
-    { url = "https://files.pythonhosted.org/packages/54/23/c97c39e3b7ba256aa343cb828ca0d1c8421f705ca84795658ecd14ca95ed/nvtx-0.2.15-cp312-cp312-win_amd64.whl", hash = "sha256:70a1e768964e0520b68ccabc4df391cc227537c45936a7eba6507bc65e617e00", size = 129178 },
 ]
 
 [[package]]
@@ -935,8 +881,8 @@ name = "omegaconf"
 version = "2.3.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "antlr4-python3-runtime" },
-    { name = "pyyaml" },
+    { name = "antlr4-python3-runtime", marker = "sys_platform == 'linux'" },
+    { name = "pyyaml", marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/09/48/6388f1bb9da707110532cb70ec4d2822858ddfb44f1cdf1233c20a80ea4b/omegaconf-2.3.0.tar.gz", hash = "sha256:d5d4b6d29955cc50ad50c46dc269bcd92c6e00f5f90d23ab5fee7bfca4ba4cc7", size = 3298120 }
 wheels = [
@@ -957,21 +903,16 @@ name = "optree"
 version = "0.19.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "typing-extensions" },
+    { name = "typing-extensions", marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/3d/63/7b078bc36d5a206c21b03565a818ede38ff0fbf014e92085ec467ef10adb/optree-0.19.0.tar.gz", hash = "sha256:bc1991a948590756409e76be4e29efd4a487a185056d35db6c67619c19ea27a1", size = 175199 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/2d/bf/5cbbf61a27f94797c3d9786f6230223023a943b60f5e893d52368f10b8b1/optree-0.19.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:7ec4b2ce49622c6be2c8634712b6c63cc274835bac89a56e3ab2ca863a32ff4b", size = 418100 },
-    { url = "https://files.pythonhosted.org/packages/00/9e/65899e6470f5df289ccdbe9e228fb0cd0ae45ccda8e32c92d6efae1530ef/optree-0.19.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f0978603623b4b1f794f05f6bbed0645cb7e219f4a5a349b2a2bd4514d84ac82", size = 388582 },
     { url = "https://files.pythonhosted.org/packages/d1/dc/f4826835be660181f1b4444ac92b51dda96d4634d3c2271e14598da7bf2a/optree-0.19.0-cp312-cp312-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8c9e52c50ed3f3f8b1cf4e47a20a7c5e77175b4f84b2ecf390a76f0d1dd91da6", size = 407457 },
     { url = "https://files.pythonhosted.org/packages/ce/b0/89283ac1dd1ead3aa3d7a6b45a26846f457bded79a83b6828fc1ed9a6db3/optree-0.19.0-cp312-cp312-manylinux_2_26_i686.manylinux_2_28_i686.whl", hash = "sha256:3fe3e5f7a30a7d08ddba0a34e48f5483f6c4d7bb710375434ad3633170c73c48", size = 471230 },
     { url = "https://files.pythonhosted.org/packages/2a/a2/47f620f87b0544b2e0eb0b3c661682bd0ea1c79f6e38f9147bc0f835c973/optree-0.19.0-cp312-cp312-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:8315527e1f14a91173fe6871847da7b949048ec61ff8b3e507fc286e75b0aa3c", size = 469442 },
     { url = "https://files.pythonhosted.org/packages/84/e9/b9ae18404135de53809fb994b754ac0eac838d8c4dfa8a10a811d8dec91d/optree-0.19.0-cp312-cp312-manylinux_2_26_s390x.manylinux_2_28_s390x.whl", hash = "sha256:938fb15d140ab65148f4e6975048facbef83a9210353fbedd471ac39e7544339", size = 468840 },
     { url = "https://files.pythonhosted.org/packages/0a/e5/a77df15a62b37bb14c81b5757e2a0573f57e7c06d125a410ad2cd7cefb72/optree-0.19.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2b8209570340135a7e586c90f393f3c6359e8a49c40d783196721cc487e51d9c", size = 451408 },
     { url = "https://files.pythonhosted.org/packages/8c/43/1aa431cee19cd98c4229e468767021f9a92195d9431857e28198a3a3ce2f/optree-0.19.0-cp312-cp312-manylinux_2_39_riscv64.whl", hash = "sha256:1397dc925026917531a43fda32054ae1e77e5ed9bf8284bcae6354c19c26e14a", size = 412544 },
-    { url = "https://files.pythonhosted.org/packages/5b/b9/b94fd3a116b80951d692a82f4135ae84b3d78bd1b092250aff76a3366138/optree-0.19.0-cp312-cp312-win32.whl", hash = "sha256:68f58e8f8b75c76c51e61e3dc2d9e94609bafb0e1a6459e6d525ced905cd9a74", size = 312033 },
-    { url = "https://files.pythonhosted.org/packages/9e/7f/31fa1b2311038bfc355ad6e4e4e63d028719cb67fb3ebe6fb76ff2124105/optree-0.19.0-cp312-cp312-win_amd64.whl", hash = "sha256:5c44ca0f579ed3e0ca777a5711d4a6c1b374feacf1bb4fe9cfe85297b0c8d237", size = 335374 },
-    { url = "https://files.pythonhosted.org/packages/09/86/863bc3f42f83113f5c6a5beaf4fec3c3481a76872f3244d0e64fb9ebd3b0/optree-0.19.0-cp312-cp312-win_arm64.whl", hash = "sha256:0461f796b4ade3fab519d821b0fa521f07e2af70206b76aac75fcfdc2e051fca", size = 345868 },
 ]
 
 [[package]]
@@ -997,20 +938,15 @@ name = "pandas"
 version = "3.0.2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "numpy" },
-    { name = "python-dateutil" },
-    { name = "tzdata", marker = "sys_platform == 'emscripten' or sys_platform == 'win32'" },
+    { name = "numpy", marker = "sys_platform == 'linux'" },
+    { name = "python-dateutil", marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/da/99/b342345300f13440fe9fe385c3c481e2d9a595ee3bab4d3219247ac94e9a/pandas-3.0.2.tar.gz", hash = "sha256:f4753e73e34c8d83221ba58f232433fca2748be8b18dbca02d242ed153945043", size = 4645855 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/f3/b0/c20bd4d6d3f736e6bd6b55794e9cd0a617b858eaad27c8f410ea05d953b7/pandas-3.0.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:232a70ebb568c0c4d2db4584f338c1577d81e3af63292208d615907b698a0f18", size = 10347921 },
-    { url = "https://files.pythonhosted.org/packages/35/d0/4831af68ce30cc2d03c697bea8450e3225a835ef497d0d70f31b8cdde965/pandas-3.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:970762605cff1ca0d3f71ed4f3a769ea8f85fc8e6348f6e110b8fea7e6eb5a14", size = 9888127 },
     { url = "https://files.pythonhosted.org/packages/61/a9/16ea9346e1fc4a96e2896242d9bc674764fb9049b0044c0132502f7a771e/pandas-3.0.2-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:aff4e6f4d722e0652707d7bcb190c445fe58428500c6d16005b02401764b1b3d", size = 10399577 },
     { url = "https://files.pythonhosted.org/packages/c4/a8/3a61a721472959ab0ce865ef05d10b0d6bfe27ce8801c99f33d4fa996e65/pandas-3.0.2-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ef8b27695c3d3dc78403c9a7d5e59a62d5464a7e1123b4e0042763f7104dc74f", size = 10880030 },
     { url = "https://files.pythonhosted.org/packages/da/65/7225c0ea4d6ce9cb2160a7fb7f39804871049f016e74782e5dade4d14109/pandas-3.0.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:f8d68083e49e16b84734eb1a4dcae4259a75c90fb6e2251ab9a00b61120c06ab", size = 11409468 },
     { url = "https://files.pythonhosted.org/packages/fa/5b/46e7c76032639f2132359b5cf4c785dd8cf9aea5ea64699eac752f02b9db/pandas-3.0.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:32cc41f310ebd4a296d93515fcac312216adfedb1894e879303987b8f1e2b97d", size = 11936381 },
-    { url = "https://files.pythonhosted.org/packages/7b/8b/721a9cff6fa6a91b162eb51019c6243b82b3226c71bb6c8ef4a9bd65cbc6/pandas-3.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:a4785e1d6547d8427c5208b748ae2efb64659a21bd82bf440d4262d02bfa02a4", size = 9744993 },
-    { url = "https://files.pythonhosted.org/packages/d5/18/7f0bd34ae27b28159aa80f2a6799f47fda34f7fb938a76e20c7b7fe3b200/pandas-3.0.2-cp312-cp312-win_arm64.whl", hash = "sha256:08504503f7101300107ecdc8df73658e4347586db5cfdadabc1592e9d7e7a0fd", size = 9056118 },
 ]
 
 [[package]]
@@ -1019,17 +955,12 @@ version = "12.2.0"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/8c/21/c2bcdd5906101a30244eaffc1b6e6ce71a31bd0742a01eb89e660ebfac2d/pillow-12.2.0.tar.gz", hash = "sha256:a830b1a40919539d07806aa58e1b114df53ddd43213d9c8b75847eee6c0182b5", size = 46987819 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/58/be/7482c8a5ebebbc6470b3eb791812fff7d5e0216c2be3827b30b8bb6603ed/pillow-12.2.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:2d192a155bbcec180f8564f693e6fd9bccff5a7af9b32e2e4bf8c9c69dbad6b5", size = 5308279 },
-    { url = "https://files.pythonhosted.org/packages/d8/95/0a351b9289c2b5cbde0bacd4a83ebc44023e835490a727b2a3bd60ddc0f4/pillow-12.2.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:f3f40b3c5a968281fd507d519e444c35f0ff171237f4fdde090dd60699458421", size = 4695490 },
     { url = "https://files.pythonhosted.org/packages/de/af/4e8e6869cbed569d43c416fad3dc4ecb944cb5d9492defaed89ddd6fe871/pillow-12.2.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:03e7e372d5240cc23e9f07deca4d775c0817bffc641b01e9c3af208dbd300987", size = 6284462 },
     { url = "https://files.pythonhosted.org/packages/e9/9e/c05e19657fd57841e476be1ab46c4d501bffbadbafdc31a6d665f8b737b6/pillow-12.2.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:b86024e52a1b269467a802258c25521e6d742349d760728092e1bc2d135b4d76", size = 8094744 },
     { url = "https://files.pythonhosted.org/packages/2b/54/1789c455ed10176066b6e7e6da1b01e50e36f94ba584dc68d9eebfe9156d/pillow-12.2.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7371b48c4fa448d20d2714c9a1f775a81155050d383333e0a6c15b1123dda005", size = 6398371 },
     { url = "https://files.pythonhosted.org/packages/43/e3/fdc657359e919462369869f1c9f0e973f353f9a9ee295a39b1fea8ee1a77/pillow-12.2.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:62f5409336adb0663b7caa0da5c7d9e7bdbaae9ce761d34669420c2a801b2780", size = 7087215 },
     { url = "https://files.pythonhosted.org/packages/8b/f8/2f6825e441d5b1959d2ca5adec984210f1ec086435b0ed5f52c19b3b8a6e/pillow-12.2.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:01afa7cf67f74f09523699b4e88c73fb55c13346d212a59a2db1f86b0a63e8c5", size = 6509783 },
     { url = "https://files.pythonhosted.org/packages/67/f9/029a27095ad20f854f9dba026b3ea6428548316e057e6fc3545409e86651/pillow-12.2.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:fc3d34d4a8fbec3e88a79b92e5465e0f9b842b628675850d860b8bd300b159f5", size = 7212112 },
-    { url = "https://files.pythonhosted.org/packages/be/42/025cfe05d1be22dbfdb4f264fe9de1ccda83f66e4fc3aac94748e784af04/pillow-12.2.0-cp312-cp312-win32.whl", hash = "sha256:58f62cc0f00fd29e64b29f4fd923ffdb3859c9f9e6105bfc37ba1d08994e8940", size = 6378489 },
-    { url = "https://files.pythonhosted.org/packages/5d/7b/25a221d2c761c6a8ae21bfa3874988ff2583e19cf8a27bf2fee358df7942/pillow-12.2.0-cp312-cp312-win_amd64.whl", hash = "sha256:7f84204dee22a783350679a0333981df803dac21a0190d706a50475e361c93f5", size = 7084129 },
-    { url = "https://files.pythonhosted.org/packages/10/e1/542a474affab20fd4a0f1836cb234e8493519da6b76899e30bcc5d990b8b/pillow-12.2.0-cp312-cp312-win_arm64.whl", hash = "sha256:af73337013e0b3b46f175e79492d96845b16126ddf79c438d7ea7ff27783a414", size = 2463612 },
 ]
 
 [[package]]
@@ -1047,12 +978,9 @@ version = "7.34.1"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/6b/6b/a0e95cad1ad7cc3f2c6821fcab91671bd5b78bd42afb357bb4765f29bc41/protobuf-7.34.1.tar.gz", hash = "sha256:9ce42245e704cc5027be797c1db1eb93184d44d1cdd71811fb2d9b25ad541280", size = 454708 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/ec/11/3325d41e6ee15bf1125654301211247b042563bcc898784351252549a8ad/protobuf-7.34.1-cp310-abi3-macosx_10_9_universal2.whl", hash = "sha256:d8b2cc79c4d8f62b293ad9b11ec3aebce9af481fa73e64556969f7345ebf9fc7", size = 429247 },
     { url = "https://files.pythonhosted.org/packages/eb/9d/aa69df2724ff63efa6f72307b483ce0827f4347cc6d6df24b59e26659fef/protobuf-7.34.1-cp310-abi3-manylinux2014_aarch64.whl", hash = "sha256:5185e0e948d07abe94bb76ec9b8416b604cfe5da6f871d67aad30cbf24c3110b", size = 325753 },
     { url = "https://files.pythonhosted.org/packages/92/e8/d174c91fd48e50101943f042b09af9029064810b734e4160bbe282fa1caa/protobuf-7.34.1-cp310-abi3-manylinux2014_s390x.whl", hash = "sha256:403b093a6e28a960372b44e5eb081775c9b056e816a8029c61231743d63f881a", size = 340198 },
     { url = "https://files.pythonhosted.org/packages/53/1b/3b431694a4dc6d37b9f653f0c64b0a0d9ec074ee810710c0c3da21d67ba7/protobuf-7.34.1-cp310-abi3-manylinux2014_x86_64.whl", hash = "sha256:8ff40ce8cd688f7265326b38d5a1bed9bfdf5e6723d49961432f83e21d5713e4", size = 324267 },
-    { url = "https://files.pythonhosted.org/packages/85/29/64de04a0ac142fb685fd09999bc3d337943fb386f3a0ec57f92fd8203f97/protobuf-7.34.1-cp310-abi3-win32.whl", hash = "sha256:34b84ce27680df7cca9f231043ada0daa55d0c44a2ddfaa58ec1d0d89d8bf60a", size = 426628 },
-    { url = "https://files.pythonhosted.org/packages/4d/87/cb5e585192a22b8bd457df5a2c16a75ea0db9674c3a0a39fc9347d84e075/protobuf-7.34.1-cp310-abi3-win_amd64.whl", hash = "sha256:e97b55646e6ce5cbb0954a8c28cd39a5869b59090dfaa7df4598a7fba869468c", size = 437901 },
     { url = "https://files.pythonhosted.org/packages/88/95/608f665226bca68b736b79e457fded9a2a38c4f4379a4a7614303d9db3bc/protobuf-7.34.1-py3-none-any.whl", hash = "sha256:bb3812cd53aefea2b028ef42bd780f5b96407247f20c6ef7c679807e9d188f11", size = 170715 },
 ]
 
@@ -1062,14 +990,10 @@ version = "7.2.2"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/aa/c6/d1ddf4abb55e93cebc4f2ed8b5d6dbad109ecb8d63748dd2b20ab5e57ebe/psutil-7.2.2.tar.gz", hash = "sha256:0746f5f8d406af344fd547f1c8daa5f5c33dbc293bb8d6a16d80b4bb88f59372", size = 493740 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/e7/36/5ee6e05c9bd427237b11b3937ad82bb8ad2752d72c6969314590dd0c2f6e/psutil-7.2.2-cp36-abi3-macosx_10_9_x86_64.whl", hash = "sha256:ed0cace939114f62738d808fdcecd4c869222507e266e574799e9c0faa17d486", size = 129090 },
-    { url = "https://files.pythonhosted.org/packages/80/c4/f5af4c1ca8c1eeb2e92ccca14ce8effdeec651d5ab6053c589b074eda6e1/psutil-7.2.2-cp36-abi3-macosx_11_0_arm64.whl", hash = "sha256:1a7b04c10f32cc88ab39cbf606e117fd74721c831c98a27dc04578deb0c16979", size = 129859 },
     { url = "https://files.pythonhosted.org/packages/b5/70/5d8df3b09e25bce090399cf48e452d25c935ab72dad19406c77f4e828045/psutil-7.2.2-cp36-abi3-manylinux2010_x86_64.manylinux_2_12_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:076a2d2f923fd4821644f5ba89f059523da90dc9014e85f8e45a5774ca5bc6f9", size = 155560 },
     { url = "https://files.pythonhosted.org/packages/63/65/37648c0c158dc222aba51c089eb3bdfa238e621674dc42d48706e639204f/psutil-7.2.2-cp36-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b0726cecd84f9474419d67252add4ac0cd9811b04d61123054b9fb6f57df6e9e", size = 156997 },
     { url = "https://files.pythonhosted.org/packages/8e/13/125093eadae863ce03c6ffdbae9929430d116a246ef69866dad94da3bfbc/psutil-7.2.2-cp36-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:fd04ef36b4a6d599bbdb225dd1d3f51e00105f6d48a28f006da7f9822f2606d8", size = 148972 },
     { url = "https://files.pythonhosted.org/packages/04/78/0acd37ca84ce3ddffaa92ef0f571e073faa6d8ff1f0559ab1272188ea2be/psutil-7.2.2-cp36-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:b58fabe35e80b264a4e3bb23e6b96f9e45a3df7fb7eed419ac0e5947c61e47cc", size = 148266 },
-    { url = "https://files.pythonhosted.org/packages/b4/90/e2159492b5426be0c1fef7acba807a03511f97c5f86b3caeda6ad92351a7/psutil-7.2.2-cp37-abi3-win_amd64.whl", hash = "sha256:eb7e81434c8d223ec4a219b5fc1c47d0417b12be7ea866e24fb5ad6e84b3d988", size = 137737 },
-    { url = "https://files.pythonhosted.org/packages/8c/c7/7bb2e321574b10df20cbde462a94e2b71d05f9bbda251ef27d104668306a/psutil-7.2.2-cp37-abi3-win_arm64.whl", hash = "sha256:8c233660f575a5a89e6d4cb65d9f938126312bca76d8fe087b947b3a1aaac9ee", size = 134617 },
 ]
 
 [[package]]
@@ -1078,13 +1002,10 @@ version = "23.0.1"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/88/22/134986a4cc224d593c1afde5494d18ff629393d74cc2eddb176669f234a4/pyarrow-23.0.1.tar.gz", hash = "sha256:b8c5873e33440b2bc2f4a79d2b47017a89c5a24116c055625e6f2ee50523f019", size = 1167336 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/9a/4b/4166bb5abbfe6f750fc60ad337c43ecf61340fa52ab386da6e8dbf9e63c4/pyarrow-23.0.1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:f4b0dbfa124c0bb161f8b5ebb40f1a680b70279aa0c9901d44a2b5a20806039f", size = 34214575 },
-    { url = "https://files.pythonhosted.org/packages/e1/da/3f941e3734ac8088ea588b53e860baeddac8323ea40ce22e3d0baa865cc9/pyarrow-23.0.1-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:7707d2b6673f7de054e2e83d59f9e805939038eebe1763fe811ee8fa5c0cd1a7", size = 35832540 },
     { url = "https://files.pythonhosted.org/packages/88/7c/3d841c366620e906d54430817531b877ba646310296df42ef697308c2705/pyarrow-23.0.1-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:86ff03fb9f1a320266e0de855dee4b17da6794c595d207f89bba40d16b5c78b9", size = 44470940 },
     { url = "https://files.pythonhosted.org/packages/2c/a5/da83046273d990f256cb79796a190bbf7ec999269705ddc609403f8c6b06/pyarrow-23.0.1-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:813d99f31275919c383aab17f0f455a04f5a429c261cc411b1e9a8f5e4aaaa05", size = 47586063 },
     { url = "https://files.pythonhosted.org/packages/5b/3c/b7d2ebcff47a514f47f9da1e74b7949138c58cfeb108cdd4ee62f43f0cf3/pyarrow-23.0.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:bf5842f960cddd2ef757d486041d57c96483efc295a8c4a0e20e704cbbf39c67", size = 48173045 },
     { url = "https://files.pythonhosted.org/packages/43/b2/b40961262213beaba6acfc88698eb773dfce32ecdf34d19291db94c2bd73/pyarrow-23.0.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:564baf97c858ecc03ec01a41062e8f4698abc3e6e2acd79c01c2e97880a19730", size = 50621741 },
-    { url = "https://files.pythonhosted.org/packages/f6/70/1fdda42d65b28b078e93d75d371b2185a61da89dda4def8ba6ba41ebdeb4/pyarrow-23.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:07deae7783782ac7250989a7b2ecde9b3c343a643f82e8a4df03d93b633006f0", size = 27620678 },
 ]
 
 [[package]]
@@ -1101,10 +1022,10 @@ name = "pydantic"
 version = "2.13.2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "annotated-types" },
-    { name = "pydantic-core" },
-    { name = "typing-extensions" },
-    { name = "typing-inspection" },
+    { name = "annotated-types", marker = "sys_platform == 'linux'" },
+    { name = "pydantic-core", marker = "sys_platform == 'linux'" },
+    { name = "typing-extensions", marker = "sys_platform == 'linux'" },
+    { name = "typing-inspection", marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/09/e5/06d23afac9973109d1e3c8ad38e1547a12e860610e327c05ee686827dc37/pydantic-2.13.2.tar.gz", hash = "sha256:b418196607e61081c3226dcd4f0672f2a194828abb9109e9cfb84026564df2d1", size = 843836 }
 wheels = [
@@ -1116,12 +1037,10 @@ name = "pydantic-core"
 version = "2.46.2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "typing-extensions" },
+    { name = "typing-extensions", marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/43/bb/4742f05b739b2478459bb16fa8470549518c802e06ddcf3f106c5081315e/pydantic_core-2.46.2.tar.gz", hash = "sha256:37bb079f9ee3f1a519392b73fda2a96379b31f2013c6b467fe693e7f2987f596", size = 471269 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/97/ec/2fafa4c86f5d2a69372c7cddef30925fd0e370b1efaf556609c1a0196d8a/pydantic_core-2.46.2-cp312-cp312-macosx_10_12_x86_64.whl", hash = "sha256:ea1ad8c89da31512fe2d249cf0638fb666925bda341901541bc5f3311c6fcc9e", size = 2101729 },
-    { url = "https://files.pythonhosted.org/packages/cf/55/be5386c2c4b49af346e8a26b748194ff25757bbb6cf544130854e997af7a/pydantic_core-2.46.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b308da17b92481e0587244631c5529e5d91d04cb2b08194825627b1eca28e21e", size = 1951546 },
     { url = "https://files.pythonhosted.org/packages/29/92/89e273a055ce440e6636c756379af35ad86da9d336a560049c3ba5e41c80/pydantic_core-2.46.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d333a50bdd814a917d8d6a7ee35ba2395d53ddaa882613bc24e54a9d8b129095", size = 1976178 },
     { url = "https://files.pythonhosted.org/packages/91/b3/e4664469cf70c0cb0f7b2f5719d64e5968bb6f38217042c2afa3d3c4ba17/pydantic_core-2.46.2-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:1d00b99590c5bd1fabbc5d28b170923e32c1b1071b1f1de1851a4d14d89eb192", size = 2051697 },
     { url = "https://files.pythonhosted.org/packages/98/58/dbf68213ee06ce51cdd6d8c95f97980e646858c45bd96bd2dfb40433be73/pydantic_core-2.46.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9f0e686960ffe9e65066395af856ac2d52c159043144433602c50c221d81c1ba", size = 2233160 },
@@ -1132,9 +1051,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/78/6b/685b98a834d5e3d1c34a1bde1627525559dd223b75075bc7490cdb24eb33/pydantic_core-2.46.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:b839d5c802e31348b949b6473f8190cddbf7d47475856d8ac995a373ee16ec59", size = 2186842 },
     { url = "https://files.pythonhosted.org/packages/22/64/caa2f5a2ac8b6113adaa410ccdf31ba7f54897a6e54cd0d726fc7e780c88/pydantic_core-2.46.2-cp312-cp312-musllinux_1_1_armv7l.whl", hash = "sha256:c6b1064f3f9cf9072e1d59dd2936f9f3b668bec1c37039708c9222db703c0d5b", size = 2336066 },
     { url = "https://files.pythonhosted.org/packages/ee/f9/7d2701bf82945b5b9e7df8347be97ef6a36da2846bfe5b4afec299ffe27b/pydantic_core-2.46.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:37a68e6f2ac95578ce3c0564802404b27b24988649616e556c07e77111ed3f1d", size = 2363691 },
-    { url = "https://files.pythonhosted.org/packages/3b/65/0dab11574101522941055109419db3cc09db871643dc3fc74e2413215e5b/pydantic_core-2.46.2-cp312-cp312-win32.whl", hash = "sha256:d9ffa75a7ef4b97d6e5e205fabd4304ef01fec09e6f1bdde04b9ad1b07d20289", size = 1958801 },
-    { url = "https://files.pythonhosted.org/packages/13/2b/df84baa609c676f6450b8ecad44ea59146c805e3371b7b52443c0899f989/pydantic_core-2.46.2-cp312-cp312-win_amd64.whl", hash = "sha256:0551f2d2ddb68af5a00e26497f8025c538f73ef3cb698f8e5a487042cd2792a8", size = 2072634 },
-    { url = "https://files.pythonhosted.org/packages/d1/4e/e1ce8029fc438086a946739bf9d596f70ff470aad4a8345555920618cabe/pydantic_core-2.46.2-cp312-cp312-win_arm64.whl", hash = "sha256:83aef30f106edcc21a6a4cc44b82d3169a1dbe255508db788e778f3c804d3583", size = 2026188 },
 ]
 
 [[package]]
@@ -1160,11 +1076,10 @@ name = "pytest"
 version = "9.0.3"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "colorama", marker = "sys_platform == 'win32'" },
-    { name = "iniconfig" },
-    { name = "packaging" },
-    { name = "pluggy" },
-    { name = "pygments" },
+    { name = "iniconfig", marker = "sys_platform == 'linux'" },
+    { name = "packaging", marker = "sys_platform == 'linux'" },
+    { name = "pluggy", marker = "sys_platform == 'linux'" },
+    { name = "pygments", marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/7d/0d/549bd94f1a0a402dc8cf64563a117c0f3765662e2e668477624baeec44d5/pytest-9.0.3.tar.gz", hash = "sha256:b86ada508af81d19edeb213c681b1d48246c1a91d304c6c81a427674c17eb91c", size = 1572165 }
 wheels = [
@@ -1176,7 +1091,7 @@ name = "pytest-timeout"
 version = "2.4.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "pytest" },
+    { name = "pytest", marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/ac/82/4c9ecabab13363e72d880f2fb504c5f750433b2b6f16e99f4ec21ada284c/pytest_timeout-2.4.0.tar.gz", hash = "sha256:7e68e90b01f9eff71332b25001f85c75495fc4e3a836701876183c4bcfd0540a", size = 17973 }
 wheels = [
@@ -1188,8 +1103,8 @@ name = "pytest-xdist"
 version = "3.8.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "execnet" },
-    { name = "pytest" },
+    { name = "execnet", marker = "sys_platform == 'linux'" },
+    { name = "pytest", marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/78/b4/439b179d1ff526791eb921115fca8e44e596a13efeda518b9d845a619450/pytest_xdist-3.8.0.tar.gz", hash = "sha256:7e578125ec9bc6050861aa93f2d59f1d8d085595d6551c2c90b6f4fad8d3a9f1", size = 88069 }
 wheels = [
@@ -1201,7 +1116,7 @@ name = "python-dateutil"
 version = "2.9.0.post0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "six" },
+    { name = "six", marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/66/c0/0c8b6ad9f17a802ee498c46e004a0eb49bc148f2fd230864601a86dcf6db/python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3", size = 342432 }
 wheels = [
@@ -1214,16 +1129,11 @@ version = "6.0.3"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/05/8e/961c0007c59b8dd7729d542c61a4d537767a59645b82a0b521206e1e25c2/pyyaml-6.0.3.tar.gz", hash = "sha256:d76623373421df22fb4cf8817020cbb7ef15c725b9d5e45f17e189bfc384190f", size = 130960 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/d1/33/422b98d2195232ca1826284a76852ad5a86fe23e31b009c9886b2d0fb8b2/pyyaml-6.0.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:7f047e29dcae44602496db43be01ad42fc6f1cc0d8cd6c83d342306c32270196", size = 182063 },
-    { url = "https://files.pythonhosted.org/packages/89/a0/6cf41a19a1f2f3feab0e9c0b74134aa2ce6849093d5517a0c550fe37a648/pyyaml-6.0.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:fc09d0aa354569bc501d4e787133afc08552722d3ab34836a80547331bb5d4a0", size = 173973 },
     { url = "https://files.pythonhosted.org/packages/ed/23/7a778b6bd0b9a8039df8b1b1d80e2e2ad78aa04171592c8a5c43a56a6af4/pyyaml-6.0.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9149cad251584d5fb4981be1ecde53a1ca46c891a79788c0df828d2f166bda28", size = 775116 },
     { url = "https://files.pythonhosted.org/packages/65/30/d7353c338e12baef4ecc1b09e877c1970bd3382789c159b4f89d6a70dc09/pyyaml-6.0.3-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:5fdec68f91a0c6739b380c83b951e2c72ac0197ace422360e6d5a959d8d97b2c", size = 844011 },
     { url = "https://files.pythonhosted.org/packages/8b/9d/b3589d3877982d4f2329302ef98a8026e7f4443c765c46cfecc8858c6b4b/pyyaml-6.0.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ba1cc08a7ccde2d2ec775841541641e4548226580ab850948cbfda66a1befcdc", size = 807870 },
     { url = "https://files.pythonhosted.org/packages/05/c0/b3be26a015601b822b97d9149ff8cb5ead58c66f981e04fedf4e762f4bd4/pyyaml-6.0.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:8dc52c23056b9ddd46818a57b78404882310fb473d63f17b07d5c40421e47f8e", size = 761089 },
     { url = "https://files.pythonhosted.org/packages/be/8e/98435a21d1d4b46590d5459a22d88128103f8da4c2d4cb8f14f2a96504e1/pyyaml-6.0.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:41715c910c881bc081f1e8872880d3c650acf13dfa8214bad49ed4cede7c34ea", size = 790181 },
-    { url = "https://files.pythonhosted.org/packages/74/93/7baea19427dcfbe1e5a372d81473250b379f04b1bd3c4c5ff825e2327202/pyyaml-6.0.3-cp312-cp312-win32.whl", hash = "sha256:96b533f0e99f6579b3d4d4995707cf36df9100d67e0c8303a0c55b27b5f99bc5", size = 137658 },
-    { url = "https://files.pythonhosted.org/packages/86/bf/899e81e4cce32febab4fb42bb97dcdf66bc135272882d1987881a4b519e9/pyyaml-6.0.3-cp312-cp312-win_amd64.whl", hash = "sha256:5fcd34e47f6e0b794d17de1b4ff496c00986e1c83f7ab2fb8fcfe9616ff7477b", size = 154003 },
-    { url = "https://files.pythonhosted.org/packages/1a/08/67bd04656199bbb51dbed1439b7f27601dfb576fb864099c7ef0c3e55531/pyyaml-6.0.3-cp312-cp312-win_arm64.whl", hash = "sha256:64386e5e707d03a7e172c0701abfb7e10f0fb753ee1d773128192742712a98fd", size = 140344 },
 ]
 
 [[package]]
@@ -1231,10 +1141,10 @@ name = "requests"
 version = "2.33.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "certifi" },
-    { name = "charset-normalizer" },
-    { name = "idna" },
-    { name = "urllib3" },
+    { name = "certifi", marker = "sys_platform == 'linux'" },
+    { name = "charset-normalizer", marker = "sys_platform == 'linux'" },
+    { name = "idna", marker = "sys_platform == 'linux'" },
+    { name = "urllib3", marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/5f/a4/98b9c7c6428a668bf7e42ebb7c79d576a1c3c1e3ae2d47e674b468388871/requests-2.33.1.tar.gz", hash = "sha256:18817f8c57c6263968bc123d237e3b8b08ac046f5456bd1e307ee8f4250d3517", size = 134120 }
 wheels = [
@@ -1246,8 +1156,8 @@ name = "rich"
 version = "15.0.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "markdown-it-py" },
-    { name = "pygments" },
+    { name = "markdown-it-py", marker = "sys_platform == 'linux'" },
+    { name = "pygments", marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/c0/8f/0722ca900cc807c13a6a0c696dacf35430f72e0ec571c4275d2371fca3e9/rich-15.0.0.tar.gz", hash = "sha256:edd07a4824c6b40189fb7ac9bc4c52536e9780fbbfbddf6f1e2502c31b068c36", size = 230680 }
 wheels = [
@@ -1256,15 +1166,15 @@ wheels = [
 
 [[package]]
 name = "s3dlio"
-version = "0.9.95"
+version = "0.9.100"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "numpy" },
+    { name = "numpy", marker = "sys_platform == 'linux'" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/13/bf/b17bf94e1fd7c58b2f93d53192b61271f14538b847d98fd40ef2cc652d61/s3dlio-0.9.95.tar.gz", hash = "sha256:55f79071d244cccf7a49714c33c024639a24723dd88c7cac629c63daa89d0d96", size = 1481201 }
+sdist = { url = "https://files.pythonhosted.org/packages/33/98/23ed0451a8668e352206dea740920d85dceefadf0a6d427d1571d17e845e/s3dlio-0.9.100.tar.gz", hash = "sha256:b2d3dc9f037bcef5e2e171ab1988c1be730849730bee6570f484eb0f02c9a862", size = 1564701 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/7c/c3/502a898baa514cf796f11572508f3a78a93574d45ce7d36bcd34e2e7fe40/s3dlio-0.9.95-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:93d4f6d929e743a74428d4a6e944fbb85bd6a9cfffbdc36d6635e89f0919a5ba", size = 10258346 },
-    { url = "https://files.pythonhosted.org/packages/91/4f/d394679708a4fb7c0f362076b7f92a0933201d258a90b6b28f0529dacf98/s3dlio-0.9.95-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9dd5f1d71c3655346a879a5c3e49142c3d916a6df3505a823f983b0b1abb5bd5", size = 10613865 },
+    { url = "https://files.pythonhosted.org/packages/3b/80/e7a16ae10aa9374b29ae7dc175eaba3910f604c2f2d2ae8955488a13c821/s3dlio-0.9.100-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:090f61effc0eec32a876a62a921287961e92aec57eb0f21449bf5a89d9e9ada2", size = 12416760 },
+    { url = "https://files.pythonhosted.org/packages/ce/38/44ad05689f5f66e503eb095b442f37271e74bde1948fadf1312284173ae3/s3dlio-0.9.100-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:eb48f3d295071b5226ad6062544003abaa2defadac695424a015db04126f5d57", size = 12842294 },
 ]
 
 [[package]]
@@ -1272,8 +1182,8 @@ name = "s3torchconnector"
 version = "1.5.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "s3torchconnectorclient" },
-    { name = "torch" },
+    { name = "s3torchconnectorclient", marker = "sys_platform == 'linux'" },
+    { name = "torch", marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/0f/24/a3422bc7e3d8f2a55a64250a6d5a07416c49d6f5695879445ff72c695612/s3torchconnector-1.5.0.tar.gz", hash = "sha256:44167d8e7bc0fce6d97627fc10aa7e215f4b58e0bb7037e87858c41eefd5b5af", size = 103050 }
 
@@ -1283,8 +1193,6 @@ version = "1.5.0"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/a5/8d/e04febe3e7ff7c91bc4678a16bec1c87674fc9c160c75a8f8745e516e563/s3torchconnectorclient-1.5.0.tar.gz", hash = "sha256:09ffceca1fd025abd8a4a4cbd94b3f70a7c8ccfbf3e0f76337e180f95ce58e61", size = 85516 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/ca/ca/65c66f2b4cc331f3d8fb92961f90edf8e9964fa6890ef7f335fbf9d7989f/s3torchconnectorclient-1.5.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:83ae3c096da011af6e57947d2530814a4f78935bf1336117547984da34e1cdec", size = 2124261 },
-    { url = "https://files.pythonhosted.org/packages/e6/20/629141bf19c24fedda41f9c710e55439d6303784cc1ca8e367367a51e08b/s3torchconnectorclient-1.5.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:1eba5cfc67d7e2bd3cd51400105288a979096cfb293c604d19cdd880f960c396", size = 2019312 },
     { url = "https://files.pythonhosted.org/packages/7d/51/288b8857991cffa36b833c7128897766fb84f3a4a60a5cc3dfe6e2546f8a/s3torchconnectorclient-1.5.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:7c0d11b4da0271414ffa370718bbbfb5454dac2ad546d89c7c6c49831e2eb7e5", size = 3594664 },
     { url = "https://files.pythonhosted.org/packages/35/d3/9354e5620c3839393ff9afe2435f5e42bb63eb829edd93395cb0a3b1aa39/s3torchconnectorclient-1.5.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:0f5277d76b4d1e12cd6f96823cf5911c51a7a614acbabb4ee4133d8caa332df1", size = 3747379 },
 ]
@@ -1303,9 +1211,9 @@ name = "setuptools-scm"
 version = "10.0.5"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "packaging" },
-    { name = "setuptools" },
-    { name = "vcs-versioning" },
+    { name = "packaging", marker = "sys_platform == 'linux'" },
+    { name = "setuptools", marker = "sys_platform == 'linux'" },
+    { name = "vcs-versioning", marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/a5/b1/2a6a8ecd6f9e263754036a0b573360bdbd6873b595725e49e11139722041/setuptools_scm-10.0.5.tar.gz", hash = "sha256:bbba8fe754516cdefd017f4456721775e6ef9662bd7887fb52ae26813d4838c3", size = 56748 }
 wheels = [
@@ -1326,7 +1234,7 @@ name = "sympy"
 version = "1.14.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "mpmath" },
+    { name = "mpmath", marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/83/d3/803453b36afefb7c2bb238361cd4ae6125a569b4db67cd9e79846ba2d68c/sympy-1.14.0.tar.gz", hash = "sha256:d3d3fe8df1e5a0b42f0e7bdf50541697dbe7d23746e894990c030e2b05e72517", size = 7793921 }
 wheels = [
@@ -1347,16 +1255,16 @@ name = "tensorboard"
 version = "2.20.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "absl-py" },
-    { name = "grpcio" },
-    { name = "markdown" },
-    { name = "numpy" },
-    { name = "packaging" },
-    { name = "pillow" },
-    { name = "protobuf" },
-    { name = "setuptools" },
-    { name = "tensorboard-data-server" },
-    { name = "werkzeug" },
+    { name = "absl-py", marker = "sys_platform == 'linux'" },
+    { name = "grpcio", marker = "sys_platform == 'linux'" },
+    { name = "markdown", marker = "sys_platform == 'linux'" },
+    { name = "numpy", marker = "sys_platform == 'linux'" },
+    { name = "packaging", marker = "sys_platform == 'linux'" },
+    { name = "pillow", marker = "sys_platform == 'linux'" },
+    { name = "protobuf", marker = "sys_platform == 'linux'" },
+    { name = "setuptools", marker = "sys_platform == 'linux'" },
+    { name = "tensorboard-data-server", marker = "sys_platform == 'linux'" },
+    { name = "werkzeug", marker = "sys_platform == 'linux'" },
 ]
 wheels = [
     { url = "https://files.pythonhosted.org/packages/9c/d9/a5db55f88f258ac669a92858b70a714bbbd5acd993820b41ec4a96a4d77f/tensorboard-2.20.0-py3-none-any.whl", hash = "sha256:9dc9f978cb84c0723acf9a345d96c184f0293d18f166bb8d59ee098e6cfaaba6", size = 5525680 },
@@ -1368,7 +1276,6 @@ version = "0.7.2"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
     { url = "https://files.pythonhosted.org/packages/7a/13/e503968fefabd4c6b2650af21e110aa8466fe21432cd7c43a84577a89438/tensorboard_data_server-0.7.2-py3-none-any.whl", hash = "sha256:7e0610d205889588983836ec05dc098e80f97b7e7bbff7e994ebb78f578d0ddb", size = 2356 },
-    { url = "https://files.pythonhosted.org/packages/b7/85/dabeaf902892922777492e1d253bb7e1264cadce3cea932f7ff599e53fea/tensorboard_data_server-0.7.2-py3-none-macosx_10_9_x86_64.whl", hash = "sha256:9fe5d24221b29625dbc7328b0436ca7fc1c23de4acf4d272f1180856e32f9f60", size = 4823598 },
     { url = "https://files.pythonhosted.org/packages/73/c6/825dab04195756cf8ff2e12698f22513b3db2f64925bdd41671bfb33aaa5/tensorboard_data_server-0.7.2-py3-none-manylinux_2_31_x86_64.whl", hash = "sha256:ef687163c24185ae9754ed5650eb5bc4d84ff257aabdc33f0cc6f74d8ba54530", size = 6590363 },
 ]
 
@@ -1377,33 +1284,31 @@ name = "tensorflow"
 version = "2.20.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "absl-py" },
-    { name = "astunparse" },
-    { name = "flatbuffers" },
-    { name = "gast" },
-    { name = "google-pasta" },
-    { name = "grpcio" },
-    { name = "h5py" },
-    { name = "keras" },
-    { name = "libclang" },
-    { name = "ml-dtypes" },
-    { name = "numpy" },
-    { name = "opt-einsum" },
-    { name = "packaging" },
-    { name = "protobuf" },
-    { name = "requests" },
-    { name = "setuptools" },
-    { name = "six" },
-    { name = "tensorboard" },
-    { name = "termcolor" },
-    { name = "typing-extensions" },
-    { name = "wrapt" },
-]
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/35/31/47712f425c09cc8b8dba39c6c45aee939c4636a6feb8c81376a4eae653e0/tensorflow-2.20.0-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:52b122f0232fd7ab10f28d537ce08470d0b6dcac7fff9685432daac7f8a06c8f", size = 200540302 },
+    { name = "absl-py", marker = "sys_platform == 'linux'" },
+    { name = "astunparse", marker = "sys_platform == 'linux'" },
+    { name = "flatbuffers", marker = "sys_platform == 'linux'" },
+    { name = "gast", marker = "sys_platform == 'linux'" },
+    { name = "google-pasta", marker = "sys_platform == 'linux'" },
+    { name = "grpcio", marker = "sys_platform == 'linux'" },
+    { name = "h5py", marker = "sys_platform == 'linux'" },
+    { name = "keras", marker = "sys_platform == 'linux'" },
+    { name = "libclang", marker = "sys_platform == 'linux'" },
+    { name = "ml-dtypes", marker = "sys_platform == 'linux'" },
+    { name = "numpy", marker = "sys_platform == 'linux'" },
+    { name = "opt-einsum", marker = "sys_platform == 'linux'" },
+    { name = "packaging", marker = "sys_platform == 'linux'" },
+    { name = "protobuf", marker = "sys_platform == 'linux'" },
+    { name = "requests", marker = "sys_platform == 'linux'" },
+    { name = "setuptools", marker = "sys_platform == 'linux'" },
+    { name = "six", marker = "sys_platform == 'linux'" },
+    { name = "tensorboard", marker = "sys_platform == 'linux'" },
+    { name = "termcolor", marker = "sys_platform == 'linux'" },
+    { name = "typing-extensions", marker = "sys_platform == 'linux'" },
+    { name = "wrapt", marker = "sys_platform == 'linux'" },
+]
+wheels = [
     { url = "https://files.pythonhosted.org/packages/ec/b4/f028a5de27d0fda10ba6145bc76e40c37ff6d2d1e95b601adb5ae17d635e/tensorflow-2.20.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2bfbfb3dd0e22bffc45fe1e922390d27753e99261fab8a882e802cf98a0e078f", size = 259533109 },
     { url = "https://files.pythonhosted.org/packages/9c/d1/6aa15085d672056d5f08b5f28b1c7ce01c4e12149a23b0c98e3c79d04441/tensorflow-2.20.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:25265b0bc527e0d54b1e9cc60c44a24f44a809fe27666b905f0466471f9c52ec", size = 620682547 },
-    { url = "https://files.pythonhosted.org/packages/f9/37/b97abb360b551fbf5870a0ee07e39ff9c655e6e3e2f839bc88be81361842/tensorflow-2.20.0-cp312-cp312-win_amd64.whl", hash = "sha256:1590cbf87b6bcbd34d8e9ad70d0c696135e0aa71be31803b27358cf7ed63f8fc", size = 331887041 },
 ]
 
 [[package]]
@@ -1422,24 +1327,22 @@ source = { registry = "https://pypi.org/simple" }
 dependencies = [
     { name = "cuda-bindings", marker = "sys_platform == 'linux'" },
     { name = "cuda-toolkit", extra = ["cublas", "cudart", "cufft", "cufile", "cupti", "curand", "cusolver", "cusparse", "nvjitlink", "nvrtc", "nvtx"], marker = "sys_platform == 'linux'" },
-    { name = "filelock" },
-    { name = "fsspec" },
-    { name = "jinja2" },
-    { name = "networkx" },
+    { name = "filelock", marker = "sys_platform == 'linux'" },
+    { name = "fsspec", marker = "sys_platform == 'linux'" },
+    { name = "jinja2", marker = "sys_platform == 'linux'" },
+    { name = "networkx", marker = "sys_platform == 'linux'" },
     { name = "nvidia-cudnn-cu13", marker = "sys_platform == 'linux'" },
     { name = "nvidia-cusparselt-cu13", marker = "sys_platform == 'linux'" },
     { name = "nvidia-nccl-cu13", marker = "sys_platform == 'linux'" },
     { name = "nvidia-nvshmem-cu13", marker = "sys_platform == 'linux'" },
-    { name = "setuptools" },
-    { name = "sympy" },
+    { name = "setuptools", marker = "sys_platform == 'linux'" },
+    { name = "sympy", marker = "sys_platform == 'linux'" },
     { name = "triton", marker = "sys_platform == 'linux'" },
-    { name = "typing-extensions" },
+    { name = "typing-extensions", marker = "sys_platform == 'linux'" },
 ]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/6f/8b/69e3008d78e5cee2b30183340cc425081b78afc5eff3d080daab0adda9aa/torch-2.11.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:4b5866312ee6e52ea625cd211dcb97d6a2cdc1131a5f15cc0d87eec948f6dd34", size = 80606338 },
     { url = "https://files.pythonhosted.org/packages/13/16/42e5915ebe4868caa6bac83a8ed59db57f12e9a61b7d749d584776ed53d5/torch-2.11.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:f99924682ef0aa6a4ab3b1b76f40dc6e273fca09f367d15a524266db100a723f", size = 419731115 },
     { url = "https://files.pythonhosted.org/packages/1a/c9/82638ef24d7877510f83baf821f5619a61b45568ce21c0a87a91576510aa/torch-2.11.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:0f68f4ac6d95d12e896c3b7a912b5871619542ec54d3649cf48cc1edd4dd2756", size = 530712279 },
-    { url = "https://files.pythonhosted.org/packages/1c/ff/6756f1c7ee302f6d202120e0f4f05b432b839908f9071157302cedfc5232/torch-2.11.0-cp312-cp312-win_amd64.whl", hash = "sha256:fbf39280699d1b869f55eac536deceaa1b60bd6788ba74f399cc67e60a5fab10", size = 114556047 },
 ]
 
 [[package]]
@@ -1447,10 +1350,8 @@ name = "torchaudio"
 version = "2.11.0"
 source = { registry = "https://pypi.org/simple" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/f1/b1/77658817acacd01a72b714440c62f419efc4d90170e704e8e7a2c0918988/torchaudio-2.11.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a1cf1acc883bee9cb906a933572fed6a8a933f86ef34e9ea7d803f72317e8c1b", size = 684226 },
     { url = "https://files.pythonhosted.org/packages/78/28/c7adc053039f286c2aca0038b766cbe3294e66fec6b29a820e95128f9ede/torchaudio-2.11.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:bc653defca1c16154398517a1adc98d0fb7f1dd08e58ced217558d213c2c6e29", size = 1626670 },
     { url = "https://files.pythonhosted.org/packages/88/d8/d6d0f896e064aa67377484efef4911cdcc07bce2929474e1417cc0af18c2/torchaudio-2.11.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:6503c0bdb29daf2e6281bb70ea2dfe2c3553b782b619eb5d73bdadd8a3f7cecf", size = 1771992 },
-    { url = "https://files.pythonhosted.org/packages/23/a8/941277ecc39f7a0a169d554302a1f1afd87c1d94a8aec828891916cea59a/torchaudio-2.11.0-cp312-cp312-win_amd64.whl", hash = "sha256:478110f981e5d40a8d82221732c57a56c85a1d5895fb8fe646e86ee15eded3bd", size = 328663 },
 ]
 
 [[package]]
@@ -1458,15 +1359,13 @@ name = "torchvision"
 version = "0.26.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "numpy" },
-    { name = "pillow" },
-    { name = "torch" },
+    { name = "numpy", marker = "sys_platform == 'linux'" },
+    { name = "pillow", marker = "sys_platform == 'linux'" },
+    { name = "torch", marker = "sys_platform == 'linux'" },
 ]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/ae/e7/56b47cc3b132aea90ccce22bcb8975dec688b002150012acc842846039d0/torchvision-0.26.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:c409e1c3fdebec7a3834465086dbda8bf7680eff79abf7fd2f10c6b59520a7a4", size = 1863502 },
     { url = "https://files.pythonhosted.org/packages/f4/ec/5c31c92c08b65662fe9604a4067ae8232582805949f11ddc042cebe818ed/torchvision-0.26.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:406557718e62fdf10f5706e88d8a5ec000f872da913bf629aab9297622585547", size = 7767944 },
     { url = "https://files.pythonhosted.org/packages/f5/d8/cb6ccda1a1f35a6597645818641701207b3e8e13553e75fce5d86bac74b2/torchvision-0.26.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:d61a5abb6b42a0c0c311996c2ac4b83a94418a97182c83b055a2a4ae985e05aa", size = 7522205 },
-    { url = "https://files.pythonhosted.org/packages/1c/a9/c272623a0f735c35f0f6cd6dc74784d4f970e800cf063bb76687895a2ab9/torchvision-0.26.0-cp312-cp312-win_amd64.whl", hash = "sha256:7993c01648e7c61d191b018e84d38fe0825c8fcb2720cd0f37caf7ba14404aa1", size = 4255155 },
 ]
 
 [[package]]
@@ -1492,22 +1391,13 @@ name = "typing-inspection"
 version = "0.4.2"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "typing-extensions" },
+    { name = "typing-extensions", marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/55/e3/70399cb7dd41c10ac53367ae42139cf4b1ca5f36bb3dc6c9d33acdb43655/typing_inspection-0.4.2.tar.gz", hash = "sha256:ba561c48a67c5958007083d386c3295464928b01faa735ab8547c5692e87f464", size = 75949 }
 wheels = [
     { url = "https://files.pythonhosted.org/packages/dc/9b/47798a6c91d8bdb567fe2698fe81e0c6b7cb7ef4d13da4114b41d239f65d/typing_inspection-0.4.2-py3-none-any.whl", hash = "sha256:4ed1cacbdc298c220f1bd249ed5287caa16f34d44ef4e9c3d0cbad5b521545e7", size = 14611 },
 ]
 
-[[package]]
-name = "tzdata"
-version = "2026.1"
-source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/19/f5/cd531b2d15a671a40c0f66cf06bc3570a12cd56eef98960068ebbad1bf5a/tzdata-2026.1.tar.gz", hash = "sha256:67658a1903c75917309e753fdc349ac0efd8c27db7a0cb406a25be4840f87f98", size = 197639 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/b0/70/d460bd685a170790ec89317e9bd33047988e4bce507b831f5db771e142de/tzdata-2026.1-py2.py3-none-any.whl", hash = "sha256:4b1d2be7ac37ceafd7327b961aa3a54e467efbdb563a23655fbfe0d39cfc42a9", size = 348952 },
-]
-
 [[package]]
 name = "urllib3"
 version = "2.6.3"
@@ -1522,7 +1412,7 @@ name = "vcs-versioning"
 version = "1.1.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "packaging" },
+    { name = "packaging", marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/49/42/d97a7795055677961c63a1eef8e7b19d5968ed992ed3a70ab8eb012efad8/vcs_versioning-1.1.1.tar.gz", hash = "sha256:fabd75a3cab7dd8ac02fe24a3a9ba936bf258667b5a62ed468c9a1da0f5775bc", size = 97575 }
 wheels = [
@@ -1534,7 +1424,7 @@ name = "werkzeug"
 version = "3.1.8"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "markupsafe" },
+    { name = "markupsafe", marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/dd/b2/381be8cfdee792dd117872481b6e378f85c957dd7c5bca38897b08f765fd/werkzeug-3.1.8.tar.gz", hash = "sha256:9bad61a4268dac112f1c5cd4630a56ede601b6ed420300677a869083d70a4c44", size = 875852 }
 wheels = [
@@ -1546,7 +1436,7 @@ name = "wheel"
 version = "0.46.3"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "packaging" },
+    { name = "packaging", marker = "sys_platform == 'linux'" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/89/24/a2eb353a6edac9a0303977c4cb048134959dd2a51b48a269dfc9dde00c8a/wheel-0.46.3.tar.gz", hash = "sha256:e3e79874b07d776c40bd6033f8ddf76a7dad46a7b8aa1b2787a83083519a1803", size = 60605 }
 wheels = [
@@ -1559,17 +1449,12 @@ version = "2.1.2"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/2e/64/925f213fdcbb9baeb1530449ac71a4d57fc361c053d06bf78d0c5c7cd80c/wrapt-2.1.2.tar.gz", hash = "sha256:3996a67eecc2c68fd47b4e3c564405a5777367adfd9b8abb58387b63ee83b21e", size = 81678 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/4c/b6/1db817582c49c7fcbb7df6809d0f515af29d7c2fbf57eb44c36e98fb1492/wrapt-2.1.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:ff2aad9c4cda28a8f0653fc2d487596458c2a3f475e56ba02909e950a9efa6a9", size = 61255 },
-    { url = "https://files.pythonhosted.org/packages/a2/16/9b02a6b99c09227c93cd4b73acc3678114154ec38da53043c0ddc1fba0dc/wrapt-2.1.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6433ea84e1cfacf32021d2a4ee909554ade7fd392caa6f7c13f1f4bf7b8e8748", size = 61848 },
     { url = "https://files.pythonhosted.org/packages/af/aa/ead46a88f9ec3a432a4832dfedb84092fc35af2d0ba40cd04aea3889f247/wrapt-2.1.2-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:c20b757c268d30d6215916a5fa8461048d023865d888e437fab451139cad6c8e", size = 121433 },
     { url = "https://files.pythonhosted.org/packages/3a/9f/742c7c7cdf58b59085a1ee4b6c37b013f66ac33673a7ef4aaed5e992bc33/wrapt-2.1.2-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:79847b83eb38e70d93dc392c7c5b587efe65b3e7afcc167aa8abd5d60e8761c8", size = 123013 },
     { url = "https://files.pythonhosted.org/packages/e8/44/2c3dd45d53236b7ed7c646fcf212251dc19e48e599debd3926b52310fafb/wrapt-2.1.2-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:f8fba1bae256186a83d1875b2b1f4e2d1242e8fac0f58ec0d7e41b26967b965c", size = 117326 },
     { url = "https://files.pythonhosted.org/packages/74/e2/b17d66abc26bd96f89dec0ecd0ef03da4a1286e6ff793839ec431b9fae57/wrapt-2.1.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e3d3b35eedcf5f7d022291ecd7533321c4775f7b9cd0050a31a68499ba45757c", size = 121444 },
     { url = "https://files.pythonhosted.org/packages/3c/62/e2977843fdf9f03daf1586a0ff49060b1b2fc7ff85a7ea82b6217c1ae36e/wrapt-2.1.2-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:6f2c5390460de57fa9582bc8a1b7a6c86e1a41dfad74c5225fc07044c15cc8d1", size = 116237 },
     { url = "https://files.pythonhosted.org/packages/88/dd/27fc67914e68d740bce512f11734aec08696e6b17641fef8867c00c949fc/wrapt-2.1.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:7dfa9f2cf65d027b951d05c662cc99ee3bd01f6e4691ed39848a7a5fffc902b2", size = 120563 },
-    { url = "https://files.pythonhosted.org/packages/ec/9f/b750b3692ed2ef4705cb305bd68858e73010492b80e43d2a4faa5573cbe7/wrapt-2.1.2-cp312-cp312-win32.whl", hash = "sha256:eba8155747eb2cae4a0b913d9ebd12a1db4d860fc4c829d7578c7b989bd3f2f0", size = 58198 },
-    { url = "https://files.pythonhosted.org/packages/8e/b2/feecfe29f28483d888d76a48f03c4c4d8afea944dbee2b0cd3380f9df032/wrapt-2.1.2-cp312-cp312-win_amd64.whl", hash = "sha256:1c51c738d7d9faa0b3601708e7e2eda9bf779e1b601dce6c77411f2a1b324a63", size = 60441 },
-    { url = "https://files.pythonhosted.org/packages/44/e1/e328f605d6e208547ea9fd120804fcdec68536ac748987a68c47c606eea8/wrapt-2.1.2-cp312-cp312-win_arm64.whl", hash = "sha256:c8e46ae8e4032792eb2f677dbd0d557170a8e5524d22acc55199f43efedd39bf", size = 58836 },
     { url = "https://files.pythonhosted.org/packages/1a/c7/8528ac2dfa2c1e6708f647df7ae144ead13f0a31146f43c7264b4942bf12/wrapt-2.1.2-py3-none-any.whl", hash = "sha256:b8fd6fa2b2c4e7621808f8c62e8317f4aae56e59721ad933bac5239d913cf0e8", size = 43993 },
 ]
 
@@ -1579,8 +1464,6 @@ version = "3.6.0"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/02/84/30869e01909fb37a6cc7e18688ee8bf1e42d57e7e0777636bd47524c43c7/xxhash-3.6.0.tar.gz", hash = "sha256:f0162a78b13a0d7617b2845b90c763339d1f1d82bb04a4b07f4ab535cc5e05d6", size = 85160 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/9a/07/d9412f3d7d462347e4511181dea65e47e0d0e16e26fbee2ea86a2aefb657/xxhash-3.6.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:01362c4331775398e7bb34e3ab403bc9ee9f7c497bc7dee6272114055277dd3c", size = 32744 },
-    { url = "https://files.pythonhosted.org/packages/79/35/0429ee11d035fc33abe32dca1b2b69e8c18d236547b9a9b72c1929189b9a/xxhash-3.6.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b7b2df81a23f8cb99656378e72501b2cb41b1827c0f5a86f87d6b06b69f9f204", size = 30816 },
     { url = "https://files.pythonhosted.org/packages/b7/f2/57eb99aa0f7d98624c0932c5b9a170e1806406cdbcdb510546634a1359e0/xxhash-3.6.0-cp312-cp312-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:dc94790144e66b14f67b10ac8ed75b39ca47536bf8800eb7c24b50271ea0c490", size = 194035 },
     { url = "https://files.pythonhosted.org/packages/4c/ed/6224ba353690d73af7a3f1c7cdb1fc1b002e38f783cb991ae338e1eb3d79/xxhash-3.6.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:93f107c673bccf0d592cdba077dedaf52fe7f42dcd7676eba1f6d6f0c3efffd2", size = 212914 },
     { url = "https://files.pythonhosted.org/packages/38/86/fb6b6130d8dd6b8942cc17ab4d90e223653a89aa32ad2776f8af7064ed13/xxhash-3.6.0-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:2aa5ee3444c25b69813663c9f8067dcfaa2e126dc55e8dddf40f4d1c25d7effa", size = 212163 },
@@ -1591,9 +1474,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/58/ca/faa05ac19b3b622c7c9317ac3e23954187516298a091eb02c976d0d3dd45/xxhash-3.6.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:843b52f6d88071f87eba1631b684fcb4b2068cd2180a0224122fe4ef011a9374", size = 210655 },
     { url = "https://files.pythonhosted.org/packages/d4/7a/06aa7482345480cc0cb597f5c875b11a82c3953f534394f620b0be2f700c/xxhash-3.6.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:7d14a6cfaf03b1b6f5f9790f76880601ccc7896aff7ab9cd8978a939c1eb7e0d", size = 414001 },
     { url = "https://files.pythonhosted.org/packages/23/07/63ffb386cd47029aa2916b3d2f454e6cc5b9f5c5ada3790377d5430084e7/xxhash-3.6.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:418daf3db71e1413cfe211c2f9a528456936645c17f46b5204705581a45390ae", size = 191431 },
-    { url = "https://files.pythonhosted.org/packages/0f/93/14fde614cadb4ddf5e7cebf8918b7e8fac5ae7861c1875964f17e678205c/xxhash-3.6.0-cp312-cp312-win32.whl", hash = "sha256:50fc255f39428a27299c20e280d6193d8b63b8ef8028995323bf834a026b4fbb", size = 30617 },
-    { url = "https://files.pythonhosted.org/packages/13/5d/0d125536cbe7565a83d06e43783389ecae0c0f2ed037b48ede185de477c0/xxhash-3.6.0-cp312-cp312-win_amd64.whl", hash = "sha256:c0f2ab8c715630565ab8991b536ecded9416d615538be8ecddce43ccf26cbc7c", size = 31534 },
-    { url = "https://files.pythonhosted.org/packages/54/85/6ec269b0952ec7e36ba019125982cf11d91256a778c7c3f98a4c5043d283/xxhash-3.6.0-cp312-cp312-win_arm64.whl", hash = "sha256:eae5c13f3bc455a3bbb68bdc513912dc7356de7e2280363ea235f71f54064829", size = 27876 },
 ]
 
 [[package]]
@@ -1602,8 +1482,6 @@ version = "0.25.0"
 source = { registry = "https://pypi.org/simple" }
 sdist = { url = "https://files.pythonhosted.org/packages/fd/aa/3e0508d5a5dd96529cdc5a97011299056e14c6505b678fd58938792794b1/zstandard-0.25.0.tar.gz", hash = "sha256:7713e1179d162cf5c7906da876ec2ccb9c3a9dcbdffef0cc7f70c3667a205f0b", size = 711513 }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/82/fc/f26eb6ef91ae723a03e16eddb198abcfce2bc5a42e224d44cc8b6765e57e/zstandard-0.25.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:7b3c3a3ab9daa3eed242d6ecceead93aebbb8f5f84318d82cee643e019c4b73b", size = 795738 },
-    { url = "https://files.pythonhosted.org/packages/aa/1c/d920d64b22f8dd028a8b90e2d756e431a5d86194caa78e3819c7bf53b4b3/zstandard-0.25.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:913cbd31a400febff93b564a23e17c3ed2d56c064006f54efec210d586171c00", size = 640436 },
     { url = "https://files.pythonhosted.org/packages/53/6c/288c3f0bd9fcfe9ca41e2c2fbfd17b2097f6af57b62a81161941f09afa76/zstandard-0.25.0-cp312-cp312-manylinux2010_i686.manylinux2014_i686.manylinux_2_12_i686.manylinux_2_17_i686.whl", hash = "sha256:011d388c76b11a0c165374ce660ce2c8efa8e5d87f34996aa80f9c0816698b64", size = 5343019 },
     { url = "https://files.pythonhosted.org/packages/1e/15/efef5a2f204a64bdb5571e6161d49f7ef0fffdbca953a615efbec045f60f/zstandard-0.25.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:6dffecc361d079bb48d7caef5d673c88c8988d3d33fb74ab95b7ee6da42652ea", size = 5063012 },
     { url = "https://files.pythonhosted.org/packages/b7/37/a6ce629ffdb43959e92e87ebdaeebb5ac81c944b6a75c9c47e300f85abdf/zstandard-0.25.0-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:7149623bba7fdf7e7f24312953bcf73cae103db8cae49f8154dd1eadc8a29ecb", size = 5394148 },
@@ -1616,7 +1494,4 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/1e/73/316e4010de585ac798e154e88fd81bb16afc5c5cb1a72eeb16dd37e8024a/zstandard-0.25.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:f27662e4f7dbf9f9c12391cb37b4c4c3cb90ffbd3b1fb9284dadbbb8935fa708", size = 5433517 },
     { url = "https://files.pythonhosted.org/packages/5b/60/dd0f8cfa8129c5a0ce3ea6b7f70be5b33d2618013a161e1ff26c2b39787c/zstandard-0.25.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:99c0c846e6e61718715a3c9437ccc625de26593fea60189567f0118dc9db7512", size = 5814292 },
     { url = "https://files.pythonhosted.org/packages/fc/5f/75aafd4b9d11b5407b641b8e41a57864097663699f23e9ad4dbb91dc6bfe/zstandard-0.25.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:474d2596a2dbc241a556e965fb76002c1ce655445e4e3bf38e5477d413165ffa", size = 5360237 },
-    { url = "https://files.pythonhosted.org/packages/ff/8d/0309daffea4fcac7981021dbf21cdb2e3427a9e76bafbcdbdf5392ff99a4/zstandard-0.25.0-cp312-cp312-win32.whl", hash = "sha256:23ebc8f17a03133b4426bcc04aabd68f8236eb78c3760f12783385171b0fd8bd", size = 436922 },
-    { url = "https://files.pythonhosted.org/packages/79/3b/fa54d9015f945330510cb5d0b0501e8253c127cca7ebe8ba46a965df18c5/zstandard-0.25.0-cp312-cp312-win_amd64.whl", hash = "sha256:ffef5a74088f1e09947aecf91011136665152e0b4b359c42be3373897fb39b01", size = 506276 },
-    { url = "https://files.pythonhosted.org/packages/ea/6b/8b51697e5319b1f9ac71087b0af9a40d8a6288ff8025c36486e0c12abcc4/zstandard-0.25.0-cp312-cp312-win_arm64.whl", hash = "sha256:181eb40e0b6a29b3cd2849f825e0fa34397f649170673d385f3598ae17cca2e9", size = 462679 },
 ]