diff --git a/docs/OBJECT_STORAGE_GUIDE.md b/docs/OBJECT_STORAGE_GUIDE.md index 86ebf521..a726cceb 100644 --- a/docs/OBJECT_STORAGE_GUIDE.md +++ b/docs/OBJECT_STORAGE_GUIDE.md @@ -158,12 +158,84 @@ automatically injected (you do **not** need to pass them via `--params`): | `storage.storage_root` | `$BUCKET` | Bucket name from `.env` | | `storage.storage_options.storage_library` | `$STORAGE_LIBRARY` | Library (default: `s3dlio`) | | `storage.s3_force_path_style` | `true` | Required for non-AWS endpoints (path-style URLs) | +| `dataset.skip_listing` | `True` | Skip S3 object listing; generate URIs deterministically (see below) | +| `dataset.listing_validation_interval` | `1000` | HEAD-check every 1000th file to validate naming convention | > **Note**: These are only injected if not already present in `params_dict` > (existing `--params` overrides take precedence). --- +## skip_listing — Avoiding S3 Listing for Large Datasets + +### Why it matters + +On S3-compatible storage, discovering dataset files requires paginated +`list_objects_v2` calls: 50 M files = 50,000 pages × 100 ms/page = **83 minutes +minimum**, observed in the field as **12+ hours** on busy or WAN-connected +endpoints. + +mlp-storage enables `skip_listing=True` automatically for all `--object` runs. +Instead of listing, DLIO reconstructs file URIs from its own naming convention: + +``` +{file_prefix}_{index:0N}_of_{num_files}.{format} +``` + +Each MPI rank generates only its own shard — zero S3 API calls, zero MPI +communication. A sampling validation (rank 0) confirms the convention matches +by HEAD-checking the first file, last file, and every 1000th file. + +### Is the listing phase timed / does it affect the score? + +**No.** File discovery runs inside `DLIOBenchmark.initialize()`, which +completes before the scored benchmark window opens at `stats.start_run()` in +`run()`. AU and throughput scores are identical whether `skip_listing` is True +or False. This is intentional: the benchmark measures storage I/O throughput, +not metadata-API performance. + +For fair comparability, mlp-storage defaults `skip_listing=True` so no +submission is penalised by slow object-store metadata. + +### Requirements + +`skip_listing=True` only works when filenames follow DLIO's standard convention. +All data generated by `dlio_benchmark generate_data` satisfies this +automatically. If your dataset was generated by an external tool with different +naming, set `skip_listing=False`. + +### Overriding the default + +```bash +# Disable skip_listing (e.g. externally generated dataset): +uv run mlpstorage training run --object ... \ + --params dataset.skip_listing=False + +# Disable the validation sampling (faster startup, no safety check): +uv run mlpstorage training run --object ... \ + --params dataset.listing_validation_interval=0 + +# Increase validation density (check every 100th file instead of every 1000th): +uv run mlpstorage training run --object ... \ + --params dataset.listing_validation_interval=100 +``` + +### Validation output + +When validation runs you will see lines like: + +``` +skip_listing [train]: validating 50,001 of 50,000,000 files + (first, last, every 1,000) via HEAD requests ... +5,000/50,001 checked (10%) — 483 checks/s — ETA 93s — 0 failed so far +skip_listing [train]: validation complete — all 50,001 samples exist (103.6s) +``` + +If any file is missing, DLIO raises an error listing the failed URIs and +suggests either fixing the dataset or setting `skip_listing=False`. + +--- + ## MPI Workaround (single-node) On a single machine, OpenMPI's default shared-memory transport (`vader` BTL) can diff --git a/mlpstorage_py/benchmarks/dlio.py b/mlpstorage_py/benchmarks/dlio.py index c76d54c1..e7b0d2ab 100755 --- a/mlpstorage_py/benchmarks/dlio.py +++ b/mlpstorage_py/benchmarks/dlio.py @@ -217,6 +217,63 @@ def _apply_object_storage_params(self): f'uri_scheme={uri_scheme}, force_path_style={is_http_scheme and bool(endpoint_url)})' ) + @staticmethod + def _compute_validation_interval(num_files: int) -> int: + """Return a validation-sample interval scaled to dataset size. + + Smaller datasets are checked exhaustively; larger datasets are sampled + geometrically so startup HEAD-check time stays bounded at any scale. + + < 10,000 files → interval 1 (every file) + 10,000 files → interval 10 + 100,000 files → interval 100 + 1,000,000 files → interval 1,000 + 10,000,000+ files → interval 10,000 + """ + if num_files < 10_000: + return 1 + if num_files < 100_000: + return 10 + if num_files < 1_000_000: + return 100 + if num_files < 10_000_000: + return 1_000 + return 10_000 + + def _apply_skip_listing_params(self): + """Inject skip_listing=True and an adaptive listing_validation_interval. + + Applies to both file and object storage. skip_listing is safe whenever + data was generated by DLIO, which always uses the standard naming + convention: {prefix}_{idx:0N}_of_{total}.{format}. Each MPI rank + independently reconstructs its own shard — zero storage API calls, + zero MPI communication, and no process ever holds the full file list. + + The validation interval is derived from num_files_train so that small + datasets are validated exhaustively (every file) while large datasets + are sampled geometrically — keeping HEAD-check overhead bounded to + ~100 s even at 50 M files. + + Both params respect user --params overrides. + """ + if 'dataset.skip_listing' not in self.params_dict: + self.params_dict['dataset.skip_listing'] = 'True' + + if 'dataset.listing_validation_interval' not in self.params_dict: + raw = (self.combined_params or {}).get('dataset', {}).get('num_files_train', 0) + try: + num_files = int(raw) + except (ValueError, TypeError): + num_files = 0 + interval = self._compute_validation_interval(num_files) + self.params_dict['dataset.listing_validation_interval'] = str(interval) + checks = (num_files // interval) + 2 if interval > 0 and num_files > 0 else num_files + self.logger.info( + f'skip_listing enabled: {num_files:,} train files → ' + f'validation_interval={interval:,} ' + f'(~{checks:,} HEAD checks at startup)' + ) + @staticmethod def _strip_uri_scheme(value): # DLIO obj_store_lib treats storage_root as a bare bucket/prefix and @@ -331,6 +388,9 @@ def __init__(self, args, **kwargs): # Inject object storage params before add_datadir_param (which reads storage_type # from params_dict to decide whether to create local directories). self._apply_object_storage_params() + # Enable skip_listing for all storage types (file and object). Must be + # called after _apply_object_storage_params so combined_params is final. + self._apply_skip_listing_params() if self.args.command not in ("datagen", "datasize"): self.verify_benchmark()