From c753b7f7c9b3795edd9452969a00ee3e9de8e7ba Mon Sep 17 00:00:00 2001 From: Curtis Anderson Date: Tue, 16 Jun 2026 14:55:25 -0700 Subject: [PATCH] fix: bump DLIO pin for storage #391 (sudo prompt) + #448 (per-node memory budget) Pin moves from 60fd3b8 to 814f3ff (FileSystemGuy-combined-391-448 in mlcommons/DLIO_local_changes), which adds two fixes on top of the existing pin: - DLIO PR #23 (storage #391 follow-up): per-epoch page-cache flush uses `sudo -n` instead of interactive sudo, and disables itself after the first failure with a one-line remediation warning. Stops the silent 30s-per-epoch stall (and the hours-long hang the storage #391 reporter saw) on hosts without NOPASSWD sudo. - DLIO PR #24 (storage #448): worker-memory budget guard now scopes to per-node (read_threads x ranks_per_node), uses psutil.virtual_memory().available with a 90% safety margin instead of .total, and reports hostname + local_ranks in the error/warning text. Removes the false-positive that rejected valid multi-node configurations and collapsed max_threads to 0 at ~100 nodes. Once both DLIO PRs merge into main, revert the pin to branch = "main" and delete the combined branch. Refs #391, #448 --- pyproject.toml | 10 +++++++--- uv.lock | 6 +++--- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 8ca11ae9..d3d4bf20 100755 --- a/pyproject.toml +++ b/pyproject.toml @@ -122,9 +122,13 @@ environments = ["sys_platform == 'linux'"] torch = [{ index = "pytorch-cpu" }] torchvision = [{ index = "pytorch-cpu" }] torchaudio = [{ index = "pytorch-cpu" }] -# Pinned to PR #22 head (fix for issue #415: mpi4py auto-init aborts in PyTorch -# DataLoader spawn-workers). Revert to branch = "main" once the PR merges upstream. -dlio-benchmark = { git = "https://github.com/mlcommons/DLIO_local_changes.git", rev = "60fd3b8e7ae9cc8be644b47df0661366ac2c8bd6" } +# Pinned to FileSystemGuy-combined-391-448 head — carries: +# - DLIO PR #21 fix for storage #391 part 1 (TorchIterableDatasetSimple gating) +# - DLIO PR #22 fix for storage #415 (mpi4py auto-init in spawn workers) +# - DLIO PR #23 fix for storage #391 part 2 (sudo -n + warn-once for drop_caches) +# - DLIO PR #24 fix for storage #448 (per-node worker-memory budget) +# Revert to branch = "main" once PRs #23 and #24 merge upstream. +dlio-benchmark = { git = "https://github.com/mlcommons/DLIO_local_changes.git", rev = "814f3ff400c865a294bfc70714a3289efbfa83ac" } [dependency-groups] dev = [ diff --git a/uv.lock b/uv.lock index ef8236da..5137a3b7 100644 --- a/uv.lock +++ b/uv.lock @@ -237,7 +237,7 @@ wheels = [ [[package]] name = "dlio-benchmark" version = "3.0.2" -source = { git = "https://github.com/mlcommons/DLIO_local_changes.git?rev=60fd3b8e7ae9cc8be644b47df0661366ac2c8bd6#60fd3b8e7ae9cc8be644b47df0661366ac2c8bd6" } +source = { git = "https://github.com/mlcommons/DLIO_local_changes.git?rev=814f3ff400c865a294bfc70714a3289efbfa83ac#814f3ff400c865a294bfc70714a3289efbfa83ac" } dependencies = [ { name = "dgen-py", marker = "sys_platform == 'linux'" }, { name = "h5py", marker = "sys_platform == 'linux'" }, @@ -583,8 +583,8 @@ dev = [ [package.metadata] requires-dist = [ - { name = "dlio-benchmark", git = "https://github.com/mlcommons/DLIO_local_changes.git?rev=60fd3b8e7ae9cc8be644b47df0661366ac2c8bd6" }, - { name = "dlio-benchmark", marker = "extra == 'full'", git = "https://github.com/mlcommons/DLIO_local_changes.git?rev=60fd3b8e7ae9cc8be644b47df0661366ac2c8bd6" }, + { name = "dlio-benchmark", git = "https://github.com/mlcommons/DLIO_local_changes.git?rev=814f3ff400c865a294bfc70714a3289efbfa83ac" }, + { name = "dlio-benchmark", marker = "extra == 'full'", git = "https://github.com/mlcommons/DLIO_local_changes.git?rev=814f3ff400c865a294bfc70714a3289efbfa83ac" }, { name = "elasticsearch", marker = "extra == 'vectordb'", specifier = ">=8.0" }, { name = "elasticsearch", marker = "extra == 'vectordb-elasticsearch'", specifier = ">=8.0" }, { name = "minio", specifier = ">=7.2.20" },