From 8332f0582efd5d3c6ce6eedbffd6d41b46aa420d Mon Sep 17 00:00:00 2001 From: Curtis Anderson Date: Fri, 12 Jun 2026 11:56:11 -0700 Subject: [PATCH] fix: pin dlio-benchmark to PR #22 head to resolve issue #415 PyTorch DataLoader workers were aborting with "MPI_Init_thread on a NULL communicator" whenever a DLIO workload used reader.read_threads > 0 (e.g. retinanet_b200, unet3d_*). Root cause is in dlio_benchmark: the top-level `from mpi4py import MPI` in statscounter.py triggers MPI_Init_thread() in every spawn-context DataLoader child, which has no PMIX/ORTE environment because it was not launched by mpirun. The upstream comment in dlio_benchmark/utils/utility.py:176 already documented the hazard, but the mpi4py.rc.initialize / rc.finalize flags that actually disable the auto-init were never set. mlcommons/DLIO_local_changes#22 adds those two lines. Pin to that PR's head commit (60fd3b8e) so storage picks up the fix immediately. Revert to branch = "main" once #22 merges upstream. --- pyproject.toml | 6 ++++-- uv.lock | 8 ++++---- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 95215e2e..8ca11ae9 100755 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "mlpstorage" -version = "3.0.7" +version = "3.0.9" description = "MLPerf Storage Benchmark Suite" readme = "README.md" license = {text = "Apache-2.0"} @@ -122,7 +122,9 @@ environments = ["sys_platform == 'linux'"] torch = [{ index = "pytorch-cpu" }] torchvision = [{ index = "pytorch-cpu" }] torchaudio = [{ index = "pytorch-cpu" }] -dlio-benchmark = { git = "https://github.com/mlcommons/DLIO_local_changes.git", branch = "main" } +# Pinned to PR #22 head (fix for issue #415: mpi4py auto-init aborts in PyTorch +# DataLoader spawn-workers). Revert to branch = "main" once the PR merges upstream. +dlio-benchmark = { git = "https://github.com/mlcommons/DLIO_local_changes.git", rev = "60fd3b8e7ae9cc8be644b47df0661366ac2c8bd6" } [dependency-groups] dev = [ diff --git a/uv.lock b/uv.lock index 3f339085..ef8236da 100644 --- a/uv.lock +++ b/uv.lock @@ -237,7 +237,7 @@ wheels = [ [[package]] name = "dlio-benchmark" version = "3.0.2" -source = { git = "https://github.com/mlcommons/DLIO_local_changes.git?branch=main#e4c9b7aa3ffa96fae0054b92ea6629996dbd5f23" } +source = { git = "https://github.com/mlcommons/DLIO_local_changes.git?rev=60fd3b8e7ae9cc8be644b47df0661366ac2c8bd6#60fd3b8e7ae9cc8be644b47df0661366ac2c8bd6" } dependencies = [ { name = "dgen-py", marker = "sys_platform == 'linux'" }, { name = "h5py", marker = "sys_platform == 'linux'" }, @@ -518,7 +518,7 @@ wheels = [ [[package]] name = "mlpstorage" -version = "3.0.7" +version = "3.0.9" source = { editable = "." } dependencies = [ { name = "dlio-benchmark", marker = "sys_platform == 'linux'" }, @@ -583,8 +583,8 @@ dev = [ [package.metadata] requires-dist = [ - { name = "dlio-benchmark", git = "https://github.com/mlcommons/DLIO_local_changes.git?branch=main" }, - { name = "dlio-benchmark", marker = "extra == 'full'", git = "https://github.com/mlcommons/DLIO_local_changes.git?branch=main" }, + { name = "dlio-benchmark", git = "https://github.com/mlcommons/DLIO_local_changes.git?rev=60fd3b8e7ae9cc8be644b47df0661366ac2c8bd6" }, + { name = "dlio-benchmark", marker = "extra == 'full'", git = "https://github.com/mlcommons/DLIO_local_changes.git?rev=60fd3b8e7ae9cc8be644b47df0661366ac2c8bd6" }, { name = "elasticsearch", marker = "extra == 'vectordb'", specifier = ">=8.0" }, { name = "elasticsearch", marker = "extra == 'vectordb-elasticsearch'", specifier = ">=8.0" }, { name = "minio", specifier = ">=7.2.20" },