From 60fd3b8e7ae9cc8be644b47df0661366ac2c8bd6 Mon Sep 17 00:00:00 2001 From: Wolfgang De Salvador Date: Tue, 9 Jun 2026 12:52:28 +0000 Subject: [PATCH] fix: disable mpi4py auto-init to prevent spawn-worker MPI_Init abort The existing comment in utility.py already documented this issue: "MPI cannot be initialized automatically, or read_thread spawn/forkserver child processes will abort trying to open a non-existant PMI_fd file." However the rc flags that actually disable mpi4py's auto-initialization were never set, so a bare "from mpi4py import MPI" still triggered MPI_Init_thread() at module import time. When PyTorch DataLoader workers are created with the default 'spawn' multiprocessing context, each child re-imports dlio_benchmark modules. Outside mpirun's PMIX namespace, the auto MPI_Init aborts with: orte_ess_init failed --> Returned value No permission (-17) instead of ORTE_SUCCESS *** An error occurred in MPI_Init_thread on a NULL communicator Setting mpi4py.rc.initialize = False and mpi4py.rc.finalize = False before any "from mpi4py import MPI" prevents the auto-init. Main MPI ranks still get initialized explicitly via DLIOMPI.initialize() which calls MPI.Init() when MPI.Is_initialized() is False. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- dlio_benchmark/utils/utility.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/dlio_benchmark/utils/utility.py b/dlio_benchmark/utils/utility.py index 890cd728..f937c139 100644 --- a/dlio_benchmark/utils/utility.py +++ b/dlio_benchmark/utils/utility.py @@ -176,6 +176,8 @@ def reset(): # MPI cannot be initialized automatically, or read_thread spawn/forkserver # child processes will abort trying to open a non-existant PMI_fd file. import mpi4py +mpi4py.rc.initialize = False +mpi4py.rc.finalize = False p = psutil.Process()