From 47ad9f5ffc61894ace599ca7652806574286751d Mon Sep 17 00:00:00 2001
From: Curtis Anderson <curtis@mlcommons.org>
Date: Tue, 16 Jun 2026 14:51:17 -0700
Subject: [PATCH] fix(config): scope worker-memory budget to per-node (#448)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The memory-budget guard in ConfigArguments.validate compared a global
worker count (read_threads × comm_size) against one node's RAM
(psutil.virtual_memory().total). Operands in incompatible units, so
valid multi-node configurations were rejected with a misleading
"reduce read_threads to N" suggestion where N collapsed to 0 at
~100 nodes — leaving the user with no valid setting.

Changes:

- Worker count is now per-node: read_threads × DLIOMPI.ranks_per_node().
  ranks_per_node() already exists (used by the auto-sizing path,
  config.py:774) and falls back to comm_size in CHILD_INITIALIZED state,
  matching the conservative behavior elsewhere in this file.
- Budget basis is now psutil.virtual_memory().available with a 90%
  safety margin (so already-used RAM is respected). The previous
  .total basis blocked large machines for fresh RAM that was never
  going to be there.
- Error and warning messages now name the host, report local_ranks
  instead of comm_size, and the max_threads suggestion is derived
  from per-node arithmetic.
- The 50% available-RAM warning gets the same per-node treatment.

Worked example from the issue: 100 nodes × 12 ranks × read_threads=16,
256 GiB/node — per-node demand is 96 GB (16 × 12 × 0.5), well under
budget. Old code computed 9.6 TB against 256 GB and rejected. New
code computes 96 GB against 256 GB and passes.

Refs mlcommons/storage#448
---
 dlio_benchmark/utils/config.py | 52 +++++++++++++++++++++-------------
 1 file changed, 32 insertions(+), 20 deletions(-)

diff --git a/dlio_benchmark/utils/config.py b/dlio_benchmark/utils/config.py
index effecbd4..cc6f9a80 100644
--- a/dlio_benchmark/utils/config.py
+++ b/dlio_benchmark/utils/config.py
@@ -384,36 +384,48 @@ def validate(self):
                     self.logger.warning(
                         f"Running DLIO with {self.read_threads} threads for I/O but core available {cores_available} "
                         f"are insufficient and can lead to lower performance.")
-        # Memory budget guard: spawned worker processes must not exhaust system RAM.
-        # Each worker loads Python + framework + reader libraries (~512 MB RSS minimum).
-        # The hard cap is 32 GB so these benchmarks run on any compliant system.
-        # This check runs on all ranks so every rank refuses before workers are spawned.
+        # Memory budget guard: spawned worker processes must not exhaust this
+        # node's RAM.  Worker counts are scoped to *this* node — not the world
+        # communicator — because RAM is a per-node resource.  Prior code multiplied
+        # by self.comm_size and compared against one node's RAM, which produced
+        # false positives at multi-node scale (issue mlcommons/storage#448:
+        # the suggested max_threads collapsed to 0 at 100+ nodes even though the
+        # per-node load was well under capacity).
         if self.read_threads > 0 and self.data_loader in [
             DataLoaderType.PYTORCH, DataLoaderType.DALI
         ]:
+            import platform
+            import socket
             import psutil
-            total_workers = self.read_threads * self.comm_size
-            # 512 MB per spawned worker is the minimum observed RSS (framework imports only).
+            ranks_per_node = DLIOMPI.get_instance().ranks_per_node()
+            local_workers = self.read_threads * ranks_per_node
+            # 512 MB per spawned worker is the minimum observed RSS (framework
+            # imports only).  Compare against psutil.virtual_memory().available
+            # with a 90% safety margin so already-used RAM is respected.
             per_worker_mb = 512
-            # Use actual installed RAM so large machines aren't blocked (#372).
-            # Spawning more workers than can fit in RAM is still an error.
-            BUDGET_MB = psutil.virtual_memory().total // (1024 * 1024)
-            estimated_mb = per_worker_mb * total_workers
+            vm = psutil.virtual_memory()
+            available_mb = vm.available // (1024 * 1024)
+            BUDGET_MB = int(available_mb * 0.9)
+            estimated_mb = per_worker_mb * local_workers
+            hostname = socket.gethostname()
             if estimated_mb > BUDGET_MB:
-                max_threads = BUDGET_MB // per_worker_mb // max(1, self.comm_size)
+                max_threads = BUDGET_MB // per_worker_mb // max(1, ranks_per_node)
                 raise Exception(
-                    f"Memory budget exceeded: reader.read_threads={self.read_threads} "
-                    f"x comm_size={self.comm_size} = {total_workers} worker processes, "
-                    f"estimated ~{estimated_mb // 1024} GB (host RAM: {BUDGET_MB // 1024} GB). "
-                    f"Reduce reader.read_threads to at most {max_threads} for this run."
+                    f"Per-node memory budget exceeded on host {hostname}: "
+                    f"reader.read_threads={self.read_threads} x local_ranks="
+                    f"{ranks_per_node} = {local_workers} worker processes, "
+                    f"estimated ~{estimated_mb // 1024} GB "
+                    f"(available RAM on this node: {available_mb // 1024} GB; "
+                    f"total: {vm.total // (1024**3)} GB). "
+                    f"Reduce reader.read_threads to at most {max_threads}."
                 )
-            # Also warn if estimated usage exceeds 50% of available RAM on this machine
-            available_mb = psutil.virtual_memory().available // (1024 * 1024)
+            # Also warn if estimated usage exceeds 50% of available RAM on this node
             if estimated_mb > available_mb * 0.5:
                 self.logger.warning(
-                    f"reader.read_threads={self.read_threads} x comm_size={self.comm_size} "
-                    f"= {total_workers} workers, estimated ~{estimated_mb // 1024} GB — "
-                    f"exceeds 50% of available RAM ({available_mb // 1024} GB). "
+                    f"On host {hostname}: reader.read_threads={self.read_threads} "
+                    f"x local_ranks={ranks_per_node} = {local_workers} workers, "
+                    f"estimated ~{estimated_mb // 1024} GB — exceeds 50% of "
+                    f"available RAM on this node ({available_mb // 1024} GB). "
                     f"Consider reducing read_threads to avoid OOM."
                 )
         if self.num_layers > 0 and self.num_layers < self.pipeline_parallelism: