From ffe6f9bfacc36bbae22204b914a1f042e731c5bf Mon Sep 17 00:00:00 2001 From: Curtis Anderson Date: Tue, 16 Jun 2026 14:49:44 -0700 Subject: [PATCH] fix(main): use sudo -n for page-cache flush; warn once on failure MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The per-epoch drop_caches call invoked sudo interactively, inheriting the user's TTY from mpirun's stdin. When sudo's NOPASSWD wasn't configured, sudo blocked at the password prompt — and when mlpstorage_py's rich progress bar overwrote the prompt line, users could not see or respond to it. The bare `except Exception: pass` then swallowed the 30s timeout silently every epoch. Reporter of mlcommons/storage#391 sat on a hung training run for ~16 hours. Two changes: 1. `sudo -n` (non-interactive) plus stdin redirected to /dev/null — sudo fails immediately instead of prompting for a password it can't read. 2. A `drop_caches_disabled` flag set on the first failure suppresses subsequent attempts and emits a single explanatory warning telling the user how to enable passwordless sudo for the flush. No more silent 30s/epoch stalls, no more log spam. Behavior on a properly-configured host (NOPASSWD sudo) is unchanged. Refs mlcommons/storage#391 --- dlio_benchmark/main.py | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/dlio_benchmark/main.py b/dlio_benchmark/main.py index 86733fd2..f7fdb118 100644 --- a/dlio_benchmark/main.py +++ b/dlio_benchmark/main.py @@ -444,18 +444,35 @@ def run(self): if self.do_eval: self.framework.get_loader(dataset_type=DatasetType.VALID).read() self.comm.barrier() + # Skip the per-epoch page-cache flush after the first failure so a host + # without NOPASSWD sudo doesn't pay the failure cost on every epoch and + # the warning fires exactly once. See mlcommons/storage issue #391. + drop_caches_disabled = False for epoch in dft_ai.pipeline.epoch.iter(range(1, self.epochs + 1), include_iter=False): # Flush page cache before each epoch so reads bypass the OS buffer cache. - # Rank 0 does the flush via sudo; all ranks barrier-wait so no rank starts - # reading stale cached data. - if self.my_rank == 0: + # Rank 0 does the flush via sudo -n (non-interactive); all ranks barrier- + # wait so no rank starts reading stale cached data. If sudo requires a + # password (or isn't installed) -n exits immediately with non-zero — we + # log a single warning and stop trying. This avoids the interactive + # password prompt that hung issue #391 for ~16 hours. + if self.my_rank == 0 and not drop_caches_disabled: try: subprocess.run( - ["sudo", "sh", "-c", "echo 3 > /proc/sys/vm/drop_caches"], - check=True, timeout=30 + ["sudo", "-n", "sh", "-c", "echo 3 > /proc/sys/vm/drop_caches"], + check=True, timeout=30, + stdin=subprocess.DEVNULL, + stdout=subprocess.DEVNULL, + stderr=subprocess.PIPE, + ) + except Exception as exc: + drop_caches_disabled = True + self.logger.warning( + f"Could not flush page cache between epochs: {exc}. " + "Per-epoch reads may be served from the OS buffer cache, " + "inflating throughput numbers. To enable, configure " + "passwordless sudo for `sh -c 'echo 3 > /proc/sys/vm/" + "drop_caches'`." ) - except Exception: - pass self.comm.barrier() self.stats.start_epoch(epoch) self.next_checkpoint_step = self.steps_between_checkpoints