From dec4d422873b60a4ac6a03dacd23b9a9b24f2a38 Mon Sep 17 00:00:00 2001 From: Curtis Anderson Date: Tue, 26 May 2026 11:57:32 -0700 Subject: [PATCH 1/5] Fix two silent hang causes in unet3d training benchmark - utils.py: replace readline() with buffer.read1() in CommandExecutor to prevent blocking on partial/\r-terminated output from DLIO/MPI processes - cluster_collector.py: wrap collect_local_info() in try/except inside MPI_COLLECTOR_SCRIPT so every rank always reaches comm.gather(), preventing a deadlock when any rank exits early due to a collection error - tests/unit/test_cluster_collector.py: add TestMPICollectorScriptMain with 6 tests covering the gather-always-called invariant --- mlpstorage_py/cluster_collector.py | 16 ++- mlpstorage_py/utils.py | 7 +- tests/unit/test_cluster_collector.py | 144 +++++++++++++++++++++++++++ 3 files changed, 161 insertions(+), 6 deletions(-) diff --git a/mlpstorage_py/cluster_collector.py b/mlpstorage_py/cluster_collector.py index a4a1ea4e..bc1a12b9 100644 --- a/mlpstorage_py/cluster_collector.py +++ b/mlpstorage_py/cluster_collector.py @@ -1128,11 +1128,19 @@ def main(): json.dump(error_output, f, indent=2) sys.exit(1) - # Collect local info - local_info = collect_local_info() - local_info['mpi_rank'] = rank + # Collect local info — wrap in try/except so every rank always reaches + # comm.gather(); an early exit from any rank would deadlock all others. + try: + local_info = collect_local_info() + local_info['mpi_rank'] = rank + except Exception as e: + local_info = { + 'hostname': socket.gethostname(), + 'mpi_rank': rank, + '_collection_error': str(e), + } - # Gather all info to rank 0 + # Gather all info to rank 0 — every rank must reach this call all_info = comm.gather(local_info, root=0) if rank == 0: diff --git a/mlpstorage_py/utils.py b/mlpstorage_py/utils.py index 7c4c80b8..21fe8935 100755 --- a/mlpstorage_py/utils.py +++ b/mlpstorage_py/utils.py @@ -387,9 +387,12 @@ def execute(self, ) for stream in readable: - line = stream.readline() - if not line: # EOF + # read1() returns whatever bytes are in the pipe buffer without + # blocking for '\n', preventing a hang on \r-terminated output. + raw = stream.buffer.read1(65536) + if not raw: # EOF continue + line = raw.decode('utf-8', errors='replace') if stream.fileno() == stdout_fd: stdout_buffer.write(line) diff --git a/tests/unit/test_cluster_collector.py b/tests/unit/test_cluster_collector.py index b8042e0c..6feca9ed 100755 --- a/tests/unit/test_cluster_collector.py +++ b/tests/unit/test_cluster_collector.py @@ -1210,3 +1210,147 @@ def test_handles_unreachable_host_gracefully(self): if bad_host_samples: # If we got samples, they should have errors assert any('errors' in s for s in bad_host_samples) + + +class TestMPICollectorScriptMain: + """Tests for the main() function embedded in MPI_COLLECTOR_SCRIPT. + + Verifies that every rank always calls comm.gather() even when + collect_local_info() raises, preventing a deadlock on surviving ranks. + """ + + @staticmethod + def _load_script_ns(): + """Exec MPI_COLLECTOR_SCRIPT into a fresh namespace and return it.""" + from mlpstorage_py.cluster_collector import MPI_COLLECTOR_SCRIPT + ns = {'__name__': 'mlps_collector'} + exec(MPI_COLLECTOR_SCRIPT, ns) + return ns + + @staticmethod + def _mock_mpi(mock_comm): + """Return a sys.modules patch dict wiring mock_comm as MPI.COMM_WORLD.""" + mock_mpi = MagicMock() + mock_mpi.COMM_WORLD = mock_comm + mock_mpi4py = MagicMock() + mock_mpi4py.MPI = mock_mpi + return {'mpi4py': mock_mpi4py} + + def test_gather_called_on_successful_collection(self, tmp_path): + """Normal path: gather is called with local info dict including mpi_rank.""" + output_file = str(tmp_path / 'out.json') + mock_comm = MagicMock() + mock_comm.Get_rank.return_value = 1 + mock_comm.Get_size.return_value = 2 + mock_comm.gather.return_value = None + + ns = self._load_script_ns() + ns['collect_local_info'] = MagicMock(return_value={'hostname': 'node1'}) + + with patch.dict('sys.modules', self._mock_mpi(mock_comm)), \ + patch('sys.argv', ['script', output_file]): + ns['main']() + + mock_comm.gather.assert_called_once() + gathered_info = mock_comm.gather.call_args[0][0] + assert gathered_info['hostname'] == 'node1' + assert gathered_info['mpi_rank'] == 1 + assert '_collection_error' not in gathered_info + + def test_gather_still_called_when_collection_raises(self, tmp_path): + """Error path: gather is called even when collect_local_info() raises.""" + output_file = str(tmp_path / 'out.json') + mock_comm = MagicMock() + mock_comm.Get_rank.return_value = 1 + mock_comm.Get_size.return_value = 2 + mock_comm.gather.return_value = None + + ns = self._load_script_ns() + ns['collect_local_info'] = MagicMock(side_effect=RuntimeError('disk read failed')) + + with patch.dict('sys.modules', self._mock_mpi(mock_comm)), \ + patch('sys.argv', ['script', output_file]): + ns['main']() + + mock_comm.gather.assert_called_once() + + def test_sentinel_has_collection_error_key(self, tmp_path): + """Error sentinel must contain _collection_error so callers can detect failures.""" + output_file = str(tmp_path / 'out.json') + mock_comm = MagicMock() + mock_comm.Get_rank.return_value = 1 + mock_comm.Get_size.return_value = 2 + mock_comm.gather.return_value = None + + ns = self._load_script_ns() + ns['collect_local_info'] = MagicMock(side_effect=RuntimeError('disk read failed')) + + with patch.dict('sys.modules', self._mock_mpi(mock_comm)), \ + patch('sys.argv', ['script', output_file]): + ns['main']() + + gathered_info = mock_comm.gather.call_args[0][0] + assert '_collection_error' in gathered_info + assert 'disk read failed' in gathered_info['_collection_error'] + + def test_sentinel_has_hostname_and_rank(self, tmp_path): + """Error sentinel must carry hostname and mpi_rank so rank 0 can identify the source.""" + output_file = str(tmp_path / 'out.json') + mock_comm = MagicMock() + mock_comm.Get_rank.return_value = 2 + mock_comm.Get_size.return_value = 4 + mock_comm.gather.return_value = None + + ns = self._load_script_ns() + ns['collect_local_info'] = MagicMock(side_effect=OSError('permission denied')) + + with patch.dict('sys.modules', self._mock_mpi(mock_comm)), \ + patch('sys.argv', ['script', output_file]): + ns['main']() + + gathered_info = mock_comm.gather.call_args[0][0] + assert gathered_info['mpi_rank'] == 2 + assert 'hostname' in gathered_info + + def test_rank_zero_writes_output_file(self, tmp_path): + """Rank 0 writes the JSON output file when collection succeeds.""" + output_file = str(tmp_path / 'out.json') + mock_comm = MagicMock() + mock_comm.Get_rank.return_value = 0 + mock_comm.Get_size.return_value = 1 + mock_comm.gather.return_value = [{'hostname': 'node0', 'mpi_rank': 0}] + + ns = self._load_script_ns() + ns['collect_local_info'] = MagicMock(return_value={'hostname': 'node0'}) + + with patch.dict('sys.modules', self._mock_mpi(mock_comm)), \ + patch('sys.argv', ['script', output_file]): + ns['main']() + + with open(output_file) as f: + data = json.load(f) + assert 'node0' in data + + def test_rank_zero_writes_output_when_another_rank_sent_sentinel(self, tmp_path): + """Rank 0 writes JSON even when another rank's payload is an error sentinel.""" + output_file = str(tmp_path / 'out.json') + mock_comm = MagicMock() + mock_comm.Get_rank.return_value = 0 + mock_comm.Get_size.return_value = 2 + mock_comm.gather.return_value = [ + {'hostname': 'node0', 'mpi_rank': 0}, + {'hostname': 'node1', 'mpi_rank': 1, '_collection_error': 'disk read failed'}, + ] + + ns = self._load_script_ns() + ns['collect_local_info'] = MagicMock(return_value={'hostname': 'node0'}) + + with patch.dict('sys.modules', self._mock_mpi(mock_comm)), \ + patch('sys.argv', ['script', output_file]): + ns['main']() + + with open(output_file) as f: + data = json.load(f) + assert 'node0' in data + assert 'node1' in data + assert '_collection_error' in data['node1'] From 301e3f2607f7c366798857e7b1a0239e0769c8bb Mon Sep 17 00:00:00 2001 From: Curtis Anderson Date: Tue, 26 May 2026 12:53:39 -0700 Subject: [PATCH 2/5] =?UTF-8?q?wip:=20fork-after-mpi-init=20fix=20paused?= =?UTF-8?q?=20=E2=80=94=20utils.py=20change=20uncommitted?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Handoff for the third unet3d hang fix. The pipe-drain blocking .read() replacement (select+read1 with 0.5s timeout) is applied but not yet committed. See .planning/.continue-here.md for full context. Co-Authored-By: Claude Sonnet 4.6 --- .planning/.continue-here.md | 85 +++++++++++++++++++++++++++++++++++++ .planning/HANDOFF.json | 76 +++++++++++++++++++++++++++++++++ 2 files changed, 161 insertions(+) create mode 100644 .planning/.continue-here.md create mode 100644 .planning/HANDOFF.json diff --git a/.planning/.continue-here.md b/.planning/.continue-here.md new file mode 100644 index 00000000..e0cbe9a4 --- /dev/null +++ b/.planning/.continue-here.md @@ -0,0 +1,85 @@ +--- +context: default +phase: null +task: null +total_tasks: null +status: in_progress +branch: FileSystemGuy-une3d-hang +last_updated: 2026-05-26T19:52:28Z +--- + +# BLOCKING CONSTRAINTS — Read Before Anything Else + +_No constraints discovered through failure this session. Section removed._ + + +Branch `FileSystemGuy-une3d-hang`. This session reviewed the unet3d training +hang investigation and applied a third surgical fix to `mlpstorage_py/utils.py`. +The fix is **uncommitted**. The branch already has two committed fixes in `dec4d42`. + +The file that needs committing: +- `mlpstorage_py/utils.py` — drain loop replacing blocking `.read()` calls + + + + +1. **Reviewed all changes in `dec4d42`** (the existing PR commit): + - `utils.py`: `readline()` → `buffer.read1()` — prevents hang on `\r`-terminated DLIO/tqdm output + - `cluster_collector.py`: `try/except` around `collect_local_info()` so every MPI rank always reaches `comm.gather()` — prevents deadlock when any rank exits early + - `test_cluster_collector.py`: 6 new tests covering gather-always invariant + +2. **Identified the fork()-after-MPI-init hang** (third issue, not yet in PR): + - Root cause chain: mpirun → DLIO rank (calls `MPI_Init()`) → PyTorch DataLoader `fork()`s workers → forked workers inherit pipe write-ends AND corrupted MPI state → may hang/never exit → mpirun exits (its direct children are done) → `while process.poll() is None` loop exits → **`self.process.stdout.read()` blocks indefinitely** because orphaned DataLoader grandchildren still hold the pipe write-end. + +3. **Applied surgical fix** to `CommandExecutor.execute()` in `utils.py` (lines 408-421): + - Replaced two `TextIOWrapper.read()` blocking calls with a `select()+read1()` drain loop + - `0.5s` timeout: EOF case (no orphans) completes immediately; orphan case times out and un-sticks orchestrator + - 176 unit tests pass (`tests/unit/test_cluster_collector.py`, `tests/unit/test_cli.py`) + + + + +- **Commit `mlpstorage_py/utils.py`** with an appropriate message covering the fork()-after-MPI-init fix +- Optional (out of scope for this PR): eliminate the redundant double cluster collection — `DLIOBenchmark.__init__` → `accumulate_host_info()` → `_collect_cluster_information()` AND `base.Benchmark.run()` → `_collect_cluster_start()` → `_collect_cluster_information()` both run `mpirun python3 mlps_collector.py` before the DLIO job starts. Not a hang, just wasted time. + + + + +- **select()+read1() with 0.5s timeout** chosen over `os.set_blocking()` or process-group management — minimal surface area, correct behavior in both normal and orphan cases +- **Do NOT add `start_new_session=True`** to Popen — changes mpirun's session/signal handling, too invasive for late release +- **Double cluster collection left out of scope** — late in release cycle, user wants minimum required change + + + +None. + + +## Required Reading (in order) + +1. `mlpstorage_py/utils.py` lines 360-450 — `CommandExecutor.execute()` — the full context for the fix +2. `mlpstorage_py/cluster_collector.py` lines 1108-1162 — `MPI_COLLECTOR_SCRIPT main()` — the existing gather-always fix +3. `git show dec4d42` — the two existing PR fixes for context before committing the third + +## Critical Anti-Patterns (do NOT repeat these) + +_None discovered this session._ + +## Infrastructure State + +- Branch: `FileSystemGuy-une3d-hang` +- One uncommitted modification: `mlpstorage_py/utils.py` +- `.planning/` directory is untracked (not in git) +- `pyarrow` and `psutil` not installed in this environment — only `test_cluster_collector.py` and `test_cli.py` are runnable + + +User is late in the v2.0 release cycle. The PR (`FileSystemGuy-une3d-hang`) already +has two fixes committed. This session identified and fixed a third hang: the pipe-drain +hang caused by PyTorch DataLoader workers that are forked inside DLIO MPI ranks (fork +after MPI_Init). The fix is minimal — only the post-loop drain section changed, and +it correctly adds zero latency to the normal (no-orphan) case. + + + +Start with: `git add mlpstorage_py/utils.py && git commit` with a message like: +"Fix third unet3d hang: drain pipe with select()+read1() timeout to avoid blocking on orphaned DataLoader workers (fork-after-MPI-init)" + diff --git a/.planning/HANDOFF.json b/.planning/HANDOFF.json new file mode 100644 index 00000000..55809811 --- /dev/null +++ b/.planning/HANDOFF.json @@ -0,0 +1,76 @@ +{ + "version": "1.0", + "timestamp": "2026-05-26T19:52:28Z", + "phase": null, + "phase_name": "fork-after-mpi-init-fix", + "phase_dir": null, + "plan": null, + "task": null, + "total_tasks": null, + "status": "paused", + "context": "ad-hoc code review and surgical fix", + "branch": "FileSystemGuy-une3d-hang", + "completed_tasks": [ + { + "id": 1, + "name": "Review unet3d training support in current PR", + "status": "done", + "notes": "Reviewed all three files changed in dec4d42: utils.py (read1 fix), cluster_collector.py (gather-always fix), test_cluster_collector.py (6 new tests)" + }, + { + "id": 2, + "name": "Identify remaining hang cause: fork()-after-MPI-init", + "status": "done", + "notes": "Root cause: mpirun->DLIO rank (MPI_Init) -> DataLoader fork() -> orphan workers hold pipe write-end -> TextIOWrapper.read() blocks indefinitely after mpirun exits" + }, + { + "id": 3, + "name": "Apply surgical fix to CommandExecutor.execute() drain section", + "status": "done", + "commit": "uncommitted", + "notes": "Replaced two blocking .read() calls with select()+read1() loop using 0.5s timeout. 176 unit tests pass." + } + ], + "remaining_tasks": [ + { + "id": 4, + "name": "Commit the fix", + "status": "not_started", + "notes": "mlpstorage_py/utils.py has the change staged but not committed" + }, + { + "id": 5, + "name": "Optionally: address double cluster collection in DLIOBenchmark.__init__ vs base.Benchmark.run()", + "status": "not_started", + "notes": "Both call _collect_cluster_information() redundantly. Not a hang, just wasted time. Out of scope for this PR per user's late-release constraint." + } + ], + "blockers": [], + "human_actions_pending": [ + { + "action": "Review and commit the utils.py fix", + "context": "One uncommitted file: mlpstorage_py/utils.py with the select-based drain replacing blocking .read() calls", + "blocking": false + } + ], + "decisions": [ + { + "decision": "Use select()+read1() with 0.5s timeout instead of blocking .read()", + "rationale": "When pipe write-end is closed (normal case) select() returns immediately on EOF — zero latency. When orphaned grandchildren hold it open, times out after 0.5s and un-sticks the orchestrator.", + "phase": "n/a" + }, + { + "decision": "Do NOT add process-group management (start_new_session=True + os.killpg)", + "rationale": "More invasive, changes mpirun's session/signal handling. The drain timeout achieves the same unblock with minimal surface area.", + "phase": "n/a" + }, + { + "decision": "Keep double cluster collection (DLIOBenchmark.__init__ + base.Benchmark.run()) out of scope", + "rationale": "User is late in release cycle; only fix minimum required. Double collection is a redundancy/perf issue, not a hang.", + "phase": "n/a" + } + ], + "uncommitted_files": ["mlpstorage_py/utils.py"], + "next_action": "git add mlpstorage_py/utils.py && git commit with message describing the fork()-after-MPI-init fix", + "context_notes": "This is on branch FileSystemGuy-une3d-hang. The PR already has two fixes in dec4d42 (read1 for readline hang, gather-always for MPI deadlock). This session added a third fix for the pipe drain hang caused by orphaned DataLoader workers. All 176 runnable unit tests pass." +} From 2e74360586f62ded9be37221c84cca7376b3922a Mon Sep 17 00:00:00 2001 From: Curtis Anderson Date: Tue, 26 May 2026 12:56:54 -0700 Subject: [PATCH 3/5] Fix unet3d hang: drain stdout/stderr with select()+read1() to avoid blocking on orphaned DataLoader workers PyTorch DataLoader workers are forked inside DLIO MPI ranks after MPI_Init. When mpirun exits (its direct children done), those orphaned grandchildren still hold the pipe write-end open, causing TextIOWrapper.read() in the post-loop drain to block indefinitely. Replacing the blocking .read() calls with a select()+read1() loop (0.5s timeout) un-sticks the orchestrator; the normal (no-orphan) case completes immediately on EOF with zero added latency. --- mlpstorage_py/utils.py | 37 +++++++++++++++++++++++-------------- 1 file changed, 23 insertions(+), 14 deletions(-) diff --git a/mlpstorage_py/utils.py b/mlpstorage_py/utils.py index 21fe8935..065f893a 100755 --- a/mlpstorage_py/utils.py +++ b/mlpstorage_py/utils.py @@ -405,20 +405,29 @@ def execute(self, sys.stderr.write(line) sys.stderr.flush() - # Read any remaining output - stdout_remainder = self.process.stdout.read() - if stdout_remainder: - stdout_buffer.write(stdout_remainder) - if print_stdout: - sys.stdout.write(stdout_remainder) - sys.stdout.flush() - - stderr_remainder = self.process.stderr.read() - if stderr_remainder: - stderr_buffer.write(stderr_remainder) - if print_stderr: - sys.stderr.write(stderr_remainder) - sys.stderr.flush() + # Drain any remaining output. TextIOWrapper.read() blocks until + # EOF, which never arrives if orphaned grandchild processes + # (e.g. PyTorch DataLoader workers forked inside a DLIO MPI rank + # *after* MPI_Init) still hold the pipe write-end open. + # select() + read1() with a short timeout avoids that hang: + # when the write-end is fully closed select() returns immediately + # (EOF is readable), so the normal path has no added latency. + for stream, buf, print_flag, sys_out in [ + (self.process.stdout, stdout_buffer, print_stdout, sys.stdout), + (self.process.stderr, stderr_buffer, print_stderr, sys.stderr), + ]: + while True: + ready, _, _ = select.select([stream], [], [], 0.5) + if not ready: + break + chunk = stream.buffer.read1(65536) + if not chunk: + break + text = chunk.decode('utf-8', errors='replace') + buf.write(text) + if print_flag: + sys_out.write(text) + sys_out.flush() # Get the return code return_code = self.process.poll() From 3d9a9a8da08abae85e78f14cffb40bd5b7bf5f29 Mon Sep 17 00:00:00 2001 From: Curtis Anderson <99758333+FileSystemGuy@users.noreply.github.com> Date: Tue, 26 May 2026 12:58:45 -0700 Subject: [PATCH 4/5] Delete .planning/.continue-here.md --- .planning/.continue-here.md | 85 ------------------------------------- 1 file changed, 85 deletions(-) delete mode 100644 .planning/.continue-here.md diff --git a/.planning/.continue-here.md b/.planning/.continue-here.md deleted file mode 100644 index e0cbe9a4..00000000 --- a/.planning/.continue-here.md +++ /dev/null @@ -1,85 +0,0 @@ ---- -context: default -phase: null -task: null -total_tasks: null -status: in_progress -branch: FileSystemGuy-une3d-hang -last_updated: 2026-05-26T19:52:28Z ---- - -# BLOCKING CONSTRAINTS — Read Before Anything Else - -_No constraints discovered through failure this session. Section removed._ - - -Branch `FileSystemGuy-une3d-hang`. This session reviewed the unet3d training -hang investigation and applied a third surgical fix to `mlpstorage_py/utils.py`. -The fix is **uncommitted**. The branch already has two committed fixes in `dec4d42`. - -The file that needs committing: -- `mlpstorage_py/utils.py` — drain loop replacing blocking `.read()` calls - - - - -1. **Reviewed all changes in `dec4d42`** (the existing PR commit): - - `utils.py`: `readline()` → `buffer.read1()` — prevents hang on `\r`-terminated DLIO/tqdm output - - `cluster_collector.py`: `try/except` around `collect_local_info()` so every MPI rank always reaches `comm.gather()` — prevents deadlock when any rank exits early - - `test_cluster_collector.py`: 6 new tests covering gather-always invariant - -2. **Identified the fork()-after-MPI-init hang** (third issue, not yet in PR): - - Root cause chain: mpirun → DLIO rank (calls `MPI_Init()`) → PyTorch DataLoader `fork()`s workers → forked workers inherit pipe write-ends AND corrupted MPI state → may hang/never exit → mpirun exits (its direct children are done) → `while process.poll() is None` loop exits → **`self.process.stdout.read()` blocks indefinitely** because orphaned DataLoader grandchildren still hold the pipe write-end. - -3. **Applied surgical fix** to `CommandExecutor.execute()` in `utils.py` (lines 408-421): - - Replaced two `TextIOWrapper.read()` blocking calls with a `select()+read1()` drain loop - - `0.5s` timeout: EOF case (no orphans) completes immediately; orphan case times out and un-sticks orchestrator - - 176 unit tests pass (`tests/unit/test_cluster_collector.py`, `tests/unit/test_cli.py`) - - - - -- **Commit `mlpstorage_py/utils.py`** with an appropriate message covering the fork()-after-MPI-init fix -- Optional (out of scope for this PR): eliminate the redundant double cluster collection — `DLIOBenchmark.__init__` → `accumulate_host_info()` → `_collect_cluster_information()` AND `base.Benchmark.run()` → `_collect_cluster_start()` → `_collect_cluster_information()` both run `mpirun python3 mlps_collector.py` before the DLIO job starts. Not a hang, just wasted time. - - - - -- **select()+read1() with 0.5s timeout** chosen over `os.set_blocking()` or process-group management — minimal surface area, correct behavior in both normal and orphan cases -- **Do NOT add `start_new_session=True`** to Popen — changes mpirun's session/signal handling, too invasive for late release -- **Double cluster collection left out of scope** — late in release cycle, user wants minimum required change - - - -None. - - -## Required Reading (in order) - -1. `mlpstorage_py/utils.py` lines 360-450 — `CommandExecutor.execute()` — the full context for the fix -2. `mlpstorage_py/cluster_collector.py` lines 1108-1162 — `MPI_COLLECTOR_SCRIPT main()` — the existing gather-always fix -3. `git show dec4d42` — the two existing PR fixes for context before committing the third - -## Critical Anti-Patterns (do NOT repeat these) - -_None discovered this session._ - -## Infrastructure State - -- Branch: `FileSystemGuy-une3d-hang` -- One uncommitted modification: `mlpstorage_py/utils.py` -- `.planning/` directory is untracked (not in git) -- `pyarrow` and `psutil` not installed in this environment — only `test_cluster_collector.py` and `test_cli.py` are runnable - - -User is late in the v2.0 release cycle. The PR (`FileSystemGuy-une3d-hang`) already -has two fixes committed. This session identified and fixed a third hang: the pipe-drain -hang caused by PyTorch DataLoader workers that are forked inside DLIO MPI ranks (fork -after MPI_Init). The fix is minimal — only the post-loop drain section changed, and -it correctly adds zero latency to the normal (no-orphan) case. - - - -Start with: `git add mlpstorage_py/utils.py && git commit` with a message like: -"Fix third unet3d hang: drain pipe with select()+read1() timeout to avoid blocking on orphaned DataLoader workers (fork-after-MPI-init)" - From b8a4dee3e1bad2456277265d895e23c97abfe48f Mon Sep 17 00:00:00 2001 From: Curtis Anderson <99758333+FileSystemGuy@users.noreply.github.com> Date: Tue, 26 May 2026 13:01:16 -0700 Subject: [PATCH 5/5] Delete .planning/HANDOFF.json --- .planning/HANDOFF.json | 76 ------------------------------------------ 1 file changed, 76 deletions(-) delete mode 100644 .planning/HANDOFF.json diff --git a/.planning/HANDOFF.json b/.planning/HANDOFF.json deleted file mode 100644 index 55809811..00000000 --- a/.planning/HANDOFF.json +++ /dev/null @@ -1,76 +0,0 @@ -{ - "version": "1.0", - "timestamp": "2026-05-26T19:52:28Z", - "phase": null, - "phase_name": "fork-after-mpi-init-fix", - "phase_dir": null, - "plan": null, - "task": null, - "total_tasks": null, - "status": "paused", - "context": "ad-hoc code review and surgical fix", - "branch": "FileSystemGuy-une3d-hang", - "completed_tasks": [ - { - "id": 1, - "name": "Review unet3d training support in current PR", - "status": "done", - "notes": "Reviewed all three files changed in dec4d42: utils.py (read1 fix), cluster_collector.py (gather-always fix), test_cluster_collector.py (6 new tests)" - }, - { - "id": 2, - "name": "Identify remaining hang cause: fork()-after-MPI-init", - "status": "done", - "notes": "Root cause: mpirun->DLIO rank (MPI_Init) -> DataLoader fork() -> orphan workers hold pipe write-end -> TextIOWrapper.read() blocks indefinitely after mpirun exits" - }, - { - "id": 3, - "name": "Apply surgical fix to CommandExecutor.execute() drain section", - "status": "done", - "commit": "uncommitted", - "notes": "Replaced two blocking .read() calls with select()+read1() loop using 0.5s timeout. 176 unit tests pass." - } - ], - "remaining_tasks": [ - { - "id": 4, - "name": "Commit the fix", - "status": "not_started", - "notes": "mlpstorage_py/utils.py has the change staged but not committed" - }, - { - "id": 5, - "name": "Optionally: address double cluster collection in DLIOBenchmark.__init__ vs base.Benchmark.run()", - "status": "not_started", - "notes": "Both call _collect_cluster_information() redundantly. Not a hang, just wasted time. Out of scope for this PR per user's late-release constraint." - } - ], - "blockers": [], - "human_actions_pending": [ - { - "action": "Review and commit the utils.py fix", - "context": "One uncommitted file: mlpstorage_py/utils.py with the select-based drain replacing blocking .read() calls", - "blocking": false - } - ], - "decisions": [ - { - "decision": "Use select()+read1() with 0.5s timeout instead of blocking .read()", - "rationale": "When pipe write-end is closed (normal case) select() returns immediately on EOF — zero latency. When orphaned grandchildren hold it open, times out after 0.5s and un-sticks the orchestrator.", - "phase": "n/a" - }, - { - "decision": "Do NOT add process-group management (start_new_session=True + os.killpg)", - "rationale": "More invasive, changes mpirun's session/signal handling. The drain timeout achieves the same unblock with minimal surface area.", - "phase": "n/a" - }, - { - "decision": "Keep double cluster collection (DLIOBenchmark.__init__ + base.Benchmark.run()) out of scope", - "rationale": "User is late in release cycle; only fix minimum required. Double collection is a redundancy/perf issue, not a hang.", - "phase": "n/a" - } - ], - "uncommitted_files": ["mlpstorage_py/utils.py"], - "next_action": "git add mlpstorage_py/utils.py && git commit with message describing the fork()-after-MPI-init fix", - "context_notes": "This is on branch FileSystemGuy-une3d-hang. The PR already has two fixes in dec4d42 (read1 for readline hang, gather-always for MPI deadlock). This session added a third fix for the pipe drain hang caused by orphaned DataLoader workers. All 176 runnable unit tests pass." -}