diff --git a/helion/autotuner/base_search.py b/helion/autotuner/base_search.py index 990404e1c..e7186096b 100644 --- a/helion/autotuner/base_search.py +++ b/helion/autotuner/base_search.py @@ -2,6 +2,7 @@ import abc import collections +from collections.abc import Iterator import contextlib import dataclasses import datetime @@ -435,6 +436,7 @@ def __init__(self, kernel: _AutotunableKernel, args: Sequence[object]) -> None: self._precompile_tmpdir: tempfile.TemporaryDirectory[str] | None = None self._precompile_args_path: str | None = None self._precompile_result_counter = count() + self._bad_config_strs: set[str] = set() def _prepare(self) -> None: """Some initialization deferred until autotuning actually runs. @@ -531,9 +533,50 @@ def _try_load_checkpoint(self) -> bool: # load_state_dict validates required keys and raises CheckpointError for issues self.load_state_dict(state) + # Load bad configs (from subprocess crash recovery) + self._load_bad_configs() + self.log(f"Resumed at generation {self._current_generation}") return True + def _load_bad_configs(self) -> None: + """Load bad configs from _bad_configs.txt file.""" + from .subprocess_runner import load_bad_configs + + checkpoint_dir_str = self.settings.autotune_checkpoint_dir + if checkpoint_dir_str is not None: + bad_configs_path = os.path.join(checkpoint_dir_str, "_bad_configs.txt") + self._bad_config_strs |= load_bad_configs(bad_configs_path) + + if self._bad_config_strs: + self.log( + f"Loaded {len(self._bad_config_strs)} bad config(s) to skip", + ) + + @contextlib.contextmanager + def _pending_config(self, config: Config) -> Iterator[None]: + """Context manager that writes the pending-config breadcrumb on entry + and removes it on exit. + + If the body raises TritonUnrecoverableRuntimeError the pending file + is intentionally *not* cleared so the external crash-recovery script + can detect it. + """ + from .subprocess_runner import clear_pending, write_pending + + checkpoint_dir_str = self.settings.autotune_checkpoint_dir + if checkpoint_dir_str is None: + yield + return + write_pending(checkpoint_dir_str, str(config)) + try: + yield + except exc.TritonUnrecoverableRuntimeError: + # Let the pending file survive for the bash crash-recovery script + raise + else: + clear_pending(checkpoint_dir_str) + def _compute_baseline( self, ) -> tuple[object, Sequence[int], Sequence[object] | None]: @@ -752,6 +795,12 @@ def benchmark_function(self, config: Config, fn: CompiledConfig) -> float: Returns: The performance of the configuration in ms. """ + # Skip configs that previously crashed the subprocess + config_str = str(config) + if config_str in self._bad_config_strs: + self.log.warning(f"Skipping known-bad config: {config}") + return inf + self._autotune_metrics.num_configs_tested += 1 self.counters["benchmark"] += 1 self.log.debug(lambda: f"Running benchmark for {config!r}") @@ -1089,7 +1138,8 @@ def _benchmark( ) ) # benchmark one-by-one to avoid noisy results - perf = self.benchmark_function(config, fn) + with self._pending_config(config): + perf = self.benchmark_function(config, fn) status = "ok" if math.isfinite(perf) else "error" # Log completion after benchmarking self.log.record_autotune_entry( @@ -1193,6 +1243,8 @@ def autotune(self, *, skip_cache: bool = False) -> Config: exit_stack.callback(self.cleanup) if not self._try_load_checkpoint(): + # Load bad configs even on fresh starts (subprocess recovery) + self._load_bad_configs() self._init_search() try: best = self._autotune() @@ -1296,6 +1348,11 @@ def _cleanup_checkpoint(self) -> None: checkpoint_file.unlink() self.log(f"Checkpoint cleaned up: {checkpoint_file}") + # Clean up subprocess recovery artifacts + from .subprocess_runner import cleanup_subprocess_artifacts + + cleanup_subprocess_artifacts(checkpoint_dir_str) + @staticmethod def _serialize_numpy_rng_state( state: tuple[str, Any, int, int, float], diff --git a/helion/autotuner/subprocess_runner.py b/helion/autotuner/subprocess_runner.py new file mode 100644 index 000000000..e8ad2b1c5 --- /dev/null +++ b/helion/autotuner/subprocess_runner.py @@ -0,0 +1,60 @@ +"""File I/O helpers for autotuner crash recovery. + +The crash recovery protocol works with an external retry loop +(scripts/autotune_with_crash_recovery.sh). Before benchmarking each +config, the autotuner writes its string representation to a pending +file. If the process crashes (e.g. CUDA illegal memory access), the +pending file survives and the external retry loop records it as a bad +config. On re-run, the autotuner loads the checkpoint + bad configs +and skips the poison config. +""" + +from __future__ import annotations + +import os +from pathlib import Path + +_PENDING_FILENAME = "_pending_config.txt" +_BAD_CONFIGS_FILENAME = "_bad_configs.txt" + + +def write_pending(checkpoint_dir: str, config_str: str) -> None: + """Write the config being benchmarked to the pending file.""" + pending_path = Path(checkpoint_dir) / _PENDING_FILENAME + pending_path.write_text(config_str) + + +def clear_pending(checkpoint_dir: str) -> None: + """Remove the pending file after benchmark completes.""" + pending_path = Path(checkpoint_dir) / _PENDING_FILENAME + if pending_path.exists(): + pending_path.unlink() + + +def load_bad_configs(bad_configs_path: str) -> set[str]: + """Load bad config strings from file, one per line.""" + path = Path(bad_configs_path) + if not path.exists(): + return set() + lines = path.read_text().splitlines() + return {line.strip() for line in lines if line.strip()} + + +def _append_bad_config(bad_configs_path: str, config_str: str) -> None: + """Append a bad config string to the bad configs file.""" + with open(bad_configs_path, "a") as f: + f.write(config_str + "\n") + f.flush() + os.fsync(f.fileno()) + + +def cleanup_subprocess_artifacts(checkpoint_dir: str) -> None: + """Remove crash-recovery files in the checkpoint directory.""" + checkpoint_path = Path(checkpoint_dir) + for name in ( + _PENDING_FILENAME, + _BAD_CONFIGS_FILENAME, + ): + artifact = checkpoint_path / name + if artifact.exists(): + artifact.unlink() diff --git a/scripts/autotune_with_crash_recovery.sh b/scripts/autotune_with_crash_recovery.sh new file mode 100755 index 000000000..58c378775 --- /dev/null +++ b/scripts/autotune_with_crash_recovery.sh @@ -0,0 +1,111 @@ +#!/usr/bin/env bash +# Autotuner crash recovery wrapper. +# +# Runs a command (typically a Python script that calls helion autotuning) +# in a retry loop. When the process crashes due to an unrecoverable CUDA +# error (illegal memory access, misaligned address, etc.), the autotuner +# leaves a "_pending_config.txt" breadcrumb in the checkpoint directory. +# This script detects that file, records the poison config in +# "_bad_configs.txt", and re-runs the command. On re-run the autotuner +# loads its checkpoint and skips the bad config. +# +# Progress detection: +# Each crash should block a different config (since blocked configs are +# skipped on re-run). If the same config crashes twice, the autotuner +# is stuck and we give up. +# +# Requirements: +# - HELION_AUTOTUNE_CHECKPOINT_DIR must be set +# +# Usage: +# HELION_AUTOTUNE_CHECKPOINT_DIR=/tmp/ckpt \ +# scripts/autotune_with_crash_recovery.sh -- COMMAND [ARGS...] +# +# Examples: +# HELION_AUTOTUNE_CHECKPOINT_DIR=/tmp/autotune_ckpt \ +# scripts/autotune_with_crash_recovery.sh -- python train.py + +set -uo pipefail + +# --- Argument parsing --- +usage() { + cat >&2 <<'EOF' +Usage: HELION_AUTOTUNE_CHECKPOINT_DIR=/path/to/dir \ + autotune_with_crash_recovery.sh -- COMMAND [ARGS...] +EOF + exit "${1:-1}" +} + +while [[ $# -gt 0 ]]; do + case "$1" in + -h|--help) + usage 0 + ;; + --) + shift + break + ;; + *) + echo "Error: unknown option '$1'" >&2 + usage 1 + ;; + esac +done + +if [[ $# -eq 0 ]]; then + echo "Error: no command specified after --" >&2 + usage 1 +fi + +if [[ -z "${HELION_AUTOTUNE_CHECKPOINT_DIR:-}" ]]; then + echo "Error: HELION_AUTOTUNE_CHECKPOINT_DIR must be set." >&2 + exit 1 +fi + +# --- Setup --- +checkpoint_dir="$HELION_AUTOTUNE_CHECKPOINT_DIR" +mkdir -p "$checkpoint_dir" + +pending_file="$checkpoint_dir/_pending_config.txt" +bad_configs_file="$checkpoint_dir/_bad_configs.txt" + +# --- Retry loop --- +attempt=0 +last_config="" + +while true; do + attempt=$((attempt + 1)) + + # Run the user command (don't use set -e, capture exit code manually) + "$@" + exit_code=$? + + if [[ $exit_code -eq 0 ]]; then + exit 0 + fi + + # Check if the autotuner left a pending config breadcrumb + if [[ -f "$pending_file" ]]; then + config=$(cat "$pending_file") + rm -f "$pending_file" + echo "$config" >> "$bad_configs_file" + + echo "[crash-recovery] Process crashed (exit code $exit_code, attempt $attempt)." >&2 + echo "[crash-recovery] Blocked config: $config" >&2 + + # If the same config crashed again, the bad config is not being + # skipped — the autotuner is stuck. + if [[ "$config" == "$last_config" ]]; then + echo "[crash-recovery] Same config crashed twice — the autotuner appears stuck." >&2 + echo "[crash-recovery] All bad configs have been recorded. You can re-run this script and it will resume from the latest checkpoint, skipping all previously recorded bad configs." >&2 + exit 1 + fi + last_config="$config" + + echo "[crash-recovery] Restarting from checkpoint..." >&2 + else + # No pending file — this is not a recoverable CUDA crash. + # Propagate the original exit code. + exit "$exit_code" + fi +done diff --git a/test/data/autotune_crash_helper.py b/test/data/autotune_crash_helper.py new file mode 100644 index 000000000..2a29083e1 --- /dev/null +++ b/test/data/autotune_crash_helper.py @@ -0,0 +1,67 @@ +"""Helper script for bash crash recovery tests. + +Run via: + HELION_AUTOTUNE_CHECKPOINT_DIR=DIR \ + scripts/autotune_with_crash_recovery.sh -- python test/data/autotune_crash_helper.py + +On first run (when _CRASH_ON_FIRST_BENCHMARK is set and no counter file +exists): patches do_bench to trigger a real CUDA illegal memory access, +which exercises the real _pending_config context manager and +TritonUnrecoverableRuntimeError code path. On subsequent runs: autotuning +resumes from checkpoint normally, skipping the bad config. + +Without _CRASH_ON_FIRST_BENCHMARK: runs autotuning normally (used to test +that the bash script passes through a successful run). +""" + +from __future__ import annotations + +import os +from pathlib import Path + +import torch + +checkpoint_dir = os.environ["HELION_AUTOTUNE_CHECKPOINT_DIR"] +crash_on_first = os.environ.get("_CRASH_ON_FIRST_BENCHMARK", "") +counter_file = Path(checkpoint_dir) / "_benchmark_counter" + +if crash_on_first and not counter_file.exists(): + import triton + import triton.language as tl + + import helion.autotuner.base_search as _bs + + @triton.jit + def _ima_kernel(ptr): + """Triton kernel that triggers illegal memory access.""" + bad_ptr = ptr + (1 << 40) + tl.store(bad_ptr, tl.full([], 42.0, dtype=tl.float32)) + + _original_do_bench = _bs.do_bench + + def _ima_do_bench(*args, **kwargs): # type: ignore[no-untyped-def] + counter_file.write_text("done") + # Restore original so this only fires once + _bs.do_bench = _original_do_bench + # Trigger real CUDA illegal memory access + x = torch.zeros(1, device="cuda") + _ima_kernel[(1,)](x) + torch.cuda.synchronize() + # Should not reach here — IMA raises an exception + return _original_do_bench(*args, **kwargs) + + _bs.do_bench = _ima_do_bench + +# Import and run real autotuning +from helion._testing import import_path # noqa: E402 + +datadir = Path(__file__).parent +basic_kernels = import_path(datadir / "basic_kernels.py") + +args = (torch.randn([8, 32], device="cuda"), torch.randn([8, 32], device="cuda")) +bound = basic_kernels.add.bind(args) +bound.settings.autotune_checkpoint_dir = checkpoint_dir +bound.settings.autotune_effort = "quick" +config = bound.autotune(args, force=True) +result = bound(*args) +torch.testing.assert_close(result, args[0] + args[1]) diff --git a/test/test_autotuner_subprocess.py b/test/test_autotuner_subprocess.py new file mode 100644 index 000000000..ec04491c3 --- /dev/null +++ b/test/test_autotuner_subprocess.py @@ -0,0 +1,230 @@ +from __future__ import annotations + +import os +from pathlib import Path +import subprocess +import tempfile + +import pytest + +import helion +from helion.autotuner.logger import match_unrecoverable_runtime_error +from helion.autotuner.subprocess_runner import _append_bad_config +from helion.autotuner.subprocess_runner import cleanup_subprocess_artifacts +from helion.autotuner.subprocess_runner import clear_pending +from helion.autotuner.subprocess_runner import load_bad_configs +from helion.autotuner.subprocess_runner import write_pending + + +class TestErrorStringMatching: + """Test match_unrecoverable_runtime_error with bare payload substrings.""" + + @pytest.mark.parametrize( + ("msg", "expected"), + [ + ("illegal memory access", True), + ("an illegal memory access was encountered", True), + ("misaligned address", True), + ("unspecified launch failure", True), + ("illegal instruction", True), + ("ILLEGAL MEMORY ACCESS", True), # case insensitive + ("Misaligned Address", True), # case insensitive + ("out of memory", False), + ("CUDA error: out of memory", False), + ("segfault", False), + ("", False), + ], + ) + def test_match(self, msg: str, expected: bool) -> None: + err = RuntimeError(msg) + assert match_unrecoverable_runtime_error(err) == expected + + +class TestPendingFileIO: + """Test pending file write/clear lifecycle.""" + + def test_write_and_clear(self, tmp_path: Path) -> None: + config_str = "Config(block_sizes=[32], num_warps=4)" + + write_pending(str(tmp_path), config_str) + pending_file = tmp_path / "_pending_config.txt" + assert pending_file.exists() + assert pending_file.read_text() == config_str + + clear_pending(str(tmp_path)) + assert not pending_file.exists() + + def test_clear_nonexistent(self, tmp_path: Path) -> None: + # Should not raise + clear_pending(str(tmp_path)) + + +class TestBadConfigFileIO: + """Test .bad_configs file read/write.""" + + def test_load_empty(self, tmp_path: Path) -> None: + bad_path = str(tmp_path / "test.bad_configs") + result = load_bad_configs(bad_path) + assert result == set() + + def test_append_and_load(self, tmp_path: Path) -> None: + bad_path = str(tmp_path / "test.bad_configs") + config1 = "Config(block_sizes=[32], num_warps=4)" + config2 = "Config(block_sizes=[64], num_warps=8)" + + _append_bad_config(bad_path, config1) + _append_bad_config(bad_path, config2) + + result = load_bad_configs(bad_path) + assert config1 in result + assert config2 in result + assert len(result) == 2 + + def test_config_str_deterministic(self) -> None: + """Config.__str__() produces sorted, deterministic output.""" + c1 = helion.Config(block_sizes=[32], num_warps=4, num_stages=2) + c2 = helion.Config(num_stages=2, num_warps=4, block_sizes=[32]) + assert str(c1) == str(c2) + + +class TestCleanupArtifacts: + """Test cleanup_subprocess_artifacts removes crash-recovery files.""" + + def test_cleanup(self, tmp_path: Path) -> None: + (tmp_path / "_pending_config.txt").write_text("test") + (tmp_path / "_bad_configs.txt").write_text("test") + + cleanup_subprocess_artifacts(str(tmp_path)) + + assert not (tmp_path / "_pending_config.txt").exists() + assert not (tmp_path / "_bad_configs.txt").exists() + + +SCRIPT = str( + Path(__file__).parent.parent / "scripts" / "autotune_with_crash_recovery.sh" +) +HELPER = str(Path(__file__).parent / "data" / "autotune_crash_helper.py") + + +class TestBashCrashRecoveryScript: + """Tests for scripts/autotune_with_crash_recovery.sh. + + These invoke the bash script via subprocess.run(). The crash recovery + test uses test/data/autotune_crash_helper.py which monkey-patches + _benchmark_config to crash on the first call, exercising the real + pending-file and checkpoint code paths. + """ + + def _run_script( + self, + tmp_path: Path, + cmd: list[str], + extra_env: dict[str, str] | None = None, + ) -> subprocess.CompletedProcess[str]: + """Helper to run the bash script with HELION_AUTOTUNE_CHECKPOINT_DIR set.""" + env = {**os.environ, "HELION_AUTOTUNE_CHECKPOINT_DIR": str(tmp_path)} + if extra_env: + env.update(extra_env) + return subprocess.run( + [SCRIPT, "--"] + cmd, capture_output=True, text=True, env=env + ) + + def test_normal_exit(self, tmp_path: Path) -> None: + """Successful command passes through exit 0.""" + r = self._run_script(tmp_path, ["python", "-c", "pass"]) + assert r.returncode == 0 + + def test_no_pending_propagates_error(self, tmp_path: Path) -> None: + """Non-CUDA crash (no pending file) propagates exit code.""" + r = self._run_script( + tmp_path, ["python", "-c", "import sys; sys.exit(42)"] + ) + assert r.returncode == 42 + + def test_crash_with_pending_recovery(self, tmp_path: Path) -> None: + """Pending file detected by bash script, bad config recorded, re-run succeeds.""" + counter = tmp_path / "_run_counter" + # First run: write pending via real write_pending() + exit(1) + # Second run: succeed + cmd = ( + "import sys, os; " + "from pathlib import Path; " + "from helion.autotuner.subprocess_runner import write_pending; " + f"counter = Path('{counter}'); " + "run = int(counter.read_text()) if counter.exists() else 0; " + "counter.write_text(str(run + 1)); " + f"write_pending('{tmp_path}', 'Config(bad=True)') if run == 0 else None; " + "sys.exit(1) if run == 0 else None" + ) + r = self._run_script(tmp_path, ["python", "-c", cmd]) + assert r.returncode == 0 + # Bad config was recorded by bash script + bad = (tmp_path / "_bad_configs.txt").read_text() + assert "Config(bad=True)" in bad + # Script ran twice + assert counter.read_text() == "2" + + def test_same_config_gives_up(self, tmp_path: Path) -> None: + """Script gives up when the same config crashes twice.""" + cmd = ( + "from helion.autotuner.subprocess_runner import write_pending; " + f"write_pending('{tmp_path}', 'Config(always_bad=True)'); " + "import os; os._exit(1)" + ) + r = self._run_script(tmp_path, ["python", "-c", cmd]) + assert r.returncode != 0 + assert "appears stuck" in r.stderr + bad_lines = (tmp_path / "_bad_configs.txt").read_text().strip().splitlines() + assert len(bad_lines) == 2 + + def test_different_configs_keep_retrying(self, tmp_path: Path) -> None: + """Crashes on different configs keep retrying (not stuck).""" + counter = tmp_path / "_run_counter" + # Runs 1-5: write DIFFERENT pending config each time + crash + # Run 6: succeed + cmd = ( + "import sys, os; " + "from pathlib import Path; " + "from helion.autotuner.subprocess_runner import write_pending; " + f"counter = Path('{counter}'); " + "run = int(counter.read_text()) if counter.exists() else 0; " + "counter.write_text(str(run + 1)); " + f"write_pending('{tmp_path}', f'Config(bad={{run}})') if run < 5 else None; " + "sys.exit(1) if run < 5 else None" + ) + r = self._run_script(tmp_path, ["python", "-c", cmd]) + assert r.returncode == 0 + assert counter.read_text() == "6" + + def test_real_autotune_through_bash(self) -> None: + """End-to-end: real autotuning succeeds through the bash script.""" + with tempfile.TemporaryDirectory() as tmpdir: + r = self._run_script( + Path(tmpdir), + ["python", HELPER], + extra_env={ + "HELION_AUTOTUNE_MAX_GENERATIONS": "1", + "HELION_AUTOTUNER": "PatternSearch", + }, + ) + assert r.returncode == 0, f"stderr: {r.stderr}" + + def test_real_crash_recovery_through_bash(self) -> None: + """End-to-end: first run crashes during real benchmarking via + monkey-patch, bash script detects pending file, records bad config, + re-runs. Second run resumes from checkpoint and succeeds.""" + with tempfile.TemporaryDirectory() as tmpdir: + r = self._run_script( + Path(tmpdir), + ["python", HELPER], + extra_env={ + "_CRASH_ON_FIRST_BENCHMARK": "1", + "HELION_AUTOTUNE_MAX_GENERATIONS": "1", + "HELION_AUTOTUNER": "PatternSearch", + }, + ) + assert r.returncode == 0, f"stderr: {r.stderr}" + # Verify crash recovery happened (bad_configs.txt is cleaned + # up by _cleanup_checkpoint on success, so check stderr) + assert "[crash-recovery]" in r.stderr + assert "Blocked config:" in r.stderr