diff --git a/helion/autotuner/base_search.py b/helion/autotuner/base_search.py
index 990404e1c..e7186096b 100644
--- a/helion/autotuner/base_search.py
+++ b/helion/autotuner/base_search.py
@@ -2,6 +2,7 @@
 
 import abc
 import collections
+from collections.abc import Iterator
 import contextlib
 import dataclasses
 import datetime
@@ -435,6 +436,7 @@ def __init__(self, kernel: _AutotunableKernel, args: Sequence[object]) -> None:
         self._precompile_tmpdir: tempfile.TemporaryDirectory[str] | None = None
         self._precompile_args_path: str | None = None
         self._precompile_result_counter = count()
+        self._bad_config_strs: set[str] = set()
 
     def _prepare(self) -> None:
         """Some initialization deferred until autotuning actually runs.
@@ -531,9 +533,50 @@ def _try_load_checkpoint(self) -> bool:
         # load_state_dict validates required keys and raises CheckpointError for issues
         self.load_state_dict(state)
 
+        # Load bad configs (from subprocess crash recovery)
+        self._load_bad_configs()
+
         self.log(f"Resumed at generation {self._current_generation}")
         return True
 
+    def _load_bad_configs(self) -> None:
+        """Load bad configs from _bad_configs.txt file."""
+        from .subprocess_runner import load_bad_configs
+
+        checkpoint_dir_str = self.settings.autotune_checkpoint_dir
+        if checkpoint_dir_str is not None:
+            bad_configs_path = os.path.join(checkpoint_dir_str, "_bad_configs.txt")
+            self._bad_config_strs |= load_bad_configs(bad_configs_path)
+
+        if self._bad_config_strs:
+            self.log(
+                f"Loaded {len(self._bad_config_strs)} bad config(s) to skip",
+            )
+
+    @contextlib.contextmanager
+    def _pending_config(self, config: Config) -> Iterator[None]:
+        """Context manager that writes the pending-config breadcrumb on entry
+        and removes it on exit.
+
+        If the body raises TritonUnrecoverableRuntimeError the pending file
+        is intentionally *not* cleared so the external crash-recovery script
+        can detect it.
+        """
+        from .subprocess_runner import clear_pending, write_pending
+
+        checkpoint_dir_str = self.settings.autotune_checkpoint_dir
+        if checkpoint_dir_str is None:
+            yield
+            return
+        write_pending(checkpoint_dir_str, str(config))
+        try:
+            yield
+        except exc.TritonUnrecoverableRuntimeError:
+            # Let the pending file survive for the bash crash-recovery script
+            raise
+        else:
+            clear_pending(checkpoint_dir_str)
+
     def _compute_baseline(
         self,
     ) -> tuple[object, Sequence[int], Sequence[object] | None]:
@@ -752,6 +795,12 @@ def benchmark_function(self, config: Config, fn: CompiledConfig) -> float:
         Returns:
             The performance of the configuration in ms.
         """
+        # Skip configs that previously crashed the subprocess
+        config_str = str(config)
+        if config_str in self._bad_config_strs:
+            self.log.warning(f"Skipping known-bad config: {config}")
+            return inf
+
         self._autotune_metrics.num_configs_tested += 1
         self.counters["benchmark"] += 1
         self.log.debug(lambda: f"Running benchmark for {config!r}")
@@ -1089,7 +1138,8 @@ def _benchmark(
                     )
                 )
                 # benchmark one-by-one to avoid noisy results
-                perf = self.benchmark_function(config, fn)
+                with self._pending_config(config):
+                    perf = self.benchmark_function(config, fn)
                 status = "ok" if math.isfinite(perf) else "error"
                 # Log completion after benchmarking
                 self.log.record_autotune_entry(
@@ -1193,6 +1243,8 @@ def autotune(self, *, skip_cache: bool = False) -> Config:
             exit_stack.callback(self.cleanup)
 
             if not self._try_load_checkpoint():
+                # Load bad configs even on fresh starts (subprocess recovery)
+                self._load_bad_configs()
                 self._init_search()
             try:
                 best = self._autotune()
@@ -1296,6 +1348,11 @@ def _cleanup_checkpoint(self) -> None:
             checkpoint_file.unlink()
             self.log(f"Checkpoint cleaned up: {checkpoint_file}")
 
+        # Clean up subprocess recovery artifacts
+        from .subprocess_runner import cleanup_subprocess_artifacts
+
+        cleanup_subprocess_artifacts(checkpoint_dir_str)
+
     @staticmethod
     def _serialize_numpy_rng_state(
         state: tuple[str, Any, int, int, float],
diff --git a/helion/autotuner/subprocess_runner.py b/helion/autotuner/subprocess_runner.py
new file mode 100644
index 000000000..e8ad2b1c5
--- /dev/null
+++ b/helion/autotuner/subprocess_runner.py
@@ -0,0 +1,60 @@
+"""File I/O helpers for autotuner crash recovery.
+
+The crash recovery protocol works with an external retry loop
+(scripts/autotune_with_crash_recovery.sh). Before benchmarking each
+config, the autotuner writes its string representation to a pending
+file. If the process crashes (e.g. CUDA illegal memory access), the
+pending file survives and the external retry loop records it as a bad
+config. On re-run, the autotuner loads the checkpoint + bad configs
+and skips the poison config.
+"""
+
+from __future__ import annotations
+
+import os
+from pathlib import Path
+
+_PENDING_FILENAME = "_pending_config.txt"
+_BAD_CONFIGS_FILENAME = "_bad_configs.txt"
+
+
+def write_pending(checkpoint_dir: str, config_str: str) -> None:
+    """Write the config being benchmarked to the pending file."""
+    pending_path = Path(checkpoint_dir) / _PENDING_FILENAME
+    pending_path.write_text(config_str)
+
+
+def clear_pending(checkpoint_dir: str) -> None:
+    """Remove the pending file after benchmark completes."""
+    pending_path = Path(checkpoint_dir) / _PENDING_FILENAME
+    if pending_path.exists():
+        pending_path.unlink()
+
+
+def load_bad_configs(bad_configs_path: str) -> set[str]:
+    """Load bad config strings from file, one per line."""
+    path = Path(bad_configs_path)
+    if not path.exists():
+        return set()
+    lines = path.read_text().splitlines()
+    return {line.strip() for line in lines if line.strip()}
+
+
+def _append_bad_config(bad_configs_path: str, config_str: str) -> None:
+    """Append a bad config string to the bad configs file."""
+    with open(bad_configs_path, "a") as f:
+        f.write(config_str + "\n")
+        f.flush()
+        os.fsync(f.fileno())
+
+
+def cleanup_subprocess_artifacts(checkpoint_dir: str) -> None:
+    """Remove crash-recovery files in the checkpoint directory."""
+    checkpoint_path = Path(checkpoint_dir)
+    for name in (
+        _PENDING_FILENAME,
+        _BAD_CONFIGS_FILENAME,
+    ):
+        artifact = checkpoint_path / name
+        if artifact.exists():
+            artifact.unlink()
diff --git a/scripts/autotune_with_crash_recovery.sh b/scripts/autotune_with_crash_recovery.sh
new file mode 100755
index 000000000..58c378775
--- /dev/null
+++ b/scripts/autotune_with_crash_recovery.sh
@@ -0,0 +1,111 @@
+#!/usr/bin/env bash
+# Autotuner crash recovery wrapper.
+#
+# Runs a command (typically a Python script that calls helion autotuning)
+# in a retry loop. When the process crashes due to an unrecoverable CUDA
+# error (illegal memory access, misaligned address, etc.), the autotuner
+# leaves a "_pending_config.txt" breadcrumb in the checkpoint directory.
+# This script detects that file, records the poison config in
+# "_bad_configs.txt", and re-runs the command. On re-run the autotuner
+# loads its checkpoint and skips the bad config.
+#
+# Progress detection:
+#   Each crash should block a different config (since blocked configs are
+#   skipped on re-run). If the same config crashes twice, the autotuner
+#   is stuck and we give up.
+#
+# Requirements:
+#   - HELION_AUTOTUNE_CHECKPOINT_DIR must be set
+#
+# Usage:
+#   HELION_AUTOTUNE_CHECKPOINT_DIR=/tmp/ckpt \
+#       scripts/autotune_with_crash_recovery.sh -- COMMAND [ARGS...]
+#
+# Examples:
+#   HELION_AUTOTUNE_CHECKPOINT_DIR=/tmp/autotune_ckpt \
+#       scripts/autotune_with_crash_recovery.sh -- python train.py
+
+set -uo pipefail
+
+# --- Argument parsing ---
+usage() {
+    cat >&2 <<'EOF'
+Usage: HELION_AUTOTUNE_CHECKPOINT_DIR=/path/to/dir \
+           autotune_with_crash_recovery.sh -- COMMAND [ARGS...]
+EOF
+    exit "${1:-1}"
+}
+
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        -h|--help)
+            usage 0
+            ;;
+        --)
+            shift
+            break
+            ;;
+        *)
+            echo "Error: unknown option '$1'" >&2
+            usage 1
+            ;;
+    esac
+done
+
+if [[ $# -eq 0 ]]; then
+    echo "Error: no command specified after --" >&2
+    usage 1
+fi
+
+if [[ -z "${HELION_AUTOTUNE_CHECKPOINT_DIR:-}" ]]; then
+    echo "Error: HELION_AUTOTUNE_CHECKPOINT_DIR must be set." >&2
+    exit 1
+fi
+
+# --- Setup ---
+checkpoint_dir="$HELION_AUTOTUNE_CHECKPOINT_DIR"
+mkdir -p "$checkpoint_dir"
+
+pending_file="$checkpoint_dir/_pending_config.txt"
+bad_configs_file="$checkpoint_dir/_bad_configs.txt"
+
+# --- Retry loop ---
+attempt=0
+last_config=""
+
+while true; do
+    attempt=$((attempt + 1))
+
+    # Run the user command (don't use set -e, capture exit code manually)
+    "$@"
+    exit_code=$?
+
+    if [[ $exit_code -eq 0 ]]; then
+        exit 0
+    fi
+
+    # Check if the autotuner left a pending config breadcrumb
+    if [[ -f "$pending_file" ]]; then
+        config=$(cat "$pending_file")
+        rm -f "$pending_file"
+        echo "$config" >> "$bad_configs_file"
+
+        echo "[crash-recovery] Process crashed (exit code $exit_code, attempt $attempt)." >&2
+        echo "[crash-recovery] Blocked config: $config" >&2
+
+        # If the same config crashed again, the bad config is not being
+        # skipped — the autotuner is stuck.
+        if [[ "$config" == "$last_config" ]]; then
+            echo "[crash-recovery] Same config crashed twice — the autotuner appears stuck." >&2
+            echo "[crash-recovery] All bad configs have been recorded. You can re-run this script and it will resume from the latest checkpoint, skipping all previously recorded bad configs." >&2
+            exit 1
+        fi
+        last_config="$config"
+
+        echo "[crash-recovery] Restarting from checkpoint..." >&2
+    else
+        # No pending file — this is not a recoverable CUDA crash.
+        # Propagate the original exit code.
+        exit "$exit_code"
+    fi
+done
diff --git a/test/data/autotune_crash_helper.py b/test/data/autotune_crash_helper.py
new file mode 100644
index 000000000..2a29083e1
--- /dev/null
+++ b/test/data/autotune_crash_helper.py
@@ -0,0 +1,67 @@
+"""Helper script for bash crash recovery tests.
+
+Run via:
+    HELION_AUTOTUNE_CHECKPOINT_DIR=DIR \
+        scripts/autotune_with_crash_recovery.sh -- python test/data/autotune_crash_helper.py
+
+On first run (when _CRASH_ON_FIRST_BENCHMARK is set and no counter file
+exists): patches do_bench to trigger a real CUDA illegal memory access,
+which exercises the real _pending_config context manager and
+TritonUnrecoverableRuntimeError code path.  On subsequent runs: autotuning
+resumes from checkpoint normally, skipping the bad config.
+
+Without _CRASH_ON_FIRST_BENCHMARK: runs autotuning normally (used to test
+that the bash script passes through a successful run).
+"""
+
+from __future__ import annotations
+
+import os
+from pathlib import Path
+
+import torch
+
+checkpoint_dir = os.environ["HELION_AUTOTUNE_CHECKPOINT_DIR"]
+crash_on_first = os.environ.get("_CRASH_ON_FIRST_BENCHMARK", "")
+counter_file = Path(checkpoint_dir) / "_benchmark_counter"
+
+if crash_on_first and not counter_file.exists():
+    import triton
+    import triton.language as tl
+
+    import helion.autotuner.base_search as _bs
+
+    @triton.jit
+    def _ima_kernel(ptr):
+        """Triton kernel that triggers illegal memory access."""
+        bad_ptr = ptr + (1 << 40)
+        tl.store(bad_ptr, tl.full([], 42.0, dtype=tl.float32))
+
+    _original_do_bench = _bs.do_bench
+
+    def _ima_do_bench(*args, **kwargs):  # type: ignore[no-untyped-def]
+        counter_file.write_text("done")
+        # Restore original so this only fires once
+        _bs.do_bench = _original_do_bench
+        # Trigger real CUDA illegal memory access
+        x = torch.zeros(1, device="cuda")
+        _ima_kernel[(1,)](x)
+        torch.cuda.synchronize()
+        # Should not reach here — IMA raises an exception
+        return _original_do_bench(*args, **kwargs)
+
+    _bs.do_bench = _ima_do_bench
+
+# Import and run real autotuning
+from helion._testing import import_path  # noqa: E402
+
+datadir = Path(__file__).parent
+basic_kernels = import_path(datadir / "basic_kernels.py")
+
+args = (torch.randn([8, 32], device="cuda"), torch.randn([8, 32], device="cuda"))
+bound = basic_kernels.add.bind(args)
+bound.settings.autotune_checkpoint_dir = checkpoint_dir
+bound.settings.autotune_effort = "quick"
+config = bound.autotune(args, force=True)
+result = bound(*args)
+torch.testing.assert_close(result, args[0] + args[1])
diff --git a/test/test_autotuner_subprocess.py b/test/test_autotuner_subprocess.py
new file mode 100644
index 000000000..ec04491c3
--- /dev/null
+++ b/test/test_autotuner_subprocess.py
@@ -0,0 +1,230 @@
+from __future__ import annotations
+
+import os
+from pathlib import Path
+import subprocess
+import tempfile
+
+import pytest
+
+import helion
+from helion.autotuner.logger import match_unrecoverable_runtime_error
+from helion.autotuner.subprocess_runner import _append_bad_config
+from helion.autotuner.subprocess_runner import cleanup_subprocess_artifacts
+from helion.autotuner.subprocess_runner import clear_pending
+from helion.autotuner.subprocess_runner import load_bad_configs
+from helion.autotuner.subprocess_runner import write_pending
+
+
+class TestErrorStringMatching:
+    """Test match_unrecoverable_runtime_error with bare payload substrings."""
+
+    @pytest.mark.parametrize(
+        ("msg", "expected"),
+        [
+            ("illegal memory access", True),
+            ("an illegal memory access was encountered", True),
+            ("misaligned address", True),
+            ("unspecified launch failure", True),
+            ("illegal instruction", True),
+            ("ILLEGAL MEMORY ACCESS", True),  # case insensitive
+            ("Misaligned Address", True),  # case insensitive
+            ("out of memory", False),
+            ("CUDA error: out of memory", False),
+            ("segfault", False),
+            ("", False),
+        ],
+    )
+    def test_match(self, msg: str, expected: bool) -> None:
+        err = RuntimeError(msg)
+        assert match_unrecoverable_runtime_error(err) == expected
+
+
+class TestPendingFileIO:
+    """Test pending file write/clear lifecycle."""
+
+    def test_write_and_clear(self, tmp_path: Path) -> None:
+        config_str = "Config(block_sizes=[32], num_warps=4)"
+
+        write_pending(str(tmp_path), config_str)
+        pending_file = tmp_path / "_pending_config.txt"
+        assert pending_file.exists()
+        assert pending_file.read_text() == config_str
+
+        clear_pending(str(tmp_path))
+        assert not pending_file.exists()
+
+    def test_clear_nonexistent(self, tmp_path: Path) -> None:
+        # Should not raise
+        clear_pending(str(tmp_path))
+
+
+class TestBadConfigFileIO:
+    """Test .bad_configs file read/write."""
+
+    def test_load_empty(self, tmp_path: Path) -> None:
+        bad_path = str(tmp_path / "test.bad_configs")
+        result = load_bad_configs(bad_path)
+        assert result == set()
+
+    def test_append_and_load(self, tmp_path: Path) -> None:
+        bad_path = str(tmp_path / "test.bad_configs")
+        config1 = "Config(block_sizes=[32], num_warps=4)"
+        config2 = "Config(block_sizes=[64], num_warps=8)"
+
+        _append_bad_config(bad_path, config1)
+        _append_bad_config(bad_path, config2)
+
+        result = load_bad_configs(bad_path)
+        assert config1 in result
+        assert config2 in result
+        assert len(result) == 2
+
+    def test_config_str_deterministic(self) -> None:
+        """Config.__str__() produces sorted, deterministic output."""
+        c1 = helion.Config(block_sizes=[32], num_warps=4, num_stages=2)
+        c2 = helion.Config(num_stages=2, num_warps=4, block_sizes=[32])
+        assert str(c1) == str(c2)
+
+
+class TestCleanupArtifacts:
+    """Test cleanup_subprocess_artifacts removes crash-recovery files."""
+
+    def test_cleanup(self, tmp_path: Path) -> None:
+        (tmp_path / "_pending_config.txt").write_text("test")
+        (tmp_path / "_bad_configs.txt").write_text("test")
+
+        cleanup_subprocess_artifacts(str(tmp_path))
+
+        assert not (tmp_path / "_pending_config.txt").exists()
+        assert not (tmp_path / "_bad_configs.txt").exists()
+
+
+SCRIPT = str(
+    Path(__file__).parent.parent / "scripts" / "autotune_with_crash_recovery.sh"
+)
+HELPER = str(Path(__file__).parent / "data" / "autotune_crash_helper.py")
+
+
+class TestBashCrashRecoveryScript:
+    """Tests for scripts/autotune_with_crash_recovery.sh.
+
+    These invoke the bash script via subprocess.run(). The crash recovery
+    test uses test/data/autotune_crash_helper.py which monkey-patches
+    _benchmark_config to crash on the first call, exercising the real
+    pending-file and checkpoint code paths.
+    """
+
+    def _run_script(
+        self,
+        tmp_path: Path,
+        cmd: list[str],
+        extra_env: dict[str, str] | None = None,
+    ) -> subprocess.CompletedProcess[str]:
+        """Helper to run the bash script with HELION_AUTOTUNE_CHECKPOINT_DIR set."""
+        env = {**os.environ, "HELION_AUTOTUNE_CHECKPOINT_DIR": str(tmp_path)}
+        if extra_env:
+            env.update(extra_env)
+        return subprocess.run(
+            [SCRIPT, "--"] + cmd, capture_output=True, text=True, env=env
+        )
+
+    def test_normal_exit(self, tmp_path: Path) -> None:
+        """Successful command passes through exit 0."""
+        r = self._run_script(tmp_path, ["python", "-c", "pass"])
+        assert r.returncode == 0
+
+    def test_no_pending_propagates_error(self, tmp_path: Path) -> None:
+        """Non-CUDA crash (no pending file) propagates exit code."""
+        r = self._run_script(
+            tmp_path, ["python", "-c", "import sys; sys.exit(42)"]
+        )
+        assert r.returncode == 42
+
+    def test_crash_with_pending_recovery(self, tmp_path: Path) -> None:
+        """Pending file detected by bash script, bad config recorded, re-run succeeds."""
+        counter = tmp_path / "_run_counter"
+        # First run: write pending via real write_pending() + exit(1)
+        # Second run: succeed
+        cmd = (
+            "import sys, os; "
+            "from pathlib import Path; "
+            "from helion.autotuner.subprocess_runner import write_pending; "
+            f"counter = Path('{counter}'); "
+            "run = int(counter.read_text()) if counter.exists() else 0; "
+            "counter.write_text(str(run + 1)); "
+            f"write_pending('{tmp_path}', 'Config(bad=True)') if run == 0 else None; "
+            "sys.exit(1) if run == 0 else None"
+        )
+        r = self._run_script(tmp_path, ["python", "-c", cmd])
+        assert r.returncode == 0
+        # Bad config was recorded by bash script
+        bad = (tmp_path / "_bad_configs.txt").read_text()
+        assert "Config(bad=True)" in bad
+        # Script ran twice
+        assert counter.read_text() == "2"
+
+    def test_same_config_gives_up(self, tmp_path: Path) -> None:
+        """Script gives up when the same config crashes twice."""
+        cmd = (
+            "from helion.autotuner.subprocess_runner import write_pending; "
+            f"write_pending('{tmp_path}', 'Config(always_bad=True)'); "
+            "import os; os._exit(1)"
+        )
+        r = self._run_script(tmp_path, ["python", "-c", cmd])
+        assert r.returncode != 0
+        assert "appears stuck" in r.stderr
+        bad_lines = (tmp_path / "_bad_configs.txt").read_text().strip().splitlines()
+        assert len(bad_lines) == 2
+
+    def test_different_configs_keep_retrying(self, tmp_path: Path) -> None:
+        """Crashes on different configs keep retrying (not stuck)."""
+        counter = tmp_path / "_run_counter"
+        # Runs 1-5: write DIFFERENT pending config each time + crash
+        # Run 6: succeed
+        cmd = (
+            "import sys, os; "
+            "from pathlib import Path; "
+            "from helion.autotuner.subprocess_runner import write_pending; "
+            f"counter = Path('{counter}'); "
+            "run = int(counter.read_text()) if counter.exists() else 0; "
+            "counter.write_text(str(run + 1)); "
+            f"write_pending('{tmp_path}', f'Config(bad={{run}})') if run < 5 else None; "
+            "sys.exit(1) if run < 5 else None"
+        )
+        r = self._run_script(tmp_path, ["python", "-c", cmd])
+        assert r.returncode == 0
+        assert counter.read_text() == "6"
+
+    def test_real_autotune_through_bash(self) -> None:
+        """End-to-end: real autotuning succeeds through the bash script."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            r = self._run_script(
+                Path(tmpdir),
+                ["python", HELPER],
+                extra_env={
+                    "HELION_AUTOTUNE_MAX_GENERATIONS": "1",
+                    "HELION_AUTOTUNER": "PatternSearch",
+                },
+            )
+            assert r.returncode == 0, f"stderr: {r.stderr}"
+
+    def test_real_crash_recovery_through_bash(self) -> None:
+        """End-to-end: first run crashes during real benchmarking via
+        monkey-patch, bash script detects pending file, records bad config,
+        re-runs. Second run resumes from checkpoint and succeeds."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            r = self._run_script(
+                Path(tmpdir),
+                ["python", HELPER],
+                extra_env={
+                    "_CRASH_ON_FIRST_BENCHMARK": "1",
+                    "HELION_AUTOTUNE_MAX_GENERATIONS": "1",
+                    "HELION_AUTOTUNER": "PatternSearch",
+                },
+            )
+            assert r.returncode == 0, f"stderr: {r.stderr}"
+            # Verify crash recovery happened (bad_configs.txt is cleaned
+            # up by _cleanup_checkpoint on success, so check stderr)
+            assert "[crash-recovery]" in r.stderr
+            assert "Blocked config:" in r.stderr