From 7dac951f2b816a784a5f69bc96525dc3f5d62e26 Mon Sep 17 00:00:00 2001
From: Jongsok Choi <jongsok.choi@gmail.com>
Date: Sat, 11 Apr 2026 20:22:36 -0700
Subject: [PATCH] [Autotuner] Add LLM-seeded hybrid search

stack-info: PR: https://github.com/pytorch/helion/pull/2004, branch: choijon5/stack/4
---
 helion/autotuner/__init__.py          |   4 +
 helion/autotuner/base_search.py       |  32 +-
 helion/autotuner/block_id_sequence.py |  11 +
 helion/autotuner/config_generation.py |  13 +-
 helion/autotuner/config_spec.py       |  30 +-
 helion/autotuner/llm_seeded_lfbo.py   | 429 ++++++++++++++++++++++++++
 test/test_autotuner.py                | 235 ++++++++++++++
 test/test_best_available.py           |  21 ++
 8 files changed, 761 insertions(+), 14 deletions(-)
 create mode 100644 helion/autotuner/llm_seeded_lfbo.py

diff --git a/helion/autotuner/__init__.py b/helion/autotuner/__init__.py
index 19d9268e7..e08933e6e 100644
--- a/helion/autotuner/__init__.py
+++ b/helion/autotuner/__init__.py
@@ -22,6 +22,8 @@
 from .external import autotune as autotune
 from .finite_search import FiniteSearch as FiniteSearch
 from .llm_search import LLMGuidedSearch as LLMGuidedSearch
+from .llm_seeded_lfbo import LLMSeededLFBOTreeSearch as LLMSeededLFBOTreeSearch
+from .llm_seeded_lfbo import LLMSeededSearch as LLMSeededSearch
 from .local_cache import LocalAutotuneCache as LocalAutotuneCache
 from .local_cache import StrictLocalAutotuneCache as StrictLocalAutotuneCache
 from .pattern_search import InitialPopulationStrategy as InitialPopulationStrategy
@@ -38,6 +40,8 @@
     "LFBOPatternSearch": LFBOPatternSearch,
     "LFBOTreeSearch": LFBOTreeSearch,
     "LLMGuidedSearch": LLMGuidedSearch,
+    "LLMSeededSearch": LLMSeededSearch,
+    "LLMSeededLFBOTreeSearch": LLMSeededLFBOTreeSearch,
     "DifferentialEvolutionSearch": DifferentialEvolutionSearch,
     "FiniteSearch": FiniteSearch,
     "PatternSearch": PatternSearch,
diff --git a/helion/autotuner/base_search.py b/helion/autotuner/base_search.py
index fdb052ec0..09a593323 100644
--- a/helion/autotuner/base_search.py
+++ b/helion/autotuner/base_search.py
@@ -698,6 +698,7 @@ def __init__(
         super().__init__(kernel, args)
         self.finishing_rounds = finishing_rounds
         self.population: list[PopulationMember] = []
+        self._best_available_seed_configs: list[Config] = []
         self.config_gen: ConfigGeneration = self.config_spec.create_config_generation(
             overrides=self.settings.autotune_config_overrides or None,
             advanced_controls_files=self.settings.autotune_search_acf or None,
@@ -856,15 +857,20 @@ def _find_similar_cached_configs(self, max_configs: int) -> list[SavedBestConfig
 
     def _generate_best_available_population_flat(self) -> list[FlatConfig]:
         """
-        Generate initial population using default config plus cached configs.
+        Generate initial population using default config, explicit seed configs,
+        and cached configs.
 
         Always starts with the default configuration, then adds up to
         MAX_BEST_AVAILABLE_CONFIGS matching cached configs from previous runs.
-        No random configs are added.  Duplicate configs are discarded.
+        Explicit seed configs provided by the caller are added ahead of cached
+        configs and are not suppressed by cache-skip settings. No random configs
+        are added. Duplicate configs are discarded.
 
         Returns:
             A list of unique FlatConfig values for the initial population.
-            Minimum size is 1 (just default), maximum is 1 + autotune_best_available_max_configs setting.
+            Minimum size is 1 (just default), plus any valid unique explicit
+            seed configs and up to autotune_best_available_max_configs cached
+            configs.
         """
         # Always start with the default config
         default_flat = self.config_gen.default_flat()
@@ -873,6 +879,16 @@ def _generate_best_available_population_flat(self) -> list[FlatConfig]:
         result: list[FlatConfig] = [default_flat]
         self.log("Starting with default config")
 
+        for config in self._best_available_seed_configs:
+            try:
+                flat = self.config_gen.flatten(config)
+                transferred_config = self.config_gen.unflatten(flat)
+                if transferred_config not in seen:
+                    seen.add(transferred_config)
+                    result.append(flat)
+            except (ValueError, TypeError, KeyError, AssertionError) as e:
+                self.log(f"Failed to transfer explicit seed config: {e}")
+
         max_configs = self.settings.autotune_best_available_max_configs
         cached_entries = self._find_similar_cached_configs(max_configs)
 
@@ -905,12 +921,16 @@ def _generate_best_available_population_flat(self) -> list[FlatConfig]:
         if duplicates > 0:
             self.log.debug(f"Discarded {duplicates} duplicate config(s)")
 
-        self.log(
-            f"Initial population: 1 default + {len(result) - 1} unique cached = {len(result)} total"
-        )
+        self.log(f"Initial population: {len(result)} total")
 
         return result
 
+    def set_best_available_seed_configs(
+        self,
+        configs: Sequence[Config],
+    ) -> None:
+        self._best_available_seed_configs = list(configs)
+
     def parallel_benchmark_population(
         self, members: list[PopulationMember], *, desc: str = "Benchmarking"
     ) -> list[PopulationMember]:
diff --git a/helion/autotuner/block_id_sequence.py b/helion/autotuner/block_id_sequence.py
index 8da053f7d..623617a54 100644
--- a/helion/autotuner/block_id_sequence.py
+++ b/helion/autotuner/block_id_sequence.py
@@ -46,6 +46,17 @@ def _flat_config(
     ) -> object:
         return fn(self._fragment(base))
 
+    def _encode_flat_value(self, base: ConfigSpec, value: object) -> object:
+        """Encode a normalized Config value into its flat-slot representation.
+
+        Most specs store the same value in Config and FlatConfig, so the
+        default implementation is the identity. ReductionLoopSpec is the
+        only override today: it normalizes persistent reductions to None in
+        Config, but FlatConfig stores that choice as an integer sentinel.
+        """
+        del base
+        return value
+
 
 _BlockIdItemT = TypeVar("_BlockIdItemT", bound=_BlockIdItem)
 
diff --git a/helion/autotuner/config_generation.py b/helion/autotuner/config_generation.py
index b74191999..b12a923cd 100644
--- a/helion/autotuner/config_generation.py
+++ b/helion/autotuner/config_generation.py
@@ -9,6 +9,7 @@
 from typing import cast
 
 from .._compat import warps_to_threads
+from .block_id_sequence import BlockIdSequence
 from .config_fragment import Category
 from .config_fragment import ConfigSpecFragment
 from .config_fragment import PowerOfTwoFragment
@@ -117,14 +118,22 @@ def _apply_overrides(self, config: Config) -> Config:
     def flatten(self, config: Config) -> FlatConfig:
         """Inverse of unflatten: convert a Config to a FlatConfig."""
         result = self.default_flat()
+        flat_fields = self.config_spec._flat_fields()
         for key, (indices, is_sequence) in self._key_to_flat_indices.items():
             if key not in config.config:
                 continue
             value = config.config[key]
             if is_sequence:
                 assert isinstance(value, list)
-                for idx, v in zip(indices, value, strict=True):
-                    result[idx] = v
+                field = flat_fields[key]
+                assert isinstance(field, BlockIdSequence)
+                # Sequence specs can normalize values in Config differently
+                # from how they are stored in FlatConfig. Only
+                # ReductionLoopSpec overrides this today, but keep the dispatch
+                # on the spec so flatten() remains the generic inverse of
+                # unflatten().
+                for idx, spec, v in zip(indices, field, value, strict=True):
+                    result[idx] = spec._encode_flat_value(self.config_spec, v)
             else:
                 assert len(indices) == 1
                 result[indices[0]] = value
diff --git a/helion/autotuner/config_spec.py b/helion/autotuner/config_spec.py
index 526b6211c..26808c8e8 100644
--- a/helion/autotuner/config_spec.py
+++ b/helion/autotuner/config_spec.py
@@ -1027,9 +1027,10 @@ def __init__(
         super().__init__([block_id])
         self.size_hint = size_hint
 
-    def _flat_config(
-        self, base: ConfigSpec, fn: Callable[[ConfigSpecFragment], object]
-    ) -> int | None:
+    def _flat_fragment(self, base: ConfigSpec) -> BlockSizeFragment:
+        # Shared by both directions:
+        # - unflatten: flat integer -> Config value via _flat_config()
+        # - flatten: Config value -> flat integer via _encode_flat_value()
         low = 8  # TODO(jansel): is smaller needed?
         high = next_power_of_2(max(low, self.size_hint))
         default = min(high, 4096)
@@ -1038,16 +1039,33 @@ def _flat_config(
         if base.max_reduction_threads is not None:
             if self.size_hint > base.max_reduction_threads:
                 default = min(default, base.max_reduction_threads)
-        value = fn(BlockSizeFragment(low, high, default))
+        return BlockSizeFragment(low, high, default)
+
+    def _flat_config(
+        self, base: ConfigSpec, fn: Callable[[ConfigSpecFragment], object]
+    ) -> int | None:
+        fragment = self._flat_fragment(base)
+        value = fn(fragment)
         assert isinstance(value, int)
-        if not (low <= value <= high):
+        if not (fragment.low <= value <= fragment.high):
             raise InvalidConfig(
-                f"Invalid value for reduction loop {low} <= {value} <= {high}"
+                "Invalid value for reduction loop "
+                f"{fragment.low} <= {value} <= {fragment.high}"
             )
         if value >= self.size_hint:
             return None  # max size becomes persistent reduction
         return value
 
+    def _encode_flat_value(self, base: ConfigSpec, value: object) -> object:
+        # None means "persistent reduction" in the normalized Config. In the
+        # flat search space that same choice is represented by an integer
+        # sentinel, typically the fragment default such as 1024 for a 1024-wide
+        # reduction. This is the one non-identity Config <-> FlatConfig
+        # mapping today.
+        if value is None:
+            return self._flat_fragment(base).default()
+        return value
+
     def _normalize(self, name: str, value: object) -> int | None:
         if value is None:
             return None
diff --git a/helion/autotuner/llm_seeded_lfbo.py b/helion/autotuner/llm_seeded_lfbo.py
new file mode 100644
index 000000000..08a4c0e9a
--- /dev/null
+++ b/helion/autotuner/llm_seeded_lfbo.py
@@ -0,0 +1,429 @@
+"""Seed a second-stage autotuner with configs from an LLM search pass."""
+
+from __future__ import annotations
+
+import inspect
+import math
+import os
+import time
+from typing import TYPE_CHECKING
+from typing import Any
+from typing import cast
+
+from .base_search import BaseSearch
+from .effort_profile import PATTERN_SEARCH_DEFAULTS
+from .effort_profile import QUICK_LLM_SEARCH_DEFAULTS
+from .llm.transport import DEFAULT_REQUEST_TIMEOUT_S
+from .llm_search import LLMGuidedSearch
+from .llm_search import guided_search_kwargs_from_config
+from .pattern_search import InitialPopulationStrategy
+
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+
+    from ..runtime.config import Config
+    from ..runtime.settings import Settings
+    from .base_search import _AutotunableKernel
+    from .effort_profile import AutotuneEffortProfile
+
+
+_DISALLOWED_SECOND_STAGE_ALGORITHMS = {
+    "LLMGuidedSearch",
+    "LLMSeededSearch",
+    "LLMSeededLFBOTreeSearch",
+}
+
+
+def _parse_env_bool(value: str) -> bool:
+    """Parse the small env-var bool dialect used by the hybrid overrides."""
+    return value.strip().lower() not in {"", "0", "false"}
+
+
+def _resolve_second_stage_algorithm(name: str) -> type[BaseSearch]:
+    """Resolve and validate the non-LLM search used in stage 2."""
+    from . import search_algorithms
+
+    search_cls = search_algorithms.get(name)
+    if search_cls is None:
+        raise ValueError(
+            f"Unknown hybrid second-stage algorithm: {name}. "
+            f"Valid options are: {', '.join(search_algorithms.keys())}"
+        )
+    if name in _DISALLOWED_SECOND_STAGE_ALGORITHMS:
+        raise ValueError(
+            f"Invalid hybrid second-stage algorithm: {name}. "
+            "The second stage must be a non-LLM search algorithm."
+        )
+    return search_cls
+
+
+def _supports_init_parameter(search_cls: type[BaseSearch], name: str) -> bool:
+    """Check whether a second-stage search accepts a particular kwarg."""
+    return name in inspect.signature(search_cls.__init__).parameters
+
+
+class LLMSeededSearch(BaseSearch):
+    """
+    Generic hybrid autotuner that seeds a second-stage search with LLM proposals.
+
+    The algorithm runs in two stages:
+    1. Run ``LLMGuidedSearch`` for ``llm_max_rounds`` rounds and capture its best
+       config in memory.
+    2. Run the configured second-stage search algorithm. If the algorithm
+       supports ``initial_population_strategy``, it is switched to
+       ``FROM_BEST_AVAILABLE`` so it can start from the LLM seed config.
+
+    Setting ``llm_max_rounds=0`` disables the seed stage and runs only the
+    second-stage search.
+    """
+
+    default_second_stage_algorithm = "LFBOTreeSearch"
+    allow_second_stage_env_override = True
+    hybrid_stage_breakdown: dict[str, object] | None
+
+    def __init__(
+        self,
+        kernel: _AutotunableKernel,
+        args: Sequence[object],
+        *,
+        second_stage_algorithm: str | None = None,
+        second_stage_kwargs: dict[str, object] | None = None,
+        best_available_pad_random: bool = False,
+        llm_provider: str | None = None,
+        llm_model: str = QUICK_LLM_SEARCH_DEFAULTS.model,
+        llm_configs_per_round: int = QUICK_LLM_SEARCH_DEFAULTS.configs_per_round,
+        llm_max_rounds: int = QUICK_LLM_SEARCH_DEFAULTS.max_rounds,
+        llm_initial_random_configs: int = QUICK_LLM_SEARCH_DEFAULTS.initial_random_configs,
+        llm_api_base: str | None = None,
+        llm_api_key: str | None = None,
+        llm_max_output_tokens: int | None = None,
+        llm_request_timeout_s: float = DEFAULT_REQUEST_TIMEOUT_S,
+        llm_compile_timeout_s: int | None = QUICK_LLM_SEARCH_DEFAULTS.compile_timeout_s,
+    ) -> None:
+        super().__init__(kernel, args)
+        if llm_max_rounds < 0:
+            raise ValueError("LLMSeededSearch llm_max_rounds must be >= 0")
+        self.second_stage_algorithm = (
+            second_stage_algorithm or type(self).default_second_stage_algorithm
+        )
+        _resolve_second_stage_algorithm(self.second_stage_algorithm)
+        self.second_stage_kwargs = dict(second_stage_kwargs or {})
+        self.best_available_pad_random = best_available_pad_random
+
+        self.llm_provider = llm_provider
+        self.llm_model = llm_model
+        self.llm_configs_per_round = llm_configs_per_round
+        self.llm_max_rounds = llm_max_rounds
+        self.llm_initial_random_configs = llm_initial_random_configs
+        self.llm_api_base = llm_api_base
+        self.llm_api_key = llm_api_key
+        self.llm_max_output_tokens = llm_max_output_tokens
+        self.llm_request_timeout_s = llm_request_timeout_s
+        self.llm_compile_timeout_s = llm_compile_timeout_s
+
+        self.hybrid_stage_breakdown = None
+
+    @classmethod
+    def _get_default_second_stage_algorithm(cls) -> str:
+        """Read the default stage-2 algorithm, optionally from env."""
+        if (
+            cls.allow_second_stage_env_override
+            and (value := os.environ.get("HELION_HYBRID_SECOND_STAGE_ALGORITHM"))
+            is not None
+        ):
+            return value
+        return cls.default_second_stage_algorithm
+
+    @classmethod
+    def get_kwargs_from_profile(
+        cls, profile: AutotuneEffortProfile, settings: Settings
+    ) -> dict[str, object]:
+        """Combine shared LLM defaults with the chosen second-stage profile."""
+        second_stage_algorithm = cls._get_default_second_stage_algorithm()
+        second_stage_cls = _resolve_second_stage_algorithm(second_stage_algorithm)
+
+        # The hybrid uses a quick LLM seed stage by default, even under full effort.
+        guided_kwargs = guided_search_kwargs_from_config(
+            QUICK_LLM_SEARCH_DEFAULTS, settings
+        )
+        llm_kwargs: dict[str, object] = {
+            f"llm_{k}": v for k, v in guided_kwargs.items()
+        }
+
+        kwargs = {
+            **super().get_kwargs_from_profile(profile, settings),
+            "second_stage_algorithm": second_stage_algorithm,
+            "second_stage_kwargs": second_stage_cls.get_kwargs_from_profile(
+                profile, settings
+            ),
+            **llm_kwargs,
+            "best_available_pad_random": False,
+        }
+
+        if (value := os.environ.get("HELION_HYBRID_LLM_MAX_ROUNDS")) is not None:
+            kwargs["llm_max_rounds"] = int(value)
+        if (
+            value := os.environ.get("HELION_HYBRID_BEST_AVAILABLE_PAD_RANDOM")
+        ) is not None:
+            kwargs["best_available_pad_random"] = _parse_env_bool(value)
+        return kwargs
+
+    def _make_llm_search(self) -> LLMGuidedSearch:
+        """Construct the stage-1 guided search from llm_* settings."""
+        return LLMGuidedSearch(
+            self.kernel,
+            self.args,
+            finishing_rounds=0,
+            provider=self.llm_provider,
+            model=self.llm_model,
+            configs_per_round=self.llm_configs_per_round,
+            max_rounds=self.llm_max_rounds,
+            initial_random_configs=self.llm_initial_random_configs,
+            api_base=self.llm_api_base,
+            api_key=self.llm_api_key,
+            max_output_tokens=self.llm_max_output_tokens,
+            request_timeout_s=self.llm_request_timeout_s,
+            compile_timeout_s=self.llm_compile_timeout_s,
+        )
+
+    def _make_second_stage_search(self, *, seeded: bool) -> BaseSearch:
+        """Construct stage 2 and enable best-available seeding when supported."""
+        second_stage_cls = _resolve_second_stage_algorithm(self.second_stage_algorithm)
+        kwargs = dict(self.second_stage_kwargs)
+
+        if seeded:
+            if _supports_init_parameter(
+                second_stage_cls, "initial_population_strategy"
+            ):
+                kwargs["initial_population_strategy"] = (
+                    InitialPopulationStrategy.FROM_BEST_AVAILABLE
+                )
+                if _supports_init_parameter(
+                    second_stage_cls, "best_available_pad_random"
+                ):
+                    kwargs["best_available_pad_random"] = self.best_available_pad_random
+            else:
+                self.log(
+                    f"Second-stage algorithm {self.second_stage_algorithm} "
+                    "does not support FROM_BEST_AVAILABLE initialization; "
+                    "the LLM seed may not influence the next stage."
+                )
+
+        return cast(
+            "BaseSearch",
+            cast("Any", second_stage_cls)(self.kernel, self.args, **kwargs),
+        )
+
+    def _inject_seed_into_second_stage(
+        self,
+        second_stage_search: BaseSearch,
+        llm_seed_config: Config,
+    ) -> None:
+        """Pass the best LLM config into searches that expose the seed hook."""
+        setter = getattr(second_stage_search, "set_best_available_seed_configs", None)
+        if setter is None:
+            return
+        setter([llm_seed_config])
+
+    def _finalize_stage_metrics(
+        self,
+        llm_search: LLMGuidedSearch | None,
+        llm_seed_config: Config | None,
+        llm_wall_time: float,
+        second_stage_search: BaseSearch,
+        second_stage_wall_time: float,
+    ) -> None:
+        """Merge per-stage timing and autotune metrics into the hybrid summary."""
+
+        def _finite_perf(search: BaseSearch | None) -> float | None:
+            if search is None or not math.isfinite(search.best_perf_so_far):
+                return None
+            return search.best_perf_so_far
+
+        llm_metrics = llm_search._autotune_metrics if llm_search else None
+        second_stage_metrics = second_stage_search._autotune_metrics
+        second_stage_tested = second_stage_metrics.num_configs_tested
+
+        self.hybrid_stage_breakdown = {
+            "used_llm_seed": llm_search is not None,
+            "llm_seed_perf_ms": _finite_perf(llm_search),
+            "llm_seed_time_s": llm_wall_time,
+            "llm_seed_configs_tested": (
+                llm_metrics.num_configs_tested if llm_metrics else 0
+            ),
+            "llm_seed_config": (
+                dict(llm_seed_config) if llm_seed_config is not None else None
+            ),
+            "second_stage_algorithm": self.second_stage_algorithm,
+            "second_stage_perf_ms": _finite_perf(second_stage_search),
+            "second_stage_time_s": second_stage_wall_time,
+            "second_stage_configs_tested": second_stage_tested,
+        }
+        if self.second_stage_algorithm == "LFBOTreeSearch":
+            self.hybrid_stage_breakdown.update(
+                {
+                    "lfbo_stage_perf_ms": _finite_perf(second_stage_search),
+                    "lfbo_stage_time_s": second_stage_wall_time,
+                    "lfbo_stage_configs_tested": second_stage_tested,
+                }
+            )
+
+        # Aggregate metrics from both stages
+        for field in (
+            "num_configs_tested",
+            "num_compile_failures",
+            "num_accuracy_failures",
+            "num_generations",
+        ):
+            setattr(
+                self._autotune_metrics,
+                field,
+                (getattr(llm_metrics, field) if llm_metrics else 0)
+                + getattr(second_stage_metrics, field),
+            )
+
+        candidate_best = [
+            stage.best_perf_so_far
+            for stage in (llm_search, second_stage_search)
+            if stage is not None and math.isfinite(stage.best_perf_so_far)
+        ]
+        self.best_perf_so_far = min(candidate_best) if candidate_best else math.inf
+
+    def _autotune(self) -> Config:
+        """Run the optional LLM seed stage, then the configured second stage."""
+        self.log(
+            f"Starting {type(self).__name__} with "
+            f"second_stage_algorithm={self.second_stage_algorithm}, "
+            f"llm_max_rounds={self.llm_max_rounds}, "
+            f"llm_configs_per_round={self.llm_configs_per_round}, "
+            f"best_available_pad_random={self.best_available_pad_random}"
+        )
+
+        # Stage 1: LLM seed search
+        llm_search: LLMGuidedSearch | None = None
+        llm_seed_config: Config | None = None
+        llm_wall_time = 0.0
+
+        if self.llm_max_rounds > 0:
+            self.log(
+                "Hybrid stage 1/2: "
+                f"LLMGuidedSearch for {self.llm_max_rounds} round(s) "
+                f"with {self.llm_configs_per_round} configs/round"
+            )
+            llm_search = self._make_llm_search()
+            llm_start = time.perf_counter()
+            llm_seed_config = llm_search.autotune(skip_cache=True)
+            llm_wall_time = time.perf_counter() - llm_start
+
+        # Stage 2: second-stage search (optionally seeded)
+        seeded = llm_seed_config is not None
+        self.log(
+            "Hybrid stage 2/2: "
+            + (
+                f"running {self.second_stage_algorithm} from best available seed"
+                if seeded
+                else f"running {self.second_stage_algorithm} without LLM seed"
+            )
+        )
+        second_stage_search = self._make_second_stage_search(seeded=seeded)
+        if llm_seed_config is not None:
+            self._inject_seed_into_second_stage(second_stage_search, llm_seed_config)
+        second_stage_start = time.perf_counter()
+        best_config = second_stage_search.autotune()
+        second_stage_wall_time = time.perf_counter() - second_stage_start
+
+        self._finalize_stage_metrics(
+            llm_search,
+            llm_seed_config,
+            llm_wall_time,
+            second_stage_search,
+            second_stage_wall_time,
+        )
+        return best_config
+
+
+class LLMSeededLFBOTreeSearch(LLMSeededSearch):
+    """Convenience wrapper for the common LLM-seeded LFBO tree search pipeline."""
+
+    allow_second_stage_env_override = False
+
+    @classmethod
+    def get_kwargs_from_profile(
+        cls, profile: AutotuneEffortProfile, settings: Settings
+    ) -> dict[str, object]:
+        """Drop the explicit stage-2 algorithm knob from the LFBO convenience API."""
+        kwargs = super().get_kwargs_from_profile(profile, settings)
+        kwargs.pop("second_stage_algorithm", None)
+        return kwargs
+
+    def __init__(
+        self,
+        kernel: _AutotunableKernel,
+        args: Sequence[object],
+        *,
+        second_stage_kwargs: dict[str, object] | None = None,
+        num_neighbors: int = 200,
+        frac_selected: float = 0.10,
+        radius: int = 2,
+        initial_population: int = PATTERN_SEARCH_DEFAULTS.initial_population,
+        copies: int = PATTERN_SEARCH_DEFAULTS.copies,
+        max_generations: int = PATTERN_SEARCH_DEFAULTS.max_generations,
+        min_improvement_delta: float = 0.001,
+        quantile: float = 0.1,
+        patience: int = 1,
+        similarity_penalty: float = 1.0,
+        initial_population_strategy: InitialPopulationStrategy | None = None,
+        best_available_pad_random: bool = False,
+        finishing_rounds: int = 0,
+        compile_timeout_lower_bound: float = PATTERN_SEARCH_DEFAULTS.compile_timeout_lower_bound,
+        compile_timeout_quantile: float = PATTERN_SEARCH_DEFAULTS.compile_timeout_quantile,
+        llm_provider: str | None = None,
+        llm_model: str = QUICK_LLM_SEARCH_DEFAULTS.model,
+        llm_configs_per_round: int = QUICK_LLM_SEARCH_DEFAULTS.configs_per_round,
+        llm_max_rounds: int = QUICK_LLM_SEARCH_DEFAULTS.max_rounds,
+        llm_initial_random_configs: int = QUICK_LLM_SEARCH_DEFAULTS.initial_random_configs,
+        llm_api_base: str | None = None,
+        llm_api_key: str | None = None,
+        llm_max_output_tokens: int | None = None,
+        llm_request_timeout_s: float = DEFAULT_REQUEST_TIMEOUT_S,
+        llm_compile_timeout_s: int | None = QUICK_LLM_SEARCH_DEFAULTS.compile_timeout_s,
+    ) -> None:
+        # Build LFBO second-stage kwargs from individual params or passthrough
+        computed_second_stage_kwargs: dict[str, object]
+        if second_stage_kwargs is not None:
+            computed_second_stage_kwargs = dict(second_stage_kwargs)
+        else:
+            computed_second_stage_kwargs = {
+                "num_neighbors": num_neighbors,
+                "frac_selected": frac_selected,
+                "radius": radius,
+                "initial_population": initial_population,
+                "copies": copies,
+                "max_generations": max_generations,
+                "min_improvement_delta": min_improvement_delta,
+                "quantile": quantile,
+                "patience": patience,
+                "similarity_penalty": similarity_penalty,
+                "initial_population_strategy": initial_population_strategy,
+                "finishing_rounds": finishing_rounds,
+                "compile_timeout_lower_bound": compile_timeout_lower_bound,
+                "compile_timeout_quantile": compile_timeout_quantile,
+            }
+
+        super().__init__(
+            kernel,
+            args,
+            second_stage_algorithm="LFBOTreeSearch",
+            second_stage_kwargs=computed_second_stage_kwargs,
+            best_available_pad_random=best_available_pad_random,
+            llm_provider=llm_provider,
+            llm_model=llm_model,
+            llm_configs_per_round=llm_configs_per_round,
+            llm_max_rounds=llm_max_rounds,
+            llm_initial_random_configs=llm_initial_random_configs,
+            llm_api_base=llm_api_base,
+            llm_api_key=llm_api_key,
+            llm_max_output_tokens=llm_max_output_tokens,
+            llm_request_timeout_s=llm_request_timeout_s,
+            llm_compile_timeout_s=llm_compile_timeout_s,
+        )
diff --git a/test/test_autotuner.py b/test/test_autotuner.py
index 8ece4d86a..9911c1f39 100644
--- a/test/test_autotuner.py
+++ b/test/test_autotuner.py
@@ -61,6 +61,7 @@
 from helion.autotuner.local_cache import StrictLocalAutotuneCache
 from helion.autotuner.logger import AutotuneLogEntry
 from helion.autotuner.logger import AutotuningLogger
+from helion.autotuner.metrics import AutotuneMetrics
 from helion.autotuner.random_search import RandomSearch
 import helion.language as hl
 from helion.language import loops
@@ -2977,5 +2978,239 @@ def fake_post_json_with_response(
                 case["request_assertions"](captured)
 
 
+class TestLLMSeededLFBOTreeSearch(TestCase):
+    """Tests for the two-stage LLM-seeded hybrid autotuner."""
+
+    def test_profile_kwargs_and_env_overrides(self):
+        """Hybrid profile wiring forwards shared LLM settings and hybrid env overrides."""
+        from helion.autotuner import LLMSeededLFBOTreeSearch
+        from helion.autotuner import LLMSeededSearch
+
+        kwargs = LLMSeededLFBOTreeSearch.get_kwargs_from_profile(
+            get_effort_profile("full"), Settings()
+        )
+        self.assertEqual(kwargs["llm_model"], "gpt-5-2")
+        self.assertEqual(kwargs["llm_configs_per_round"], 15)
+        self.assertEqual(kwargs["llm_max_rounds"], 1)
+        self.assertEqual(kwargs["llm_initial_random_configs"], 10)
+        self.assertEqual(kwargs["llm_compile_timeout_s"], 15)
+        self.assertFalse(kwargs["best_available_pad_random"])
+
+        with patch.dict(
+            os.environ,
+            {"HELION_HYBRID_SECOND_STAGE_ALGORITHM": "PatternSearch"},
+            clear=False,
+        ):
+            generic_kwargs = LLMSeededSearch.get_kwargs_from_profile(
+                get_effort_profile("full"), Settings()
+            )
+        self.assertEqual(generic_kwargs["second_stage_algorithm"], "PatternSearch")
+        self.assertIn("max_generations", generic_kwargs["second_stage_kwargs"])
+
+        kernel = SimpleNamespace(
+            settings=Settings(),
+            config_spec=SimpleNamespace(),
+        )
+        with patch.dict(
+            os.environ,
+            {
+                "HELION_HYBRID_LLM_MAX_ROUNDS": "2",
+                "HELION_LLM_PROVIDER": "openai",
+            },
+            clear=False,
+        ):
+            kwargs = LLMSeededLFBOTreeSearch.get_kwargs_from_profile(
+                get_effort_profile("full"), Settings()
+            )
+        self.assertEqual(kwargs["llm_max_rounds"], 2)
+        self.assertEqual(kwargs["llm_provider"], "openai")
+
+        search = LLMSeededLFBOTreeSearch(kernel, (), **kwargs)
+        self.assertEqual(search.llm_provider, "openai")
+
+    def test_selected_by_env(self):
+        """HELION_AUTOTUNER selects the hybrid autotuner and applies profile defaults."""
+        from helion.autotuner import LLMSeededLFBOTreeSearch
+
+        args = (
+            torch.randn([8, 32], device=DEVICE),
+            torch.randn([8, 32], device=DEVICE),
+        )
+
+        with patch.dict(os.environ, {"HELION_AUTOTUNER": "LLMSeededLFBOTreeSearch"}):
+
+            @helion.kernel(autotune_effort="full")
+            def add(a, b):
+                out = torch.empty_like(a)
+                for tile in hl.tile(out.size()):
+                    out[tile] = a[tile] + b[tile]
+                return out
+
+            bound = add.bind(args)
+            autotuner = bound.settings.autotuner_fn(bound, args)
+            self.assertIsInstance(autotuner.autotuner, LLMSeededLFBOTreeSearch)
+            self.assertEqual(autotuner.autotuner.llm_max_rounds, 1)
+            self.assertFalse(autotuner.autotuner.best_available_pad_random)
+
+    def test_handoff_runs_llm_then_lfbo(self):
+        """The hybrid flow runs LLM seeding first, then injects that seed into LFBO."""
+        from helion.autotuner import InitialPopulationStrategy
+        from helion.autotuner import LLMSeededLFBOTreeSearch
+        from helion.runtime.config import Config
+
+        llm_instances = []
+        lfbo_instances = []
+
+        class FakeBenchmarkProvider:
+            def __init__(self, **kwargs) -> None:
+                self.kwargs = kwargs
+
+        class FakeLLMSearch:
+            def __init__(self, kernel, args, **kwargs) -> None:
+                self.kernel = kernel
+                self.args = args
+                self.kwargs = kwargs
+                self.best_perf_so_far = 0.9
+                self._autotune_metrics = AutotuneMetrics(
+                    num_configs_tested=7,
+                    num_compile_failures=1,
+                    num_accuracy_failures=2,
+                    num_generations=3,
+                )
+                llm_instances.append(self)
+
+            def autotune(self, *, skip_cache=False):
+                self.skip_cache = skip_cache
+                return Config(num_warps=4)
+
+        class FakeLFBOSearch:
+            def __init__(
+                self,
+                kernel,
+                args,
+                *,
+                initial_population_strategy=None,
+                best_available_pad_random=True,
+                **kwargs,
+            ) -> None:
+                self.kernel = kernel
+                self.args = args
+                self.kwargs = {
+                    **kwargs,
+                    "initial_population_strategy": initial_population_strategy,
+                    "best_available_pad_random": best_available_pad_random,
+                }
+                self.best_perf_so_far = 0.5
+                self._autotune_metrics = AutotuneMetrics(
+                    num_configs_tested=11,
+                    num_compile_failures=3,
+                    num_accuracy_failures=5,
+                    num_generations=6,
+                )
+                self.seed_configs = None
+                lfbo_instances.append(self)
+
+            def set_best_available_seed_configs(self, configs):
+                self.seed_configs = list(configs)
+
+            def autotune(self):
+                return Config(num_warps=8)
+
+        kernel = SimpleNamespace(
+            settings=Settings(),
+            config_spec=SimpleNamespace(),
+            env=SimpleNamespace(device=DEVICE, process_group_name=None),
+        )
+        args = (torch.randn([8], device=DEVICE),)
+        search = LLMSeededLFBOTreeSearch(kernel, args, llm_max_rounds=2)
+        search._benchmark_provider_cls = FakeBenchmarkProvider
+        search._prepare()
+        self.assertIsInstance(search.benchmark_provider, FakeBenchmarkProvider)
+
+        with (
+            patch("helion.autotuner.llm_seeded_lfbo.LLMGuidedSearch", FakeLLMSearch),
+            patch(
+                "helion.autotuner.llm_seeded_lfbo._resolve_second_stage_algorithm",
+                return_value=FakeLFBOSearch,
+            ),
+        ):
+            best = search._autotune()
+
+        self.assertEqual(best["num_warps"], 8)
+        self.assertEqual(llm_instances[0].kwargs["max_rounds"], 2)
+        self.assertTrue(llm_instances[0].skip_cache)
+        self.assertEqual(
+            lfbo_instances[0].kwargs["initial_population_strategy"],
+            InitialPopulationStrategy.FROM_BEST_AVAILABLE,
+        )
+        self.assertEqual(lfbo_instances[0].seed_configs, [Config(num_warps=4)])
+        self.assertEqual(search._autotune_metrics.num_configs_tested, 18)
+        self.assertEqual(search._autotune_metrics.num_compile_failures, 4)
+        self.assertEqual(search._autotune_metrics.num_accuracy_failures, 7)
+        self.assertEqual(search._autotune_metrics.num_generations, 9)
+        self.assertEqual(search.hybrid_stage_breakdown["llm_seed_configs_tested"], 7)
+        self.assertEqual(search.hybrid_stage_breakdown["lfbo_stage_configs_tested"], 11)
+
+    def test_zero_llm_rounds_falls_back_to_lfbo_strategy(self):
+        """Disabling LLM rounds skips stage 1 and leaves the second-stage strategy unchanged."""
+        from helion.autotuner import InitialPopulationStrategy
+        from helion.autotuner import LLMSeededLFBOTreeSearch
+        from helion.runtime.config import Config
+
+        lfbo_instances = []
+
+        class FakeBenchmarkProvider:
+            def __init__(self, **kwargs) -> None:
+                self.kwargs = kwargs
+
+        class FailIfLLMConstructed:
+            def __init__(self, *args, **kwargs) -> None:
+                raise AssertionError("LLM seed stage should be skipped")
+
+        class FakeLFBOSearch:
+            def __init__(self, kernel, args, **kwargs) -> None:
+                self.kwargs = kwargs
+                self.best_perf_so_far = 0.4
+                self._autotune_metrics = AutotuneMetrics(num_configs_tested=3)
+                lfbo_instances.append(self)
+
+            def autotune(self):
+                return Config(num_warps=16)
+
+        kernel = SimpleNamespace(
+            settings=Settings(),
+            config_spec=SimpleNamespace(),
+            env=SimpleNamespace(device=DEVICE, process_group_name=None),
+        )
+        args = (torch.randn([8], device=DEVICE),)
+        search = LLMSeededLFBOTreeSearch(
+            kernel,
+            args,
+            llm_max_rounds=0,
+            initial_population_strategy=InitialPopulationStrategy.FROM_RANDOM,
+        )
+        search._benchmark_provider_cls = FakeBenchmarkProvider
+        search._prepare()
+
+        with (
+            patch(
+                "helion.autotuner.llm_seeded_lfbo.LLMGuidedSearch",
+                FailIfLLMConstructed,
+            ),
+            patch(
+                "helion.autotuner.llm_seeded_lfbo._resolve_second_stage_algorithm",
+                return_value=FakeLFBOSearch,
+            ),
+        ):
+            best = search._autotune()
+
+        self.assertEqual(best["num_warps"], 16)
+        self.assertEqual(
+            lfbo_instances[0].kwargs["initial_population_strategy"],
+            InitialPopulationStrategy.FROM_RANDOM,
+        )
+        self.assertFalse(search.hybrid_stage_breakdown["used_llm_seed"])
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/test_best_available.py b/test/test_best_available.py
index 956fb86db..0261d3c53 100644
--- a/test/test_best_available.py
+++ b/test/test_best_available.py
@@ -303,6 +303,27 @@ def test_flatten_multiple_reduction_loops(self):
         re_flat = config_gen.flatten(restored)
         self.assertEqual(re_flat, flat)
 
+    def test_flatten_persistent_reduction_loop_roundtrip(self):
+        """Persistent reductions normalize to None but must round-trip to the flat sentinel."""
+        config_spec = ConfigSpec(backend=TritonBackend())
+        config_spec.block_sizes.append(
+            BlockSizeSpec(block_id=0, size_hint=64, min_size=16, max_size=256)
+        )
+        config_spec.reduction_loops.append(ReductionLoopSpec(block_id=1, size_hint=128))
+
+        config_gen = ConfigGeneration(config_spec)
+        default_flat = config_gen.default_flat()
+        rl_indices, rl_is_seq = config_gen._key_to_flat_indices["reduction_loops"]
+        self.assertTrue(rl_is_seq)
+        self.assertEqual(len(rl_indices), 1)
+        self.assertEqual(default_flat[rl_indices[0]], 128)
+
+        config = config_gen.unflatten(default_flat)
+        self.assertEqual(config.config["reduction_loops"], [None])
+
+        roundtripped = config_gen.flatten(config)
+        self.assertEqual(roundtripped, default_flat)
+
 
 class TestCacheMatching(unittest.TestCase):
     """Tests for cache file matching in warm start."""