From 7dac951f2b816a784a5f69bc96525dc3f5d62e26 Mon Sep 17 00:00:00 2001 From: Jongsok Choi Date: Sat, 11 Apr 2026 20:22:36 -0700 Subject: [PATCH] [Autotuner] Add LLM-seeded hybrid search stack-info: PR: https://github.com/pytorch/helion/pull/2004, branch: choijon5/stack/4 --- helion/autotuner/__init__.py | 4 + helion/autotuner/base_search.py | 32 +- helion/autotuner/block_id_sequence.py | 11 + helion/autotuner/config_generation.py | 13 +- helion/autotuner/config_spec.py | 30 +- helion/autotuner/llm_seeded_lfbo.py | 429 ++++++++++++++++++++++++++ test/test_autotuner.py | 235 ++++++++++++++ test/test_best_available.py | 21 ++ 8 files changed, 761 insertions(+), 14 deletions(-) create mode 100644 helion/autotuner/llm_seeded_lfbo.py diff --git a/helion/autotuner/__init__.py b/helion/autotuner/__init__.py index 19d9268e7..e08933e6e 100644 --- a/helion/autotuner/__init__.py +++ b/helion/autotuner/__init__.py @@ -22,6 +22,8 @@ from .external import autotune as autotune from .finite_search import FiniteSearch as FiniteSearch from .llm_search import LLMGuidedSearch as LLMGuidedSearch +from .llm_seeded_lfbo import LLMSeededLFBOTreeSearch as LLMSeededLFBOTreeSearch +from .llm_seeded_lfbo import LLMSeededSearch as LLMSeededSearch from .local_cache import LocalAutotuneCache as LocalAutotuneCache from .local_cache import StrictLocalAutotuneCache as StrictLocalAutotuneCache from .pattern_search import InitialPopulationStrategy as InitialPopulationStrategy @@ -38,6 +40,8 @@ "LFBOPatternSearch": LFBOPatternSearch, "LFBOTreeSearch": LFBOTreeSearch, "LLMGuidedSearch": LLMGuidedSearch, + "LLMSeededSearch": LLMSeededSearch, + "LLMSeededLFBOTreeSearch": LLMSeededLFBOTreeSearch, "DifferentialEvolutionSearch": DifferentialEvolutionSearch, "FiniteSearch": FiniteSearch, "PatternSearch": PatternSearch, diff --git a/helion/autotuner/base_search.py b/helion/autotuner/base_search.py index fdb052ec0..09a593323 100644 --- a/helion/autotuner/base_search.py +++ b/helion/autotuner/base_search.py @@ -698,6 +698,7 @@ def __init__( super().__init__(kernel, args) self.finishing_rounds = finishing_rounds self.population: list[PopulationMember] = [] + self._best_available_seed_configs: list[Config] = [] self.config_gen: ConfigGeneration = self.config_spec.create_config_generation( overrides=self.settings.autotune_config_overrides or None, advanced_controls_files=self.settings.autotune_search_acf or None, @@ -856,15 +857,20 @@ def _find_similar_cached_configs(self, max_configs: int) -> list[SavedBestConfig def _generate_best_available_population_flat(self) -> list[FlatConfig]: """ - Generate initial population using default config plus cached configs. + Generate initial population using default config, explicit seed configs, + and cached configs. Always starts with the default configuration, then adds up to MAX_BEST_AVAILABLE_CONFIGS matching cached configs from previous runs. - No random configs are added. Duplicate configs are discarded. + Explicit seed configs provided by the caller are added ahead of cached + configs and are not suppressed by cache-skip settings. No random configs + are added. Duplicate configs are discarded. Returns: A list of unique FlatConfig values for the initial population. - Minimum size is 1 (just default), maximum is 1 + autotune_best_available_max_configs setting. + Minimum size is 1 (just default), plus any valid unique explicit + seed configs and up to autotune_best_available_max_configs cached + configs. """ # Always start with the default config default_flat = self.config_gen.default_flat() @@ -873,6 +879,16 @@ def _generate_best_available_population_flat(self) -> list[FlatConfig]: result: list[FlatConfig] = [default_flat] self.log("Starting with default config") + for config in self._best_available_seed_configs: + try: + flat = self.config_gen.flatten(config) + transferred_config = self.config_gen.unflatten(flat) + if transferred_config not in seen: + seen.add(transferred_config) + result.append(flat) + except (ValueError, TypeError, KeyError, AssertionError) as e: + self.log(f"Failed to transfer explicit seed config: {e}") + max_configs = self.settings.autotune_best_available_max_configs cached_entries = self._find_similar_cached_configs(max_configs) @@ -905,12 +921,16 @@ def _generate_best_available_population_flat(self) -> list[FlatConfig]: if duplicates > 0: self.log.debug(f"Discarded {duplicates} duplicate config(s)") - self.log( - f"Initial population: 1 default + {len(result) - 1} unique cached = {len(result)} total" - ) + self.log(f"Initial population: {len(result)} total") return result + def set_best_available_seed_configs( + self, + configs: Sequence[Config], + ) -> None: + self._best_available_seed_configs = list(configs) + def parallel_benchmark_population( self, members: list[PopulationMember], *, desc: str = "Benchmarking" ) -> list[PopulationMember]: diff --git a/helion/autotuner/block_id_sequence.py b/helion/autotuner/block_id_sequence.py index 8da053f7d..623617a54 100644 --- a/helion/autotuner/block_id_sequence.py +++ b/helion/autotuner/block_id_sequence.py @@ -46,6 +46,17 @@ def _flat_config( ) -> object: return fn(self._fragment(base)) + def _encode_flat_value(self, base: ConfigSpec, value: object) -> object: + """Encode a normalized Config value into its flat-slot representation. + + Most specs store the same value in Config and FlatConfig, so the + default implementation is the identity. ReductionLoopSpec is the + only override today: it normalizes persistent reductions to None in + Config, but FlatConfig stores that choice as an integer sentinel. + """ + del base + return value + _BlockIdItemT = TypeVar("_BlockIdItemT", bound=_BlockIdItem) diff --git a/helion/autotuner/config_generation.py b/helion/autotuner/config_generation.py index b74191999..b12a923cd 100644 --- a/helion/autotuner/config_generation.py +++ b/helion/autotuner/config_generation.py @@ -9,6 +9,7 @@ from typing import cast from .._compat import warps_to_threads +from .block_id_sequence import BlockIdSequence from .config_fragment import Category from .config_fragment import ConfigSpecFragment from .config_fragment import PowerOfTwoFragment @@ -117,14 +118,22 @@ def _apply_overrides(self, config: Config) -> Config: def flatten(self, config: Config) -> FlatConfig: """Inverse of unflatten: convert a Config to a FlatConfig.""" result = self.default_flat() + flat_fields = self.config_spec._flat_fields() for key, (indices, is_sequence) in self._key_to_flat_indices.items(): if key not in config.config: continue value = config.config[key] if is_sequence: assert isinstance(value, list) - for idx, v in zip(indices, value, strict=True): - result[idx] = v + field = flat_fields[key] + assert isinstance(field, BlockIdSequence) + # Sequence specs can normalize values in Config differently + # from how they are stored in FlatConfig. Only + # ReductionLoopSpec overrides this today, but keep the dispatch + # on the spec so flatten() remains the generic inverse of + # unflatten(). + for idx, spec, v in zip(indices, field, value, strict=True): + result[idx] = spec._encode_flat_value(self.config_spec, v) else: assert len(indices) == 1 result[indices[0]] = value diff --git a/helion/autotuner/config_spec.py b/helion/autotuner/config_spec.py index 526b6211c..26808c8e8 100644 --- a/helion/autotuner/config_spec.py +++ b/helion/autotuner/config_spec.py @@ -1027,9 +1027,10 @@ def __init__( super().__init__([block_id]) self.size_hint = size_hint - def _flat_config( - self, base: ConfigSpec, fn: Callable[[ConfigSpecFragment], object] - ) -> int | None: + def _flat_fragment(self, base: ConfigSpec) -> BlockSizeFragment: + # Shared by both directions: + # - unflatten: flat integer -> Config value via _flat_config() + # - flatten: Config value -> flat integer via _encode_flat_value() low = 8 # TODO(jansel): is smaller needed? high = next_power_of_2(max(low, self.size_hint)) default = min(high, 4096) @@ -1038,16 +1039,33 @@ def _flat_config( if base.max_reduction_threads is not None: if self.size_hint > base.max_reduction_threads: default = min(default, base.max_reduction_threads) - value = fn(BlockSizeFragment(low, high, default)) + return BlockSizeFragment(low, high, default) + + def _flat_config( + self, base: ConfigSpec, fn: Callable[[ConfigSpecFragment], object] + ) -> int | None: + fragment = self._flat_fragment(base) + value = fn(fragment) assert isinstance(value, int) - if not (low <= value <= high): + if not (fragment.low <= value <= fragment.high): raise InvalidConfig( - f"Invalid value for reduction loop {low} <= {value} <= {high}" + "Invalid value for reduction loop " + f"{fragment.low} <= {value} <= {fragment.high}" ) if value >= self.size_hint: return None # max size becomes persistent reduction return value + def _encode_flat_value(self, base: ConfigSpec, value: object) -> object: + # None means "persistent reduction" in the normalized Config. In the + # flat search space that same choice is represented by an integer + # sentinel, typically the fragment default such as 1024 for a 1024-wide + # reduction. This is the one non-identity Config <-> FlatConfig + # mapping today. + if value is None: + return self._flat_fragment(base).default() + return value + def _normalize(self, name: str, value: object) -> int | None: if value is None: return None diff --git a/helion/autotuner/llm_seeded_lfbo.py b/helion/autotuner/llm_seeded_lfbo.py new file mode 100644 index 000000000..08a4c0e9a --- /dev/null +++ b/helion/autotuner/llm_seeded_lfbo.py @@ -0,0 +1,429 @@ +"""Seed a second-stage autotuner with configs from an LLM search pass.""" + +from __future__ import annotations + +import inspect +import math +import os +import time +from typing import TYPE_CHECKING +from typing import Any +from typing import cast + +from .base_search import BaseSearch +from .effort_profile import PATTERN_SEARCH_DEFAULTS +from .effort_profile import QUICK_LLM_SEARCH_DEFAULTS +from .llm.transport import DEFAULT_REQUEST_TIMEOUT_S +from .llm_search import LLMGuidedSearch +from .llm_search import guided_search_kwargs_from_config +from .pattern_search import InitialPopulationStrategy + +if TYPE_CHECKING: + from collections.abc import Sequence + + from ..runtime.config import Config + from ..runtime.settings import Settings + from .base_search import _AutotunableKernel + from .effort_profile import AutotuneEffortProfile + + +_DISALLOWED_SECOND_STAGE_ALGORITHMS = { + "LLMGuidedSearch", + "LLMSeededSearch", + "LLMSeededLFBOTreeSearch", +} + + +def _parse_env_bool(value: str) -> bool: + """Parse the small env-var bool dialect used by the hybrid overrides.""" + return value.strip().lower() not in {"", "0", "false"} + + +def _resolve_second_stage_algorithm(name: str) -> type[BaseSearch]: + """Resolve and validate the non-LLM search used in stage 2.""" + from . import search_algorithms + + search_cls = search_algorithms.get(name) + if search_cls is None: + raise ValueError( + f"Unknown hybrid second-stage algorithm: {name}. " + f"Valid options are: {', '.join(search_algorithms.keys())}" + ) + if name in _DISALLOWED_SECOND_STAGE_ALGORITHMS: + raise ValueError( + f"Invalid hybrid second-stage algorithm: {name}. " + "The second stage must be a non-LLM search algorithm." + ) + return search_cls + + +def _supports_init_parameter(search_cls: type[BaseSearch], name: str) -> bool: + """Check whether a second-stage search accepts a particular kwarg.""" + return name in inspect.signature(search_cls.__init__).parameters + + +class LLMSeededSearch(BaseSearch): + """ + Generic hybrid autotuner that seeds a second-stage search with LLM proposals. + + The algorithm runs in two stages: + 1. Run ``LLMGuidedSearch`` for ``llm_max_rounds`` rounds and capture its best + config in memory. + 2. Run the configured second-stage search algorithm. If the algorithm + supports ``initial_population_strategy``, it is switched to + ``FROM_BEST_AVAILABLE`` so it can start from the LLM seed config. + + Setting ``llm_max_rounds=0`` disables the seed stage and runs only the + second-stage search. + """ + + default_second_stage_algorithm = "LFBOTreeSearch" + allow_second_stage_env_override = True + hybrid_stage_breakdown: dict[str, object] | None + + def __init__( + self, + kernel: _AutotunableKernel, + args: Sequence[object], + *, + second_stage_algorithm: str | None = None, + second_stage_kwargs: dict[str, object] | None = None, + best_available_pad_random: bool = False, + llm_provider: str | None = None, + llm_model: str = QUICK_LLM_SEARCH_DEFAULTS.model, + llm_configs_per_round: int = QUICK_LLM_SEARCH_DEFAULTS.configs_per_round, + llm_max_rounds: int = QUICK_LLM_SEARCH_DEFAULTS.max_rounds, + llm_initial_random_configs: int = QUICK_LLM_SEARCH_DEFAULTS.initial_random_configs, + llm_api_base: str | None = None, + llm_api_key: str | None = None, + llm_max_output_tokens: int | None = None, + llm_request_timeout_s: float = DEFAULT_REQUEST_TIMEOUT_S, + llm_compile_timeout_s: int | None = QUICK_LLM_SEARCH_DEFAULTS.compile_timeout_s, + ) -> None: + super().__init__(kernel, args) + if llm_max_rounds < 0: + raise ValueError("LLMSeededSearch llm_max_rounds must be >= 0") + self.second_stage_algorithm = ( + second_stage_algorithm or type(self).default_second_stage_algorithm + ) + _resolve_second_stage_algorithm(self.second_stage_algorithm) + self.second_stage_kwargs = dict(second_stage_kwargs or {}) + self.best_available_pad_random = best_available_pad_random + + self.llm_provider = llm_provider + self.llm_model = llm_model + self.llm_configs_per_round = llm_configs_per_round + self.llm_max_rounds = llm_max_rounds + self.llm_initial_random_configs = llm_initial_random_configs + self.llm_api_base = llm_api_base + self.llm_api_key = llm_api_key + self.llm_max_output_tokens = llm_max_output_tokens + self.llm_request_timeout_s = llm_request_timeout_s + self.llm_compile_timeout_s = llm_compile_timeout_s + + self.hybrid_stage_breakdown = None + + @classmethod + def _get_default_second_stage_algorithm(cls) -> str: + """Read the default stage-2 algorithm, optionally from env.""" + if ( + cls.allow_second_stage_env_override + and (value := os.environ.get("HELION_HYBRID_SECOND_STAGE_ALGORITHM")) + is not None + ): + return value + return cls.default_second_stage_algorithm + + @classmethod + def get_kwargs_from_profile( + cls, profile: AutotuneEffortProfile, settings: Settings + ) -> dict[str, object]: + """Combine shared LLM defaults with the chosen second-stage profile.""" + second_stage_algorithm = cls._get_default_second_stage_algorithm() + second_stage_cls = _resolve_second_stage_algorithm(second_stage_algorithm) + + # The hybrid uses a quick LLM seed stage by default, even under full effort. + guided_kwargs = guided_search_kwargs_from_config( + QUICK_LLM_SEARCH_DEFAULTS, settings + ) + llm_kwargs: dict[str, object] = { + f"llm_{k}": v for k, v in guided_kwargs.items() + } + + kwargs = { + **super().get_kwargs_from_profile(profile, settings), + "second_stage_algorithm": second_stage_algorithm, + "second_stage_kwargs": second_stage_cls.get_kwargs_from_profile( + profile, settings + ), + **llm_kwargs, + "best_available_pad_random": False, + } + + if (value := os.environ.get("HELION_HYBRID_LLM_MAX_ROUNDS")) is not None: + kwargs["llm_max_rounds"] = int(value) + if ( + value := os.environ.get("HELION_HYBRID_BEST_AVAILABLE_PAD_RANDOM") + ) is not None: + kwargs["best_available_pad_random"] = _parse_env_bool(value) + return kwargs + + def _make_llm_search(self) -> LLMGuidedSearch: + """Construct the stage-1 guided search from llm_* settings.""" + return LLMGuidedSearch( + self.kernel, + self.args, + finishing_rounds=0, + provider=self.llm_provider, + model=self.llm_model, + configs_per_round=self.llm_configs_per_round, + max_rounds=self.llm_max_rounds, + initial_random_configs=self.llm_initial_random_configs, + api_base=self.llm_api_base, + api_key=self.llm_api_key, + max_output_tokens=self.llm_max_output_tokens, + request_timeout_s=self.llm_request_timeout_s, + compile_timeout_s=self.llm_compile_timeout_s, + ) + + def _make_second_stage_search(self, *, seeded: bool) -> BaseSearch: + """Construct stage 2 and enable best-available seeding when supported.""" + second_stage_cls = _resolve_second_stage_algorithm(self.second_stage_algorithm) + kwargs = dict(self.second_stage_kwargs) + + if seeded: + if _supports_init_parameter( + second_stage_cls, "initial_population_strategy" + ): + kwargs["initial_population_strategy"] = ( + InitialPopulationStrategy.FROM_BEST_AVAILABLE + ) + if _supports_init_parameter( + second_stage_cls, "best_available_pad_random" + ): + kwargs["best_available_pad_random"] = self.best_available_pad_random + else: + self.log( + f"Second-stage algorithm {self.second_stage_algorithm} " + "does not support FROM_BEST_AVAILABLE initialization; " + "the LLM seed may not influence the next stage." + ) + + return cast( + "BaseSearch", + cast("Any", second_stage_cls)(self.kernel, self.args, **kwargs), + ) + + def _inject_seed_into_second_stage( + self, + second_stage_search: BaseSearch, + llm_seed_config: Config, + ) -> None: + """Pass the best LLM config into searches that expose the seed hook.""" + setter = getattr(second_stage_search, "set_best_available_seed_configs", None) + if setter is None: + return + setter([llm_seed_config]) + + def _finalize_stage_metrics( + self, + llm_search: LLMGuidedSearch | None, + llm_seed_config: Config | None, + llm_wall_time: float, + second_stage_search: BaseSearch, + second_stage_wall_time: float, + ) -> None: + """Merge per-stage timing and autotune metrics into the hybrid summary.""" + + def _finite_perf(search: BaseSearch | None) -> float | None: + if search is None or not math.isfinite(search.best_perf_so_far): + return None + return search.best_perf_so_far + + llm_metrics = llm_search._autotune_metrics if llm_search else None + second_stage_metrics = second_stage_search._autotune_metrics + second_stage_tested = second_stage_metrics.num_configs_tested + + self.hybrid_stage_breakdown = { + "used_llm_seed": llm_search is not None, + "llm_seed_perf_ms": _finite_perf(llm_search), + "llm_seed_time_s": llm_wall_time, + "llm_seed_configs_tested": ( + llm_metrics.num_configs_tested if llm_metrics else 0 + ), + "llm_seed_config": ( + dict(llm_seed_config) if llm_seed_config is not None else None + ), + "second_stage_algorithm": self.second_stage_algorithm, + "second_stage_perf_ms": _finite_perf(second_stage_search), + "second_stage_time_s": second_stage_wall_time, + "second_stage_configs_tested": second_stage_tested, + } + if self.second_stage_algorithm == "LFBOTreeSearch": + self.hybrid_stage_breakdown.update( + { + "lfbo_stage_perf_ms": _finite_perf(second_stage_search), + "lfbo_stage_time_s": second_stage_wall_time, + "lfbo_stage_configs_tested": second_stage_tested, + } + ) + + # Aggregate metrics from both stages + for field in ( + "num_configs_tested", + "num_compile_failures", + "num_accuracy_failures", + "num_generations", + ): + setattr( + self._autotune_metrics, + field, + (getattr(llm_metrics, field) if llm_metrics else 0) + + getattr(second_stage_metrics, field), + ) + + candidate_best = [ + stage.best_perf_so_far + for stage in (llm_search, second_stage_search) + if stage is not None and math.isfinite(stage.best_perf_so_far) + ] + self.best_perf_so_far = min(candidate_best) if candidate_best else math.inf + + def _autotune(self) -> Config: + """Run the optional LLM seed stage, then the configured second stage.""" + self.log( + f"Starting {type(self).__name__} with " + f"second_stage_algorithm={self.second_stage_algorithm}, " + f"llm_max_rounds={self.llm_max_rounds}, " + f"llm_configs_per_round={self.llm_configs_per_round}, " + f"best_available_pad_random={self.best_available_pad_random}" + ) + + # Stage 1: LLM seed search + llm_search: LLMGuidedSearch | None = None + llm_seed_config: Config | None = None + llm_wall_time = 0.0 + + if self.llm_max_rounds > 0: + self.log( + "Hybrid stage 1/2: " + f"LLMGuidedSearch for {self.llm_max_rounds} round(s) " + f"with {self.llm_configs_per_round} configs/round" + ) + llm_search = self._make_llm_search() + llm_start = time.perf_counter() + llm_seed_config = llm_search.autotune(skip_cache=True) + llm_wall_time = time.perf_counter() - llm_start + + # Stage 2: second-stage search (optionally seeded) + seeded = llm_seed_config is not None + self.log( + "Hybrid stage 2/2: " + + ( + f"running {self.second_stage_algorithm} from best available seed" + if seeded + else f"running {self.second_stage_algorithm} without LLM seed" + ) + ) + second_stage_search = self._make_second_stage_search(seeded=seeded) + if llm_seed_config is not None: + self._inject_seed_into_second_stage(second_stage_search, llm_seed_config) + second_stage_start = time.perf_counter() + best_config = second_stage_search.autotune() + second_stage_wall_time = time.perf_counter() - second_stage_start + + self._finalize_stage_metrics( + llm_search, + llm_seed_config, + llm_wall_time, + second_stage_search, + second_stage_wall_time, + ) + return best_config + + +class LLMSeededLFBOTreeSearch(LLMSeededSearch): + """Convenience wrapper for the common LLM-seeded LFBO tree search pipeline.""" + + allow_second_stage_env_override = False + + @classmethod + def get_kwargs_from_profile( + cls, profile: AutotuneEffortProfile, settings: Settings + ) -> dict[str, object]: + """Drop the explicit stage-2 algorithm knob from the LFBO convenience API.""" + kwargs = super().get_kwargs_from_profile(profile, settings) + kwargs.pop("second_stage_algorithm", None) + return kwargs + + def __init__( + self, + kernel: _AutotunableKernel, + args: Sequence[object], + *, + second_stage_kwargs: dict[str, object] | None = None, + num_neighbors: int = 200, + frac_selected: float = 0.10, + radius: int = 2, + initial_population: int = PATTERN_SEARCH_DEFAULTS.initial_population, + copies: int = PATTERN_SEARCH_DEFAULTS.copies, + max_generations: int = PATTERN_SEARCH_DEFAULTS.max_generations, + min_improvement_delta: float = 0.001, + quantile: float = 0.1, + patience: int = 1, + similarity_penalty: float = 1.0, + initial_population_strategy: InitialPopulationStrategy | None = None, + best_available_pad_random: bool = False, + finishing_rounds: int = 0, + compile_timeout_lower_bound: float = PATTERN_SEARCH_DEFAULTS.compile_timeout_lower_bound, + compile_timeout_quantile: float = PATTERN_SEARCH_DEFAULTS.compile_timeout_quantile, + llm_provider: str | None = None, + llm_model: str = QUICK_LLM_SEARCH_DEFAULTS.model, + llm_configs_per_round: int = QUICK_LLM_SEARCH_DEFAULTS.configs_per_round, + llm_max_rounds: int = QUICK_LLM_SEARCH_DEFAULTS.max_rounds, + llm_initial_random_configs: int = QUICK_LLM_SEARCH_DEFAULTS.initial_random_configs, + llm_api_base: str | None = None, + llm_api_key: str | None = None, + llm_max_output_tokens: int | None = None, + llm_request_timeout_s: float = DEFAULT_REQUEST_TIMEOUT_S, + llm_compile_timeout_s: int | None = QUICK_LLM_SEARCH_DEFAULTS.compile_timeout_s, + ) -> None: + # Build LFBO second-stage kwargs from individual params or passthrough + computed_second_stage_kwargs: dict[str, object] + if second_stage_kwargs is not None: + computed_second_stage_kwargs = dict(second_stage_kwargs) + else: + computed_second_stage_kwargs = { + "num_neighbors": num_neighbors, + "frac_selected": frac_selected, + "radius": radius, + "initial_population": initial_population, + "copies": copies, + "max_generations": max_generations, + "min_improvement_delta": min_improvement_delta, + "quantile": quantile, + "patience": patience, + "similarity_penalty": similarity_penalty, + "initial_population_strategy": initial_population_strategy, + "finishing_rounds": finishing_rounds, + "compile_timeout_lower_bound": compile_timeout_lower_bound, + "compile_timeout_quantile": compile_timeout_quantile, + } + + super().__init__( + kernel, + args, + second_stage_algorithm="LFBOTreeSearch", + second_stage_kwargs=computed_second_stage_kwargs, + best_available_pad_random=best_available_pad_random, + llm_provider=llm_provider, + llm_model=llm_model, + llm_configs_per_round=llm_configs_per_round, + llm_max_rounds=llm_max_rounds, + llm_initial_random_configs=llm_initial_random_configs, + llm_api_base=llm_api_base, + llm_api_key=llm_api_key, + llm_max_output_tokens=llm_max_output_tokens, + llm_request_timeout_s=llm_request_timeout_s, + llm_compile_timeout_s=llm_compile_timeout_s, + ) diff --git a/test/test_autotuner.py b/test/test_autotuner.py index 8ece4d86a..9911c1f39 100644 --- a/test/test_autotuner.py +++ b/test/test_autotuner.py @@ -61,6 +61,7 @@ from helion.autotuner.local_cache import StrictLocalAutotuneCache from helion.autotuner.logger import AutotuneLogEntry from helion.autotuner.logger import AutotuningLogger +from helion.autotuner.metrics import AutotuneMetrics from helion.autotuner.random_search import RandomSearch import helion.language as hl from helion.language import loops @@ -2977,5 +2978,239 @@ def fake_post_json_with_response( case["request_assertions"](captured) +class TestLLMSeededLFBOTreeSearch(TestCase): + """Tests for the two-stage LLM-seeded hybrid autotuner.""" + + def test_profile_kwargs_and_env_overrides(self): + """Hybrid profile wiring forwards shared LLM settings and hybrid env overrides.""" + from helion.autotuner import LLMSeededLFBOTreeSearch + from helion.autotuner import LLMSeededSearch + + kwargs = LLMSeededLFBOTreeSearch.get_kwargs_from_profile( + get_effort_profile("full"), Settings() + ) + self.assertEqual(kwargs["llm_model"], "gpt-5-2") + self.assertEqual(kwargs["llm_configs_per_round"], 15) + self.assertEqual(kwargs["llm_max_rounds"], 1) + self.assertEqual(kwargs["llm_initial_random_configs"], 10) + self.assertEqual(kwargs["llm_compile_timeout_s"], 15) + self.assertFalse(kwargs["best_available_pad_random"]) + + with patch.dict( + os.environ, + {"HELION_HYBRID_SECOND_STAGE_ALGORITHM": "PatternSearch"}, + clear=False, + ): + generic_kwargs = LLMSeededSearch.get_kwargs_from_profile( + get_effort_profile("full"), Settings() + ) + self.assertEqual(generic_kwargs["second_stage_algorithm"], "PatternSearch") + self.assertIn("max_generations", generic_kwargs["second_stage_kwargs"]) + + kernel = SimpleNamespace( + settings=Settings(), + config_spec=SimpleNamespace(), + ) + with patch.dict( + os.environ, + { + "HELION_HYBRID_LLM_MAX_ROUNDS": "2", + "HELION_LLM_PROVIDER": "openai", + }, + clear=False, + ): + kwargs = LLMSeededLFBOTreeSearch.get_kwargs_from_profile( + get_effort_profile("full"), Settings() + ) + self.assertEqual(kwargs["llm_max_rounds"], 2) + self.assertEqual(kwargs["llm_provider"], "openai") + + search = LLMSeededLFBOTreeSearch(kernel, (), **kwargs) + self.assertEqual(search.llm_provider, "openai") + + def test_selected_by_env(self): + """HELION_AUTOTUNER selects the hybrid autotuner and applies profile defaults.""" + from helion.autotuner import LLMSeededLFBOTreeSearch + + args = ( + torch.randn([8, 32], device=DEVICE), + torch.randn([8, 32], device=DEVICE), + ) + + with patch.dict(os.environ, {"HELION_AUTOTUNER": "LLMSeededLFBOTreeSearch"}): + + @helion.kernel(autotune_effort="full") + def add(a, b): + out = torch.empty_like(a) + for tile in hl.tile(out.size()): + out[tile] = a[tile] + b[tile] + return out + + bound = add.bind(args) + autotuner = bound.settings.autotuner_fn(bound, args) + self.assertIsInstance(autotuner.autotuner, LLMSeededLFBOTreeSearch) + self.assertEqual(autotuner.autotuner.llm_max_rounds, 1) + self.assertFalse(autotuner.autotuner.best_available_pad_random) + + def test_handoff_runs_llm_then_lfbo(self): + """The hybrid flow runs LLM seeding first, then injects that seed into LFBO.""" + from helion.autotuner import InitialPopulationStrategy + from helion.autotuner import LLMSeededLFBOTreeSearch + from helion.runtime.config import Config + + llm_instances = [] + lfbo_instances = [] + + class FakeBenchmarkProvider: + def __init__(self, **kwargs) -> None: + self.kwargs = kwargs + + class FakeLLMSearch: + def __init__(self, kernel, args, **kwargs) -> None: + self.kernel = kernel + self.args = args + self.kwargs = kwargs + self.best_perf_so_far = 0.9 + self._autotune_metrics = AutotuneMetrics( + num_configs_tested=7, + num_compile_failures=1, + num_accuracy_failures=2, + num_generations=3, + ) + llm_instances.append(self) + + def autotune(self, *, skip_cache=False): + self.skip_cache = skip_cache + return Config(num_warps=4) + + class FakeLFBOSearch: + def __init__( + self, + kernel, + args, + *, + initial_population_strategy=None, + best_available_pad_random=True, + **kwargs, + ) -> None: + self.kernel = kernel + self.args = args + self.kwargs = { + **kwargs, + "initial_population_strategy": initial_population_strategy, + "best_available_pad_random": best_available_pad_random, + } + self.best_perf_so_far = 0.5 + self._autotune_metrics = AutotuneMetrics( + num_configs_tested=11, + num_compile_failures=3, + num_accuracy_failures=5, + num_generations=6, + ) + self.seed_configs = None + lfbo_instances.append(self) + + def set_best_available_seed_configs(self, configs): + self.seed_configs = list(configs) + + def autotune(self): + return Config(num_warps=8) + + kernel = SimpleNamespace( + settings=Settings(), + config_spec=SimpleNamespace(), + env=SimpleNamespace(device=DEVICE, process_group_name=None), + ) + args = (torch.randn([8], device=DEVICE),) + search = LLMSeededLFBOTreeSearch(kernel, args, llm_max_rounds=2) + search._benchmark_provider_cls = FakeBenchmarkProvider + search._prepare() + self.assertIsInstance(search.benchmark_provider, FakeBenchmarkProvider) + + with ( + patch("helion.autotuner.llm_seeded_lfbo.LLMGuidedSearch", FakeLLMSearch), + patch( + "helion.autotuner.llm_seeded_lfbo._resolve_second_stage_algorithm", + return_value=FakeLFBOSearch, + ), + ): + best = search._autotune() + + self.assertEqual(best["num_warps"], 8) + self.assertEqual(llm_instances[0].kwargs["max_rounds"], 2) + self.assertTrue(llm_instances[0].skip_cache) + self.assertEqual( + lfbo_instances[0].kwargs["initial_population_strategy"], + InitialPopulationStrategy.FROM_BEST_AVAILABLE, + ) + self.assertEqual(lfbo_instances[0].seed_configs, [Config(num_warps=4)]) + self.assertEqual(search._autotune_metrics.num_configs_tested, 18) + self.assertEqual(search._autotune_metrics.num_compile_failures, 4) + self.assertEqual(search._autotune_metrics.num_accuracy_failures, 7) + self.assertEqual(search._autotune_metrics.num_generations, 9) + self.assertEqual(search.hybrid_stage_breakdown["llm_seed_configs_tested"], 7) + self.assertEqual(search.hybrid_stage_breakdown["lfbo_stage_configs_tested"], 11) + + def test_zero_llm_rounds_falls_back_to_lfbo_strategy(self): + """Disabling LLM rounds skips stage 1 and leaves the second-stage strategy unchanged.""" + from helion.autotuner import InitialPopulationStrategy + from helion.autotuner import LLMSeededLFBOTreeSearch + from helion.runtime.config import Config + + lfbo_instances = [] + + class FakeBenchmarkProvider: + def __init__(self, **kwargs) -> None: + self.kwargs = kwargs + + class FailIfLLMConstructed: + def __init__(self, *args, **kwargs) -> None: + raise AssertionError("LLM seed stage should be skipped") + + class FakeLFBOSearch: + def __init__(self, kernel, args, **kwargs) -> None: + self.kwargs = kwargs + self.best_perf_so_far = 0.4 + self._autotune_metrics = AutotuneMetrics(num_configs_tested=3) + lfbo_instances.append(self) + + def autotune(self): + return Config(num_warps=16) + + kernel = SimpleNamespace( + settings=Settings(), + config_spec=SimpleNamespace(), + env=SimpleNamespace(device=DEVICE, process_group_name=None), + ) + args = (torch.randn([8], device=DEVICE),) + search = LLMSeededLFBOTreeSearch( + kernel, + args, + llm_max_rounds=0, + initial_population_strategy=InitialPopulationStrategy.FROM_RANDOM, + ) + search._benchmark_provider_cls = FakeBenchmarkProvider + search._prepare() + + with ( + patch( + "helion.autotuner.llm_seeded_lfbo.LLMGuidedSearch", + FailIfLLMConstructed, + ), + patch( + "helion.autotuner.llm_seeded_lfbo._resolve_second_stage_algorithm", + return_value=FakeLFBOSearch, + ), + ): + best = search._autotune() + + self.assertEqual(best["num_warps"], 16) + self.assertEqual( + lfbo_instances[0].kwargs["initial_population_strategy"], + InitialPopulationStrategy.FROM_RANDOM, + ) + self.assertFalse(search.hybrid_stage_breakdown["used_llm_seed"]) + + if __name__ == "__main__": unittest.main() diff --git a/test/test_best_available.py b/test/test_best_available.py index 956fb86db..0261d3c53 100644 --- a/test/test_best_available.py +++ b/test/test_best_available.py @@ -303,6 +303,27 @@ def test_flatten_multiple_reduction_loops(self): re_flat = config_gen.flatten(restored) self.assertEqual(re_flat, flat) + def test_flatten_persistent_reduction_loop_roundtrip(self): + """Persistent reductions normalize to None but must round-trip to the flat sentinel.""" + config_spec = ConfigSpec(backend=TritonBackend()) + config_spec.block_sizes.append( + BlockSizeSpec(block_id=0, size_hint=64, min_size=16, max_size=256) + ) + config_spec.reduction_loops.append(ReductionLoopSpec(block_id=1, size_hint=128)) + + config_gen = ConfigGeneration(config_spec) + default_flat = config_gen.default_flat() + rl_indices, rl_is_seq = config_gen._key_to_flat_indices["reduction_loops"] + self.assertTrue(rl_is_seq) + self.assertEqual(len(rl_indices), 1) + self.assertEqual(default_flat[rl_indices[0]], 128) + + config = config_gen.unflatten(default_flat) + self.assertEqual(config.config["reduction_loops"], [None]) + + roundtripped = config_gen.flatten(config) + self.assertEqual(roundtripped, default_flat) + class TestCacheMatching(unittest.TestCase): """Tests for cache file matching in warm start."""