properly codegen hl.triton_kernel

shunting314 · shunting314 · commit 3cecf1e6aca8 · 2026-03-25T23:30:44.000-07:00
stack-info: PR: #1797, branch: shunting314/stack/18
diff --git a/examples/distributed/allreduce_bias_rmsnorm.py b/examples/distributed/allreduce_bias_rmsnorm.py
@@ -28,6 +28,7 @@
     config=helion.Config(
         block_sizes=[8],
         num_warps=8,
+        reduction_loops=[1024],
     ),
     static_shapes=True,
 )
diff --git a/helion/_compiler/roll_reduction.py b/helion/_compiler/roll_reduction.py
@@ -399,6 +399,7 @@ def process(self, graph: torch.fx.Graph) -> torch.fx.Graph:
                 if (
                     not all((n in self.available) for n in node.all_input_nodes)
                     or node.op == "output"
+                    or (node.is_impure() and self.inner_count > 0)
                 ):
                     self.start_new_graph()
                 new_node = self.outer_graph.create_node(
diff --git a/helion/_utils.py b/helion/_utils.py
@@ -225,11 +225,3 @@ def all_gather_object(obj: T) -> list[T]:
     object_list = [None] * dist.get_world_size()
     dist.all_gather_object(object_list, obj)
     return object_list  # pyrefly: ignore
-
-
-def autotune_for_distributed_kernel() -> bool:
-    """
-    Remove this once these issues regarding distributed kernels are fixed:
-    - https://github.com/pytorch/helion/issues/1642
-    """
-    return os.getenv("HELION_AUTOTUNE_FOR_DISTRIBUTED_KERNEL") == "1"
diff --git a/helion/autotuner/config_spec.py b/helion/autotuner/config_spec.py
@@ -31,7 +31,6 @@
 from .config_fragment import PowerOfTwoFragment
 from .config_fragment import assert_integer_power_of_two
 import helion
-from helion._utils import autotune_for_distributed_kernel
 
 if TYPE_CHECKING:
     from collections.abc import Callable
@@ -931,10 +930,6 @@ def _flat_config(
                 default = min(default, base.max_reduction_threads)
         value = fn(BlockSizeFragment(low, high, default))
         assert isinstance(value, int)
-        if autotune_for_distributed_kernel():
-            # workaround https://github.com/pytorch/helion/issues/1642
-            return None
-
         if not (low <= value <= high):
             raise InvalidConfig(
                 f"Invalid value for reduction loop {low} <= {value} <= {high}"
diff --git a/test/test_distributed.py b/test/test_distributed.py
@@ -94,7 +94,6 @@ def setUpClass(cls) -> None:
                     "HELION_DIST_CHECK_CONFIG_CONSISTANCY": "1",
                     "HELION_CAP_AUTOTUNE_NUM_NEIGHBORS": "50",
                     "HELION_CAP_REBENCHMARK_REPEAT": "50",
-                    "HELION_AUTOTUNE_FOR_DISTRIBUTED_KERNEL": "1",
                 },
             )
         )
@@ -264,8 +263,7 @@ def test_allreduce_bias_rmsnorm(self, kernel_name, autotuner):
         kernel = getattr(mod, kernel_name).fn
         if autotuner == "fixed":
             fixed_config = helion.Config(
-                block_sizes=[8],
-                num_warps=8,
+                block_sizes=[8], num_warps=8, reduction_loops=[1024]
             )
 
             kernel = helion.kernel(

Original file line number	Diff line number	Diff line change
`@@ -28,6 +28,7 @@`
`28`	`28`	`config=helion.Config(`
`29`	`29`	`block_sizes=[8],`
`30`	`30`	`num_warps=8,`
	`31`	`+ reduction_loops=[1024],`
`31`	`32`	`),`
`32`	`33`	`static_shapes=True,`
`33`	`34`	`)`
Original file line number	Diff line number	Diff line change
`@@ -94,7 +94,6 @@ def setUpClass(cls) -> None:`
`94`	`94`	`"HELION_DIST_CHECK_CONFIG_CONSISTANCY": "1",`
`95`	`95`	`"HELION_CAP_AUTOTUNE_NUM_NEIGHBORS": "50",`
`96`	`96`	`"HELION_CAP_REBENCHMARK_REPEAT": "50",`
`97`		`- "HELION_AUTOTUNE_FOR_DISTRIBUTED_KERNEL": "1",`
`98`	`97`	`},`
`99`	`98`	`)`
`100`	`99`	`)`
`@@ -264,8 +263,7 @@ def test_allreduce_bias_rmsnorm(self, kernel_name, autotuner):`
`264`	`263`	`kernel = getattr(mod, kernel_name).fn`
`265`	`264`	`if autotuner == "fixed":`
`266`	`265`	`fixed_config = helion.Config(`
`267`		`- block_sizes=[8],`
`268`		`- num_warps=8,`
	`266`	`+ block_sizes=[8], num_warps=8, reduction_loops=[1024]`
`269`	`267`	`)`
`270`	`268`
`271`	`269`	`kernel = helion.kernel(`