pytorch · shunting314 · Mar 26, 2026 · Mar 24, 2026 · jansel · Mar 26, 2026
diff --git a/examples/distributed/allreduce_bias_rmsnorm.py b/examples/distributed/allreduce_bias_rmsnorm.py
@@ -28,6 +28,7 @@
     config=helion.Config(
         block_sizes=[8],
         num_warps=8,
+        reduction_loops=[1024],
     ),
     static_shapes=True,
 )

diff --git a/helion/_compiler/roll_reduction.py b/helion/_compiler/roll_reduction.py
@@ -399,6 +399,7 @@ def process(self, graph: torch.fx.Graph) -> torch.fx.Graph:
                 if (
                     not all((n in self.available) for n in node.all_input_nodes)
                     or node.op == "output"
+                    or (node.is_impure() and self.inner_count > 0)
                 ):
                     self.start_new_graph()
                 new_node = self.outer_graph.create_node(

diff --git a/helion/_utils.py b/helion/_utils.py
@@ -225,11 +225,3 @@ def all_gather_object(obj: T) -> list[T]:
     object_list = [None] * dist.get_world_size()
     dist.all_gather_object(object_list, obj)
     return object_list  # pyrefly: ignore
-
-
-def autotune_for_distributed_kernel() -> bool:
-    """
-    Remove this once these issues regarding distributed kernels are fixed:
-    - https://github.com/pytorch/helion/issues/1642
-    """
-    return os.getenv("HELION_AUTOTUNE_FOR_DISTRIBUTED_KERNEL") == "1"
diff --git a/helion/autotuner/config_spec.py b/helion/autotuner/config_spec.py
@@ -31,7 +31,6 @@
 from .config_fragment import PowerOfTwoFragment
 from .config_fragment import assert_integer_power_of_two
 import helion
-from helion._utils import autotune_for_distributed_kernel
 
 if TYPE_CHECKING:
     from collections.abc import Callable
@@ -931,10 +930,6 @@ def _flat_config(
                 default = min(default, base.max_reduction_threads)
         value = fn(BlockSizeFragment(low, high, default))
         assert isinstance(value, int)
-        if autotune_for_distributed_kernel():
-            # workaround https://github.com/pytorch/helion/issues/1642
-            return None
-
         if not (low <= value <= high):
             raise InvalidConfig(
                 f"Invalid value for reduction loop {low} <= {value} <= {high}"

diff --git a/test/test_distributed.py b/test/test_distributed.py
@@ -94,7 +94,6 @@ def setUpClass(cls) -> None:
                     "HELION_DIST_CHECK_CONFIG_CONSISTANCY": "1",
                     "HELION_CAP_AUTOTUNE_NUM_NEIGHBORS": "50",
                     "HELION_CAP_REBENCHMARK_REPEAT": "50",
-                    "HELION_AUTOTUNE_FOR_DISTRIBUTED_KERNEL": "1",
                 },
             )
         )
@@ -264,8 +263,7 @@ def test_allreduce_bias_rmsnorm(self, kernel_name, autotuner):
         kernel = getattr(mod, kernel_name).fn
         if autotuner == "fixed":
             fixed_config = helion.Config(
-                block_sizes=[8],
-                num_warps=8,
+                block_sizes=[8], num_warps=8, reduction_loops=[1024]
             )
 
             kernel = helion.kernel(