pytorch · norx1991 · Apr 9, 2026
diff --git a/helion/_compiler/backend.py b/helion/_compiler/backend.py
@@ -1436,8 +1436,8 @@ def _empty_allocated_vars(body: list[ast.stmt]) -> set[str]:
         output_indices: list[int] = []
         # Indices of output tensors that are also read by the kernel
         # (inplace-mutated params or body-created tensors the kernel reads).
-        # These must use VMEM BlockSpecs. Output-only tensors (written but
-        # never read) get HBM in_specs to avoid VMEM pressure.
+        # Output-only tensors (written but never read) are excluded from
+        # pallas_call inputs and returned by the launcher instead.
         inplace_indices: list[int] = []
         if sorted_args is not None:
             env = CompileEnvironment.current()
@@ -1455,9 +1455,9 @@ def _empty_allocated_vars(body: list[ast.stmt]) -> set[str]:
             #
             # Only tensors allocated with torch.empty/empty_like/new_empty can be
             # output-only — their initial values are undefined, so it's safe
-            # to use HBM BlockSpecs.  Tensors allocated with torch.zeros_like,
-            # torch.full, etc. have meaningful initial values that must be
-            # preserved via VMEM BlockSpecs.
+            # to skip passing them as pallas_call inputs.  Tensors allocated
+            # with torch.zeros_like, torch.full, etc. have meaningful initial
+            # values that must be preserved via input_output_aliases (donation).
             empty_vars = _empty_allocated_vars(host_fn.body)
             kernel_reads: set[str] = set()
             for stmt in host_fn.body:
@@ -1481,6 +1481,16 @@ def _empty_allocated_vars(body: list[ast.stmt]) -> set[str]:
                     output_indices.append(i)
                     inplace_indices.append(i)
 
+        # Identify output-only arg names for codegen to capture return values.
+        output_only_set = set(output_indices) - set(inplace_indices)
+        output_only_names: list[str] = []
+        if sorted_args is not None:
+            for i in output_indices:
+                if i in output_only_set:
+                    output_only_names.append(sorted_args[i].host_str())
+        # Store for codegen_function_call to use.
+        self._output_only_names = output_only_names
+
         launcher_args = [*args, f"_output_indices={output_indices}"]
         launcher_args.append(f"_inplace_indices={inplace_indices}")
 

diff --git a/helion/_compiler/device_function.py b/helion/_compiler/device_function.py
@@ -791,11 +791,39 @@ def codegen_function_call(self) -> ast.AST:
             has_barrier=env.has_barrier,
             sorted_args=arg_objects,
         )
-        # TODO(jansel): we should run CSE this statement
-        call_statement = statement_from_string(
-            f"_launcher({self.name}, {{call_grid_expr}}, {', '.join(call_args)})",
-            call_grid_expr=call_grid_expr,
+        # Check if the backend wants to capture return values for output-only tensors.
+        output_only_names = getattr(backend, "_output_only_names", [])
+        launcher_call = (
+            f"_launcher({self.name}, {{call_grid_expr}}, {', '.join(call_args)})"
         )
+        if output_only_names:
+            if len(output_only_names) == 1:
+                assign_target = output_only_names[0]
+            else:
+                assign_target = ", ".join(output_only_names)
+            # Assign launcher result to output-only vars.  The `if` guard
+            # handles custom launchers that return None (e.g. fake-tensor
+            # inference mocks).
+            if len(output_only_names) == 1:
+                fallback = output_only_names[0]
+            else:
+                fallback = f"({', '.join(output_only_names)})"
+            # Emit two statements: the launcher call, then the conditional assign.
+            launcher_stmt = statement_from_string(
+                f"_launcher_result = {launcher_call}",
+                call_grid_expr=call_grid_expr,
+            )
+            assert isinstance(launcher_stmt, ExtendedAST)
+            launcher_stmt._is_kernel_call = True
+            self.codegen.host_statements.append(launcher_stmt)
+            call_statement = statement_from_string(
+                f"{assign_target} = _launcher_result if _launcher_result is not None else {fallback}",
+            )
+        else:
+            call_statement = statement_from_string(
+                launcher_call,
+                call_grid_expr=call_grid_expr,
+            )
         assert isinstance(call_statement, ExtendedAST)
         # Mark the kernel call we can find it in codegen_precompile_def
         call_statement._is_kernel_call = True