llvm · rolfmorel · Mar 19, 2026 · Mar 16, 2026 · Mar 16, 2026 · Mar 16, 2026
diff --git a/examples/cpu/x86/matmul.py b/examples/cpu/x86/matmul.py
@@ -24,8 +24,8 @@
 from mlir.dialects.transform import vector
 from mlir.dialects.transform import tensor
 
-from lighthouse.workload import benchmark
-from lighthouse.workload import get_bench_wrapper_schedule
+from lighthouse import dialects as lh_dialects
+from lighthouse.workload import benchmark, get_bench_wrapper_schedule
 from lighthouse.utils.numpy import numpy_to_mlir_type
 from lighthouse.pipeline.helper import apply_registered_pass
 import lighthouse.utils as lh_utils
@@ -358,6 +358,8 @@ def parse_cli():
     args = parse_cli()
 
     with ir.Context(), ir.Location.unknown():
+        lh_dialects.register_and_load()
+
         match args.dtype:
             case "f32":
                 in_dtype = np.float32

diff --git a/examples/feed-forward-mpi/feed-forward-mpi.py b/examples/feed-forward-mpi/feed-forward-mpi.py
@@ -26,6 +26,7 @@
     as_ctype,
 )
 
+from lighthouse import dialects as lh_dialects
 from lighthouse.utils.memref import (
     to_ctype as memref_to_ctype,
     deallocate_memrefs_on_exit,
@@ -412,6 +413,8 @@ def schedule_modules(
     R = MPI.COMM_WORLD.Get_rank()
 
     with ir.Context(), ir.Location.unknown():
+        lh_dialects.register_and_load()
+
         wload = DistFF(args, P, R)
 
         # execute(wload, verbose=args.verbose)

diff --git a/examples/workload/example.py b/examples/workload/example.py
@@ -18,6 +18,7 @@
 from mlir.dialects import transform
 from mlir.execution_engine import ExecutionEngine
 
+from lighthouse import dialects as lh_dialects
 from lighthouse.pipeline.helper import match
 from lighthouse.pipeline.opt import PassBundles, apply_bundle
 
@@ -159,6 +160,8 @@ def schedule_modules(
 
 if __name__ == "__main__":
     with ir.Context(), ir.Location.unknown():
+        lh_dialects.register_and_load()
+
         wload = ElementwiseSum(400, 400)
 
         print(" Dump kernel ".center(60, "-"))

diff --git a/examples/workload/example_mlir.py b/examples/workload/example_mlir.py
@@ -21,6 +21,7 @@
 from mlir.dialects import func, linalg, arith, memref
 from mlir.execution_engine import ExecutionEngine
 
+from lighthouse import dialects as lh_dialects
 from lighthouse.workload import execute, benchmark
 import lighthouse.utils as lh_utils
 
@@ -195,6 +196,8 @@ def payload_module(self):
 
 if __name__ == "__main__":
     with ir.Context(), ir.Location.unknown():
+        lh_dialects.register_and_load()
+
         wload = ElementwiseSumMLIRAlloc(400, 400)
 
         print(" Dump kernel ".center(60, "-"))

diff --git a/examples/xegpu/matmul.py b/examples/xegpu/matmul.py
@@ -19,6 +19,7 @@
 from mlir import ir
 from mlir.execution_engine import ExecutionEngine
 
+from lighthouse import dialects as lh_dialects
 from lighthouse.workload import benchmark, get_bench_wrapper_schedule
 from lighthouse.utils.memref import to_ctype as memref_to_ctype
 from lighthouse.utils.numpy import numpy_to_ctype
@@ -360,6 +361,8 @@ def parse_cli():
     c_type = "f32"
 
     with ir.Context(), ir.Location.unknown():
+        lh_dialects.register_and_load()
+
         wload = XeGPUMatMul(
             M=M,
             N=N,

diff --git a/examples/xegpu/mlp.py b/examples/xegpu/mlp.py
@@ -24,6 +24,7 @@
 from mlir import ir
 from mlir.execution_engine import ExecutionEngine
 
+from lighthouse import dialects as lh_dialects
 from lighthouse.workload import benchmark, get_bench_wrapper_schedule
 from lighthouse.utils.memref import to_ctype as memref_to_ctype
 from lighthouse.utils.numpy import numpy_to_ctype
@@ -375,6 +376,8 @@ def parse_cli():
     identity_weights = args.check_result
 
     with ir.Context(), ir.Location.unknown():
+        lh_dialects.register_and_load()
+
         wload = XeGPUMLP(
             batch_size=args.batch_size,
             input_size=args.input_size,

diff --git a/lighthouse/dialects/__init__.py b/lighthouse/dialects/__init__.py
@@ -0,0 +1,4 @@
+def register_and_load():
+    from . import transform_ext
+
+    transform_ext.register_and_load()
diff --git a/lighthouse/dialects/transform_ext.py b/lighthouse/dialects/transform_ext.py
@@ -0,0 +1,129 @@
+from mlir import ir
+from mlir.dialects import ext, transform, func, arith, scf, memref
+from mlir.dialects.transform import DiagnosedSilenceableFailure
+
+from lighthouse.utils.mlir import func_cif
+
+
+def register_and_load(context=None):
+    TransformExtensionDialect.load()
+
+
+class TransformExtensionDialect(ext.Dialect, name="transform_ext"):
+    @classmethod
+    def load(cls, *args, **kwargs):
+        super().load(*args, **kwargs)
+        for op_cls in cls.operations:
+            if hasattr(op_cls, "attach_interface_impls"):
+                op_cls.attach_interface_impls()
+
+
+class WrapInBenchingFuncOp(
+    TransformExtensionDialect.Operation, name="wrap_in_benching_func"
+):
+    """Create a function that calls `target` function in a benchmarking loop.
+
+    The new function has the same arguments as `target` plus three additional ones:
+    - A memref to store the timing results (one element per iteration).
+    - The number of timed iterations.
+    - The number of warmup iterations.
+    """
+
+    target: ext.Operand[transform.AnyOpType]
+    bench_func: ext.Result[transform.AnyOpType[()]]
+
+    @classmethod
+    def attach_interface_impls(cls, context=None):
+        cls.TransformOpInterfaceModel.attach(cls.OPERATION_NAME, context=context)
+        cls.MemoryEffectsOpInterfaceModel.attach(cls.OPERATION_NAME, context=context)
+
+    @staticmethod
+    def wrap_in_benching_func(target: func.FuncOp, bench_name: str):
+        """Create a function that calls `target` in a benchmarking loop.
+
+        Each call to `target` is timed separately, and the times (in seconds)
+        are stored in a memref that is passed as an additional argument to the
+        benchmark function. It also takes two additional arguments for the
+        number of runs and warmup iterations.
+        """
+
+        # define rtclock function
+        f64_t = ir.F64Type.get()
+        func.FuncOp("rtclock", ((), (f64_t,)), visibility="private")
+        # emit benchmark function
+        time_memref_t = ir.MemRefType.get((ir.ShapedType.get_dynamic_size(),), f64_t)
+        index_t = ir.IndexType.get()
+        args = target.type.inputs + [time_memref_t, index_t, index_t]
+
+        @func_cif(*args, name=bench_name)
+        def bench(*args):
+            zero = arith.constant(index_t, 0)
+            one = arith.constant(index_t, 1)
+            func_args = list(args[: len(target.type.inputs)])
+            times_memref, num_times, num_warmup = args[-3:]
+            for i in scf.for_(zero, num_warmup, one):
+                # FIXME(upstream): func.call needs to wrap _overridden_ CallOp.
+                func.CallOp(target, func_args)
+                scf.yield_(())
+            # TODO: get `num_times` from the `times_memref`.
+            for i in scf.for_(zero, num_times, one):
+                tic = func.call((f64_t,), "rtclock", ())
+                func.CallOp(target, func_args)
+                toc = func.call((f64_t,), "rtclock", ())
+                time = arith.subf(toc, tic)
+                memref.store(time, times_memref, [i])
+                scf.yield_(())
+
+        return bench.func_op
+
+    class TransformOpInterfaceModel(transform.TransformOpInterface):
+        @staticmethod
+        def apply(
+            op: "WrapInBenchingFuncOp",
+            _rewriter: transform.TransformRewriter,
+            results: transform.TransformResults,
+            state: transform.TransformState,
+        ) -> DiagnosedSilenceableFailure:
+            targets = state.get_payload_ops(op.target)
+            if bench_name_attr := op.attributes.get("bench_name"):
+                bench_name = bench_name_attr.value
+                if len(targets) != 1:
+                    return DiagnosedSilenceableFailure.SilenceableFailure
+            else:
+                bench_name = None
+
+            bench_funcs = []
+            for target in targets:
+                if not isinstance(target, func.FuncOp):
+                    return DiagnosedSilenceableFailure.SilenceableFailure
+
+                with ir.InsertionPoint(target), target.location:
+                    bench_func = WrapInBenchingFuncOp.wrap_in_benching_func(
+                        target, bench_name or f"bench_{target.name.value}"
+                    )
+                    bench_funcs.append(bench_func)
+
+            results.set_ops(op.bench_func, bench_funcs)
+
+            return DiagnosedSilenceableFailure.Success
+
+        @staticmethod
+        def allow_repeated_handle_operands(_op: "WrapInBenchingFuncOp") -> bool:
+            return False
+
+    class MemoryEffectsOpInterfaceModel(ir.MemoryEffectsOpInterface):
+        @staticmethod
+        def get_effects(op: "WrapInBenchingFuncOp", effects):
+            transform.only_reads_handle(op.op_operands, effects)
+            transform.produces_handle(op.results, effects)
+            transform.modifies_payload(effects)
+
+
+def wrap_in_benching_func(
+    target: ir.Value[transform.AnyOpType], bench_name: str | None = None
+) -> ir.Value[transform.AnyOpType]:
+    """snake_case wrapper to create a WrapInBenchingFuncOp."""
+    op = WrapInBenchingFuncOp(target=target)
+    if bench_name is not None:
+        op.attributes["bench_name"] = ir.StringAttr.get(bench_name)
+    return op.bench_func
diff --git a/lighthouse/schedule/pattern_schedule.py b/lighthouse/schedule/pattern_schedule.py
diff --git a/lighthouse/workload/__init__.py b/lighthouse/workload/__init__.py
@@ -2,13 +2,11 @@
 from .runner import (
     execute,
     benchmark,
-    bench_wrapper_pattern,
     get_bench_wrapper_schedule,
 )
 
 __all__ = [
     "Workload",
-    "bench_wrapper_pattern",
     "benchmark",
     "execute",
     "get_bench_wrapper_schedule",