-
Notifications
You must be signed in to change notification settings - Fork 12
Open
Description
I have been experimenting with different schedules and getting the following error. Not sure how to fix it. can anyone help.
python examples/xegpu/softmax.py --dump-schedule --dump-kernel=initial | /home/jovy
an/llvm-project/build_llvm_upstream_python/bin/mlir-opt --pass-pipeline="builtin.module(transform-interpreter)"
LLVM ERROR: Loading a dialect (dlti) while in a multi-threaded execution context (maybe the PassManager): this can indicate a missing `dependentDialects` in a pass for example.
This is my transform schedule and payload.
#map = affine_map<(d0, d1) -> (d0, d1)>
#map1 = affine_map<(d0, d1) -> (d0)>
module {
func.func @payload(%arg0: memref<1024x64xf32>, %arg1: memref<1024x64xf32>) attributes {llvm.emit_c_interface} {
%0 = bufferization.to_tensor %arg0 restrict writable : memref<1024x64xf32> to tensor<1024x64xf32>
%1 = bufferization.to_tensor %arg1 restrict : memref<1024x64xf32> to tensor<1024x64xf32>
%cst = arith.constant 0xFF800000 : f32
%2 = tensor.empty() : tensor<1024xf32>
%3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<1024xf32>) -> tensor<1024xf32>
%4 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "reduction"]} ins(%1 : tensor<1024x64xf32>) outs(%3 : tensor<1024xf32>) {
^bb0(%in: f32, %out: f32):
%12 = arith.maximumf %in, %out : f32
linalg.yield %12 : f32
} -> tensor<1024xf32>
%5 = tensor.empty() : tensor<1024x64xf32>
%6 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel"]} ins(%1, %4 : tensor<1024x64xf32>, tensor<1024xf32>) outs(%5 : tensor<1024x64xf32>) {
^bb0(%in: f32, %in_1: f32, %out: f32):
%12 = arith.subf %in, %in_1 : f32
linalg.yield %12 : f32
} -> tensor<1024x64xf32>
%7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel"]} ins(%6 : tensor<1024x64xf32>) outs(%5 : tensor<1024x64xf32>) {
^bb0(%in: f32, %out: f32):
%12 = math.exp %in : f32
linalg.yield %12 : f32
} -> tensor<1024x64xf32>
%8 = tensor.empty() : tensor<1024xf32>
%cst_0 = arith.constant 0.000000e+00 : f32
%9 = linalg.fill ins(%cst_0 : f32) outs(%8 : tensor<1024xf32>) -> tensor<1024xf32>
%10 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "reduction"]} ins(%7 : tensor<1024x64xf32>) outs(%9 : tensor<1024xf32>) {
^bb0(%in: f32, %out: f32):
%12 = arith.addf %in, %out : f32
linalg.yield %12 : f32
} -> tensor<1024xf32>
%11 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel"]} ins(%7, %10 : tensor<1024x64xf32>, tensor<1024xf32>) outs(%5 : tensor<1024x64xf32>) {
^bb0(%in: f32, %in_1: f32, %out: f32):
%12 = arith.divf %in, %in_1 : f32
linalg.yield %12 : f32
} -> tensor<1024x64xf32>
bufferization.materialize_in_destination %11 in restrict writable %arg0 : (tensor<1024x64xf32>, memref<1024x64xf32>) -> ()
return
}
func.func @gpu_alloc_2d_f32(%arg0: i32, %arg1: i32) -> memref<?x?xf32> attributes {llvm.emit_c_interface} {
%0 = arith.index_cast %arg0 : i32 to index
%1 = arith.index_cast %arg1 : i32 to index
%memref = gpu.alloc (%0, %1) : memref<?x?xf32>
return %memref : memref<?x?xf32>
}
func.func @gpu_dealloc_2d_f32(%arg0: memref<?x?xf32>) attributes {llvm.emit_c_interface} {
gpu.dealloc %arg0 : memref<?x?xf32>
return
}
func.func @gpu_copy_2d_f32(%arg0: memref<?x?xf32>, %arg1: memref<?x?xf32>) attributes {llvm.emit_c_interface} {
gpu.memcpy %arg1, %arg0 : memref<?x?xf32>, memref<?x?xf32>
return
}
}
module attributes {transform.with_named_sequence} {
transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
%0 = transform.structured.match ops{["linalg.generic"]} in %arg0 : (!transform.any_op) -> !transform.any_op
%1:5 = transform.split_handle %0 : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op, !transform.any_op)
%tiled_op, %forall_op = transform.structured.tile_using_forall %1#4 tile_sizes [64] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
%fused_op, %new_containing_op = transform.structured.fuse_into_containing_op %1#3 into %forall_op : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
%fused_op_0, %new_containing_op_1 = transform.structured.fuse_into_containing_op %1#2 into %new_containing_op : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
%fused_op_2, %new_containing_op_3 = transform.structured.fuse_into_containing_op %1#1 into %new_containing_op_1 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
%fused_op_4, %new_containing_op_5 = transform.structured.fuse_into_containing_op %1#0 into %new_containing_op_3 : (!transform.any_op, !transform.any_op) -> (!transform.any_op, !transform.any_op)
%2 = transform.get_parent_op %new_containing_op_5 {deduplicate, op_name = "func.func"} : (!transform.any_op) -> !transform.any_op
transform.apply_cse to %2 : !transform.any_op
transform.apply_patterns to %2 {
transform.apply_patterns.canonicalization
} : !transform.any_op
%3 = transform.apply_registered_pass "eliminate-empty-tensors" to %2 : (!transform.any_op) -> !transform.any_op
%4 = transform.structured.vectorize_children_and_apply_patterns %3 {fold_type_extensions_into_contract} : (!transform.any_op) -> !transform.any_op
%5 = transform.get_parent_op %4 {deduplicate, op_name = "builtin.module"} : (!transform.any_op) -> !transform.any_op
%6 = transform.bufferization.one_shot_bufferize layout{IdentityLayoutMap} %5 {allow_return_allocs_from_loops = true, bufferize_function_boundaries = true} : (!transform.any_op) -> !transform.any_op
%7 = transform.apply_registered_pass "fold-memref-alias-ops" to %6 : (!transform.any_op) -> !transform.any_op
transform.apply_cse to %7 : !transform.any_op
transform.apply_patterns to %7 {
transform.apply_patterns.canonicalization
} : !transform.any_op
%8 = transform.structured.match ops{["scf.forall"]} in %7 : (!transform.any_op) -> !transform.any_op
%9 = transform.split_handle %8 : (!transform.any_op) -> !transform.any_op
%10 = transform.loop.forall_to_parallel %9 : (!transform.any_op) -> !transform.any_op
%11 = transform.get_parent_op %10 : (!transform.any_op) -> !transform.any_op
%12 = transform.apply_registered_pass "gpu-map-parallel-loops" to %11 : (!transform.any_op) -> !transform.any_op
%13 = transform.apply_registered_pass "convert-parallel-loops-to-gpu" to %12 : (!transform.any_op) -> !transform.any_op
%14 = transform.apply_registered_pass "lower-affine" to %13 : (!transform.any_op) -> !transform.any_op
transform.apply_cse to %14 : !transform.any_op
transform.apply_patterns to %14 {
transform.apply_patterns.canonicalization
} : !transform.any_op
%15 = transform.structured.match ops{["gpu.launch"]} in %14 : (!transform.any_op) -> !transform.any_op
%16 = transform.split_handle %15 : (!transform.any_op) -> !transform.any_op
transform.xegpu.set_gpu_launch_threads %16 threads = [128, 1, 1] : !transform.any_op
%17 = transform.apply_registered_pass "lower-affine" to %14 : (!transform.any_op) -> !transform.any_op
transform.apply_patterns to %17 {
transform.apply_patterns.canonicalization
} : !transform.any_op
%18 = transform.apply_registered_pass "gpu-launch-sink-index-computations" to %17 : (!transform.any_op) -> !transform.any_op
%19 = transform.get_parent_op %18 {deduplicate, op_name = "builtin.module"} : (!transform.any_op) -> !transform.any_op
%20 = transform.apply_registered_pass "gpu-kernel-outlining" to %19 : (!transform.any_op) -> !transform.any_op
transform.yield
}
}
is there anything I am doing wrong?
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels