From 2045089ac1623572aba2320d2660b5e031b0aba0 Mon Sep 17 00:00:00 2001 From: erweiw Date: Thu, 9 Apr 2026 20:27:47 -0700 Subject: [PATCH 1/9] Add multi-dtype support to vec-add example (bf16, f32, i8, i16) Extends the vec-add example to support bf16, f32 (via bf16-emulation), i8, and i16 data types, inspired by mlir-air's triton_vec_add test. Driver changes: - Add dtype detection from Linalg IR (_detect_element_type) - Add placeholder substitution (@DTYPE@, @PAD_VAL@, @VECTOR_SIZE@) in transform scripts, resolved before library injection based on the IR element type and NPU version. Backward-compatible: no-op when no placeholders are present. Transform library: - Add pad_and_promote_binary_{f32,i8,i16} sequences alongside the existing bf16 variant. Vec-add example: - Add --dtype and --bf16-emulation CLI arguments - Transform scripts now use @DTYPE@ and @VECTOR_SIZE@ placeholders, making them dtype-generic across both AIE2 and AIE2P. Tested on NPU2 (Strix/AIE2P): all 4 dtypes pass correctness checks across vector sizes 1024-32768. Co-Authored-By: Claude Opus 4.6 (1M context) --- amd_triton_npu/backend/driver.py | 68 +++++++++++- .../transform_library/elementwise.mlir | 94 ++++++++++++++++ examples/vec-add/transform_aie2.mlir | 10 +- examples/vec-add/transform_aie2p.mlir | 10 +- examples/vec-add/vec-add.py | 102 ++++++++++++++---- 5 files changed, 256 insertions(+), 28 deletions(-) diff --git a/amd_triton_npu/backend/driver.py b/amd_triton_npu/backend/driver.py index 7887cff..8c3634b 100644 --- a/amd_triton_npu/backend/driver.py +++ b/amd_triton_npu/backend/driver.py @@ -410,7 +410,57 @@ def _replace_include(m): return result -def _get_transform_ir_string(): +def _detect_element_type(ir_str): + """Detect the primary element type from the Linalg IR function signature. + + Scans memref types in the first func.func line for the element type. + Returns the MLIR type string (e.g., "bf16", "f32", "i8", "i16"). + Falls back to "bf16" if detection fails. + """ + import re + + # Match memref<...xTYPE> in the function signature + match = re.search(r"memref<[^>]*x(\w+)>", ir_str) + if match: + return match.group(1) + return "bf16" + + +# Dtype-aware placeholder info: padding value and default vector size per NPU. +_DTYPE_PLACEHOLDER_INFO = { + "bf16": {"pad_val": "0.0 : bf16", "vector_size": {"npu1": 16, "npu2": 32}}, + "f32": {"pad_val": "0.0 : f32", "vector_size": {"npu1": 16, "npu2": 16}}, + "i8": {"pad_val": "0 : i8", "vector_size": {"npu1": 32, "npu2": 32}}, + "i16": {"pad_val": "0 : i16", "vector_size": {"npu1": 32, "npu2": 32}}, + "i32": {"pad_val": "0 : i32", "vector_size": {"npu1": 16, "npu2": 16}}, +} + + +def _substitute_dtype_placeholders(script, dtype, npu_version): + """Substitute dtype-aware placeholders in a transform script. + + Replaces @DTYPE@, @PAD_VAL@, and @VECTOR_SIZE@ with values derived + from the detected element type and target NPU version. + No-op if the script contains no placeholders (backward compatible). + """ + if ( + "@DTYPE@" not in script + and "@PAD_VAL@" not in script + and "@VECTOR_SIZE@" not in script + ): + return script + info = _DTYPE_PLACEHOLDER_INFO.get(dtype) + if info is None: + return script + script = script.replace("@DTYPE@", dtype) + script = script.replace("@PAD_VAL@", info["pad_val"]) + script = script.replace( + "@VECTOR_SIZE@", str(info["vector_size"].get(npu_version, 16)) + ) + return script + + +def _get_transform_ir_string(ir_str=None): """ Get the transform IR string for tiling operations. @@ -421,6 +471,12 @@ def _get_transform_ir_string(): If the script uses `transform.include`, the shared transform library (transform_library.mlir) is automatically injected. + If ir_str is provided, dtype-aware placeholders (@DTYPE@, @PAD_VAL@, + @VECTOR_SIZE@) are substituted before library injection. + + Args: + ir_str: Optional Linalg IR string for dtype detection. + Returns: str: The transform IR string to use for tiling """ @@ -436,6 +492,14 @@ def _get_transform_ir_string(): with open(custom_script_path, "r") as f: print(f"Using custom tiling script from: {custom_script_path}") user_script = f.read() + if ir_str is not None: + dtype = _detect_element_type( + ir_str if isinstance(ir_str, str) else str(ir_str) + ) + npu_version = detect_npu_version() + user_script = _substitute_dtype_placeholders( + user_script, dtype, npu_version + ) return _inject_transform_library(user_script) # Default hardcoded transform IR string @@ -493,7 +557,7 @@ def _ttshared_to_air(mod, gridX, gridY, gridZ, actual_sizes=None): pm = air.passmanager.PassManager.parse(pipeline, context=air_context) pm.run(air_module.operation) # MLIR-AIR compilation step 2: tiling the launch body - transform_ir_string = _get_transform_ir_string() + transform_ir_string = _get_transform_ir_string(ir_str=mod) transform_ir = Module.parse(transform_ir_string, context=air_context) run_transform(transform_ir, air_module) # MLIR-AIR compilation step 3: converting to AIR diff --git a/amd_triton_npu/backend/transform_library/elementwise.mlir b/amd_triton_npu/backend/transform_library/elementwise.mlir index 26fda74..cd847cb 100644 --- a/amd_triton_npu/backend/transform_library/elementwise.mlir +++ b/amd_triton_npu/backend/transform_library/elementwise.mlir @@ -96,3 +96,97 @@ transform.named_sequence @pad_and_promote_binary_bf16( {memory_space = 2, bufferize_destination_only, emit_dealloc} : !transform.any_op transform.yield } + +// Binary variant for f32: 2 inputs + 1 output = 3 operands. +// Used with bf16-emulation (f32 data, bf16 compute on AIE cores). +transform.named_sequence @pad_and_promote_binary_f32( + %module: !transform.any_op {transform.readonly}) { + %op = transform.structured.match ops{["linalg.generic"]} in %module + : (!transform.any_op) -> !transform.any_op + %padded_op, %pad_op, %__ = transform.structured.pad %op { + padding_values=[0.0 : f32, 0.0 : f32, 0.0 : f32], + padding_dimensions=[0, 1, 2], + nofold_flags=[1, 1, 1], + copy_back_op="linalg.copy" + } : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) + %pad_dps = transform.structured.rewrite_in_destination_passing_style %pad_op + : (!transform.any_op) -> !transform.any_op + %padded_lhs = transform.get_producer_of_operand %padded_op[0] + : (!transform.any_op) -> (!transform.any_op) + %padded_lhs_buffer, %padded_lhs_new = + transform.structured.bufferize_to_allocation %padded_lhs + {memory_space = 2, bufferize_destination_only, emit_dealloc} : !transform.any_op + %padded_rhs = transform.get_producer_of_operand %padded_op[1] + : (!transform.any_op) -> (!transform.any_op) + %padded_rhs_buffer, %padded_rhs_new = + transform.structured.bufferize_to_allocation %padded_rhs + {memory_space = 2, bufferize_destination_only, emit_dealloc} : !transform.any_op + %padded_result = transform.get_producer_of_operand %padded_op[2] + : (!transform.any_op) -> (!transform.any_op) + %padded_result_buffer, %padded_result_new = + transform.structured.bufferize_to_allocation %padded_result + {memory_space = 2, bufferize_destination_only, emit_dealloc} : !transform.any_op + transform.yield +} + +// Binary variant for i8: 2 inputs + 1 output = 3 operands. +transform.named_sequence @pad_and_promote_binary_i8( + %module: !transform.any_op {transform.readonly}) { + %op = transform.structured.match ops{["linalg.generic"]} in %module + : (!transform.any_op) -> !transform.any_op + %padded_op, %pad_op, %__ = transform.structured.pad %op { + padding_values=[0 : i8, 0 : i8, 0 : i8], + padding_dimensions=[0, 1, 2], + nofold_flags=[1, 1, 1], + copy_back_op="linalg.copy" + } : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) + %pad_dps = transform.structured.rewrite_in_destination_passing_style %pad_op + : (!transform.any_op) -> !transform.any_op + %padded_lhs = transform.get_producer_of_operand %padded_op[0] + : (!transform.any_op) -> (!transform.any_op) + %padded_lhs_buffer, %padded_lhs_new = + transform.structured.bufferize_to_allocation %padded_lhs + {memory_space = 2, bufferize_destination_only, emit_dealloc} : !transform.any_op + %padded_rhs = transform.get_producer_of_operand %padded_op[1] + : (!transform.any_op) -> (!transform.any_op) + %padded_rhs_buffer, %padded_rhs_new = + transform.structured.bufferize_to_allocation %padded_rhs + {memory_space = 2, bufferize_destination_only, emit_dealloc} : !transform.any_op + %padded_result = transform.get_producer_of_operand %padded_op[2] + : (!transform.any_op) -> (!transform.any_op) + %padded_result_buffer, %padded_result_new = + transform.structured.bufferize_to_allocation %padded_result + {memory_space = 2, bufferize_destination_only, emit_dealloc} : !transform.any_op + transform.yield +} + +// Binary variant for i16: 2 inputs + 1 output = 3 operands. +transform.named_sequence @pad_and_promote_binary_i16( + %module: !transform.any_op {transform.readonly}) { + %op = transform.structured.match ops{["linalg.generic"]} in %module + : (!transform.any_op) -> !transform.any_op + %padded_op, %pad_op, %__ = transform.structured.pad %op { + padding_values=[0 : i16, 0 : i16, 0 : i16], + padding_dimensions=[0, 1, 2], + nofold_flags=[1, 1, 1], + copy_back_op="linalg.copy" + } : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) + %pad_dps = transform.structured.rewrite_in_destination_passing_style %pad_op + : (!transform.any_op) -> !transform.any_op + %padded_lhs = transform.get_producer_of_operand %padded_op[0] + : (!transform.any_op) -> (!transform.any_op) + %padded_lhs_buffer, %padded_lhs_new = + transform.structured.bufferize_to_allocation %padded_lhs + {memory_space = 2, bufferize_destination_only, emit_dealloc} : !transform.any_op + %padded_rhs = transform.get_producer_of_operand %padded_op[1] + : (!transform.any_op) -> (!transform.any_op) + %padded_rhs_buffer, %padded_rhs_new = + transform.structured.bufferize_to_allocation %padded_rhs + {memory_space = 2, bufferize_destination_only, emit_dealloc} : !transform.any_op + %padded_result = transform.get_producer_of_operand %padded_op[2] + : (!transform.any_op) -> (!transform.any_op) + %padded_result_buffer, %padded_result_new = + transform.structured.bufferize_to_allocation %padded_result + {memory_space = 2, bufferize_destination_only, emit_dealloc} : !transform.any_op + transform.yield +} diff --git a/examples/vec-add/transform_aie2.mlir b/examples/vec-add/transform_aie2.mlir index b192305..5fdcf4f 100644 --- a/examples/vec-add/transform_aie2.mlir +++ b/examples/vec-add/transform_aie2.mlir @@ -4,8 +4,10 @@ //////////////////////////////////////////////////////////////////////////////// // Transform Script for Vector Addition (AIE2) // Simple elementwise add: out = a + b -// Binary op (2 inputs + 1 output). No fusion needed. Vec tile = 16 (AIE2). -// No type casts needed (bf16 add is native). +// Binary op (2 inputs + 1 output). No fusion needed. +// No type casts needed (bf16/i8/i16 add is native; f32 uses bf16-emulation). +// Dtype-generic: uses @DTYPE@ and @VECTOR_SIZE@ placeholders substituted +// by the driver based on the IR element type and NPU version. // Uses shared library sequences from transform_library.mlir (auto-injected). //////////////////////////////////////////////////////////////////////////////// @@ -18,7 +20,7 @@ module attributes {transform.with_named_sequence} { (%arg1) : (!transform.any_op) -> () transform.include @canonicalize_with_cse failures(propagate) (%arg1) : (!transform.any_op) -> () - transform.include @pad_and_promote_binary_bf16 failures(propagate) + transform.include @pad_and_promote_binary_@DTYPE@ failures(propagate) (%arg1) : (!transform.any_op) -> () transform.include @canonicalize_with_cse failures(propagate) (%arg1) : (!transform.any_op) -> () @@ -27,7 +29,7 @@ module attributes {transform.with_named_sequence} { transform.include @post_bufferize_cleanup failures(propagate) (%arg1) : (!transform.any_op) -> () - transform.include @vectorize_generics_at_16 failures(propagate) + transform.include @vectorize_generics_at_@VECTOR_SIZE@ failures(propagate) (%arg1) : (!transform.any_op) -> () %vh = transform.include @air_herd_mapping_and_vectorize failures(propagate) (%arg1) : (!transform.any_op) -> !transform.any_op diff --git a/examples/vec-add/transform_aie2p.mlir b/examples/vec-add/transform_aie2p.mlir index c9bae4f..9dad749 100644 --- a/examples/vec-add/transform_aie2p.mlir +++ b/examples/vec-add/transform_aie2p.mlir @@ -4,8 +4,10 @@ //////////////////////////////////////////////////////////////////////////////// // Transform Script for Vector Addition (AIE2P) // Simple elementwise add: out = a + b -// Binary op (2 inputs + 1 output). No fusion needed. Vec tile = 32 (AIE2P). -// No type casts needed (bf16 add is native). +// Binary op (2 inputs + 1 output). No fusion needed. +// No type casts needed (bf16/i8/i16 add is native; f32 uses bf16-emulation). +// Dtype-generic: uses @DTYPE@ and @VECTOR_SIZE@ placeholders substituted +// by the driver based on the IR element type and NPU version. // Uses shared library sequences from transform_library.mlir (auto-injected). //////////////////////////////////////////////////////////////////////////////// @@ -18,7 +20,7 @@ module attributes {transform.with_named_sequence} { (%arg1) : (!transform.any_op) -> () transform.include @canonicalize_with_cse failures(propagate) (%arg1) : (!transform.any_op) -> () - transform.include @pad_and_promote_binary_bf16 failures(propagate) + transform.include @pad_and_promote_binary_@DTYPE@ failures(propagate) (%arg1) : (!transform.any_op) -> () transform.include @canonicalize_with_cse failures(propagate) (%arg1) : (!transform.any_op) -> () @@ -27,7 +29,7 @@ module attributes {transform.with_named_sequence} { transform.include @post_bufferize_cleanup failures(propagate) (%arg1) : (!transform.any_op) -> () - transform.include @vectorize_generics_at_32 failures(propagate) + transform.include @vectorize_generics_at_@VECTOR_SIZE@ failures(propagate) (%arg1) : (!transform.any_op) -> () %vh = transform.include @air_herd_mapping_and_vectorize failures(propagate) (%arg1) : (!transform.any_op) -> !transform.any_op diff --git a/examples/vec-add/vec-add.py b/examples/vec-add/vec-add.py index c5452dd..fafb087 100644 --- a/examples/vec-add/vec-add.py +++ b/examples/vec-add/vec-add.py @@ -1,17 +1,51 @@ # Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved. # SPDX-License-Identifier: MIT -# this is a benchmark for adding vectors with maximum block size -# to check the performance of tl.dot operation +# Vector addition benchmark supporting multiple data types. +# Supports bf16 (default), f32 (via bf16-emulation), i8, and i16. +import argparse import torch import triton import triton.language as tl -import sys, os +import sys +import os sys.path.append(os.path.abspath("..")) import benchmark +# Dtype configuration: torch type, whether it's a float, tolerances. +DTYPE_CONFIG = { + "bf16": { + "torch_dtype": torch.bfloat16, + "is_float": True, + "atol": 1e-2, + "rtol": 1e-2, + "bf16_emulation": False, + }, + "f32": { + "torch_dtype": torch.float32, + "is_float": True, + "atol": 1e-1, + "rtol": 5e-2, + "bf16_emulation": True, # f32 addf not native on AIE; requires bf16-emulation + }, + "i8": { + "torch_dtype": torch.int8, + "is_float": False, + "atol": 0, + "rtol": 0, + "bf16_emulation": False, + }, + "i16": { + "torch_dtype": torch.int16, + "is_float": False, + "atol": 0, + "rtol": 0, + "bf16_emulation": False, + }, +} + @triton.jit def vecadd( @@ -25,8 +59,6 @@ def vecadd( block_start = pid * BLOCK_SIZE_N offsets = block_start + tl.arange(0, BLOCK_SIZE_N) - # mask = offsets < n_elements #AMK - in triton example, do we need? - a_block = tl.load(A + offsets[:]) b_block = tl.load(B + offsets[:]) @@ -35,35 +67,69 @@ def vecadd( tl.store(C + offsets[:], c_block) -# @benchmark.measure() -def bench_vecadd(N, provider): +def bench_vecadd(N, provider, cfg): device = "cpu" - dtype_in = torch.bfloat16 - dtype_out = ( - torch.bfloat16 - ) # torch.float32 won't work due to unsupported `%33 = fpext <8 x bfloat> %32 to <8 x float>` - a = torch.randn(N, device=device, dtype=dtype_in) - b = torch.randn(N, device=device, dtype=dtype_in) - c = torch.empty(N, device=device, dtype=dtype_out) + torch_dtype = cfg["torch_dtype"] + + if cfg["is_float"]: + a = torch.randn(N, device=device, dtype=torch_dtype) + b = torch.randn(N, device=device, dtype=torch_dtype) + else: + # Clamp to half-max to avoid overflow on addition + iinfo = torch.iinfo(torch_dtype) + half_max = iinfo.max // 2 + a = torch.randint(0, half_max, (N,), device=device, dtype=torch_dtype) + b = torch.randint(0, half_max, (N,), device=device, dtype=torch_dtype) + + c = torch.empty(N, device=device, dtype=torch_dtype) + if provider == "torch" or provider == "test": c_ref = torch.add(a, b) if provider == "triton" or provider == "test": - # 2D launch kernel where each block gets its own program. grid = lambda META: (triton.cdiv(N, META["BLOCK_SIZE_N"]),) compiled_kernel = vecadd[grid]( a, b, c, N, - BLOCK_SIZE_N=1024, # TODO: small tile sizes currently face errors due to lock race condition at memtiles + BLOCK_SIZE_N=1024, ) with open("tt.shared.mlir", "w") as f: f.write(str(compiled_kernel.asm["ttsharedir"])) if provider == "test": - torch.testing.assert_close(c, c_ref, atol=1e-2, rtol=1e-2) + torch.testing.assert_close(c, c_ref, atol=cfg["atol"], rtol=cfg["rtol"]) if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Vector addition benchmark for AMD NPU" + ) + parser.add_argument( + "--dtype", + type=str, + choices=list(DTYPE_CONFIG.keys()), + default="bf16", + help="Element data type (default: bf16)", + ) + parser.add_argument( + "--bf16-emulation", + dest="bf16_emulation", + default=False, + action="store_true", + help="Use f32 data type with bf16 emulation on AIE cores", + ) + args = parser.parse_args() + + # --bf16-emulation is shorthand for --dtype f32 + if args.bf16_emulation: + args.dtype = "f32" + + cfg = DTYPE_CONFIG[args.dtype] + + # Enable bf16 emulation env var when needed + if cfg["bf16_emulation"]: + os.environ["AMD_TRITON_NPU_BF16_EMULATION"] = "1" + benchmark.select_npu_backend() for N in [2**i for i in range(10, 16, 1)]: - bench_vecadd(N, "test") + bench_vecadd(N, "test", cfg) From a7c3f5ce4295d937bf8805a2d400fdd1488234f4 Mon Sep 17 00:00:00 2001 From: erweiw Date: Thu, 9 Apr 2026 20:35:29 -0700 Subject: [PATCH 2/9] Address Copilot review: guard placeholder substitution - Only call detect_npu_version() when @VECTOR_SIZE@ placeholder is actually present, avoiding failures in environments without xrt-smi - Raise ValueError with supported types when an unsupported element type is detected but placeholders are present - Fix _detect_element_type docstring to match actual behavior Co-Authored-By: Claude Opus 4.6 (1M context) --- amd_triton_npu/backend/driver.py | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/amd_triton_npu/backend/driver.py b/amd_triton_npu/backend/driver.py index 8c3634b..efe63c7 100644 --- a/amd_triton_npu/backend/driver.py +++ b/amd_triton_npu/backend/driver.py @@ -411,15 +411,16 @@ def _replace_include(m): def _detect_element_type(ir_str): - """Detect the primary element type from the Linalg IR function signature. + """Detect the primary element type from the provided Linalg IR string. - Scans memref types in the first func.func line for the element type. - Returns the MLIR type string (e.g., "bf16", "f32", "i8", "i16"). + Searches the IR text for the first ``memref<...xTYPE>`` occurrence and + returns the captured MLIR element type string (for example, ``"bf16"``, + ``"f32"``, ``"i8"``, or ``"i16"``). Falls back to "bf16" if detection fails. """ import re - # Match memref<...xTYPE> in the function signature + # Match the first memref<...xTYPE> occurrence in the provided IR text. match = re.search(r"memref<[^>]*x(\w+)>", ir_str) if match: return match.group(1) @@ -451,7 +452,12 @@ def _substitute_dtype_placeholders(script, dtype, npu_version): return script info = _DTYPE_PLACEHOLDER_INFO.get(dtype) if info is None: - return script + raise ValueError( + f"Unsupported element type '{dtype}' for transform script placeholder " + f"substitution. Supported types: {list(_DTYPE_PLACEHOLDER_INFO.keys())}. " + f"The script contains @DTYPE@/@PAD_VAL@/@VECTOR_SIZE@ placeholders that " + f"require a supported element type." + ) script = script.replace("@DTYPE@", dtype) script = script.replace("@PAD_VAL@", info["pad_val"]) script = script.replace( @@ -492,11 +498,14 @@ def _get_transform_ir_string(ir_str=None): with open(custom_script_path, "r") as f: print(f"Using custom tiling script from: {custom_script_path}") user_script = f.read() - if ir_str is not None: + _PLACEHOLDERS = ("@DTYPE@", "@PAD_VAL@", "@VECTOR_SIZE@") + if ir_str is not None and any(p in user_script for p in _PLACEHOLDERS): dtype = _detect_element_type( ir_str if isinstance(ir_str, str) else str(ir_str) ) - npu_version = detect_npu_version() + npu_version = ( + detect_npu_version() if "@VECTOR_SIZE@" in user_script else None + ) user_script = _substitute_dtype_placeholders( user_script, dtype, npu_version ) From eaa7210b8e892e02904e1d18e6b1a3906471e11f Mon Sep 17 00:00:00 2001 From: erweiw Date: Thu, 9 Apr 2026 20:36:59 -0700 Subject: [PATCH 3/9] Update vec-add datatypes in examples dashboard Update generate_readme.py registry to reflect multi-dtype support (bf16, f32, i8, i16) and regenerate examples/README.md. Co-Authored-By: Claude Opus 4.6 (1M context) --- examples/README.md | 59 +++++++++++++++++++++++++++++++++++++ examples/generate_readme.py | 2 +- 2 files changed, 60 insertions(+), 1 deletion(-) create mode 100644 examples/README.md diff --git a/examples/README.md b/examples/README.md new file mode 100644 index 0000000..351bfae --- /dev/null +++ b/examples/README.md @@ -0,0 +1,59 @@ + + +# Triton-XDNA Examples + +These examples demonstrate how to write [Triton](https://github.com/triton-lang/triton) kernels that compile and run on AMD XDNA™ NPUs via the [MLIR-AIR](https://github.com/Xilinx/mlir-air) compilation flow. + +## Operator Dashboard + +| Category | Operation | Datatype(s) | AIE2 | AIE2P | Example | +|:---------|:----------|:------------|:----:|:-----:|:--------| +| Matrix | [Matrix Multiplication (BF16)](matmul_bf16_m64_n64_k64/) | bf16 | ✅ | ✅ | [matmul_bf16_m64_n64_k64/](matmul_bf16_m64_n64_k64/) | +| Matrix | [Padded Matrix Multiplication (F32, A Transposed)](matmul_f32_m64_n32_k16_padded_atransposed/) | f32 (bf16 emulation) | — | ✅ | [matmul_f32_m64_n32_k16_padded_atransposed/](matmul_f32_m64_n32_k16_padded_atransposed/) | +| Matrix | [Matrix Multiplication (INT8)](matmul_i8_m64_n64_k64/) | i8 | — | ✅ | [matmul_i8_m64_n64_k64/](matmul_i8_m64_n64_k64/) | +| Matrix | [Matrix Multiplication (INT8, Large Tile)](matmul_i8_m128_n64_k64/) | i8 | — | ✅ | [matmul_i8_m128_n64_k64/](matmul_i8_m128_n64_k64/) | +| Matrix | [Matrix Multiplication (Autotune)](autotune-matmul/) | bf16 | ✅ | — | [autotune-matmul/](autotune-matmul/) | +| Element-wise | [ReLU](relu/) | bf16 | ✅ | ✅ | [relu/](relu/) | +| Element-wise | [Sigmoid](sigmoid/) | bf16 | ✅ | ✅ | [sigmoid/](sigmoid/) | +| Element-wise | [SiLU](silu/) | bf16 | ✅ | ✅ | [silu/](silu/) | +| Element-wise | [GELU](gelu/) | bf16 | — | ✅ | [gelu/](gelu/) | +| Element-wise | [Leaky ReLU](leaky_relu/) | bf16 | ✅ | ✅ | [leaky_relu/](leaky_relu/) | +| Element-wise | [SwiGLU](swiglu/) | bf16 | ✅ | ✅ | [swiglu/](swiglu/) | +| Element-wise | [AXPY](axpy/) | bf16 | ✅ | ✅ | [axpy/](axpy/) | +| Element-wise | [Vector Add](vec-add/) | bf16, f32, i8, i16 | ✅ | ✅ | [vec-add/](vec-add/) | +| Normalization | [RMS Normalization](rms_norm/) | bf16 | — | ✅ | [rms_norm/](rms_norm/) | +| Normalization | [Weighted RMS Normalization](weighted_rms_norm/) | bf16 | ✅ | ✅ | [weighted_rms_norm/](weighted_rms_norm/) | +| Normalization | [Softmax](test_softmax/) | bf16 | ✅ | ✅ | [test_softmax/](test_softmax/) | +| Normalization | [Layer Normalization](test_layernorm/) | f32 | ✅ | ✅ | [test_layernorm/](test_layernorm/) | +| Pooling | [Average Pool](average_pool/) | bf16 | ✅ | ✅ | [average_pool/](average_pool/) | +| Special | [2D Block Load](load_2d_block/) | f32 | — | — | [load_2d_block/](load_2d_block/) | +| Special | [Multi-Driver](multi_drivers/) | bf16 | ✅ | ✅ | [multi_drivers/](multi_drivers/) | + +### Legend + +- ✅ Transform file available (device target supported) +- — Not yet available + +**AIE2** = AMD Ryzen™ AI (Phoenix, NPU1)    **AIE2P** = AMD Ryzen™ AI (Strix, NPU2) + +## Running Examples + +Make sure XRT is sourced and a virtual environment with `triton-xdna` is active (see top-level [README](../README.md)): + +```bash +source /opt/xilinx/xrt/setup.sh + +# Run an example on AIE2 (NPU1): +cd matmul_bf16_m64_n64_k64 +AIR_TRANSFORM_TILING_SCRIPT=transform_aie2.mlir python matmul_bf16_m64_n64_k64.py + +# Run on AIE2P (NPU2): +AIR_TRANSFORM_TILING_SCRIPT=transform_aie2p.mlir python matmul_bf16_m64_n64_k64.py +``` + +## Running All Tests + +```bash +python scripts/run_tests.py --device aie2 --verbose +python scripts/run_tests.py --device aie2p --verbose +``` diff --git a/examples/generate_readme.py b/examples/generate_readme.py index a8a06ae..b479cf1 100644 --- a/examples/generate_readme.py +++ b/examples/generate_readme.py @@ -102,7 +102,7 @@ "category": "Element-wise", "name": "Vector Add", "path": "vec-add", - "datatypes": "bf16", + "datatypes": "bf16, f32, i8, i16", }, { "category": "Normalization", From a18807179018df2f325ea8c3a2e5c50c84afa950 Mon Sep 17 00:00:00 2001 From: erweiw Date: Thu, 9 Apr 2026 20:39:15 -0700 Subject: [PATCH 4/9] Remove auto-generated README.md (generated by CI) examples/README.md is auto-generated by generate_readme.py in CI and should not be committed. Co-Authored-By: Claude Opus 4.6 (1M context) --- examples/README.md | 59 ---------------------------------------------- 1 file changed, 59 deletions(-) delete mode 100644 examples/README.md diff --git a/examples/README.md b/examples/README.md deleted file mode 100644 index 351bfae..0000000 --- a/examples/README.md +++ /dev/null @@ -1,59 +0,0 @@ - - -# Triton-XDNA Examples - -These examples demonstrate how to write [Triton](https://github.com/triton-lang/triton) kernels that compile and run on AMD XDNA™ NPUs via the [MLIR-AIR](https://github.com/Xilinx/mlir-air) compilation flow. - -## Operator Dashboard - -| Category | Operation | Datatype(s) | AIE2 | AIE2P | Example | -|:---------|:----------|:------------|:----:|:-----:|:--------| -| Matrix | [Matrix Multiplication (BF16)](matmul_bf16_m64_n64_k64/) | bf16 | ✅ | ✅ | [matmul_bf16_m64_n64_k64/](matmul_bf16_m64_n64_k64/) | -| Matrix | [Padded Matrix Multiplication (F32, A Transposed)](matmul_f32_m64_n32_k16_padded_atransposed/) | f32 (bf16 emulation) | — | ✅ | [matmul_f32_m64_n32_k16_padded_atransposed/](matmul_f32_m64_n32_k16_padded_atransposed/) | -| Matrix | [Matrix Multiplication (INT8)](matmul_i8_m64_n64_k64/) | i8 | — | ✅ | [matmul_i8_m64_n64_k64/](matmul_i8_m64_n64_k64/) | -| Matrix | [Matrix Multiplication (INT8, Large Tile)](matmul_i8_m128_n64_k64/) | i8 | — | ✅ | [matmul_i8_m128_n64_k64/](matmul_i8_m128_n64_k64/) | -| Matrix | [Matrix Multiplication (Autotune)](autotune-matmul/) | bf16 | ✅ | — | [autotune-matmul/](autotune-matmul/) | -| Element-wise | [ReLU](relu/) | bf16 | ✅ | ✅ | [relu/](relu/) | -| Element-wise | [Sigmoid](sigmoid/) | bf16 | ✅ | ✅ | [sigmoid/](sigmoid/) | -| Element-wise | [SiLU](silu/) | bf16 | ✅ | ✅ | [silu/](silu/) | -| Element-wise | [GELU](gelu/) | bf16 | — | ✅ | [gelu/](gelu/) | -| Element-wise | [Leaky ReLU](leaky_relu/) | bf16 | ✅ | ✅ | [leaky_relu/](leaky_relu/) | -| Element-wise | [SwiGLU](swiglu/) | bf16 | ✅ | ✅ | [swiglu/](swiglu/) | -| Element-wise | [AXPY](axpy/) | bf16 | ✅ | ✅ | [axpy/](axpy/) | -| Element-wise | [Vector Add](vec-add/) | bf16, f32, i8, i16 | ✅ | ✅ | [vec-add/](vec-add/) | -| Normalization | [RMS Normalization](rms_norm/) | bf16 | — | ✅ | [rms_norm/](rms_norm/) | -| Normalization | [Weighted RMS Normalization](weighted_rms_norm/) | bf16 | ✅ | ✅ | [weighted_rms_norm/](weighted_rms_norm/) | -| Normalization | [Softmax](test_softmax/) | bf16 | ✅ | ✅ | [test_softmax/](test_softmax/) | -| Normalization | [Layer Normalization](test_layernorm/) | f32 | ✅ | ✅ | [test_layernorm/](test_layernorm/) | -| Pooling | [Average Pool](average_pool/) | bf16 | ✅ | ✅ | [average_pool/](average_pool/) | -| Special | [2D Block Load](load_2d_block/) | f32 | — | — | [load_2d_block/](load_2d_block/) | -| Special | [Multi-Driver](multi_drivers/) | bf16 | ✅ | ✅ | [multi_drivers/](multi_drivers/) | - -### Legend - -- ✅ Transform file available (device target supported) -- — Not yet available - -**AIE2** = AMD Ryzen™ AI (Phoenix, NPU1)    **AIE2P** = AMD Ryzen™ AI (Strix, NPU2) - -## Running Examples - -Make sure XRT is sourced and a virtual environment with `triton-xdna` is active (see top-level [README](../README.md)): - -```bash -source /opt/xilinx/xrt/setup.sh - -# Run an example on AIE2 (NPU1): -cd matmul_bf16_m64_n64_k64 -AIR_TRANSFORM_TILING_SCRIPT=transform_aie2.mlir python matmul_bf16_m64_n64_k64.py - -# Run on AIE2P (NPU2): -AIR_TRANSFORM_TILING_SCRIPT=transform_aie2p.mlir python matmul_bf16_m64_n64_k64.py -``` - -## Running All Tests - -```bash -python scripts/run_tests.py --device aie2 --verbose -python scripts/run_tests.py --device aie2p --verbose -``` From 68dd5fee18838e69e65bd91eb754d0f44dc7da24 Mon Sep 17 00:00:00 2001 From: erweiw Date: Thu, 9 Apr 2026 21:01:48 -0700 Subject: [PATCH 5/9] Add multi-dtype support to axpy and relu examples Extend axpy and relu to support bf16, f32 (bf16-emulation), i8, and i16 using the same @DTYPE@/@VECTOR_SIZE@ placeholder mechanism as vec-add. Transform library: add pad_and_promote_unary_{f32,i8,i16} sequences. Tested on NPU2 (Strix/AIE2P): - bf16, f32, i16: pass for both axpy and relu - i8: compiles through triton-shared-opt and AIR transforms but fails at aircc (arith.muli/maxsi not supported for i8 vectors on AIE2P). vec-add i8 works because it only uses arith.addi. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../transform_library/elementwise.mlir | 79 ++++++++++++++++ examples/axpy/axpy.py | 93 +++++++++++++++++-- examples/axpy/transform_aie2.mlir | 8 +- examples/axpy/transform_aie2p.mlir | 8 +- examples/generate_readme.py | 4 +- examples/relu/relu.py | 85 +++++++++++++++-- examples/relu/transform_aie2.mlir | 6 +- examples/relu/transform_aie2p.mlir | 7 +- 8 files changed, 256 insertions(+), 34 deletions(-) diff --git a/amd_triton_npu/backend/transform_library/elementwise.mlir b/amd_triton_npu/backend/transform_library/elementwise.mlir index cd847cb..30c14f1 100644 --- a/amd_triton_npu/backend/transform_library/elementwise.mlir +++ b/amd_triton_npu/backend/transform_library/elementwise.mlir @@ -66,6 +66,85 @@ transform.named_sequence @pad_and_promote_unary_bf16( transform.yield } +// Unary variant for f32: 1 input + 1 output = 2 operands. +// Used with bf16-emulation (f32 data, bf16 compute on AIE cores). +transform.named_sequence @pad_and_promote_unary_f32( + %module: !transform.any_op {transform.readonly}) { + %op = transform.structured.match ops{["linalg.generic"]} in %module + : (!transform.any_op) -> !transform.any_op + %padded_op, %pad_op, %__ = transform.structured.pad %op { + padding_values=[0.0 : f32, 0.0 : f32], + padding_dimensions=[0, 1], + nofold_flags=[1, 1], + copy_back_op="linalg.copy" + } : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) + %pad_dps = transform.structured.rewrite_in_destination_passing_style %pad_op + : (!transform.any_op) -> !transform.any_op + %padded_input = transform.get_producer_of_operand %padded_op[0] + : (!transform.any_op) -> (!transform.any_op) + %padded_input_buffer, %padded_input_new = + transform.structured.bufferize_to_allocation %padded_input + {memory_space = 2, bufferize_destination_only, emit_dealloc} : !transform.any_op + %padded_result = transform.get_producer_of_operand %padded_op[1] + : (!transform.any_op) -> (!transform.any_op) + %padded_result_buffer, %padded_result_new = + transform.structured.bufferize_to_allocation %padded_result + {memory_space = 2, bufferize_destination_only, emit_dealloc} : !transform.any_op + transform.yield +} + +// Unary variant for i8: 1 input + 1 output = 2 operands. +transform.named_sequence @pad_and_promote_unary_i8( + %module: !transform.any_op {transform.readonly}) { + %op = transform.structured.match ops{["linalg.generic"]} in %module + : (!transform.any_op) -> !transform.any_op + %padded_op, %pad_op, %__ = transform.structured.pad %op { + padding_values=[0 : i8, 0 : i8], + padding_dimensions=[0, 1], + nofold_flags=[1, 1], + copy_back_op="linalg.copy" + } : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) + %pad_dps = transform.structured.rewrite_in_destination_passing_style %pad_op + : (!transform.any_op) -> !transform.any_op + %padded_input = transform.get_producer_of_operand %padded_op[0] + : (!transform.any_op) -> (!transform.any_op) + %padded_input_buffer, %padded_input_new = + transform.structured.bufferize_to_allocation %padded_input + {memory_space = 2, bufferize_destination_only, emit_dealloc} : !transform.any_op + %padded_result = transform.get_producer_of_operand %padded_op[1] + : (!transform.any_op) -> (!transform.any_op) + %padded_result_buffer, %padded_result_new = + transform.structured.bufferize_to_allocation %padded_result + {memory_space = 2, bufferize_destination_only, emit_dealloc} : !transform.any_op + transform.yield +} + +// Unary variant for i16: 1 input + 1 output = 2 operands. +transform.named_sequence @pad_and_promote_unary_i16( + %module: !transform.any_op {transform.readonly}) { + %op = transform.structured.match ops{["linalg.generic"]} in %module + : (!transform.any_op) -> !transform.any_op + %padded_op, %pad_op, %__ = transform.structured.pad %op { + padding_values=[0 : i16, 0 : i16], + padding_dimensions=[0, 1], + nofold_flags=[1, 1], + copy_back_op="linalg.copy" + } : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op) + %pad_dps = transform.structured.rewrite_in_destination_passing_style %pad_op + : (!transform.any_op) -> !transform.any_op + %padded_input = transform.get_producer_of_operand %padded_op[0] + : (!transform.any_op) -> (!transform.any_op) + %padded_input_buffer, %padded_input_new = + transform.structured.bufferize_to_allocation %padded_input + {memory_space = 2, bufferize_destination_only, emit_dealloc} : !transform.any_op + %padded_result = transform.get_producer_of_operand %padded_op[1] + : (!transform.any_op) -> (!transform.any_op) + %padded_result_buffer, %padded_result_new = + transform.structured.bufferize_to_allocation %padded_result + {memory_space = 2, bufferize_destination_only, emit_dealloc} : !transform.any_op + transform.yield +} + // Binary variant: 2 inputs + 1 output = 3 operands (vec-add, axpy, swiglu). transform.named_sequence @pad_and_promote_binary_bf16( %module: !transform.any_op {transform.readonly}) { diff --git a/examples/axpy/axpy.py b/examples/axpy/axpy.py index 9eb2738..90bc69d 100644 --- a/examples/axpy/axpy.py +++ b/examples/axpy/axpy.py @@ -1,14 +1,54 @@ # Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved. # SPDX-License-Identifier: MIT +# AXPY benchmark: out = alpha * x + y +# Supports bf16 (default), f32 (via bf16-emulation), i8, and i16. + +import argparse import torch import triton import triton.language as tl -import sys, os +import sys +import os sys.path.append(os.path.abspath("..")) import benchmark +DTYPE_CONFIG = { + "bf16": { + "torch_dtype": torch.bfloat16, + "is_float": True, + "alpha": 2.0, + "atol": 1e-2, + "rtol": 1e-2, + "bf16_emulation": False, + }, + "f32": { + "torch_dtype": torch.float32, + "is_float": True, + "alpha": 2.0, + "atol": 1e-1, + "rtol": 5e-2, + "bf16_emulation": True, + }, + "i8": { + "torch_dtype": torch.int8, + "is_float": False, + "alpha": 2, + "atol": 0, + "rtol": 0, + "bf16_emulation": False, + }, + "i16": { + "torch_dtype": torch.int16, + "is_float": False, + "alpha": 2, + "atol": 0, + "rtol": 0, + "bf16_emulation": False, + }, +} + @triton.jit def axpy_kernel( @@ -29,13 +69,23 @@ def axpy_kernel( tl.store(OUT + offsets[:], out) -def bench_axpy(N, provider): +def bench_axpy(N, provider, cfg): device = "cpu" - dtype = torch.bfloat16 - alpha = 2.0 - x = torch.randn(N, device=device, dtype=dtype) - y = torch.randn(N, device=device, dtype=dtype) - out = torch.empty(N, device=device, dtype=dtype) + torch_dtype = cfg["torch_dtype"] + alpha = cfg["alpha"] + + if cfg["is_float"]: + x = torch.randn(N, device=device, dtype=torch_dtype) + y = torch.randn(N, device=device, dtype=torch_dtype) + else: + iinfo = torch.iinfo(torch_dtype) + # Keep values small enough that alpha*x+y doesn't overflow + quarter_max = iinfo.max // 4 + x = torch.randint(0, quarter_max, (N,), device=device, dtype=torch_dtype) + y = torch.randint(0, quarter_max, (N,), device=device, dtype=torch_dtype) + + out = torch.empty(N, device=device, dtype=torch_dtype) + if provider == "torch" or provider == "test": out_ref = alpha * x + y if provider == "triton" or provider == "test": @@ -51,10 +101,35 @@ def bench_axpy(N, provider): with open("tt.shared.mlir", "w") as f: f.write(str(compiled_kernel.asm["ttsharedir"])) if provider == "test": - torch.testing.assert_close(out, out_ref, atol=1e-2, rtol=1e-2) + torch.testing.assert_close(out, out_ref, atol=cfg["atol"], rtol=cfg["rtol"]) if __name__ == "__main__": + parser = argparse.ArgumentParser(description="AXPY benchmark for AMD NPU") + parser.add_argument( + "--dtype", + type=str, + choices=list(DTYPE_CONFIG.keys()), + default="bf16", + help="Element data type (default: bf16)", + ) + parser.add_argument( + "--bf16-emulation", + dest="bf16_emulation", + default=False, + action="store_true", + help="Use f32 data type with bf16 emulation on AIE cores", + ) + args = parser.parse_args() + + if args.bf16_emulation: + args.dtype = "f32" + + cfg = DTYPE_CONFIG[args.dtype] + + if cfg["bf16_emulation"]: + os.environ["AMD_TRITON_NPU_BF16_EMULATION"] = "1" + benchmark.select_npu_backend() for N in [2**i for i in range(10, 16, 1)]: - bench_axpy(N, "test") + bench_axpy(N, "test", cfg) diff --git a/examples/axpy/transform_aie2.mlir b/examples/axpy/transform_aie2.mlir index 31e907d..2bea4be 100644 --- a/examples/axpy/transform_aie2.mlir +++ b/examples/axpy/transform_aie2.mlir @@ -3,8 +3,8 @@ //////////////////////////////////////////////////////////////////////////////// // Transform Script for AXPY (AIE2): out = alpha * x + y -// Binary op (2 inputs: x, y). Cast mulf and addf to bf16. -// No extern_func.o needed (native mulf/addf). +// Binary op (2 inputs: x, y). Cast mulf and addf to bf16 when float. +// Dtype-generic: uses @DTYPE@ and @VECTOR_SIZE@ placeholders. // Uses shared library sequences from transform_library.mlir (auto-injected). //////////////////////////////////////////////////////////////////////////////// @@ -20,7 +20,7 @@ module attributes {transform.with_named_sequence} { (%arg1) : (!transform.any_op) -> () transform.include @canonicalize_with_cse failures(propagate) (%arg1) : (!transform.any_op) -> () - transform.include @pad_and_promote_binary_bf16 failures(propagate) + transform.include @pad_and_promote_binary_@DTYPE@ failures(propagate) (%arg1) : (!transform.any_op) -> () transform.include @canonicalize_with_cse failures(propagate) (%arg1) : (!transform.any_op) -> () @@ -29,7 +29,7 @@ module attributes {transform.with_named_sequence} { transform.include @post_bufferize_cleanup failures(propagate) (%arg1) : (!transform.any_op) -> () - transform.include @vectorize_generics_at_16 failures(propagate) + transform.include @vectorize_generics_at_@VECTOR_SIZE@ failures(propagate) (%arg1) : (!transform.any_op) -> () %vh = transform.include @air_herd_mapping_and_vectorize failures(propagate) (%arg1) : (!transform.any_op) -> !transform.any_op diff --git a/examples/axpy/transform_aie2p.mlir b/examples/axpy/transform_aie2p.mlir index 3244ef5..df56af7 100644 --- a/examples/axpy/transform_aie2p.mlir +++ b/examples/axpy/transform_aie2p.mlir @@ -3,8 +3,8 @@ //////////////////////////////////////////////////////////////////////////////// // Transform Script for AXPY (AIE2P): out = alpha * x + y -// Binary op (2 inputs: x, y). Cast mulf and addf to bf16. -// No extern_func.o needed (native mulf/addf). +// Binary op (2 inputs: x, y). Cast mulf and addf to bf16 when float. +// Dtype-generic: uses @DTYPE@ and @VECTOR_SIZE@ placeholders. // Uses shared library sequences from transform_library.mlir (auto-injected). //////////////////////////////////////////////////////////////////////////////// @@ -20,7 +20,7 @@ module attributes {transform.with_named_sequence} { (%arg1) : (!transform.any_op) -> () transform.include @canonicalize_with_cse failures(propagate) (%arg1) : (!transform.any_op) -> () - transform.include @pad_and_promote_binary_bf16 failures(propagate) + transform.include @pad_and_promote_binary_@DTYPE@ failures(propagate) (%arg1) : (!transform.any_op) -> () transform.include @canonicalize_with_cse failures(propagate) (%arg1) : (!transform.any_op) -> () @@ -29,7 +29,7 @@ module attributes {transform.with_named_sequence} { transform.include @post_bufferize_cleanup failures(propagate) (%arg1) : (!transform.any_op) -> () - transform.include @vectorize_generics_at_16 failures(propagate) + transform.include @vectorize_generics_at_@VECTOR_SIZE@ failures(propagate) (%arg1) : (!transform.any_op) -> () %vh = transform.include @air_herd_mapping_and_vectorize failures(propagate) (%arg1) : (!transform.any_op) -> !transform.any_op diff --git a/examples/generate_readme.py b/examples/generate_readme.py index b479cf1..1887247 100644 --- a/examples/generate_readme.py +++ b/examples/generate_readme.py @@ -60,7 +60,7 @@ "category": "Element-wise", "name": "ReLU", "path": "relu", - "datatypes": "bf16", + "datatypes": "bf16, f32, i8, i16", }, { "category": "Element-wise", @@ -96,7 +96,7 @@ "category": "Element-wise", "name": "AXPY", "path": "axpy", - "datatypes": "bf16", + "datatypes": "bf16, f32, i8, i16", }, { "category": "Element-wise", diff --git a/examples/relu/relu.py b/examples/relu/relu.py index b873aab..66ab642 100644 --- a/examples/relu/relu.py +++ b/examples/relu/relu.py @@ -1,14 +1,50 @@ # Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved. # SPDX-License-Identifier: MIT +# ReLU benchmark: y = max(x, 0) +# Supports bf16 (default), f32 (via bf16-emulation), i8, and i16. + +import argparse import torch import triton import triton.language as tl -import sys, os +import sys +import os sys.path.append(os.path.abspath("..")) import benchmark +DTYPE_CONFIG = { + "bf16": { + "torch_dtype": torch.bfloat16, + "is_float": True, + "atol": 1e-2, + "rtol": 1e-2, + "bf16_emulation": False, + }, + "f32": { + "torch_dtype": torch.float32, + "is_float": True, + "atol": 1e-1, + "rtol": 5e-2, + "bf16_emulation": True, + }, + "i8": { + "torch_dtype": torch.int8, + "is_float": False, + "atol": 0, + "rtol": 0, + "bf16_emulation": False, + }, + "i16": { + "torch_dtype": torch.int16, + "is_float": False, + "atol": 0, + "rtol": 0, + "bf16_emulation": False, + }, +} + @triton.jit def relu_kernel( @@ -22,15 +58,23 @@ def relu_kernel( offsets = block_start + tl.arange(0, BLOCK_SIZE) x = tl.load(X + offsets[:]) - y = tl.maximum(x, 0.0) + # x * 0 produces a dtype-compatible zero for both float and int types. + y = tl.maximum(x, x * 0) tl.store(Y + offsets[:], y) -def bench_relu(N, provider): +def bench_relu(N, provider, cfg): device = "cpu" - dtype = torch.bfloat16 - x = torch.randn(N, device=device, dtype=dtype) - y = torch.empty(N, device=device, dtype=dtype) + torch_dtype = cfg["torch_dtype"] + + if cfg["is_float"]: + x = torch.randn(N, device=device, dtype=torch_dtype) + else: + iinfo = torch.iinfo(torch_dtype) + x = torch.randint(iinfo.min, iinfo.max, (N,), device=device, dtype=torch_dtype) + + y = torch.empty(N, device=device, dtype=torch_dtype) + if provider == "torch" or provider == "test": y_ref = torch.relu(x) if provider == "triton" or provider == "test": @@ -44,10 +88,35 @@ def bench_relu(N, provider): with open("tt.shared.mlir", "w") as f: f.write(str(compiled_kernel.asm["ttsharedir"])) if provider == "test": - torch.testing.assert_close(y, y_ref, atol=1e-2, rtol=1e-2) + torch.testing.assert_close(y, y_ref, atol=cfg["atol"], rtol=cfg["rtol"]) if __name__ == "__main__": + parser = argparse.ArgumentParser(description="ReLU benchmark for AMD NPU") + parser.add_argument( + "--dtype", + type=str, + choices=list(DTYPE_CONFIG.keys()), + default="bf16", + help="Element data type (default: bf16)", + ) + parser.add_argument( + "--bf16-emulation", + dest="bf16_emulation", + default=False, + action="store_true", + help="Use f32 data type with bf16 emulation on AIE cores", + ) + args = parser.parse_args() + + if args.bf16_emulation: + args.dtype = "f32" + + cfg = DTYPE_CONFIG[args.dtype] + + if cfg["bf16_emulation"]: + os.environ["AMD_TRITON_NPU_BF16_EMULATION"] = "1" + benchmark.select_npu_backend() for N in [2**i for i in range(10, 16, 1)]: - bench_relu(N, "test") + bench_relu(N, "test", cfg) diff --git a/examples/relu/transform_aie2.mlir b/examples/relu/transform_aie2.mlir index ce5dc86..fbcf1df 100644 --- a/examples/relu/transform_aie2.mlir +++ b/examples/relu/transform_aie2.mlir @@ -4,7 +4,7 @@ //////////////////////////////////////////////////////////////////////////////// // Transform Script for ReLU (AIE2) // relu(x) = max(x, 0) -// No extern_func.o needed (native maxnumf). +// Dtype-generic: uses @DTYPE@ and @VECTOR_SIZE@ placeholders. // Uses shared library sequences from transform_library.mlir (auto-injected). //////////////////////////////////////////////////////////////////////////////// @@ -20,7 +20,7 @@ module attributes {transform.with_named_sequence} { (%arg1) : (!transform.any_op) -> () transform.include @canonicalize_with_cse failures(propagate) (%arg1) : (!transform.any_op) -> () - transform.include @pad_and_promote_unary_bf16 failures(propagate) + transform.include @pad_and_promote_unary_@DTYPE@ failures(propagate) (%arg1) : (!transform.any_op) -> () transform.include @canonicalize_with_cse failures(propagate) (%arg1) : (!transform.any_op) -> () @@ -29,7 +29,7 @@ module attributes {transform.with_named_sequence} { transform.include @post_bufferize_cleanup failures(propagate) (%arg1) : (!transform.any_op) -> () - transform.include @vectorize_generics_at_16 failures(propagate) + transform.include @vectorize_generics_at_@VECTOR_SIZE@ failures(propagate) (%arg1) : (!transform.any_op) -> () %vh = transform.include @air_herd_mapping_and_vectorize failures(propagate) (%arg1) : (!transform.any_op) -> !transform.any_op diff --git a/examples/relu/transform_aie2p.mlir b/examples/relu/transform_aie2p.mlir index eba1f17..7e4ba8e 100644 --- a/examples/relu/transform_aie2p.mlir +++ b/examples/relu/transform_aie2p.mlir @@ -4,8 +4,7 @@ //////////////////////////////////////////////////////////////////////////////// // Transform Script for ReLU (AIE2P) // relu(x) = max(x, 0) -// Strategy: fuse_elementwise_linalg -> unary pad+promote -> vectorize at 16 -// -> cast maxnumf to bf16. +// Dtype-generic: uses @DTYPE@ and @VECTOR_SIZE@ placeholders. // Uses shared library sequences from transform_library.mlir (auto-injected). //////////////////////////////////////////////////////////////////////////////// @@ -21,7 +20,7 @@ module attributes {transform.with_named_sequence} { (%arg1) : (!transform.any_op) -> () transform.include @canonicalize_with_cse failures(propagate) (%arg1) : (!transform.any_op) -> () - transform.include @pad_and_promote_unary_bf16 failures(propagate) + transform.include @pad_and_promote_unary_@DTYPE@ failures(propagate) (%arg1) : (!transform.any_op) -> () transform.include @canonicalize_with_cse failures(propagate) (%arg1) : (!transform.any_op) -> () @@ -30,7 +29,7 @@ module attributes {transform.with_named_sequence} { transform.include @post_bufferize_cleanup failures(propagate) (%arg1) : (!transform.any_op) -> () - transform.include @vectorize_generics_at_16 failures(propagate) + transform.include @vectorize_generics_at_@VECTOR_SIZE@ failures(propagate) (%arg1) : (!transform.any_op) -> () %vh = transform.include @air_herd_mapping_and_vectorize failures(propagate) (%arg1) : (!transform.any_op) -> !transform.any_op From 00c502dd59440ea8f921f9dd15fa9f600c73e839 Mon Sep 17 00:00:00 2001 From: erweiw Date: Thu, 9 Apr 2026 21:12:19 -0700 Subject: [PATCH 6/9] Add f32 bf16-emulation support to all elementwise activation examples Extend sigmoid, silu, gelu, swiglu, and leaky_relu examples to support f32 input via bf16-emulation, in addition to the existing bf16. All transform scripts updated with @DTYPE@/@VECTOR_SIZE@ placeholders. The @cast_bf16_only_ops and @cast_cmpf_and_select_ops phases work correctly for both bf16 and f32 inputs -- for f32, the cast converts f32 vector ops to bf16 at the MLIR level (equivalent to what bf16-emulation does at the LLVM level). Tested on NPU2 (Strix/AIE2P): all 5 examples pass correctness checks for both bf16 and f32 across vector sizes 1024-32768. Co-Authored-By: Claude Opus 4.6 (1M context) --- examples/gelu/gelu.py | 61 ++++++++++++++++++++---- examples/gelu/transform_aie2p.mlir | 4 +- examples/generate_readme.py | 10 ++-- examples/leaky_relu/leaky_relu.py | 59 ++++++++++++++++++++--- examples/leaky_relu/transform_aie2.mlir | 4 +- examples/leaky_relu/transform_aie2p.mlir | 4 +- examples/sigmoid/sigmoid.py | 59 ++++++++++++++++++++--- examples/sigmoid/transform_aie2p.mlir | 25 ++-------- examples/silu/silu.py | 59 ++++++++++++++++++++--- examples/silu/transform_aie2.mlir | 4 +- examples/silu/transform_aie2p.mlir | 4 +- examples/swiglu/swiglu.py | 61 ++++++++++++++++++++---- examples/swiglu/transform_aie2.mlir | 4 +- examples/swiglu/transform_aie2p.mlir | 4 +- 14 files changed, 285 insertions(+), 77 deletions(-) diff --git a/examples/gelu/gelu.py b/examples/gelu/gelu.py index 304afbb..ceadacb 100644 --- a/examples/gelu/gelu.py +++ b/examples/gelu/gelu.py @@ -1,14 +1,34 @@ # Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved. # SPDX-License-Identifier: MIT +# GELU benchmark: y = x * sigmoid(1.702 * x) +# Supports bf16 (default) and f32 (via bf16-emulation). + +import argparse import torch import triton import triton.language as tl -import sys, os +import sys +import os sys.path.append(os.path.abspath("..")) import benchmark +DTYPE_CONFIG = { + "bf16": { + "torch_dtype": torch.bfloat16, + "atol": 1e-1, + "rtol": 1e-1, + "bf16_emulation": False, + }, + "f32": { + "torch_dtype": torch.float32, + "atol": 2e-1, + "rtol": 1e-1, + "bf16_emulation": True, + }, +} + @triton.jit def gelu_kernel( @@ -30,14 +50,14 @@ def gelu_kernel( tl.store(Y + offsets[:], y) -def bench_gelu(N, provider): +def bench_gelu(N, provider, cfg): device = "cpu" - dtype = torch.bfloat16 - x = torch.randn(N, device=device, dtype=dtype) - y = torch.empty(N, device=device, dtype=dtype) + torch_dtype = cfg["torch_dtype"] + x = torch.randn(N, device=device, dtype=torch_dtype) + y = torch.empty(N, device=device, dtype=torch_dtype) if provider == "torch" or provider == "test": # Reference uses sigmoid approximation: x * sigmoid(1.702 * x) - y_ref = x * torch.sigmoid(1.702 * x.float()).to(dtype) + y_ref = x * torch.sigmoid(1.702 * x.float()).to(torch_dtype) if provider == "triton" or provider == "test": grid = lambda META: (triton.cdiv(N, META["BLOCK_SIZE"]),) compiled_kernel = gelu_kernel[grid]( @@ -49,10 +69,35 @@ def bench_gelu(N, provider): with open("tt.shared.mlir", "w") as f: f.write(str(compiled_kernel.asm["ttsharedir"])) if provider == "test": - torch.testing.assert_close(y, y_ref, atol=1e-1, rtol=1e-1) + torch.testing.assert_close(y, y_ref, atol=cfg["atol"], rtol=cfg["rtol"]) if __name__ == "__main__": + parser = argparse.ArgumentParser(description="GELU benchmark for AMD NPU") + parser.add_argument( + "--dtype", + type=str, + choices=list(DTYPE_CONFIG.keys()), + default="bf16", + help="Element data type (default: bf16)", + ) + parser.add_argument( + "--bf16-emulation", + dest="bf16_emulation", + default=False, + action="store_true", + help="Use f32 data type with bf16 emulation on AIE cores", + ) + args = parser.parse_args() + + if args.bf16_emulation: + args.dtype = "f32" + + cfg = DTYPE_CONFIG[args.dtype] + + if cfg["bf16_emulation"]: + os.environ["AMD_TRITON_NPU_BF16_EMULATION"] = "1" + benchmark.select_npu_backend() for N in [2**i for i in range(10, 16, 1)]: - bench_gelu(N, "test") + bench_gelu(N, "test", cfg) diff --git a/examples/gelu/transform_aie2p.mlir b/examples/gelu/transform_aie2p.mlir index 2fa1afa..71de302 100644 --- a/examples/gelu/transform_aie2p.mlir +++ b/examples/gelu/transform_aie2p.mlir @@ -22,7 +22,7 @@ module attributes {transform.with_named_sequence} { (%arg1) : (!transform.any_op) -> () transform.include @canonicalize_with_cse failures(propagate) (%arg1) : (!transform.any_op) -> () - transform.include @pad_and_promote_unary_bf16 failures(propagate) + transform.include @pad_and_promote_unary_@DTYPE@ failures(propagate) (%arg1) : (!transform.any_op) -> () transform.include @canonicalize_with_cse failures(propagate) (%arg1) : (!transform.any_op) -> () @@ -31,7 +31,7 @@ module attributes {transform.with_named_sequence} { transform.include @post_bufferize_cleanup failures(propagate) (%arg1) : (!transform.any_op) -> () - transform.include @vectorize_generics_at_16 failures(propagate) + transform.include @vectorize_generics_at_@VECTOR_SIZE@ failures(propagate) (%arg1) : (!transform.any_op) -> () %vh = transform.include @air_herd_mapping_and_vectorize failures(propagate) (%arg1) : (!transform.any_op) -> !transform.any_op diff --git a/examples/generate_readme.py b/examples/generate_readme.py index 1887247..75dd998 100644 --- a/examples/generate_readme.py +++ b/examples/generate_readme.py @@ -66,31 +66,31 @@ "category": "Element-wise", "name": "Sigmoid", "path": "sigmoid", - "datatypes": "bf16", + "datatypes": "bf16, f32", }, { "category": "Element-wise", "name": "SiLU", "path": "silu", - "datatypes": "bf16", + "datatypes": "bf16, f32", }, { "category": "Element-wise", "name": "GELU", "path": "gelu", - "datatypes": "bf16", + "datatypes": "bf16, f32", }, { "category": "Element-wise", "name": "Leaky ReLU", "path": "leaky_relu", - "datatypes": "bf16", + "datatypes": "bf16, f32", }, { "category": "Element-wise", "name": "SwiGLU", "path": "swiglu", - "datatypes": "bf16", + "datatypes": "bf16, f32", }, { "category": "Element-wise", diff --git a/examples/leaky_relu/leaky_relu.py b/examples/leaky_relu/leaky_relu.py index 088b6b1..5b9927c 100644 --- a/examples/leaky_relu/leaky_relu.py +++ b/examples/leaky_relu/leaky_relu.py @@ -1,16 +1,36 @@ # Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved. # SPDX-License-Identifier: MIT +# Leaky ReLU benchmark: y = x if x >= 0, else alpha * x +# Supports bf16 (default) and f32 (via bf16-emulation). + +import argparse import torch import triton import triton.language as tl -import sys, os +import sys +import os sys.path.append(os.path.abspath("..")) import benchmark ALPHA = 0.01 # Standard leaky relu negative slope +DTYPE_CONFIG = { + "bf16": { + "torch_dtype": torch.bfloat16, + "atol": 1e-2, + "rtol": 1e-2, + "bf16_emulation": False, + }, + "f32": { + "torch_dtype": torch.float32, + "atol": 1e-1, + "rtol": 5e-2, + "bf16_emulation": True, + }, +} + @triton.jit def leaky_relu_kernel( @@ -31,11 +51,11 @@ def leaky_relu_kernel( tl.store(Y + offsets[:], y) -def bench_leaky_relu(N, provider): +def bench_leaky_relu(N, provider, cfg): device = "cpu" - dtype = torch.bfloat16 - x = torch.randn(N, device=device, dtype=dtype) - y = torch.empty(N, device=device, dtype=dtype) + torch_dtype = cfg["torch_dtype"] + x = torch.randn(N, device=device, dtype=torch_dtype) + y = torch.empty(N, device=device, dtype=torch_dtype) if provider == "torch" or provider == "test": y_ref = torch.nn.functional.leaky_relu(x, negative_slope=ALPHA) if provider == "triton" or provider == "test": @@ -49,10 +69,35 @@ def bench_leaky_relu(N, provider): with open("tt.shared.mlir", "w") as f: f.write(str(compiled_kernel.asm["ttsharedir"])) if provider == "test": - torch.testing.assert_close(y, y_ref, atol=1e-2, rtol=1e-2) + torch.testing.assert_close(y, y_ref, atol=cfg["atol"], rtol=cfg["rtol"]) if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Leaky ReLU benchmark for AMD NPU") + parser.add_argument( + "--dtype", + type=str, + choices=list(DTYPE_CONFIG.keys()), + default="bf16", + help="Element data type (default: bf16)", + ) + parser.add_argument( + "--bf16-emulation", + dest="bf16_emulation", + default=False, + action="store_true", + help="Use f32 data type with bf16 emulation on AIE cores", + ) + args = parser.parse_args() + + if args.bf16_emulation: + args.dtype = "f32" + + cfg = DTYPE_CONFIG[args.dtype] + + if cfg["bf16_emulation"]: + os.environ["AMD_TRITON_NPU_BF16_EMULATION"] = "1" + benchmark.select_npu_backend() for N in [2**i for i in range(10, 16, 1)]: - bench_leaky_relu(N, "test") + bench_leaky_relu(N, "test", cfg) diff --git a/examples/leaky_relu/transform_aie2.mlir b/examples/leaky_relu/transform_aie2.mlir index e0234a4..f804e9f 100644 --- a/examples/leaky_relu/transform_aie2.mlir +++ b/examples/leaky_relu/transform_aie2.mlir @@ -20,7 +20,7 @@ module attributes {transform.with_named_sequence} { (%arg1) : (!transform.any_op) -> () transform.include @canonicalize_with_cse failures(propagate) (%arg1) : (!transform.any_op) -> () - transform.include @pad_and_promote_unary_bf16 failures(propagate) + transform.include @pad_and_promote_unary_@DTYPE@ failures(propagate) (%arg1) : (!transform.any_op) -> () transform.include @canonicalize_with_cse failures(propagate) (%arg1) : (!transform.any_op) -> () @@ -29,7 +29,7 @@ module attributes {transform.with_named_sequence} { transform.include @post_bufferize_cleanup failures(propagate) (%arg1) : (!transform.any_op) -> () - transform.include @vectorize_generics_at_16 failures(propagate) + transform.include @vectorize_generics_at_@VECTOR_SIZE@ failures(propagate) (%arg1) : (!transform.any_op) -> () %vh = transform.include @air_herd_mapping_and_vectorize failures(propagate) (%arg1) : (!transform.any_op) -> !transform.any_op diff --git a/examples/leaky_relu/transform_aie2p.mlir b/examples/leaky_relu/transform_aie2p.mlir index 7ed2de4..bc2d3c9 100644 --- a/examples/leaky_relu/transform_aie2p.mlir +++ b/examples/leaky_relu/transform_aie2p.mlir @@ -20,7 +20,7 @@ module attributes {transform.with_named_sequence} { (%arg1) : (!transform.any_op) -> () transform.include @canonicalize_with_cse failures(propagate) (%arg1) : (!transform.any_op) -> () - transform.include @pad_and_promote_unary_bf16 failures(propagate) + transform.include @pad_and_promote_unary_@DTYPE@ failures(propagate) (%arg1) : (!transform.any_op) -> () transform.include @canonicalize_with_cse failures(propagate) (%arg1) : (!transform.any_op) -> () @@ -29,7 +29,7 @@ module attributes {transform.with_named_sequence} { transform.include @post_bufferize_cleanup failures(propagate) (%arg1) : (!transform.any_op) -> () - transform.include @vectorize_generics_at_16 failures(propagate) + transform.include @vectorize_generics_at_@VECTOR_SIZE@ failures(propagate) (%arg1) : (!transform.any_op) -> () %vh = transform.include @air_herd_mapping_and_vectorize failures(propagate) (%arg1) : (!transform.any_op) -> !transform.any_op diff --git a/examples/sigmoid/sigmoid.py b/examples/sigmoid/sigmoid.py index 12b602c..d5922dd 100644 --- a/examples/sigmoid/sigmoid.py +++ b/examples/sigmoid/sigmoid.py @@ -1,14 +1,34 @@ # Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved. # SPDX-License-Identifier: MIT +# Sigmoid benchmark: y = 1 / (1 + exp(-x)) +# Supports bf16 (default) and f32 (via bf16-emulation). + +import argparse import torch import triton import triton.language as tl -import sys, os +import sys +import os sys.path.append(os.path.abspath("..")) import benchmark +DTYPE_CONFIG = { + "bf16": { + "torch_dtype": torch.bfloat16, + "atol": 1e-1, + "rtol": 1e-1, + "bf16_emulation": False, + }, + "f32": { + "torch_dtype": torch.float32, + "atol": 2e-1, + "rtol": 1e-1, + "bf16_emulation": True, + }, +} + @triton.jit def sigmoid_kernel( @@ -34,11 +54,11 @@ def sigmoid_kernel( tl.store(Y + offsets[:], y) -def bench_sigmoid(N, provider): +def bench_sigmoid(N, provider, cfg): device = "cpu" - dtype = torch.bfloat16 - x = torch.randn(N, device=device, dtype=dtype) - y = torch.empty(N, device=device, dtype=dtype) + torch_dtype = cfg["torch_dtype"] + x = torch.randn(N, device=device, dtype=torch_dtype) + y = torch.empty(N, device=device, dtype=torch_dtype) if provider == "torch" or provider == "test": y_ref = torch.sigmoid(x) if provider == "triton" or provider == "test": @@ -52,10 +72,35 @@ def bench_sigmoid(N, provider): with open("tt.shared.mlir", "w") as f: f.write(str(compiled_kernel.asm["ttsharedir"])) if provider == "test": - torch.testing.assert_close(y, y_ref, atol=1e-1, rtol=1e-1) + torch.testing.assert_close(y, y_ref, atol=cfg["atol"], rtol=cfg["rtol"]) if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Sigmoid benchmark for AMD NPU") + parser.add_argument( + "--dtype", + type=str, + choices=list(DTYPE_CONFIG.keys()), + default="bf16", + help="Element data type (default: bf16)", + ) + parser.add_argument( + "--bf16-emulation", + dest="bf16_emulation", + default=False, + action="store_true", + help="Use f32 data type with bf16 emulation on AIE cores", + ) + args = parser.parse_args() + + if args.bf16_emulation: + args.dtype = "f32" + + cfg = DTYPE_CONFIG[args.dtype] + + if cfg["bf16_emulation"]: + os.environ["AMD_TRITON_NPU_BF16_EMULATION"] = "1" + benchmark.select_npu_backend() for N in [2**i for i in range(10, 16, 1)]: - bench_sigmoid(N, "test") + bench_sigmoid(N, "test", cfg) diff --git a/examples/sigmoid/transform_aie2p.mlir b/examples/sigmoid/transform_aie2p.mlir index 2494c2b..8fe2d8e 100644 --- a/examples/sigmoid/transform_aie2p.mlir +++ b/examples/sigmoid/transform_aie2p.mlir @@ -6,8 +6,9 @@ // // sigmoid(x) = 1 / (1 + exp(-x)) // -// Strategy: fuse_elementwise_linalg -> unary pad+promote -> vectorize at 16 +// Strategy: fuse_elementwise_linalg -> unary pad+promote -> vectorize // -> cast exp, subf, addf, mulf to bf16; divf stays f32. +// Dtype-generic: uses @DTYPE@ and @VECTOR_SIZE@ placeholders. // // Uses shared library sequences from transform_library.mlir (auto-injected). //////////////////////////////////////////////////////////////////////////////// @@ -16,43 +17,25 @@ module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main( %arg1: !transform.any_op {transform.readonly}) { - // Phase 1: Initial canonicalization transform.include @canonicalize_with_fold_dims failures(propagate) (%arg1) : (!transform.any_op) -> () - - // Phase 2: Fuse elementwise chain (extf + subf + exp + addf + divf + truncf) transform.include @fuse_elementwise_and_canonicalize failures(propagate) (%arg1) : (!transform.any_op) -> () - - // Phase 3: Flatten + tile forall [256] transform.include @flatten_tile_forall failures(propagate) (%arg1) : (!transform.any_op) -> () - - // Phase 4: Canonicalization transform.include @canonicalize_with_cse failures(propagate) (%arg1) : (!transform.any_op) -> () - - // Phase 5: Pad and promote to L1 (unary: 1 input + 1 output) - transform.include @pad_and_promote_unary_bf16 failures(propagate) + transform.include @pad_and_promote_unary_@DTYPE@ failures(propagate) (%arg1) : (!transform.any_op) -> () - - // Phase 6: Canonicalization transform.include @canonicalize_with_cse failures(propagate) (%arg1) : (!transform.any_op) -> () - - // Phase 7: Bufferization transform.include @one_shot_bufferize failures(propagate) (%arg1) : (!transform.any_op) -> () - - // Phase 8: Post-bufferization cleanup transform.include @post_bufferize_cleanup failures(propagate) (%arg1) : (!transform.any_op) -> () - // Phase 9: Vectorization tiling (16-lane for bf16) - transform.include @vectorize_generics_at_16 failures(propagate) + transform.include @vectorize_generics_at_@VECTOR_SIZE@ failures(propagate) (%arg1) : (!transform.any_op) -> () - - // Phase 10: AIR herd mapping + vectorization %vh = transform.include @air_herd_mapping_and_vectorize failures(propagate) (%arg1) : (!transform.any_op) -> !transform.any_op transform.include @cast_bf16_only_ops failures(propagate) diff --git a/examples/silu/silu.py b/examples/silu/silu.py index 59b0aa0..05d55df 100644 --- a/examples/silu/silu.py +++ b/examples/silu/silu.py @@ -1,14 +1,34 @@ # Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved. # SPDX-License-Identifier: MIT +# SiLU benchmark: y = x * sigmoid(x) +# Supports bf16 (default) and f32 (via bf16-emulation). + +import argparse import torch import triton import triton.language as tl -import sys, os +import sys +import os sys.path.append(os.path.abspath("..")) import benchmark +DTYPE_CONFIG = { + "bf16": { + "torch_dtype": torch.bfloat16, + "atol": 1e-1, + "rtol": 1e-1, + "bf16_emulation": False, + }, + "f32": { + "torch_dtype": torch.float32, + "atol": 2e-1, + "rtol": 1e-1, + "bf16_emulation": True, + }, +} + @triton.jit def silu_kernel( @@ -30,11 +50,11 @@ def silu_kernel( tl.store(Y + offsets[:], y) -def bench_silu(N, provider): +def bench_silu(N, provider, cfg): device = "cpu" - dtype = torch.bfloat16 - x = torch.randn(N, device=device, dtype=dtype) - y = torch.empty(N, device=device, dtype=dtype) + torch_dtype = cfg["torch_dtype"] + x = torch.randn(N, device=device, dtype=torch_dtype) + y = torch.empty(N, device=device, dtype=torch_dtype) if provider == "torch" or provider == "test": y_ref = torch.nn.functional.silu(x) if provider == "triton" or provider == "test": @@ -48,10 +68,35 @@ def bench_silu(N, provider): with open("tt.shared.mlir", "w") as f: f.write(str(compiled_kernel.asm["ttsharedir"])) if provider == "test": - torch.testing.assert_close(y, y_ref, atol=1e-1, rtol=1e-1) + torch.testing.assert_close(y, y_ref, atol=cfg["atol"], rtol=cfg["rtol"]) if __name__ == "__main__": + parser = argparse.ArgumentParser(description="SiLU benchmark for AMD NPU") + parser.add_argument( + "--dtype", + type=str, + choices=list(DTYPE_CONFIG.keys()), + default="bf16", + help="Element data type (default: bf16)", + ) + parser.add_argument( + "--bf16-emulation", + dest="bf16_emulation", + default=False, + action="store_true", + help="Use f32 data type with bf16 emulation on AIE cores", + ) + args = parser.parse_args() + + if args.bf16_emulation: + args.dtype = "f32" + + cfg = DTYPE_CONFIG[args.dtype] + + if cfg["bf16_emulation"]: + os.environ["AMD_TRITON_NPU_BF16_EMULATION"] = "1" + benchmark.select_npu_backend() for N in [2**i for i in range(10, 16, 1)]: - bench_silu(N, "test") + bench_silu(N, "test", cfg) diff --git a/examples/silu/transform_aie2.mlir b/examples/silu/transform_aie2.mlir index 3f16514..78784f9 100644 --- a/examples/silu/transform_aie2.mlir +++ b/examples/silu/transform_aie2.mlir @@ -20,7 +20,7 @@ module attributes {transform.with_named_sequence} { (%arg1) : (!transform.any_op) -> () transform.include @canonicalize_with_cse failures(propagate) (%arg1) : (!transform.any_op) -> () - transform.include @pad_and_promote_unary_bf16 failures(propagate) + transform.include @pad_and_promote_unary_@DTYPE@ failures(propagate) (%arg1) : (!transform.any_op) -> () transform.include @canonicalize_with_cse failures(propagate) (%arg1) : (!transform.any_op) -> () @@ -29,7 +29,7 @@ module attributes {transform.with_named_sequence} { transform.include @post_bufferize_cleanup failures(propagate) (%arg1) : (!transform.any_op) -> () - transform.include @vectorize_generics_at_16 failures(propagate) + transform.include @vectorize_generics_at_@VECTOR_SIZE@ failures(propagate) (%arg1) : (!transform.any_op) -> () %vh = transform.include @air_herd_mapping_with_extern_and_vectorize failures(propagate) (%arg1) : (!transform.any_op) -> !transform.any_op diff --git a/examples/silu/transform_aie2p.mlir b/examples/silu/transform_aie2p.mlir index 53de42f..acc0aea 100644 --- a/examples/silu/transform_aie2p.mlir +++ b/examples/silu/transform_aie2p.mlir @@ -21,7 +21,7 @@ module attributes {transform.with_named_sequence} { (%arg1) : (!transform.any_op) -> () transform.include @canonicalize_with_cse failures(propagate) (%arg1) : (!transform.any_op) -> () - transform.include @pad_and_promote_unary_bf16 failures(propagate) + transform.include @pad_and_promote_unary_@DTYPE@ failures(propagate) (%arg1) : (!transform.any_op) -> () transform.include @canonicalize_with_cse failures(propagate) (%arg1) : (!transform.any_op) -> () @@ -30,7 +30,7 @@ module attributes {transform.with_named_sequence} { transform.include @post_bufferize_cleanup failures(propagate) (%arg1) : (!transform.any_op) -> () - transform.include @vectorize_generics_at_16 failures(propagate) + transform.include @vectorize_generics_at_@VECTOR_SIZE@ failures(propagate) (%arg1) : (!transform.any_op) -> () %vh = transform.include @air_herd_mapping_and_vectorize failures(propagate) (%arg1) : (!transform.any_op) -> !transform.any_op diff --git a/examples/swiglu/swiglu.py b/examples/swiglu/swiglu.py index 180e856..65157fe 100644 --- a/examples/swiglu/swiglu.py +++ b/examples/swiglu/swiglu.py @@ -1,14 +1,34 @@ # Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved. # SPDX-License-Identifier: MIT +# SwiGLU benchmark: out = SiLU(gate) * up = gate * sigmoid(gate) * up +# Supports bf16 (default) and f32 (via bf16-emulation). + +import argparse import torch import triton import triton.language as tl -import sys, os +import sys +import os sys.path.append(os.path.abspath("..")) import benchmark +DTYPE_CONFIG = { + "bf16": { + "torch_dtype": torch.bfloat16, + "atol": 1e-1, + "rtol": 1e-1, + "bf16_emulation": False, + }, + "f32": { + "torch_dtype": torch.float32, + "atol": 2e-1, + "rtol": 1e-1, + "bf16_emulation": True, + }, +} + @triton.jit def swiglu_kernel( @@ -33,12 +53,12 @@ def swiglu_kernel( tl.store(OUT + offsets[:], out) -def bench_swiglu(N, provider): +def bench_swiglu(N, provider, cfg): device = "cpu" - dtype = torch.bfloat16 - gate = torch.randn(N, device=device, dtype=dtype) - up = torch.randn(N, device=device, dtype=dtype) - out = torch.empty(N, device=device, dtype=dtype) + torch_dtype = cfg["torch_dtype"] + gate = torch.randn(N, device=device, dtype=torch_dtype) + up = torch.randn(N, device=device, dtype=torch_dtype) + out = torch.empty(N, device=device, dtype=torch_dtype) if provider == "torch" or provider == "test": out_ref = torch.nn.functional.silu(gate) * up if provider == "triton" or provider == "test": @@ -53,10 +73,35 @@ def bench_swiglu(N, provider): with open("tt.shared.mlir", "w") as f: f.write(str(compiled_kernel.asm["ttsharedir"])) if provider == "test": - torch.testing.assert_close(out, out_ref, atol=1e-1, rtol=1e-1) + torch.testing.assert_close(out, out_ref, atol=cfg["atol"], rtol=cfg["rtol"]) if __name__ == "__main__": + parser = argparse.ArgumentParser(description="SwiGLU benchmark for AMD NPU") + parser.add_argument( + "--dtype", + type=str, + choices=list(DTYPE_CONFIG.keys()), + default="bf16", + help="Element data type (default: bf16)", + ) + parser.add_argument( + "--bf16-emulation", + dest="bf16_emulation", + default=False, + action="store_true", + help="Use f32 data type with bf16 emulation on AIE cores", + ) + args = parser.parse_args() + + if args.bf16_emulation: + args.dtype = "f32" + + cfg = DTYPE_CONFIG[args.dtype] + + if cfg["bf16_emulation"]: + os.environ["AMD_TRITON_NPU_BF16_EMULATION"] = "1" + benchmark.select_npu_backend() for N in [2**i for i in range(10, 16, 1)]: - bench_swiglu(N, "test") + bench_swiglu(N, "test", cfg) diff --git a/examples/swiglu/transform_aie2.mlir b/examples/swiglu/transform_aie2.mlir index 0de74b4..94c07ff 100644 --- a/examples/swiglu/transform_aie2.mlir +++ b/examples/swiglu/transform_aie2.mlir @@ -21,7 +21,7 @@ module attributes {transform.with_named_sequence} { (%arg1) : (!transform.any_op) -> () transform.include @canonicalize_with_cse failures(propagate) (%arg1) : (!transform.any_op) -> () - transform.include @pad_and_promote_binary_bf16 failures(propagate) + transform.include @pad_and_promote_binary_@DTYPE@ failures(propagate) (%arg1) : (!transform.any_op) -> () transform.include @canonicalize_with_cse failures(propagate) (%arg1) : (!transform.any_op) -> () @@ -30,7 +30,7 @@ module attributes {transform.with_named_sequence} { transform.include @post_bufferize_cleanup failures(propagate) (%arg1) : (!transform.any_op) -> () - transform.include @vectorize_generics_at_16 failures(propagate) + transform.include @vectorize_generics_at_@VECTOR_SIZE@ failures(propagate) (%arg1) : (!transform.any_op) -> () %vh = transform.include @air_herd_mapping_with_extern_and_vectorize failures(propagate) (%arg1) : (!transform.any_op) -> !transform.any_op diff --git a/examples/swiglu/transform_aie2p.mlir b/examples/swiglu/transform_aie2p.mlir index ee1c6b2..7d799d3 100644 --- a/examples/swiglu/transform_aie2p.mlir +++ b/examples/swiglu/transform_aie2p.mlir @@ -20,7 +20,7 @@ module attributes {transform.with_named_sequence} { (%arg1) : (!transform.any_op) -> () transform.include @canonicalize_with_cse failures(propagate) (%arg1) : (!transform.any_op) -> () - transform.include @pad_and_promote_binary_bf16 failures(propagate) + transform.include @pad_and_promote_binary_@DTYPE@ failures(propagate) (%arg1) : (!transform.any_op) -> () transform.include @canonicalize_with_cse failures(propagate) (%arg1) : (!transform.any_op) -> () @@ -29,7 +29,7 @@ module attributes {transform.with_named_sequence} { transform.include @post_bufferize_cleanup failures(propagate) (%arg1) : (!transform.any_op) -> () - transform.include @vectorize_generics_at_16 failures(propagate) + transform.include @vectorize_generics_at_@VECTOR_SIZE@ failures(propagate) (%arg1) : (!transform.any_op) -> () %vh = transform.include @air_herd_mapping_and_vectorize failures(propagate) (%arg1) : (!transform.any_op) -> !transform.any_op From baf67c69097b8e9105f457b6a15748de678abe91 Mon Sep 17 00:00:00 2001 From: erweiw Date: Thu, 9 Apr 2026 22:01:00 -0700 Subject: [PATCH 7/9] Add elementwise_arith example (sub, mul, div, square) New multi-op example supporting sub, mul, div, and square with --op and --dtype CLI arguments. Auto-selects unary or binary transform script based on op arity. Supported dtypes: bf16 and f32 (via bf16-emulation). Integer types (i16) fail at aircc for subi/muli -- only addi works for integer vectors on AIE2P (tracked in Xilinx/mlir-aie#3027). div is f32-only (arith.divf has no bf16 hardware support on AIE2P). Tested on NPU2 (Strix/AIE2P): sub, mul, div, square all pass for their supported dtypes. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../air_project/aie.asm_air_output.mlir | 386 +++++++++++ .../elementwise_arith/air_project/aie.elf | Bin 0 -> 29488 bytes .../aiecc_failure_1775797115_856352.mlir | 411 ++++++++++++ .../aiecc_failure_1775797139_858651.mlir | 601 ++++++++++++++++++ .../aiecc_failure_1775797174_862028.mlir | 431 +++++++++++++ .../aiecc_repeater_1775797115_856352.sh | 12 + .../aiecc_repeater_1775797139_858651.sh | 14 + .../aiecc_repeater_1775797174_862028.sh | 14 + .../air_project/airinput.mlir | 41 ++ .../air_project/asm_air_output.mlir | 41 ++ .../air_project/asm_src.mlir | 34 + .../air_project/div_kernel_0.pdi | Bin 0 -> 15904 bytes .../air_project/div_kernel_0_aie_cdo_elfs.bin | Bin 0 -> 10704 bytes .../div_kernel_0_aie_cdo_enable.bin | Bin 0 -> 104 bytes .../air_project/div_kernel_0_aie_cdo_init.bin | Bin 0 -> 6032 bytes .../air_project/div_kernel_0_core_0_2.elf | Bin 0 -> 4132 bytes .../div_kernel_0_core_0_2.ld.script | 72 +++ .../air_project/div_kernel_0_core_0_2.ll | 158 +++++ .../air_project/div_kernel_0_core_0_2.o | Bin 0 -> 2048 bytes .../air_project/div_kernel_0_core_0_2.opt.ll | 129 ++++ .../div_kernel_0_core_0_2.peanohack.ll | 158 +++++ .../air_project/div_kernel_0_core_0_3.elf | Bin 0 -> 4192 bytes .../div_kernel_0_core_0_3.ld.script | 78 +++ .../air_project/div_kernel_0_core_0_3.ll | 158 +++++ .../air_project/div_kernel_0_core_0_3.o | Bin 0 -> 2048 bytes .../air_project/div_kernel_0_core_0_3.opt.ll | 129 ++++ .../div_kernel_0_core_0_3.peanohack.ll | 158 +++++ .../air_project/div_kernel_0_core_0_4.elf | Bin 0 -> 4196 bytes .../div_kernel_0_core_0_4.ld.script | 78 +++ .../air_project/div_kernel_0_core_0_4.ll | 158 +++++ .../air_project/div_kernel_0_core_0_4.o | Bin 0 -> 2048 bytes .../air_project/div_kernel_0_core_0_4.opt.ll | 129 ++++ .../div_kernel_0_core_0_4.peanohack.ll | 158 +++++ .../air_project/div_kernel_0_core_0_5.elf | Bin 0 -> 4132 bytes .../div_kernel_0_core_0_5.ld.script | 72 +++ .../air_project/div_kernel_0_core_0_5.ll | 158 +++++ .../air_project/div_kernel_0_core_0_5.o | Bin 0 -> 2052 bytes .../air_project/div_kernel_0_core_0_5.opt.ll | 129 ++++ .../div_kernel_0_core_0_5.peanohack.ll | 158 +++++ .../air_project/div_kernel_0_design.bif | 10 + .../div_kernel_0_div_kernel_0_sequence.bin | Bin 0 -> 3248 bytes .../elementwise_arith/air_project/empty_0.pdi | Bin 0 -> 368 bytes .../air_project/empty_0_aie_cdo_elfs.bin | Bin 0 -> 24 bytes .../air_project/empty_0_aie_cdo_enable.bin | Bin 0 -> 24 bytes .../air_project/empty_0_aie_cdo_init.bin | Bin 0 -> 24 bytes .../air_project/empty_0_design.bif | 10 + .../air_project/full_elf_config.json | 134 ++++ .../air_project/input_with_addresses.mlir | 328 ++++++++++ .../elementwise_arith/air_project/main.pdi | Bin 0 -> 368 bytes .../air_project/main_aie_cdo_elfs.bin | Bin 0 -> 24 bytes .../air_project/main_aie_cdo_enable.bin | Bin 0 -> 24 bytes .../air_project/main_aie_cdo_init.bin | Bin 0 -> 24 bytes .../air_project/main_design.bif | 10 + .../air_project/main_div_kernel.bin | Bin 0 -> 22460 bytes .../air_project/main_mul_kernel.bin | Bin 0 -> 14460 bytes .../air_project/main_square_kernel.bin | Bin 0 -> 11048 bytes .../air_project/main_sub_kernel.bin | Bin 0 -> 14396 bytes .../air_project/mul_kernel_0.pdi | Bin 0 -> 7856 bytes .../air_project/mul_kernel_0_aie_cdo_elfs.bin | Bin 0 -> 2656 bytes .../mul_kernel_0_aie_cdo_enable.bin | Bin 0 -> 104 bytes .../air_project/mul_kernel_0_aie_cdo_init.bin | Bin 0 -> 6032 bytes .../air_project/mul_kernel_0_core_0_2.elf | Bin 0 -> 1672 bytes .../mul_kernel_0_core_0_2.ld.script | 72 +++ .../air_project/mul_kernel_0_core_0_2.ll | 95 +++ .../air_project/mul_kernel_0_core_0_2.o | Bin 0 -> 1000 bytes .../air_project/mul_kernel_0_core_0_2.opt.ll | 72 +++ .../mul_kernel_0_core_0_2.peanohack.ll | 95 +++ .../air_project/mul_kernel_0_core_0_3.elf | Bin 0 -> 1736 bytes .../mul_kernel_0_core_0_3.ld.script | 78 +++ .../air_project/mul_kernel_0_core_0_3.ll | 95 +++ .../air_project/mul_kernel_0_core_0_3.o | Bin 0 -> 1000 bytes .../air_project/mul_kernel_0_core_0_3.opt.ll | 72 +++ .../mul_kernel_0_core_0_3.peanohack.ll | 95 +++ .../air_project/mul_kernel_0_core_0_4.elf | Bin 0 -> 1740 bytes .../mul_kernel_0_core_0_4.ld.script | 78 +++ .../air_project/mul_kernel_0_core_0_4.ll | 95 +++ .../air_project/mul_kernel_0_core_0_4.o | Bin 0 -> 1000 bytes .../air_project/mul_kernel_0_core_0_4.opt.ll | 72 +++ .../mul_kernel_0_core_0_4.peanohack.ll | 95 +++ .../air_project/mul_kernel_0_core_0_5.elf | Bin 0 -> 1676 bytes .../mul_kernel_0_core_0_5.ld.script | 72 +++ .../air_project/mul_kernel_0_core_0_5.ll | 95 +++ .../air_project/mul_kernel_0_core_0_5.o | Bin 0 -> 1000 bytes .../air_project/mul_kernel_0_core_0_5.opt.ll | 72 +++ .../mul_kernel_0_core_0_5.peanohack.ll | 95 +++ .../air_project/mul_kernel_0_design.bif | 10 + .../mul_kernel_0_mul_kernel_0_sequence.bin | Bin 0 -> 3248 bytes .../air_project/npu.asm_air_output.mlir | 300 +++++++++ .../air_project/placed.asm_air_output.mlir | 86 +++ .../air_project/square_kernel_0.pdi | Bin 0 -> 6272 bytes .../square_kernel_0_aie_cdo_elfs.bin | Bin 0 -> 2528 bytes .../square_kernel_0_aie_cdo_enable.bin | Bin 0 -> 104 bytes .../square_kernel_0_aie_cdo_init.bin | Bin 0 -> 4300 bytes .../air_project/square_kernel_0_core_0_2.elf | Bin 0 -> 1600 bytes .../square_kernel_0_core_0_2.ld.script | 66 ++ .../air_project/square_kernel_0_core_0_2.ll | 84 +++ .../air_project/square_kernel_0_core_0_2.o | Bin 0 -> 932 bytes .../square_kernel_0_core_0_2.opt.ll | 65 ++ .../square_kernel_0_core_0_2.peanohack.ll | 84 +++ .../air_project/square_kernel_0_core_0_3.elf | Bin 0 -> 1640 bytes .../square_kernel_0_core_0_3.ld.script | 69 ++ .../air_project/square_kernel_0_core_0_3.ll | 84 +++ .../air_project/square_kernel_0_core_0_3.o | Bin 0 -> 932 bytes .../square_kernel_0_core_0_3.opt.ll | 65 ++ .../square_kernel_0_core_0_3.peanohack.ll | 84 +++ .../air_project/square_kernel_0_core_0_4.elf | Bin 0 -> 1640 bytes .../square_kernel_0_core_0_4.ld.script | 69 ++ .../air_project/square_kernel_0_core_0_4.ll | 84 +++ .../air_project/square_kernel_0_core_0_4.o | Bin 0 -> 932 bytes .../square_kernel_0_core_0_4.opt.ll | 65 ++ .../square_kernel_0_core_0_4.peanohack.ll | 84 +++ .../air_project/square_kernel_0_core_0_5.elf | Bin 0 -> 1600 bytes .../square_kernel_0_core_0_5.ld.script | 66 ++ .../air_project/square_kernel_0_core_0_5.ll | 84 +++ .../air_project/square_kernel_0_core_0_5.o | Bin 0 -> 932 bytes .../square_kernel_0_core_0_5.opt.ll | 65 ++ .../square_kernel_0_core_0_5.peanohack.ll | 84 +++ .../air_project/square_kernel_0_design.bif | 10 + ...uare_kernel_0_square_kernel_0_sequence.bin | Bin 0 -> 2288 bytes .../air_project/sub_kernel_0.pdi | Bin 0 -> 7792 bytes .../air_project/sub_kernel_0_aie_cdo_elfs.bin | Bin 0 -> 2592 bytes .../sub_kernel_0_aie_cdo_enable.bin | Bin 0 -> 104 bytes .../air_project/sub_kernel_0_aie_cdo_init.bin | Bin 0 -> 6032 bytes .../air_project/sub_kernel_0_core_0_2.elf | Bin 0 -> 1656 bytes .../sub_kernel_0_core_0_2.ld.script | 72 +++ .../air_project/sub_kernel_0_core_0_2.ll | 95 +++ .../air_project/sub_kernel_0_core_0_2.o | Bin 0 -> 984 bytes .../air_project/sub_kernel_0_core_0_2.opt.ll | 64 ++ .../sub_kernel_0_core_0_2.peanohack.ll | 95 +++ .../air_project/sub_kernel_0_core_0_3.elf | Bin 0 -> 1720 bytes .../sub_kernel_0_core_0_3.ld.script | 78 +++ .../air_project/sub_kernel_0_core_0_3.ll | 95 +++ .../air_project/sub_kernel_0_core_0_3.o | Bin 0 -> 984 bytes .../air_project/sub_kernel_0_core_0_3.opt.ll | 64 ++ .../sub_kernel_0_core_0_3.peanohack.ll | 95 +++ .../air_project/sub_kernel_0_core_0_4.elf | Bin 0 -> 1724 bytes .../sub_kernel_0_core_0_4.ld.script | 78 +++ .../air_project/sub_kernel_0_core_0_4.ll | 95 +++ .../air_project/sub_kernel_0_core_0_4.o | Bin 0 -> 984 bytes .../air_project/sub_kernel_0_core_0_4.opt.ll | 64 ++ .../sub_kernel_0_core_0_4.peanohack.ll | 95 +++ .../air_project/sub_kernel_0_core_0_5.elf | Bin 0 -> 1660 bytes .../sub_kernel_0_core_0_5.ld.script | 72 +++ .../air_project/sub_kernel_0_core_0_5.ll | 95 +++ .../air_project/sub_kernel_0_core_0_5.o | Bin 0 -> 984 bytes .../air_project/sub_kernel_0_core_0_5.opt.ll | 64 ++ .../sub_kernel_0_core_0_5.peanohack.ll | 95 +++ .../air_project/sub_kernel_0_design.bif | 10 + .../sub_kernel_0_sub_kernel_0_sequence.bin | Bin 0 -> 3248 bytes .../elementwise_arith/air_project/tt.mlir | 35 + .../elementwise_arith/elementwise_arith.py | 189 ++++++ .../transform_binary_aie2p.mlir | 40 ++ .../transform_unary_aie2p.mlir | 40 ++ examples/elementwise_arith/tt.shared.mlir | 1 + examples/generate_readme.py | 6 + 155 files changed, 9150 insertions(+) create mode 100644 examples/elementwise_arith/air_project/aie.asm_air_output.mlir create mode 100644 examples/elementwise_arith/air_project/aie.elf create mode 100644 examples/elementwise_arith/air_project/aiecc_failure_1775797115_856352.mlir create mode 100644 examples/elementwise_arith/air_project/aiecc_failure_1775797139_858651.mlir create mode 100644 examples/elementwise_arith/air_project/aiecc_failure_1775797174_862028.mlir create mode 100755 examples/elementwise_arith/air_project/aiecc_repeater_1775797115_856352.sh create mode 100755 examples/elementwise_arith/air_project/aiecc_repeater_1775797139_858651.sh create mode 100755 examples/elementwise_arith/air_project/aiecc_repeater_1775797174_862028.sh create mode 100644 examples/elementwise_arith/air_project/airinput.mlir create mode 100644 examples/elementwise_arith/air_project/asm_air_output.mlir create mode 100644 examples/elementwise_arith/air_project/asm_src.mlir create mode 100644 examples/elementwise_arith/air_project/div_kernel_0.pdi create mode 100644 examples/elementwise_arith/air_project/div_kernel_0_aie_cdo_elfs.bin create mode 100644 examples/elementwise_arith/air_project/div_kernel_0_aie_cdo_enable.bin create mode 100644 examples/elementwise_arith/air_project/div_kernel_0_aie_cdo_init.bin create mode 100755 examples/elementwise_arith/air_project/div_kernel_0_core_0_2.elf create mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_2.ld.script create mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_2.ll create mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_2.o create mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_2.opt.ll create mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_2.peanohack.ll create mode 100755 examples/elementwise_arith/air_project/div_kernel_0_core_0_3.elf create mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_3.ld.script create mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_3.ll create mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_3.o create mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_3.opt.ll create mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_3.peanohack.ll create mode 100755 examples/elementwise_arith/air_project/div_kernel_0_core_0_4.elf create mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_4.ld.script create mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_4.ll create mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_4.o create mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_4.opt.ll create mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_4.peanohack.ll create mode 100755 examples/elementwise_arith/air_project/div_kernel_0_core_0_5.elf create mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_5.ld.script create mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_5.ll create mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_5.o create mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_5.opt.ll create mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_5.peanohack.ll create mode 100644 examples/elementwise_arith/air_project/div_kernel_0_design.bif create mode 100644 examples/elementwise_arith/air_project/div_kernel_0_div_kernel_0_sequence.bin create mode 100644 examples/elementwise_arith/air_project/empty_0.pdi create mode 100644 examples/elementwise_arith/air_project/empty_0_aie_cdo_elfs.bin create mode 100644 examples/elementwise_arith/air_project/empty_0_aie_cdo_enable.bin create mode 100644 examples/elementwise_arith/air_project/empty_0_aie_cdo_init.bin create mode 100644 examples/elementwise_arith/air_project/empty_0_design.bif create mode 100644 examples/elementwise_arith/air_project/full_elf_config.json create mode 100644 examples/elementwise_arith/air_project/input_with_addresses.mlir create mode 100644 examples/elementwise_arith/air_project/main.pdi create mode 100644 examples/elementwise_arith/air_project/main_aie_cdo_elfs.bin create mode 100644 examples/elementwise_arith/air_project/main_aie_cdo_enable.bin create mode 100644 examples/elementwise_arith/air_project/main_aie_cdo_init.bin create mode 100644 examples/elementwise_arith/air_project/main_design.bif create mode 100644 examples/elementwise_arith/air_project/main_div_kernel.bin create mode 100644 examples/elementwise_arith/air_project/main_mul_kernel.bin create mode 100644 examples/elementwise_arith/air_project/main_square_kernel.bin create mode 100644 examples/elementwise_arith/air_project/main_sub_kernel.bin create mode 100644 examples/elementwise_arith/air_project/mul_kernel_0.pdi create mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_aie_cdo_elfs.bin create mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_aie_cdo_enable.bin create mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_aie_cdo_init.bin create mode 100755 examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.elf create mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.ld.script create mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.ll create mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.o create mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.opt.ll create mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.peanohack.ll create mode 100755 examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.elf create mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.ld.script create mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.ll create mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.o create mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.opt.ll create mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.peanohack.ll create mode 100755 examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.elf create mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.ld.script create mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.ll create mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.o create mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.opt.ll create mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.peanohack.ll create mode 100755 examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.elf create mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.ld.script create mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.ll create mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.o create mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.opt.ll create mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.peanohack.ll create mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_design.bif create mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_mul_kernel_0_sequence.bin create mode 100644 examples/elementwise_arith/air_project/npu.asm_air_output.mlir create mode 100644 examples/elementwise_arith/air_project/placed.asm_air_output.mlir create mode 100644 examples/elementwise_arith/air_project/square_kernel_0.pdi create mode 100644 examples/elementwise_arith/air_project/square_kernel_0_aie_cdo_elfs.bin create mode 100644 examples/elementwise_arith/air_project/square_kernel_0_aie_cdo_enable.bin create mode 100644 examples/elementwise_arith/air_project/square_kernel_0_aie_cdo_init.bin create mode 100755 examples/elementwise_arith/air_project/square_kernel_0_core_0_2.elf create mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_2.ld.script create mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_2.ll create mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_2.o create mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_2.opt.ll create mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_2.peanohack.ll create mode 100755 examples/elementwise_arith/air_project/square_kernel_0_core_0_3.elf create mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_3.ld.script create mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_3.ll create mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_3.o create mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_3.opt.ll create mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_3.peanohack.ll create mode 100755 examples/elementwise_arith/air_project/square_kernel_0_core_0_4.elf create mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_4.ld.script create mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_4.ll create mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_4.o create mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_4.opt.ll create mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_4.peanohack.ll create mode 100755 examples/elementwise_arith/air_project/square_kernel_0_core_0_5.elf create mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_5.ld.script create mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_5.ll create mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_5.o create mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_5.opt.ll create mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_5.peanohack.ll create mode 100644 examples/elementwise_arith/air_project/square_kernel_0_design.bif create mode 100644 examples/elementwise_arith/air_project/square_kernel_0_square_kernel_0_sequence.bin create mode 100644 examples/elementwise_arith/air_project/sub_kernel_0.pdi create mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_aie_cdo_elfs.bin create mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_aie_cdo_enable.bin create mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_aie_cdo_init.bin create mode 100755 examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.elf create mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.ld.script create mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.ll create mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.o create mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.opt.ll create mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.peanohack.ll create mode 100755 examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.elf create mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.ld.script create mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.ll create mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.o create mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.opt.ll create mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.peanohack.ll create mode 100755 examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.elf create mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.ld.script create mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.ll create mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.o create mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.opt.ll create mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.peanohack.ll create mode 100755 examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.elf create mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.ld.script create mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.ll create mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.o create mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.opt.ll create mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.peanohack.ll create mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_design.bif create mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_sub_kernel_0_sequence.bin create mode 100644 examples/elementwise_arith/air_project/tt.mlir create mode 100644 examples/elementwise_arith/elementwise_arith.py create mode 100644 examples/elementwise_arith/transform_binary_aie2p.mlir create mode 100644 examples/elementwise_arith/transform_unary_aie2p.mlir create mode 100644 examples/elementwise_arith/tt.shared.mlir diff --git a/examples/elementwise_arith/air_project/aie.asm_air_output.mlir b/examples/elementwise_arith/air_project/aie.asm_air_output.mlir new file mode 100644 index 0000000..e55b5a1 --- /dev/null +++ b/examples/elementwise_arith/air_project/aie.asm_air_output.mlir @@ -0,0 +1,386 @@ +#loop_annotation = #llvm.loop_annotation +module { + aie.device(npu2) @square_kernel_0 { + %shim_noc_tile_0_0 = aie.tile(0, 0) + %shim_noc_tile_1_0 = aie.tile(1, 0) + %mem_tile_0_1 = aie.tile(0, 1) + %mem_tile_1_1 = aie.tile(1, 1) + %tile_0_2 = aie.tile(0, 2) + %tile_0_3 = aie.tile(0, 3) + %tile_0_4 = aie.tile(0, 4) + %tile_0_5 = aie.tile(0, 5) + %lock_0_1 = aie.lock(%mem_tile_0_1, 1) {init = 4 : i32} + %lock_0_1_0 = aie.lock(%mem_tile_0_1, 0) {init = 0 : i32} + %lock_1_1 = aie.lock(%mem_tile_1_1, 1) {init = 4 : i32} + %lock_1_1_1 = aie.lock(%mem_tile_1_1, 0) {init = 0 : i32} + %lock_0_2 = aie.lock(%tile_0_2, 3) {init = 1 : i32} + %lock_0_2_2 = aie.lock(%tile_0_2, 2) {init = 0 : i32} + %lock_0_2_3 = aie.lock(%tile_0_2, 1) {init = 1 : i32} + %lock_0_2_4 = aie.lock(%tile_0_2, 0) {init = 0 : i32} + %lock_0_3 = aie.lock(%tile_0_3, 3) {init = 1 : i32} + %lock_0_3_5 = aie.lock(%tile_0_3, 2) {init = 0 : i32} + %lock_0_3_6 = aie.lock(%tile_0_3, 1) {init = 1 : i32} + %lock_0_3_7 = aie.lock(%tile_0_3, 0) {init = 0 : i32} + %lock_0_4 = aie.lock(%tile_0_4, 3) {init = 1 : i32} + %lock_0_4_8 = aie.lock(%tile_0_4, 2) {init = 0 : i32} + %lock_0_4_9 = aie.lock(%tile_0_4, 1) {init = 1 : i32} + %lock_0_4_10 = aie.lock(%tile_0_4, 0) {init = 0 : i32} + %lock_0_5 = aie.lock(%tile_0_5, 3) {init = 1 : i32} + %lock_0_5_11 = aie.lock(%tile_0_5, 2) {init = 0 : i32} + %lock_0_5_12 = aie.lock(%tile_0_5, 1) {init = 1 : i32} + %lock_0_5_13 = aie.lock(%tile_0_5, 0) {init = 0 : i32} + %buf9 = aie.buffer(%mem_tile_0_1) {sym_name = "buf9"} : memref<1024xi16, 1 : i32> + %buf8 = aie.buffer(%mem_tile_1_1) {sym_name = "buf8"} : memref<1024xi16, 1> + %buf7 = aie.buffer(%tile_0_5) {sym_name = "buf7"} : memref<256xi16, 2> + %buf6 = aie.buffer(%tile_0_5) {sym_name = "buf6"} : memref<256xi16, 2> + %buf5 = aie.buffer(%tile_0_4) {sym_name = "buf5"} : memref<256xi16, 2> + %buf4 = aie.buffer(%tile_0_4) {sym_name = "buf4"} : memref<256xi16, 2> + %buf3 = aie.buffer(%tile_0_3) {sym_name = "buf3"} : memref<256xi16, 2> + %buf2 = aie.buffer(%tile_0_3) {sym_name = "buf2"} : memref<256xi16, 2> + %buf1 = aie.buffer(%tile_0_2) {sym_name = "buf1"} : memref<256xi16, 2> + %buf0 = aie.buffer(%tile_0_2) {sym_name = "buf0"} : memref<256xi16, 2> + %mem_0_5 = aie.mem(%tile_0_5) { + %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_0_5_13, AcquireGreaterEqual, 1) + aie.dma_bd(%buf6 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32} + aie.use_lock(%lock_0_5_12, Release, 1) + aie.next_bd ^bb1 + ^bb2: // pred: ^bb3 + aie.end + ^bb3: // pred: ^bb0 + %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb2) + ^bb4: // 2 preds: ^bb3, ^bb4 + aie.use_lock(%lock_0_5, AcquireGreaterEqual, 1) + aie.dma_bd(%buf7 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32} + aie.use_lock(%lock_0_5_11, Release, 1) + aie.next_bd ^bb4 + } + %core_0_5 = aie.core(%tile_0_5) { + %0 = ub.poison : i16 + %c256 = arith.constant 256 : index + %c32 = arith.constant 32 : index + %c0 = arith.constant 0 : index + cf.br ^bb1 + ^bb1: // 2 preds: ^bb0, ^bb2 + aie.use_lock(%lock_0_5_12, AcquireGreaterEqual, 1) + cf.br ^bb2 + ^bb2: // pred: ^bb1 + aie.use_lock(%lock_0_5_11, AcquireGreaterEqual, 1) + scf.for %arg0 = %c0 to %c256 step %c32 { + %subview = memref.subview %buf7[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2> + %subview_14 = memref.subview %buf6[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2> + %1 = vector.transfer_read %subview[%c0], %0 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16> + %2 = arith.muli %1, %1 : vector<32xi16> + vector.transfer_write %2, %subview_14[%c0] {in_bounds = [true]} : vector<32xi16>, memref<32xi16, strided<[1], offset: ?>, 2> + } {loop_annotation = #loop_annotation} + aie.use_lock(%lock_0_5, Release, 1) + aie.use_lock(%lock_0_5_13, Release, 1) + cf.br ^bb1 + } + %mem_0_4 = aie.mem(%tile_0_4) { + %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_0_4_10, AcquireGreaterEqual, 1) + aie.dma_bd(%buf4 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32} + aie.use_lock(%lock_0_4_9, Release, 1) + aie.next_bd ^bb1 + ^bb2: // pred: ^bb3 + aie.end + ^bb3: // pred: ^bb0 + %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb2) + ^bb4: // 2 preds: ^bb3, ^bb4 + aie.use_lock(%lock_0_4, AcquireGreaterEqual, 1) + aie.dma_bd(%buf5 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32} + aie.use_lock(%lock_0_4_8, Release, 1) + aie.next_bd ^bb4 + } + %core_0_4 = aie.core(%tile_0_4) { + %0 = ub.poison : i16 + %c256 = arith.constant 256 : index + %c32 = arith.constant 32 : index + %c0 = arith.constant 0 : index + cf.br ^bb1 + ^bb1: // 2 preds: ^bb0, ^bb2 + aie.use_lock(%lock_0_4_9, AcquireGreaterEqual, 1) + cf.br ^bb2 + ^bb2: // pred: ^bb1 + aie.use_lock(%lock_0_4_8, AcquireGreaterEqual, 1) + scf.for %arg0 = %c0 to %c256 step %c32 { + %subview = memref.subview %buf5[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2> + %subview_14 = memref.subview %buf4[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2> + %1 = vector.transfer_read %subview[%c0], %0 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16> + %2 = arith.muli %1, %1 : vector<32xi16> + vector.transfer_write %2, %subview_14[%c0] {in_bounds = [true]} : vector<32xi16>, memref<32xi16, strided<[1], offset: ?>, 2> + } {loop_annotation = #loop_annotation} + aie.use_lock(%lock_0_4, Release, 1) + aie.use_lock(%lock_0_4_10, Release, 1) + cf.br ^bb1 + } + %mem_0_3 = aie.mem(%tile_0_3) { + %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_0_3_7, AcquireGreaterEqual, 1) + aie.dma_bd(%buf2 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32} + aie.use_lock(%lock_0_3_6, Release, 1) + aie.next_bd ^bb1 + ^bb2: // pred: ^bb3 + aie.end + ^bb3: // pred: ^bb0 + %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb2) + ^bb4: // 2 preds: ^bb3, ^bb4 + aie.use_lock(%lock_0_3, AcquireGreaterEqual, 1) + aie.dma_bd(%buf3 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32} + aie.use_lock(%lock_0_3_5, Release, 1) + aie.next_bd ^bb4 + } + %core_0_3 = aie.core(%tile_0_3) { + %0 = ub.poison : i16 + %c256 = arith.constant 256 : index + %c32 = arith.constant 32 : index + %c0 = arith.constant 0 : index + cf.br ^bb1 + ^bb1: // 2 preds: ^bb0, ^bb2 + aie.use_lock(%lock_0_3_6, AcquireGreaterEqual, 1) + cf.br ^bb2 + ^bb2: // pred: ^bb1 + aie.use_lock(%lock_0_3_5, AcquireGreaterEqual, 1) + scf.for %arg0 = %c0 to %c256 step %c32 { + %subview = memref.subview %buf3[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2> + %subview_14 = memref.subview %buf2[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2> + %1 = vector.transfer_read %subview[%c0], %0 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16> + %2 = arith.muli %1, %1 : vector<32xi16> + vector.transfer_write %2, %subview_14[%c0] {in_bounds = [true]} : vector<32xi16>, memref<32xi16, strided<[1], offset: ?>, 2> + } {loop_annotation = #loop_annotation} + aie.use_lock(%lock_0_3, Release, 1) + aie.use_lock(%lock_0_3_7, Release, 1) + cf.br ^bb1 + } + %mem_0_2 = aie.mem(%tile_0_2) { + %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_0_2_4, AcquireGreaterEqual, 1) + aie.dma_bd(%buf0 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32} + aie.use_lock(%lock_0_2_3, Release, 1) + aie.next_bd ^bb1 + ^bb2: // pred: ^bb3 + aie.end + ^bb3: // pred: ^bb0 + %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb2) + ^bb4: // 2 preds: ^bb3, ^bb4 + aie.use_lock(%lock_0_2, AcquireGreaterEqual, 1) + aie.dma_bd(%buf1 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32} + aie.use_lock(%lock_0_2_2, Release, 1) + aie.next_bd ^bb4 + } + %core_0_2 = aie.core(%tile_0_2) { + %0 = ub.poison : i16 + %c256 = arith.constant 256 : index + %c32 = arith.constant 32 : index + %c0 = arith.constant 0 : index + cf.br ^bb1 + ^bb1: // 2 preds: ^bb0, ^bb2 + aie.use_lock(%lock_0_2_3, AcquireGreaterEqual, 1) + cf.br ^bb2 + ^bb2: // pred: ^bb1 + aie.use_lock(%lock_0_2_2, AcquireGreaterEqual, 1) + scf.for %arg0 = %c0 to %c256 step %c32 { + %subview = memref.subview %buf1[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2> + %subview_14 = memref.subview %buf0[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2> + %1 = vector.transfer_read %subview[%c0], %0 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16> + %2 = arith.muli %1, %1 : vector<32xi16> + vector.transfer_write %2, %subview_14[%c0] {in_bounds = [true]} : vector<32xi16>, memref<32xi16, strided<[1], offset: ?>, 2> + } {loop_annotation = #loop_annotation} + aie.use_lock(%lock_0_2, Release, 1) + aie.use_lock(%lock_0_2_4, Release, 1) + cf.br ^bb1 + } + air.channel @channel_0 [] + air.channel @channel_2 [1, 1] + air.channel @channel_8 [1, 1] + air.channel @channel_9 [1, 1] + air.channel @channel_10 [1, 1] + air.channel @channel_4 [1, 1] + air.channel @channel_5 [1, 1] + air.channel @channel_6 [1, 1] + air.channel @channel_7 [1, 1] + air.channel @channel_3 [] + aie.flow(%shim_noc_tile_0_0, DMA : 0, %mem_tile_0_1, DMA : 0) + aie.flow(%mem_tile_1_1, DMA : 0, %shim_noc_tile_1_0, DMA : 0) + aie.flow(%mem_tile_0_1, DMA : 0, %tile_0_2, DMA : 0) + aie.flow(%mem_tile_0_1, DMA : 1, %tile_0_3, DMA : 0) + aie.flow(%mem_tile_0_1, DMA : 2, %tile_0_4, DMA : 0) + aie.flow(%mem_tile_0_1, DMA : 3, %tile_0_5, DMA : 0) + aie.flow(%tile_0_2, DMA : 0, %mem_tile_1_1, DMA : 0) + aie.flow(%tile_0_3, DMA : 0, %mem_tile_1_1, DMA : 1) + aie.flow(%tile_0_4, DMA : 0, %mem_tile_1_1, DMA : 2) + aie.flow(%tile_0_5, DMA : 0, %mem_tile_1_1, DMA : 3) + %memtile_dma_1_1 = aie.memtile_dma(%mem_tile_1_1) { + %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_1_1_1, AcquireGreaterEqual, 4) + aie.dma_bd(%buf8 : memref<1024xi16, 1>, 0, 1024) {task_id = 0 : i32} + aie.use_lock(%lock_1_1, Release, 4) + aie.next_bd ^bb1 + ^bb2: // pred: ^bb9 + aie.end + ^bb3: // pred: ^bb0 + %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb5) + ^bb4: // 2 preds: ^bb3, ^bb4 + aie.use_lock(%lock_1_1, AcquireGreaterEqual, 1) + aie.dma_bd(%buf8 : memref<1024xi16, 1>, 0, 256) {task_id = 0 : i32} + aie.use_lock(%lock_1_1_1, Release, 1) + aie.next_bd ^bb4 + ^bb5: // pred: ^bb3 + %2 = aie.dma_start(S2MM, 1, ^bb6, ^bb7) + ^bb6: // 2 preds: ^bb5, ^bb6 + aie.use_lock(%lock_1_1, AcquireGreaterEqual, 1) + aie.dma_bd(%buf8 : memref<1024xi16, 1>, 256, 256) {task_id = 0 : i32} + aie.use_lock(%lock_1_1_1, Release, 1) + aie.next_bd ^bb6 + ^bb7: // pred: ^bb5 + %3 = aie.dma_start(S2MM, 2, ^bb8, ^bb9) + ^bb8: // 2 preds: ^bb7, ^bb8 + aie.use_lock(%lock_1_1, AcquireGreaterEqual, 1) + aie.dma_bd(%buf8 : memref<1024xi16, 1>, 512, 256) {task_id = 0 : i32} + aie.use_lock(%lock_1_1_1, Release, 1) + aie.next_bd ^bb8 + ^bb9: // pred: ^bb7 + %4 = aie.dma_start(S2MM, 3, ^bb10, ^bb2) + ^bb10: // 2 preds: ^bb9, ^bb10 + aie.use_lock(%lock_1_1, AcquireGreaterEqual, 1) + aie.dma_bd(%buf8 : memref<1024xi16, 1>, 768, 256) {task_id = 0 : i32} + aie.use_lock(%lock_1_1_1, Release, 1) + aie.next_bd ^bb10 + } + %memtile_dma_0_1 = aie.memtile_dma(%mem_tile_0_1) { + %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_0_1_0, AcquireGreaterEqual, 1) + aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 0, 256) {task_id = 0 : i32} + aie.use_lock(%lock_0_1, Release, 1) + aie.next_bd ^bb1 + ^bb2: // pred: ^bb9 + aie.end + ^bb3: // pred: ^bb0 + %1 = aie.dma_start(MM2S, 1, ^bb4, ^bb5) + ^bb4: // 2 preds: ^bb3, ^bb4 + aie.use_lock(%lock_0_1_0, AcquireGreaterEqual, 1) + aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 256, 256) {task_id = 0 : i32} + aie.use_lock(%lock_0_1, Release, 1) + aie.next_bd ^bb4 + ^bb5: // pred: ^bb3 + %2 = aie.dma_start(MM2S, 2, ^bb6, ^bb7) + ^bb6: // 2 preds: ^bb5, ^bb6 + aie.use_lock(%lock_0_1_0, AcquireGreaterEqual, 1) + aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 512, 256) {task_id = 0 : i32} + aie.use_lock(%lock_0_1, Release, 1) + aie.next_bd ^bb6 + ^bb7: // pred: ^bb5 + %3 = aie.dma_start(MM2S, 3, ^bb8, ^bb9) + ^bb8: // 2 preds: ^bb7, ^bb8 + aie.use_lock(%lock_0_1_0, AcquireGreaterEqual, 1) + aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 768, 256) {task_id = 0 : i32} + aie.use_lock(%lock_0_1, Release, 1) + aie.next_bd ^bb8 + ^bb9: // pred: ^bb7 + %4 = aie.dma_start(S2MM, 0, ^bb10, ^bb2) + ^bb10: // 2 preds: ^bb9, ^bb10 + aie.use_lock(%lock_0_1, AcquireGreaterEqual, 4) + aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 0, 1024) {task_id = 0 : i32} + aie.use_lock(%lock_0_1_0, Release, 4) + aie.next_bd ^bb10 + } + aie.shim_dma_allocation @air_channel_3(%shim_noc_tile_1_0, S2MM, 0) + aie.shim_dma_allocation @air_channel_0(%shim_noc_tile_0_0, MM2S, 0) + } {dlti.dl_spec = #dlti.dl_spec} + airrt.module_metadata{ + airrt.segment_metadata attributes {dma_allocations = [{channel = 2 : i64, col = 0 : i64, id = 3 : i64, location = 0 : i64, row = -1 : i64}], sym_name = "square_kernel_0"}{ + airrt.herd_metadata {dma_allocations = [], loc_x = 0 : i64, loc_y = 2 : i64, size_x = 1 : i64, size_y = 4 : i64, sym_name = "herd_0"} + } + } + air.channel @channel_0 [] + air.channel @channel_1 [4, 1] + air.channel @channel_2 [4, 1] + air.channel @channel_3 [] + func.func @square_kernel(%arg0: memref<*xi16> {tt.divisibility = 16 : i32}, %arg1: memref<*xi16> {tt.divisibility = 16 : i32}, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32) { + %c1 = arith.constant 1 : index + %0 = air.launch async (%arg8, %arg9, %arg10) in (%arg11=%c1, %arg12=%c1, %arg13=%c1) args(%arg14=%arg0, %arg15=%arg1) : memref<*xi16>, memref<*xi16> attributes {id = 1 : i32} { + %c1024 = arith.constant 1024 : index + %c1_0 = arith.constant 1 : index + %1 = arith.muli %arg8, %c1024 : index + %2 = air.channel.put async @channel_0[] (%arg14[%1] [%c1024] [%c1_0]) {id = 1 : i32, metadataArray = [{base = "air_channel_0", index = 0 : i32}]} : (memref<*xi16>) + %3 = air.channel.get async @channel_3[] (%arg15[%1] [%c1024] [%c1_0]) {id = 2 : i32, metadataArray = [{base = "air_channel_3", index = 0 : i32}]} : (memref<*xi16>) + %4 = air.segment @square_kernel_0 async attributes {id = 2 : i32, x_loc = 0 : i64, x_size = 8 : i64, y_loc = 2 : i64, y_size = 6 : i64} { + %c4 = arith.constant 4 : index + %c768 = arith.constant 768 : index + %c3 = arith.constant 3 : index + %c512 = arith.constant 512 : index + %c2 = arith.constant 2 : index + %c256 = arith.constant 256 : index + %c0 = arith.constant 0 : index + %c1_1 = arith.constant 1 : index + %async_token, %results = air.execute -> (memref<1024xi16, 1 : i32>) { + %alloc = memref.alloc() : memref<1024xi16, 1 : i32> + air.execute_terminator %alloc : memref<1024xi16, 1 : i32> + } + %5 = air.channel.get async [%async_token] @channel_0[] (%results[] [] []) {id = 3 : i32} : (memref<1024xi16, 1 : i32>) + %async_token_2, %results_3 = air.execute -> (memref<1024xi16, 1>) { + %alloc = memref.alloc() : memref<1024xi16, 1> + air.execute_terminator %alloc : memref<1024xi16, 1> + } + %6 = air.channel.put async [%5] @channel_1[%c0, %c0] (%results[%c0] [%c256] [%c1_1]) {id = 4 : i32} : (memref<1024xi16, 1 : i32>) + %7 = air.channel.put async [%5] @channel_1[%c1_1, %c0] (%results[%c256] [%c256] [%c1_1]) {id = 5 : i32} : (memref<1024xi16, 1 : i32>) + %8 = air.channel.put async [%5] @channel_1[%c2, %c0] (%results[%c512] [%c256] [%c1_1]) {id = 6 : i32} : (memref<1024xi16, 1 : i32>) + %9 = air.channel.put async [%5] @channel_1[%c3, %c0] (%results[%c768] [%c256] [%c1_1]) {id = 7 : i32} : (memref<1024xi16, 1 : i32>) + %10 = air.channel.get async [%async_token_2] @channel_2[%c0, %c0] (%results_3[%c0] [%c256] [%c1_1]) {id = 8 : i32} : (memref<1024xi16, 1>) + %11 = air.channel.get async [%async_token_2] @channel_2[%c1_1, %c0] (%results_3[%c256] [%c256] [%c1_1]) {id = 9 : i32} : (memref<1024xi16, 1>) + %12 = air.channel.get async [%async_token_2] @channel_2[%c2, %c0] (%results_3[%c512] [%c256] [%c1_1]) {id = 10 : i32} : (memref<1024xi16, 1>) + %13 = air.channel.get async [%async_token_2] @channel_2[%c3, %c0] (%results_3[%c768] [%c256] [%c1_1]) {id = 11 : i32} : (memref<1024xi16, 1>) + %14 = air.herd @herd_0 async [%5, %async_token_2] tile (%arg16, %arg17) in (%arg18=%c1_1, %arg19=%c4) attributes {id = 3 : i32, x_loc = 0 : i64, y_loc = 2 : i64} { + %c32 = arith.constant 32 : index + %c256_5 = arith.constant 256 : index + %c0_6 = arith.constant 0 : index + %16 = ub.poison : i16 + %async_token_7, %results_8 = air.execute -> (memref<256xi16, 2>) { + %alloc = memref.alloc() : memref<256xi16, 2> + air.execute_terminator %alloc : memref<256xi16, 2> + } + %17 = air.channel.get async [%async_token_7] @channel_1[%arg17, %c0_6] (%results_8[] [] []) {id = 12 : i32} : (memref<256xi16, 2>) + %async_token_9, %results_10 = air.execute -> (memref<256xi16, 2>) { + %alloc = memref.alloc() : memref<256xi16, 2> + air.execute_terminator %alloc : memref<256xi16, 2> + } + %18 = air.wait_all async [%17, %async_token_9] + %19 = scf.for %arg20 = %c0_6 to %c256_5 step %c32 iter_args(%arg21 = %18) -> (!air.async.token) { + %subview = memref.subview %results_8[%arg20] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2> + %subview_13 = memref.subview %results_10[%arg20] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2> + %async_token_14, %results_15 = air.execute [%arg21] -> (vector<32xi16>) { + %23 = vector.transfer_read %subview[%c0_6], %16 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16> + air.execute_terminator %23 : vector<32xi16> + } + %21 = arith.muli %results_15, %results_15 : vector<32xi16> + %async_token_16 = air.execute [%arg21] { + vector.transfer_write %21, %subview_13[%c0_6] {in_bounds = [true]} : vector<32xi16>, memref<32xi16, strided<[1], offset: ?>, 2> + } + %22 = air.wait_all async [%async_token_14, %async_token_16] + scf.yield %22 : !air.async.token + } + %20 = air.channel.put async [%async_token_9] @channel_2[%arg17, %c0_6] (%results_10[] [] []) {id = 13 : i32} : (memref<256xi16, 2>) + %async_token_11 = air.execute [%17] { + memref.dealloc %results_8 : memref<256xi16, 2> + } + %async_token_12 = air.execute [%20] { + memref.dealloc %results_10 : memref<256xi16, 2> + } + } + %15 = air.channel.put async [%14] @channel_3[] (%results_3[] [] []) {id = 14 : i32} : (memref<1024xi16, 1>) + %async_token_4 = air.execute [%15] { + memref.dealloc %results_3 : memref<1024xi16, 1> + } + air.wait_all [%6, %7, %8, %9, %10, %11, %12, %13, %async_token_4] {air.segment_end} + } + } + return + } +} diff --git a/examples/elementwise_arith/air_project/aie.elf b/examples/elementwise_arith/air_project/aie.elf new file mode 100644 index 0000000000000000000000000000000000000000..d54eb1201d9709da9aaf31632fd7188b680af270 GIT binary patch literal 29488 zcmeHQU2GiH6~43HBuksZx^>Z#1Tpym0*$P798^G28DpmkZ7P-oP*jkzCYTU4aTdpE zL#2wPjVNtVWwm`URF(YF7sSJ=KZvF;TfqaZP_^PIFO3|^BSIU*1C?mq^WA&T+%rEn z4JC4HG{>6VbH8)u-ZST(o!y=J_WGGU`@S9o!JeuzQc?l^8QDco)fVfax<+kRLu6e` z5Ns?V?SQRp0qo!8PDpQ&j%^L;7zd_a$zY_acFknm@cHjO@z7g;e)&aY z3CEAn@XUGqW&3YK)Qsi6iAE!=hN>PTPQiZvkDR{!FE3#{x&P$RsY6GPPac_a9a3iV z<=5Nqx{*??c#~dBRvsjf>Z2sbkba*=|M}9=zfoi36Zg{yN)3?y$3JzJ25B(fK5O}Y zi;C*6wBcFGa^!o@`^kL2K{0&zdz0@{K-mws#VgT6WaWtY7<|`EUJvaK7>P!FR{jO+G&| z|7Bz2$J+G~75&~|@rLMW< zHz*??E#Zal>Hs^T&2_+St^;m!9dKKPqp7+YN!3-dtT@meMJ>6T>)2gf|GrwFPH9NoD=K1F~&p)?S zcpz0z52WfT*;eP1?TTn8>uG;cPph(?%=~hF)a?9no9Ca~JpX88{`s?%{Op9!R%k;8 z0@C`MW4$W$Q0R4`M?!B1y(#pT(Az?v6Z(SCJ3{XYy(jcVq4%wBj&C&a8o_&f^7ux? z@i{&0JH0OSNazirH-+93dRypoLSGPiN9bLl_k_ME^uE>2@r|dB&$QkAjfvxPdf0b* zUFea}8$xdiy(RRv(C37{AoPyVyF%{?eNpIrtDECHkUBoocE|UCI6kL`eW%xj9tpi6 z^rp~TLT?LwPUs6l?+Cps^q$Zch29r>Rc+GOFZ64TbUrS^OZd)dab2>|>r~r;#Y0i^ zRe>vk*Qr*VzpLr0z?G=2I_2K^yBetqT#31=)3G>zm-DK?l|9};oKLK*3Gi*6q(}P% zIt8X}9~bW_ug&uhwSScGMdP1`%GV-4Lv#*=pmwTO=lIIhhOX;M*aiu7@r-@Kc;Fvl z$b`6f$WCJ7F?v94#9F5Jf9NJ2XG_LIbM}ey1OJG6O^S<$>?9^0qq}%at(bV6Eg8?| zPZ$sUBR!^kaPg3x#KdFtfZ8TMrq)e7&Sv8IoZe5V4U|Zz+_tcNias7YBmG18V*s@w z>1bPHOVP8^UkzCQ;HP@8^iFC2i2E3kZ0 zPd?NBG+)NQls{h?{{ido$!FT1=F46{_RQDE(HWb6eN;C8TIc-lUb_L;D|D{zBpt%G z32O7)L@C=oYIBbFrfg+u^W3PEZFheF*L>TEYv>|f53_oO@I6X(fh&U>{euB@M{v`1 z8dn?fO40jn4>rd8)(Pi+l)xjxDSqu=7WfXrX%vlD1b!#s6p_Z)3%uGNDA6SwjYk&; zaA;;l0`e7C^M z{jIpBtmCX*+#2U6B0SEa!1ogF=EeC>5KixQx}Q+^?;||Ut-ucw9>*i_a(`6xGaARU zi*Of@!*>%-m*@1j+b%kMAK`9XhYu0%#&vkPKQ4}IT#P%e=)#=tf73;W?<0TLzr%+J zcl|rO+&>`tKM?mnM7Zn6;kEwO(&z2CJ1%aG>mejOo_B!{5&lK%zgOU!2#@PW;I;m! z=zmo7KPvinc&$G!{KvJwJ{R{X`t@`;^!|i)xgV73)T9^_-@nKU$Ekhz8l-ERYe=qb zt^uGe<+FDG0iOLz8ssc1x%*+#E1 z#MGt@HLjzh`zvvuS2{P59wdHPw%pipS9EULaay#pY`L-HuISu!uTpO8xGOq0?$xT3 zv}|=_$6e96iQ+Ta^5({lyP|W`zDl{VBNI}8nSY7wt)Ahphn^!M+ zHm;XE8`n#o4d1<3P(yOOD&6~sg=r6+BI{_^DAqK z>m|>|^^#}fddahaYr>MP-7#&-uU)?ld<00|e&8mgZGlGcaBoq&Iv1<8cFo-45HR=2 z+V$JGUh?+iddaha*TRAtlH>6`yu2C&-vN?m1NXsa0}mo?bG5s_s9l|%)mpn|Zh3xX zEpff%*|=WvY+NsSHt=N_Prp{!#&*OKOMi2$;+h!??&GsaobUC+CUNiqSR@{K@P-F( zdhnJ9Z+q}L55C~RJ0862!FwKj(S!F>IDfX}&w9D%4)HmV=ah5ITnV&gzQLzpk$C9A z!OLKg{*ec7c<`nNZ+Y;x2cPrc3m&}V!Mh&3=fM{}ct3@^c{87L^5@Oikk{gEu^Q(}TA>c-w=|dGG}f-tpjF58m_Oiypk6!ri=?_xSvI zbMODzd2{D;nQ!omSR@{LaPX8^q<`eW8y>vr!CM}@?ZM|f_<{%Tc<`S04YmJ&$Dj&p-cGJ^m_n zJ~GkJJv>|Q+5Yp-6IGADO1*Ae|E6xU{pZ(n)y`wuU!~?3b}hXJ%nx@w95eO?e*L=&e!x9$Fq?Qnp2@qP?z8UvOn?3jK2o*Ic4DJu|N2K`aV4X4`)sK zgZ~%%UxEY3{e%A(`+GQR=%0_4d4OokdX=0&i4%`ZpLa2y#EDaui-Rx7eM^7I9h5jY zgz(3M#H*hE!7GG6cyH2QatwiUeiZTZ;{5pLmpF#8Kk@%!f6sh@1IYeV56=G*9K+aO z_V1Z5*+12T>0fhH{TxH&1VZy`6nd2Tq50J~;*dDFhY(;Ampnv?gO3PF_anK9(jOc} zNP4~`Us3vlw+KnsgXAzue{dNg;LS1YE*|zu`M- z-kiiOClb3P#EH4=vp8`v0fLt&`?vhgj6b-Z z05ivBc$^u3a6kcq2P*rwe9(+PxS;^STa^BmCz|mGXA}b7sPwn|(TqR1q!92orT=!r z`^@-*V+zT>UJOpAL_gr3LWrW4z%36IrUVX7DkM|4o_N4jg=F&M@dt+$l8wjUWJ<&X zZYw04pL~CCULoMU%6wY>D@+L-Tv$lXCc}7GB}wM@$9nx;ML0b zEx#6BC2(+UA=&uz`v(UXl8ry#AKY9>HhzPXDG?7iyO3-=`TpSY!XJEIdAyd_3sV9I z#}{Dc^=|mP8Gmqp0fPT4`?oycj6XQR05j(wPygTw!#^9p!O4^uH?;oQ`1Ae2Eza-X z;N1TJoFL#FL-2WnxX1GPH~7ae(b)PkiL~*xYV3_IX&WWP;8~lgoBaLaT8R6EpO!1l z1?7J2^YZc5%z$zJnF~uVu!1MM3UbpnG4kWG)r}o@Mb}N2vtpi?EjM=D6`dP>MsxXD zw%pipS9ETo_$;pzv~0Pt~LKTKmPPI?PH#wKCXsmrjAVl)X3!VqleY- zk(m>xTJXJeXEcoQBxU-$H}Ko zP0mam`p(qM^whCK^~T}kpGw&eot%2=)YSChDcX70@yVmpN%zL#=evqCu-ZpDbYMk_03Pj^wrQU-Mf70jZbGzw3 zP!$T!^dG2Kpkv%9P}}I@g6Tg{Z$by&4pc;68cqL!dKWrys_jSV%eLu1Q1}j1u}L_% zg>Zdcl465ylQFy1;t|CL9XRQ8+brIuK5n)+>90UXKV4d1@)nDe{w8$YKgD*d#c5vO zg$|sK5#HWA0oZm-&*79Afhp85d^)Ss^oOlIY;Fr-OfnI;JSPzrlROnta-p2Gszjx>L z5YZl_6S7-hwO9|6?$*U?yP}jpx$~^`DFNFmbNM`bo@(ht3>$y(W!a|B~OMi}f%~zCz2tM825Q2|W^cL+DMRw}jpn z`kc@ggx(Q)SLi*VFABYHbu)j(dYE)~66LGw9QfA!W!fA;+~KfLYkd#nF?pUjsn`m>qvKG#s{HPMD0LVQej zub1>W`kC3|Fr?&5)Ye}nEdC++QI{G+UmQEm=xF1(AWdqtcra??_#j(oV?WsO{{|%i zl>O-TX6%RmH;Df`nAcxj_%+F=OTA) r0rEI{6bK&&;^*V|1?lqs^EvVx;?C$9#|U|uHtF{s`Gqi8VZZ+Zn&r4p literal 0 HcmV?d00001 diff --git a/examples/elementwise_arith/air_project/aiecc_failure_1775797115_856352.mlir b/examples/elementwise_arith/air_project/aiecc_failure_1775797115_856352.mlir new file mode 100644 index 0000000..15c21c6 --- /dev/null +++ b/examples/elementwise_arith/air_project/aiecc_failure_1775797115_856352.mlir @@ -0,0 +1,411 @@ +#loop_annotation = #llvm.loop_annotation +module { + aie.device(npu2) @sub_kernel_0 { + %shim_noc_tile_0_0 = aie.tile(0, 0) + %shim_noc_tile_1_0 = aie.tile(1, 0) + %shim_noc_tile_2_0 = aie.tile(2, 0) + %mem_tile_0_1 = aie.tile(0, 1) + %mem_tile_1_1 = aie.tile(1, 1) + %mem_tile_2_1 = aie.tile(2, 1) + %tile_0_2 = aie.tile(0, 2) + %tile_0_3 = aie.tile(0, 3) + %tile_0_4 = aie.tile(0, 4) + %tile_0_5 = aie.tile(0, 5) + %lock_1_1 = aie.lock(%mem_tile_1_1, 1) {init = 4 : i32} + %lock_1_1_0 = aie.lock(%mem_tile_1_1, 0) {init = 0 : i32} + %lock_0_1 = aie.lock(%mem_tile_0_1, 1) {init = 4 : i32} + %lock_0_1_1 = aie.lock(%mem_tile_0_1, 0) {init = 0 : i32} + %lock_2_1 = aie.lock(%mem_tile_2_1, 1) {init = 4 : i32} + %lock_2_1_2 = aie.lock(%mem_tile_2_1, 0) {init = 0 : i32} + %lock_0_2 = aie.lock(%tile_0_2, 5) {init = 1 : i32} + %lock_0_2_3 = aie.lock(%tile_0_2, 4) {init = 0 : i32} + %lock_0_2_4 = aie.lock(%tile_0_2, 3) {init = 1 : i32} + %lock_0_2_5 = aie.lock(%tile_0_2, 2) {init = 0 : i32} + %lock_0_2_6 = aie.lock(%tile_0_2, 1) {init = 1 : i32} + %lock_0_2_7 = aie.lock(%tile_0_2, 0) {init = 0 : i32} + %lock_0_3 = aie.lock(%tile_0_3, 5) {init = 1 : i32} + %lock_0_3_8 = aie.lock(%tile_0_3, 4) {init = 0 : i32} + %lock_0_3_9 = aie.lock(%tile_0_3, 3) {init = 1 : i32} + %lock_0_3_10 = aie.lock(%tile_0_3, 2) {init = 0 : i32} + %lock_0_3_11 = aie.lock(%tile_0_3, 1) {init = 1 : i32} + %lock_0_3_12 = aie.lock(%tile_0_3, 0) {init = 0 : i32} + %lock_0_4 = aie.lock(%tile_0_4, 5) {init = 1 : i32} + %lock_0_4_13 = aie.lock(%tile_0_4, 4) {init = 0 : i32} + %lock_0_4_14 = aie.lock(%tile_0_4, 3) {init = 1 : i32} + %lock_0_4_15 = aie.lock(%tile_0_4, 2) {init = 0 : i32} + %lock_0_4_16 = aie.lock(%tile_0_4, 1) {init = 1 : i32} + %lock_0_4_17 = aie.lock(%tile_0_4, 0) {init = 0 : i32} + %lock_0_5 = aie.lock(%tile_0_5, 5) {init = 1 : i32} + %lock_0_5_18 = aie.lock(%tile_0_5, 4) {init = 0 : i32} + %lock_0_5_19 = aie.lock(%tile_0_5, 3) {init = 1 : i32} + %lock_0_5_20 = aie.lock(%tile_0_5, 2) {init = 0 : i32} + %lock_0_5_21 = aie.lock(%tile_0_5, 1) {init = 1 : i32} + %lock_0_5_22 = aie.lock(%tile_0_5, 0) {init = 0 : i32} + %buf14 = aie.buffer(%mem_tile_0_1) {sym_name = "buf14"} : memref<1024xi16, 1 : i32> + %buf13 = aie.buffer(%mem_tile_1_1) {sym_name = "buf13"} : memref<1024xi16, 1 : i32> + %buf12 = aie.buffer(%mem_tile_2_1) {sym_name = "buf12"} : memref<1024xi16, 1> + %buf11 = aie.buffer(%tile_0_5) {sym_name = "buf11"} : memref<256xi16, 2> + %buf10 = aie.buffer(%tile_0_5) {sym_name = "buf10"} : memref<256xi16, 2> + %buf9 = aie.buffer(%tile_0_5) {sym_name = "buf9"} : memref<256xi16, 2> + %buf8 = aie.buffer(%tile_0_4) {sym_name = "buf8"} : memref<256xi16, 2> + %buf7 = aie.buffer(%tile_0_4) {sym_name = "buf7"} : memref<256xi16, 2> + %buf6 = aie.buffer(%tile_0_4) {sym_name = "buf6"} : memref<256xi16, 2> + %buf5 = aie.buffer(%tile_0_3) {sym_name = "buf5"} : memref<256xi16, 2> + %buf4 = aie.buffer(%tile_0_3) {sym_name = "buf4"} : memref<256xi16, 2> + %buf3 = aie.buffer(%tile_0_3) {sym_name = "buf3"} : memref<256xi16, 2> + %buf2 = aie.buffer(%tile_0_2) {sym_name = "buf2"} : memref<256xi16, 2> + %buf1 = aie.buffer(%tile_0_2) {sym_name = "buf1"} : memref<256xi16, 2> + %buf0 = aie.buffer(%tile_0_2) {sym_name = "buf0"} : memref<256xi16, 2> + %mem_0_5 = aie.mem(%tile_0_5) { + %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_0_5_22, AcquireGreaterEqual, 1) + aie.dma_bd(%buf9 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32} + aie.use_lock(%lock_0_5_21, Release, 1) + aie.next_bd ^bb1 + ^bb2: // pred: ^bb5 + aie.end + ^bb3: // pred: ^bb0 + %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb5) + ^bb4: // 2 preds: ^bb3, ^bb4 + aie.use_lock(%lock_0_5_19, AcquireGreaterEqual, 1) + aie.dma_bd(%buf11 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32} + aie.use_lock(%lock_0_5_20, Release, 1) + aie.next_bd ^bb4 + ^bb5: // pred: ^bb3 + %2 = aie.dma_start(S2MM, 1, ^bb6, ^bb2) + ^bb6: // 2 preds: ^bb5, ^bb6 + aie.use_lock(%lock_0_5, AcquireGreaterEqual, 1) + aie.dma_bd(%buf10 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32} + aie.use_lock(%lock_0_5_18, Release, 1) + aie.next_bd ^bb6 + } + %core_0_5 = aie.core(%tile_0_5) { + %0 = ub.poison : i16 + %c256 = arith.constant 256 : index + %c32 = arith.constant 32 : index + %c0 = arith.constant 0 : index + cf.br ^bb1 + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_0_5_21, AcquireGreaterEqual, 1) + aie.use_lock(%lock_0_5_20, AcquireGreaterEqual, 1) + aie.use_lock(%lock_0_5_18, AcquireGreaterEqual, 1) + scf.for %arg0 = %c0 to %c256 step %c32 { + %subview = memref.subview %buf11[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2> + %subview_23 = memref.subview %buf10[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2> + %subview_24 = memref.subview %buf9[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2> + %1 = vector.transfer_read %subview[%c0], %0 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16> + %2 = vector.transfer_read %subview_23[%c0], %0 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16> + %3 = arith.subi %1, %2 : vector<32xi16> + vector.transfer_write %3, %subview_24[%c0] {in_bounds = [true]} : vector<32xi16>, memref<32xi16, strided<[1], offset: ?>, 2> + } {loop_annotation = #loop_annotation} + aie.use_lock(%lock_0_5_19, Release, 1) + aie.use_lock(%lock_0_5, Release, 1) + aie.use_lock(%lock_0_5_22, Release, 1) + cf.br ^bb1 + } + %mem_0_4 = aie.mem(%tile_0_4) { + %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_0_4_17, AcquireGreaterEqual, 1) + aie.dma_bd(%buf6 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32} + aie.use_lock(%lock_0_4_16, Release, 1) + aie.next_bd ^bb1 + ^bb2: // pred: ^bb5 + aie.end + ^bb3: // pred: ^bb0 + %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb5) + ^bb4: // 2 preds: ^bb3, ^bb4 + aie.use_lock(%lock_0_4_14, AcquireGreaterEqual, 1) + aie.dma_bd(%buf8 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32} + aie.use_lock(%lock_0_4_15, Release, 1) + aie.next_bd ^bb4 + ^bb5: // pred: ^bb3 + %2 = aie.dma_start(S2MM, 1, ^bb6, ^bb2) + ^bb6: // 2 preds: ^bb5, ^bb6 + aie.use_lock(%lock_0_4, AcquireGreaterEqual, 1) + aie.dma_bd(%buf7 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32} + aie.use_lock(%lock_0_4_13, Release, 1) + aie.next_bd ^bb6 + } + %core_0_4 = aie.core(%tile_0_4) { + %0 = ub.poison : i16 + %c256 = arith.constant 256 : index + %c32 = arith.constant 32 : index + %c0 = arith.constant 0 : index + cf.br ^bb1 + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_0_4_16, AcquireGreaterEqual, 1) + aie.use_lock(%lock_0_4_15, AcquireGreaterEqual, 1) + aie.use_lock(%lock_0_4_13, AcquireGreaterEqual, 1) + scf.for %arg0 = %c0 to %c256 step %c32 { + %subview = memref.subview %buf8[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2> + %subview_23 = memref.subview %buf7[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2> + %subview_24 = memref.subview %buf6[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2> + %1 = vector.transfer_read %subview[%c0], %0 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16> + %2 = vector.transfer_read %subview_23[%c0], %0 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16> + %3 = arith.subi %1, %2 : vector<32xi16> + vector.transfer_write %3, %subview_24[%c0] {in_bounds = [true]} : vector<32xi16>, memref<32xi16, strided<[1], offset: ?>, 2> + } {loop_annotation = #loop_annotation} + aie.use_lock(%lock_0_4_14, Release, 1) + aie.use_lock(%lock_0_4, Release, 1) + aie.use_lock(%lock_0_4_17, Release, 1) + cf.br ^bb1 + } + %mem_0_3 = aie.mem(%tile_0_3) { + %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_0_3_12, AcquireGreaterEqual, 1) + aie.dma_bd(%buf3 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32} + aie.use_lock(%lock_0_3_11, Release, 1) + aie.next_bd ^bb1 + ^bb2: // pred: ^bb5 + aie.end + ^bb3: // pred: ^bb0 + %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb5) + ^bb4: // 2 preds: ^bb3, ^bb4 + aie.use_lock(%lock_0_3_9, AcquireGreaterEqual, 1) + aie.dma_bd(%buf5 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32} + aie.use_lock(%lock_0_3_10, Release, 1) + aie.next_bd ^bb4 + ^bb5: // pred: ^bb3 + %2 = aie.dma_start(S2MM, 1, ^bb6, ^bb2) + ^bb6: // 2 preds: ^bb5, ^bb6 + aie.use_lock(%lock_0_3, AcquireGreaterEqual, 1) + aie.dma_bd(%buf4 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32} + aie.use_lock(%lock_0_3_8, Release, 1) + aie.next_bd ^bb6 + } + %core_0_3 = aie.core(%tile_0_3) { + %0 = ub.poison : i16 + %c256 = arith.constant 256 : index + %c32 = arith.constant 32 : index + %c0 = arith.constant 0 : index + cf.br ^bb1 + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_0_3_11, AcquireGreaterEqual, 1) + aie.use_lock(%lock_0_3_10, AcquireGreaterEqual, 1) + aie.use_lock(%lock_0_3_8, AcquireGreaterEqual, 1) + scf.for %arg0 = %c0 to %c256 step %c32 { + %subview = memref.subview %buf5[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2> + %subview_23 = memref.subview %buf4[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2> + %subview_24 = memref.subview %buf3[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2> + %1 = vector.transfer_read %subview[%c0], %0 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16> + %2 = vector.transfer_read %subview_23[%c0], %0 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16> + %3 = arith.subi %1, %2 : vector<32xi16> + vector.transfer_write %3, %subview_24[%c0] {in_bounds = [true]} : vector<32xi16>, memref<32xi16, strided<[1], offset: ?>, 2> + } {loop_annotation = #loop_annotation} + aie.use_lock(%lock_0_3_9, Release, 1) + aie.use_lock(%lock_0_3, Release, 1) + aie.use_lock(%lock_0_3_12, Release, 1) + cf.br ^bb1 + } + %mem_0_2 = aie.mem(%tile_0_2) { + %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_0_2_7, AcquireGreaterEqual, 1) + aie.dma_bd(%buf0 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32} + aie.use_lock(%lock_0_2_6, Release, 1) + aie.next_bd ^bb1 + ^bb2: // pred: ^bb5 + aie.end + ^bb3: // pred: ^bb0 + %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb5) + ^bb4: // 2 preds: ^bb3, ^bb4 + aie.use_lock(%lock_0_2_4, AcquireGreaterEqual, 1) + aie.dma_bd(%buf2 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32} + aie.use_lock(%lock_0_2_5, Release, 1) + aie.next_bd ^bb4 + ^bb5: // pred: ^bb3 + %2 = aie.dma_start(S2MM, 1, ^bb6, ^bb2) + ^bb6: // 2 preds: ^bb5, ^bb6 + aie.use_lock(%lock_0_2, AcquireGreaterEqual, 1) + aie.dma_bd(%buf1 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32} + aie.use_lock(%lock_0_2_3, Release, 1) + aie.next_bd ^bb6 + } + %core_0_2 = aie.core(%tile_0_2) { + %0 = ub.poison : i16 + %c256 = arith.constant 256 : index + %c32 = arith.constant 32 : index + %c0 = arith.constant 0 : index + cf.br ^bb1 + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_0_2_6, AcquireGreaterEqual, 1) + aie.use_lock(%lock_0_2_5, AcquireGreaterEqual, 1) + aie.use_lock(%lock_0_2_3, AcquireGreaterEqual, 1) + scf.for %arg0 = %c0 to %c256 step %c32 { + %subview = memref.subview %buf2[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2> + %subview_23 = memref.subview %buf1[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2> + %subview_24 = memref.subview %buf0[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2> + %1 = vector.transfer_read %subview[%c0], %0 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16> + %2 = vector.transfer_read %subview_23[%c0], %0 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16> + %3 = arith.subi %1, %2 : vector<32xi16> + vector.transfer_write %3, %subview_24[%c0] {in_bounds = [true]} : vector<32xi16>, memref<32xi16, strided<[1], offset: ?>, 2> + } {loop_annotation = #loop_annotation} + aie.use_lock(%lock_0_2_4, Release, 1) + aie.use_lock(%lock_0_2, Release, 1) + aie.use_lock(%lock_0_2_7, Release, 1) + cf.br ^bb1 + } + aie.flow(%shim_noc_tile_0_0, DMA : 0, %mem_tile_0_1, DMA : 0) + aie.flow(%shim_noc_tile_1_0, DMA : 0, %mem_tile_1_1, DMA : 0) + aie.flow(%mem_tile_2_1, DMA : 0, %shim_noc_tile_2_0, DMA : 0) + aie.flow(%mem_tile_0_1, DMA : 0, %tile_0_2, DMA : 0) + aie.flow(%mem_tile_0_1, DMA : 1, %tile_0_3, DMA : 0) + aie.flow(%mem_tile_0_1, DMA : 2, %tile_0_4, DMA : 0) + aie.flow(%mem_tile_0_1, DMA : 3, %tile_0_5, DMA : 0) + aie.flow(%mem_tile_1_1, DMA : 0, %tile_0_2, DMA : 1) + aie.flow(%mem_tile_1_1, DMA : 1, %tile_0_3, DMA : 1) + aie.flow(%mem_tile_1_1, DMA : 2, %tile_0_4, DMA : 1) + aie.flow(%mem_tile_1_1, DMA : 3, %tile_0_5, DMA : 1) + aie.flow(%tile_0_2, DMA : 0, %mem_tile_2_1, DMA : 0) + aie.flow(%tile_0_3, DMA : 0, %mem_tile_2_1, DMA : 1) + aie.flow(%tile_0_4, DMA : 0, %mem_tile_2_1, DMA : 2) + aie.flow(%tile_0_5, DMA : 0, %mem_tile_2_1, DMA : 3) + %memtile_dma_2_1 = aie.memtile_dma(%mem_tile_2_1) { + %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_2_1_2, AcquireGreaterEqual, 4) + aie.dma_bd(%buf12 : memref<1024xi16, 1>, 0, 1024) {task_id = 0 : i32} + aie.use_lock(%lock_2_1, Release, 4) + aie.next_bd ^bb1 + ^bb2: // pred: ^bb9 + aie.end + ^bb3: // pred: ^bb0 + %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb5) + ^bb4: // 2 preds: ^bb3, ^bb4 + aie.use_lock(%lock_2_1, AcquireGreaterEqual, 1) + aie.dma_bd(%buf12 : memref<1024xi16, 1>, 0, 256) {task_id = 0 : i32} + aie.use_lock(%lock_2_1_2, Release, 1) + aie.next_bd ^bb4 + ^bb5: // pred: ^bb3 + %2 = aie.dma_start(S2MM, 1, ^bb6, ^bb7) + ^bb6: // 2 preds: ^bb5, ^bb6 + aie.use_lock(%lock_2_1, AcquireGreaterEqual, 1) + aie.dma_bd(%buf12 : memref<1024xi16, 1>, 256, 256) {task_id = 0 : i32} + aie.use_lock(%lock_2_1_2, Release, 1) + aie.next_bd ^bb6 + ^bb7: // pred: ^bb5 + %3 = aie.dma_start(S2MM, 2, ^bb8, ^bb9) + ^bb8: // 2 preds: ^bb7, ^bb8 + aie.use_lock(%lock_2_1, AcquireGreaterEqual, 1) + aie.dma_bd(%buf12 : memref<1024xi16, 1>, 512, 256) {task_id = 0 : i32} + aie.use_lock(%lock_2_1_2, Release, 1) + aie.next_bd ^bb8 + ^bb9: // pred: ^bb7 + %4 = aie.dma_start(S2MM, 3, ^bb10, ^bb2) + ^bb10: // 2 preds: ^bb9, ^bb10 + aie.use_lock(%lock_2_1, AcquireGreaterEqual, 1) + aie.dma_bd(%buf12 : memref<1024xi16, 1>, 768, 256) {task_id = 0 : i32} + aie.use_lock(%lock_2_1_2, Release, 1) + aie.next_bd ^bb10 + } + %memtile_dma_0_1 = aie.memtile_dma(%mem_tile_0_1) { + %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_0_1_1, AcquireGreaterEqual, 1) + aie.dma_bd(%buf14 : memref<1024xi16, 1 : i32>, 0, 256) {task_id = 0 : i32} + aie.use_lock(%lock_0_1, Release, 1) + aie.next_bd ^bb1 + ^bb2: // pred: ^bb9 + aie.end + ^bb3: // pred: ^bb0 + %1 = aie.dma_start(MM2S, 1, ^bb4, ^bb5) + ^bb4: // 2 preds: ^bb3, ^bb4 + aie.use_lock(%lock_0_1_1, AcquireGreaterEqual, 1) + aie.dma_bd(%buf14 : memref<1024xi16, 1 : i32>, 256, 256) {task_id = 0 : i32} + aie.use_lock(%lock_0_1, Release, 1) + aie.next_bd ^bb4 + ^bb5: // pred: ^bb3 + %2 = aie.dma_start(MM2S, 2, ^bb6, ^bb7) + ^bb6: // 2 preds: ^bb5, ^bb6 + aie.use_lock(%lock_0_1_1, AcquireGreaterEqual, 1) + aie.dma_bd(%buf14 : memref<1024xi16, 1 : i32>, 512, 256) {task_id = 0 : i32} + aie.use_lock(%lock_0_1, Release, 1) + aie.next_bd ^bb6 + ^bb7: // pred: ^bb5 + %3 = aie.dma_start(MM2S, 3, ^bb8, ^bb9) + ^bb8: // 2 preds: ^bb7, ^bb8 + aie.use_lock(%lock_0_1_1, AcquireGreaterEqual, 1) + aie.dma_bd(%buf14 : memref<1024xi16, 1 : i32>, 768, 256) {task_id = 0 : i32} + aie.use_lock(%lock_0_1, Release, 1) + aie.next_bd ^bb8 + ^bb9: // pred: ^bb7 + %4 = aie.dma_start(S2MM, 0, ^bb10, ^bb2) + ^bb10: // 2 preds: ^bb9, ^bb10 + aie.use_lock(%lock_0_1, AcquireGreaterEqual, 4) + aie.dma_bd(%buf14 : memref<1024xi16, 1 : i32>, 0, 1024) {task_id = 0 : i32} + aie.use_lock(%lock_0_1_1, Release, 4) + aie.next_bd ^bb10 + } + %memtile_dma_1_1 = aie.memtile_dma(%mem_tile_1_1) { + %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_1_1_0, AcquireGreaterEqual, 1) + aie.dma_bd(%buf13 : memref<1024xi16, 1 : i32>, 0, 256) {task_id = 0 : i32} + aie.use_lock(%lock_1_1, Release, 1) + aie.next_bd ^bb1 + ^bb2: // pred: ^bb9 + aie.end + ^bb3: // pred: ^bb0 + %1 = aie.dma_start(MM2S, 1, ^bb4, ^bb5) + ^bb4: // 2 preds: ^bb3, ^bb4 + aie.use_lock(%lock_1_1_0, AcquireGreaterEqual, 1) + aie.dma_bd(%buf13 : memref<1024xi16, 1 : i32>, 256, 256) {task_id = 0 : i32} + aie.use_lock(%lock_1_1, Release, 1) + aie.next_bd ^bb4 + ^bb5: // pred: ^bb3 + %2 = aie.dma_start(MM2S, 2, ^bb6, ^bb7) + ^bb6: // 2 preds: ^bb5, ^bb6 + aie.use_lock(%lock_1_1_0, AcquireGreaterEqual, 1) + aie.dma_bd(%buf13 : memref<1024xi16, 1 : i32>, 512, 256) {task_id = 0 : i32} + aie.use_lock(%lock_1_1, Release, 1) + aie.next_bd ^bb6 + ^bb7: // pred: ^bb5 + %3 = aie.dma_start(MM2S, 3, ^bb8, ^bb9) + ^bb8: // 2 preds: ^bb7, ^bb8 + aie.use_lock(%lock_1_1_0, AcquireGreaterEqual, 1) + aie.dma_bd(%buf13 : memref<1024xi16, 1 : i32>, 768, 256) {task_id = 0 : i32} + aie.use_lock(%lock_1_1, Release, 1) + aie.next_bd ^bb8 + ^bb9: // pred: ^bb7 + %4 = aie.dma_start(S2MM, 0, ^bb10, ^bb2) + ^bb10: // 2 preds: ^bb9, ^bb10 + aie.use_lock(%lock_1_1, AcquireGreaterEqual, 4) + aie.dma_bd(%buf13 : memref<1024xi16, 1 : i32>, 0, 1024) {task_id = 0 : i32} + aie.use_lock(%lock_1_1_0, Release, 4) + aie.next_bd ^bb10 + } + aie.shim_dma_allocation @air_channel_5(%shim_noc_tile_2_0, S2MM, 0) + aie.shim_dma_allocation @air_channel_0(%shim_noc_tile_0_0, MM2S, 0) + aie.shim_dma_allocation @air_channel_1(%shim_noc_tile_1_0, MM2S, 0) + aie.runtime_sequence @sub_kernel_0_sequence(%arg0: memref<*xi16>, %arg1: memref<*xi16>, %arg2: memref<*xi16>, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32) { + %0 = aiex.dma_configure_task_for @air_channel_0 { + aie.dma_bd(%arg0 : memref<*xi16>, 0, 1024, [, ]) + aie.end + } + aiex.dma_start_task(%0) + %1 = aiex.dma_configure_task_for @air_channel_1 { + aie.dma_bd(%arg1 : memref<*xi16>, 0, 1024, [, ]) + aie.end + } + aiex.dma_start_task(%1) + %2 = aiex.dma_configure_task_for @air_channel_5 { + aie.dma_bd(%arg2 : memref<*xi16>, 0, 1024, [, ]) + aie.end + } {issue_token = true} + aiex.dma_start_task(%2) + aiex.dma_free_task(%0) + aiex.dma_await_task(%2) + aiex.dma_free_task(%1) + } + } {dlti.dl_spec = #dlti.dl_spec} + aie.device(npu2) { + aie.runtime_sequence @sub_kernel(%arg0: memref<*xi16>, %arg1: memref<*xi16>, %arg2: memref<*xi16>, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32) { + aiex.configure @sub_kernel_0 { + aiex.run @sub_kernel_0_sequence(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8) : (memref<*xi16>, memref<*xi16>, memref<*xi16>, i32, i32, i32, i32, i32, i32) + } + } + } +} diff --git a/examples/elementwise_arith/air_project/aiecc_failure_1775797139_858651.mlir b/examples/elementwise_arith/air_project/aiecc_failure_1775797139_858651.mlir new file mode 100644 index 0000000..fc9d492 --- /dev/null +++ b/examples/elementwise_arith/air_project/aiecc_failure_1775797139_858651.mlir @@ -0,0 +1,601 @@ +#loop_annotation = #llvm.loop_annotation +module { + aie.device(npu2) @mul_kernel_0 { + %shim_noc_tile_0_0 = aie.tile(0, 0) {controller_id = #aie.packet_info} + %shim_noc_tile_1_0 = aie.tile(1, 0) {controller_id = #aie.packet_info} + %shim_noc_tile_2_0 = aie.tile(2, 0) {controller_id = #aie.packet_info} + %mem_tile_0_1 = aie.tile(0, 1) {controller_id = #aie.packet_info} + %mem_tile_1_1 = aie.tile(1, 1) {controller_id = #aie.packet_info} + %mem_tile_2_1 = aie.tile(2, 1) {controller_id = #aie.packet_info} + %tile_0_2 = aie.tile(0, 2) {controller_id = #aie.packet_info} + %tile_0_3 = aie.tile(0, 3) {controller_id = #aie.packet_info} + %tile_0_4 = aie.tile(0, 4) {controller_id = #aie.packet_info} + %tile_0_5 = aie.tile(0, 5) {controller_id = #aie.packet_info} + %lock_1_1 = aie.lock(%mem_tile_1_1, 1) {init = 4 : i32} + %lock_1_1_0 = aie.lock(%mem_tile_1_1, 0) {init = 0 : i32} + %lock_0_1 = aie.lock(%mem_tile_0_1, 1) {init = 4 : i32} + %lock_0_1_1 = aie.lock(%mem_tile_0_1, 0) {init = 0 : i32} + %lock_2_1 = aie.lock(%mem_tile_2_1, 1) {init = 4 : i32} + %lock_2_1_2 = aie.lock(%mem_tile_2_1, 0) {init = 0 : i32} + %lock_0_2 = aie.lock(%tile_0_2, 5) {init = 1 : i32} + %lock_0_2_3 = aie.lock(%tile_0_2, 4) {init = 0 : i32} + %lock_0_2_4 = aie.lock(%tile_0_2, 3) {init = 1 : i32} + %lock_0_2_5 = aie.lock(%tile_0_2, 2) {init = 0 : i32} + %lock_0_2_6 = aie.lock(%tile_0_2, 1) {init = 1 : i32} + %lock_0_2_7 = aie.lock(%tile_0_2, 0) {init = 0 : i32} + %lock_0_3 = aie.lock(%tile_0_3, 5) {init = 1 : i32} + %lock_0_3_8 = aie.lock(%tile_0_3, 4) {init = 0 : i32} + %lock_0_3_9 = aie.lock(%tile_0_3, 3) {init = 1 : i32} + %lock_0_3_10 = aie.lock(%tile_0_3, 2) {init = 0 : i32} + %lock_0_3_11 = aie.lock(%tile_0_3, 1) {init = 1 : i32} + %lock_0_3_12 = aie.lock(%tile_0_3, 0) {init = 0 : i32} + %lock_0_4 = aie.lock(%tile_0_4, 5) {init = 1 : i32} + %lock_0_4_13 = aie.lock(%tile_0_4, 4) {init = 0 : i32} + %lock_0_4_14 = aie.lock(%tile_0_4, 3) {init = 1 : i32} + %lock_0_4_15 = aie.lock(%tile_0_4, 2) {init = 0 : i32} + %lock_0_4_16 = aie.lock(%tile_0_4, 1) {init = 1 : i32} + %lock_0_4_17 = aie.lock(%tile_0_4, 0) {init = 0 : i32} + %lock_0_5 = aie.lock(%tile_0_5, 5) {init = 1 : i32} + %lock_0_5_18 = aie.lock(%tile_0_5, 4) {init = 0 : i32} + %lock_0_5_19 = aie.lock(%tile_0_5, 3) {init = 1 : i32} + %lock_0_5_20 = aie.lock(%tile_0_5, 2) {init = 0 : i32} + %lock_0_5_21 = aie.lock(%tile_0_5, 1) {init = 1 : i32} + %lock_0_5_22 = aie.lock(%tile_0_5, 0) {init = 0 : i32} + %buf14 = aie.buffer(%mem_tile_0_1) {address = 0 : i32, mem_bank = 0 : i32, sym_name = "buf14"} : memref<1024xi16, 1 : i32> + %buf13 = aie.buffer(%mem_tile_1_1) {address = 0 : i32, mem_bank = 0 : i32, sym_name = "buf13"} : memref<1024xi16, 1 : i32> + %buf12 = aie.buffer(%mem_tile_2_1) {address = 0 : i32, mem_bank = 0 : i32, sym_name = "buf12"} : memref<1024xi16, 1> + %buf11 = aie.buffer(%tile_0_5) {address = 1024 : i32, mem_bank = 0 : i32, sym_name = "buf11"} : memref<256xi16, 2> + %buf10 = aie.buffer(%tile_0_5) {address = 16384 : i32, mem_bank = 1 : i32, sym_name = "buf10"} : memref<256xi16, 2> + %buf9 = aie.buffer(%tile_0_5) {address = 32768 : i32, mem_bank = 2 : i32, sym_name = "buf9"} : memref<256xi16, 2> + %buf8 = aie.buffer(%tile_0_4) {address = 1024 : i32, mem_bank = 0 : i32, sym_name = "buf8"} : memref<256xi16, 2> + %buf7 = aie.buffer(%tile_0_4) {address = 16384 : i32, mem_bank = 1 : i32, sym_name = "buf7"} : memref<256xi16, 2> + %buf6 = aie.buffer(%tile_0_4) {address = 32768 : i32, mem_bank = 2 : i32, sym_name = "buf6"} : memref<256xi16, 2> + %buf5 = aie.buffer(%tile_0_3) {address = 1024 : i32, mem_bank = 0 : i32, sym_name = "buf5"} : memref<256xi16, 2> + %buf4 = aie.buffer(%tile_0_3) {address = 16384 : i32, mem_bank = 1 : i32, sym_name = "buf4"} : memref<256xi16, 2> + %buf3 = aie.buffer(%tile_0_3) {address = 32768 : i32, mem_bank = 2 : i32, sym_name = "buf3"} : memref<256xi16, 2> + %buf2 = aie.buffer(%tile_0_2) {address = 1024 : i32, mem_bank = 0 : i32, sym_name = "buf2"} : memref<256xi16, 2> + %buf1 = aie.buffer(%tile_0_2) {address = 16384 : i32, mem_bank = 1 : i32, sym_name = "buf1"} : memref<256xi16, 2> + %buf0 = aie.buffer(%tile_0_2) {address = 32768 : i32, mem_bank = 2 : i32, sym_name = "buf0"} : memref<256xi16, 2> + %mem_0_5 = aie.mem(%tile_0_5) { + %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_0_5_22, AcquireGreaterEqual, 1) + aie.dma_bd(%buf9 : memref<256xi16, 2>, 0, 256) {bd_id = 0 : i32, next_bd_id = 0 : i32, task_id = 0 : i32} + aie.use_lock(%lock_0_5_21, Release, 1) + aie.next_bd ^bb1 + ^bb2: // pred: ^bb5 + aie.end + ^bb3: // pred: ^bb0 + %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb5) + ^bb4: // 2 preds: ^bb3, ^bb4 + aie.use_lock(%lock_0_5_19, AcquireGreaterEqual, 1) + aie.dma_bd(%buf11 : memref<256xi16, 2>, 0, 256) {bd_id = 1 : i32, next_bd_id = 1 : i32, task_id = 0 : i32} + aie.use_lock(%lock_0_5_20, Release, 1) + aie.next_bd ^bb4 + ^bb5: // pred: ^bb3 + %2 = aie.dma_start(S2MM, 1, ^bb6, ^bb2) + ^bb6: // 2 preds: ^bb5, ^bb6 + aie.use_lock(%lock_0_5, AcquireGreaterEqual, 1) + aie.dma_bd(%buf10 : memref<256xi16, 2>, 0, 256) {bd_id = 2 : i32, next_bd_id = 2 : i32, task_id = 0 : i32} + aie.use_lock(%lock_0_5_18, Release, 1) + aie.next_bd ^bb6 + } + %core_0_5 = aie.core(%tile_0_5) { + %c0_i32 = arith.constant 0 : i32 + %c256 = arith.constant 256 : index + %c32 = arith.constant 32 : index + %c0 = arith.constant 0 : index + cf.br ^bb1 + ^bb1: // 2 preds: ^bb0, ^bb4 + aie.use_lock(%lock_0_5_21, AcquireGreaterEqual, 1) + aie.use_lock(%lock_0_5_20, AcquireGreaterEqual, 1) + aie.use_lock(%lock_0_5_18, AcquireGreaterEqual, 1) + cf.br ^bb2(%c0 : index) + ^bb2(%0: index): // 2 preds: ^bb1, ^bb3 + %1 = arith.cmpi slt, %0, %c256 : index + cf.cond_br %1, ^bb3, ^bb4 + ^bb3: // pred: ^bb2 + %2 = vector.load %buf11[%0] : memref<256xi16, 2>, vector<32xi16> + %3 = vector.load %buf10[%0] : memref<256xi16, 2>, vector<32xi16> + %4 = aievec.mul_elem %2, %3 : vector<32xi16>, vector<32xi16>, vector<32xi32> + %5 = aievec.srs %4, %c0_i32 : vector<32xi32>, i32, vector<32xi16> + vector.store %5, %buf9[%0] : memref<256xi16, 2>, vector<32xi16> + %6 = arith.addi %0, %c32 : index + cf.br ^bb2(%6 : index) {loop_annotation = #loop_annotation} + ^bb4: // pred: ^bb2 + aie.use_lock(%lock_0_5_19, Release, 1) + aie.use_lock(%lock_0_5, Release, 1) + aie.use_lock(%lock_0_5_22, Release, 1) + cf.br ^bb1 + } + %mem_0_4 = aie.mem(%tile_0_4) { + %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_0_4_17, AcquireGreaterEqual, 1) + aie.dma_bd(%buf6 : memref<256xi16, 2>, 0, 256) {bd_id = 0 : i32, next_bd_id = 0 : i32, task_id = 0 : i32} + aie.use_lock(%lock_0_4_16, Release, 1) + aie.next_bd ^bb1 + ^bb2: // pred: ^bb5 + aie.end + ^bb3: // pred: ^bb0 + %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb5) + ^bb4: // 2 preds: ^bb3, ^bb4 + aie.use_lock(%lock_0_4_14, AcquireGreaterEqual, 1) + aie.dma_bd(%buf8 : memref<256xi16, 2>, 0, 256) {bd_id = 1 : i32, next_bd_id = 1 : i32, task_id = 0 : i32} + aie.use_lock(%lock_0_4_15, Release, 1) + aie.next_bd ^bb4 + ^bb5: // pred: ^bb3 + %2 = aie.dma_start(S2MM, 1, ^bb6, ^bb2) + ^bb6: // 2 preds: ^bb5, ^bb6 + aie.use_lock(%lock_0_4, AcquireGreaterEqual, 1) + aie.dma_bd(%buf7 : memref<256xi16, 2>, 0, 256) {bd_id = 2 : i32, next_bd_id = 2 : i32, task_id = 0 : i32} + aie.use_lock(%lock_0_4_13, Release, 1) + aie.next_bd ^bb6 + } + %core_0_4 = aie.core(%tile_0_4) { + %c0_i32 = arith.constant 0 : i32 + %c256 = arith.constant 256 : index + %c32 = arith.constant 32 : index + %c0 = arith.constant 0 : index + cf.br ^bb1 + ^bb1: // 2 preds: ^bb0, ^bb4 + aie.use_lock(%lock_0_4_16, AcquireGreaterEqual, 1) + aie.use_lock(%lock_0_4_15, AcquireGreaterEqual, 1) + aie.use_lock(%lock_0_4_13, AcquireGreaterEqual, 1) + cf.br ^bb2(%c0 : index) + ^bb2(%0: index): // 2 preds: ^bb1, ^bb3 + %1 = arith.cmpi slt, %0, %c256 : index + cf.cond_br %1, ^bb3, ^bb4 + ^bb3: // pred: ^bb2 + %2 = vector.load %buf8[%0] : memref<256xi16, 2>, vector<32xi16> + %3 = vector.load %buf7[%0] : memref<256xi16, 2>, vector<32xi16> + %4 = aievec.mul_elem %2, %3 : vector<32xi16>, vector<32xi16>, vector<32xi32> + %5 = aievec.srs %4, %c0_i32 : vector<32xi32>, i32, vector<32xi16> + vector.store %5, %buf6[%0] : memref<256xi16, 2>, vector<32xi16> + %6 = arith.addi %0, %c32 : index + cf.br ^bb2(%6 : index) {loop_annotation = #loop_annotation} + ^bb4: // pred: ^bb2 + aie.use_lock(%lock_0_4_14, Release, 1) + aie.use_lock(%lock_0_4, Release, 1) + aie.use_lock(%lock_0_4_17, Release, 1) + cf.br ^bb1 + } + %mem_0_3 = aie.mem(%tile_0_3) { + %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_0_3_12, AcquireGreaterEqual, 1) + aie.dma_bd(%buf3 : memref<256xi16, 2>, 0, 256) {bd_id = 0 : i32, next_bd_id = 0 : i32, task_id = 0 : i32} + aie.use_lock(%lock_0_3_11, Release, 1) + aie.next_bd ^bb1 + ^bb2: // pred: ^bb5 + aie.end + ^bb3: // pred: ^bb0 + %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb5) + ^bb4: // 2 preds: ^bb3, ^bb4 + aie.use_lock(%lock_0_3_9, AcquireGreaterEqual, 1) + aie.dma_bd(%buf5 : memref<256xi16, 2>, 0, 256) {bd_id = 1 : i32, next_bd_id = 1 : i32, task_id = 0 : i32} + aie.use_lock(%lock_0_3_10, Release, 1) + aie.next_bd ^bb4 + ^bb5: // pred: ^bb3 + %2 = aie.dma_start(S2MM, 1, ^bb6, ^bb2) + ^bb6: // 2 preds: ^bb5, ^bb6 + aie.use_lock(%lock_0_3, AcquireGreaterEqual, 1) + aie.dma_bd(%buf4 : memref<256xi16, 2>, 0, 256) {bd_id = 2 : i32, next_bd_id = 2 : i32, task_id = 0 : i32} + aie.use_lock(%lock_0_3_8, Release, 1) + aie.next_bd ^bb6 + } + %core_0_3 = aie.core(%tile_0_3) { + %c0_i32 = arith.constant 0 : i32 + %c256 = arith.constant 256 : index + %c32 = arith.constant 32 : index + %c0 = arith.constant 0 : index + cf.br ^bb1 + ^bb1: // 2 preds: ^bb0, ^bb4 + aie.use_lock(%lock_0_3_11, AcquireGreaterEqual, 1) + aie.use_lock(%lock_0_3_10, AcquireGreaterEqual, 1) + aie.use_lock(%lock_0_3_8, AcquireGreaterEqual, 1) + cf.br ^bb2(%c0 : index) + ^bb2(%0: index): // 2 preds: ^bb1, ^bb3 + %1 = arith.cmpi slt, %0, %c256 : index + cf.cond_br %1, ^bb3, ^bb4 + ^bb3: // pred: ^bb2 + %2 = vector.load %buf5[%0] : memref<256xi16, 2>, vector<32xi16> + %3 = vector.load %buf4[%0] : memref<256xi16, 2>, vector<32xi16> + %4 = aievec.mul_elem %2, %3 : vector<32xi16>, vector<32xi16>, vector<32xi32> + %5 = aievec.srs %4, %c0_i32 : vector<32xi32>, i32, vector<32xi16> + vector.store %5, %buf3[%0] : memref<256xi16, 2>, vector<32xi16> + %6 = arith.addi %0, %c32 : index + cf.br ^bb2(%6 : index) {loop_annotation = #loop_annotation} + ^bb4: // pred: ^bb2 + aie.use_lock(%lock_0_3_9, Release, 1) + aie.use_lock(%lock_0_3, Release, 1) + aie.use_lock(%lock_0_3_12, Release, 1) + cf.br ^bb1 + } + %mem_0_2 = aie.mem(%tile_0_2) { + %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_0_2_7, AcquireGreaterEqual, 1) + aie.dma_bd(%buf0 : memref<256xi16, 2>, 0, 256) {bd_id = 0 : i32, next_bd_id = 0 : i32, task_id = 0 : i32} + aie.use_lock(%lock_0_2_6, Release, 1) + aie.next_bd ^bb1 + ^bb2: // pred: ^bb5 + aie.end + ^bb3: // pred: ^bb0 + %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb5) + ^bb4: // 2 preds: ^bb3, ^bb4 + aie.use_lock(%lock_0_2_4, AcquireGreaterEqual, 1) + aie.dma_bd(%buf2 : memref<256xi16, 2>, 0, 256) {bd_id = 1 : i32, next_bd_id = 1 : i32, task_id = 0 : i32} + aie.use_lock(%lock_0_2_5, Release, 1) + aie.next_bd ^bb4 + ^bb5: // pred: ^bb3 + %2 = aie.dma_start(S2MM, 1, ^bb6, ^bb2) + ^bb6: // 2 preds: ^bb5, ^bb6 + aie.use_lock(%lock_0_2, AcquireGreaterEqual, 1) + aie.dma_bd(%buf1 : memref<256xi16, 2>, 0, 256) {bd_id = 2 : i32, next_bd_id = 2 : i32, task_id = 0 : i32} + aie.use_lock(%lock_0_2_3, Release, 1) + aie.next_bd ^bb6 + } + %core_0_2 = aie.core(%tile_0_2) { + %c0_i32 = arith.constant 0 : i32 + %c256 = arith.constant 256 : index + %c32 = arith.constant 32 : index + %c0 = arith.constant 0 : index + cf.br ^bb1 + ^bb1: // 2 preds: ^bb0, ^bb4 + aie.use_lock(%lock_0_2_6, AcquireGreaterEqual, 1) + aie.use_lock(%lock_0_2_5, AcquireGreaterEqual, 1) + aie.use_lock(%lock_0_2_3, AcquireGreaterEqual, 1) + cf.br ^bb2(%c0 : index) + ^bb2(%0: index): // 2 preds: ^bb1, ^bb3 + %1 = arith.cmpi slt, %0, %c256 : index + cf.cond_br %1, ^bb3, ^bb4 + ^bb3: // pred: ^bb2 + %2 = vector.load %buf2[%0] : memref<256xi16, 2>, vector<32xi16> + %3 = vector.load %buf1[%0] : memref<256xi16, 2>, vector<32xi16> + %4 = aievec.mul_elem %2, %3 : vector<32xi16>, vector<32xi16>, vector<32xi32> + %5 = aievec.srs %4, %c0_i32 : vector<32xi32>, i32, vector<32xi16> + vector.store %5, %buf0[%0] : memref<256xi16, 2>, vector<32xi16> + %6 = arith.addi %0, %c32 : index + cf.br ^bb2(%6 : index) {loop_annotation = #loop_annotation} + ^bb4: // pred: ^bb2 + aie.use_lock(%lock_0_2_4, Release, 1) + aie.use_lock(%lock_0_2, Release, 1) + aie.use_lock(%lock_0_2_7, Release, 1) + cf.br ^bb1 + } + %memtile_dma_2_1 = aie.memtile_dma(%mem_tile_2_1) { + %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_2_1_2, AcquireGreaterEqual, 4) + aie.dma_bd(%buf12 : memref<1024xi16, 1>, 0, 1024) {bd_id = 0 : i32, next_bd_id = 0 : i32, task_id = 0 : i32} + aie.use_lock(%lock_2_1, Release, 4) + aie.next_bd ^bb1 + ^bb2: // pred: ^bb9 + aie.end + ^bb3: // pred: ^bb0 + %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb5) + ^bb4: // 2 preds: ^bb3, ^bb4 + aie.use_lock(%lock_2_1, AcquireGreaterEqual, 1) + aie.dma_bd(%buf12 : memref<1024xi16, 1>, 0, 256) {bd_id = 1 : i32, next_bd_id = 1 : i32, task_id = 0 : i32} + aie.use_lock(%lock_2_1_2, Release, 1) + aie.next_bd ^bb4 + ^bb5: // pred: ^bb3 + %2 = aie.dma_start(S2MM, 1, ^bb6, ^bb7) + ^bb6: // 2 preds: ^bb5, ^bb6 + aie.use_lock(%lock_2_1, AcquireGreaterEqual, 1) + aie.dma_bd(%buf12 : memref<1024xi16, 1>, 256, 256) {bd_id = 24 : i32, next_bd_id = 24 : i32, task_id = 0 : i32} + aie.use_lock(%lock_2_1_2, Release, 1) + aie.next_bd ^bb6 + ^bb7: // pred: ^bb5 + %3 = aie.dma_start(S2MM, 2, ^bb8, ^bb9) + ^bb8: // 2 preds: ^bb7, ^bb8 + aie.use_lock(%lock_2_1, AcquireGreaterEqual, 1) + aie.dma_bd(%buf12 : memref<1024xi16, 1>, 512, 256) {bd_id = 2 : i32, next_bd_id = 2 : i32, task_id = 0 : i32} + aie.use_lock(%lock_2_1_2, Release, 1) + aie.next_bd ^bb8 + ^bb9: // pred: ^bb7 + %4 = aie.dma_start(S2MM, 3, ^bb10, ^bb2) + ^bb10: // 2 preds: ^bb9, ^bb10 + aie.use_lock(%lock_2_1, AcquireGreaterEqual, 1) + aie.dma_bd(%buf12 : memref<1024xi16, 1>, 768, 256) {bd_id = 25 : i32, next_bd_id = 25 : i32, task_id = 0 : i32} + aie.use_lock(%lock_2_1_2, Release, 1) + aie.next_bd ^bb10 + } + %memtile_dma_0_1 = aie.memtile_dma(%mem_tile_0_1) { + %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_0_1_1, AcquireGreaterEqual, 1) + aie.dma_bd(%buf14 : memref<1024xi16, 1 : i32>, 0, 256) {bd_id = 0 : i32, next_bd_id = 0 : i32, task_id = 0 : i32} + aie.use_lock(%lock_0_1, Release, 1) + aie.next_bd ^bb1 + ^bb2: // pred: ^bb9 + aie.end + ^bb3: // pred: ^bb0 + %1 = aie.dma_start(MM2S, 1, ^bb4, ^bb5) + ^bb4: // 2 preds: ^bb3, ^bb4 + aie.use_lock(%lock_0_1_1, AcquireGreaterEqual, 1) + aie.dma_bd(%buf14 : memref<1024xi16, 1 : i32>, 256, 256) {bd_id = 24 : i32, next_bd_id = 24 : i32, task_id = 0 : i32} + aie.use_lock(%lock_0_1, Release, 1) + aie.next_bd ^bb4 + ^bb5: // pred: ^bb3 + %2 = aie.dma_start(MM2S, 2, ^bb6, ^bb7) + ^bb6: // 2 preds: ^bb5, ^bb6 + aie.use_lock(%lock_0_1_1, AcquireGreaterEqual, 1) + aie.dma_bd(%buf14 : memref<1024xi16, 1 : i32>, 512, 256) {bd_id = 1 : i32, next_bd_id = 1 : i32, task_id = 0 : i32} + aie.use_lock(%lock_0_1, Release, 1) + aie.next_bd ^bb6 + ^bb7: // pred: ^bb5 + %3 = aie.dma_start(MM2S, 3, ^bb8, ^bb9) + ^bb8: // 2 preds: ^bb7, ^bb8 + aie.use_lock(%lock_0_1_1, AcquireGreaterEqual, 1) + aie.dma_bd(%buf14 : memref<1024xi16, 1 : i32>, 768, 256) {bd_id = 25 : i32, next_bd_id = 25 : i32, task_id = 0 : i32} + aie.use_lock(%lock_0_1, Release, 1) + aie.next_bd ^bb8 + ^bb9: // pred: ^bb7 + %4 = aie.dma_start(S2MM, 0, ^bb10, ^bb2) + ^bb10: // 2 preds: ^bb9, ^bb10 + aie.use_lock(%lock_0_1, AcquireGreaterEqual, 4) + aie.dma_bd(%buf14 : memref<1024xi16, 1 : i32>, 0, 1024) {bd_id = 2 : i32, next_bd_id = 2 : i32, task_id = 0 : i32} + aie.use_lock(%lock_0_1_1, Release, 4) + aie.next_bd ^bb10 + } + %memtile_dma_1_1 = aie.memtile_dma(%mem_tile_1_1) { + %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_1_1_0, AcquireGreaterEqual, 1) + aie.dma_bd(%buf13 : memref<1024xi16, 1 : i32>, 0, 256) {bd_id = 0 : i32, next_bd_id = 0 : i32, task_id = 0 : i32} + aie.use_lock(%lock_1_1, Release, 1) + aie.next_bd ^bb1 + ^bb2: // pred: ^bb9 + aie.end + ^bb3: // pred: ^bb0 + %1 = aie.dma_start(MM2S, 1, ^bb4, ^bb5) + ^bb4: // 2 preds: ^bb3, ^bb4 + aie.use_lock(%lock_1_1_0, AcquireGreaterEqual, 1) + aie.dma_bd(%buf13 : memref<1024xi16, 1 : i32>, 256, 256) {bd_id = 24 : i32, next_bd_id = 24 : i32, task_id = 0 : i32} + aie.use_lock(%lock_1_1, Release, 1) + aie.next_bd ^bb4 + ^bb5: // pred: ^bb3 + %2 = aie.dma_start(MM2S, 2, ^bb6, ^bb7) + ^bb6: // 2 preds: ^bb5, ^bb6 + aie.use_lock(%lock_1_1_0, AcquireGreaterEqual, 1) + aie.dma_bd(%buf13 : memref<1024xi16, 1 : i32>, 512, 256) {bd_id = 1 : i32, next_bd_id = 1 : i32, task_id = 0 : i32} + aie.use_lock(%lock_1_1, Release, 1) + aie.next_bd ^bb6 + ^bb7: // pred: ^bb5 + %3 = aie.dma_start(MM2S, 3, ^bb8, ^bb9) + ^bb8: // 2 preds: ^bb7, ^bb8 + aie.use_lock(%lock_1_1_0, AcquireGreaterEqual, 1) + aie.dma_bd(%buf13 : memref<1024xi16, 1 : i32>, 768, 256) {bd_id = 25 : i32, next_bd_id = 25 : i32, task_id = 0 : i32} + aie.use_lock(%lock_1_1, Release, 1) + aie.next_bd ^bb8 + ^bb9: // pred: ^bb7 + %4 = aie.dma_start(S2MM, 0, ^bb10, ^bb2) + ^bb10: // 2 preds: ^bb9, ^bb10 + aie.use_lock(%lock_1_1, AcquireGreaterEqual, 4) + aie.dma_bd(%buf13 : memref<1024xi16, 1 : i32>, 0, 1024) {bd_id = 2 : i32, next_bd_id = 2 : i32, task_id = 0 : i32} + aie.use_lock(%lock_1_1_0, Release, 4) + aie.next_bd ^bb10 + } + aie.shim_dma_allocation @air_channel_5(%shim_noc_tile_2_0, S2MM, 0) + aie.shim_dma_allocation @air_channel_0(%shim_noc_tile_0_0, MM2S, 0) + aie.shim_dma_allocation @air_channel_1(%shim_noc_tile_1_0, MM2S, 0) + aie.runtime_sequence @mul_kernel_0_sequence(%arg0: memref<*xi16>, %arg1: memref<*xi16>, %arg2: memref<*xi16>, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32) { + %0 = aiex.dma_configure_task_for @air_channel_0 { + aie.dma_bd(%arg0 : memref<*xi16>, 0, 1024, [, ]) + aie.end + } + aiex.dma_start_task(%0) + %1 = aiex.dma_configure_task_for @air_channel_1 { + aie.dma_bd(%arg1 : memref<*xi16>, 0, 1024, [, ]) + aie.end + } + aiex.dma_start_task(%1) + %2 = aiex.dma_configure_task_for @air_channel_5 { + aie.dma_bd(%arg2 : memref<*xi16>, 0, 1024, [, ]) + aie.end + } {issue_token = true} + aiex.dma_start_task(%2) + aiex.dma_free_task(%0) + aiex.dma_await_task(%2) + aiex.dma_free_task(%1) + } + aie.packet_flow(15) { + aie.packet_source<%shim_noc_tile_0_0, TileControl : 0> + aie.packet_dest<%shim_noc_tile_0_0, South : 0> + } {keep_pkt_header = true, priority_route = true} + aie.packet_flow(15) { + aie.packet_source<%shim_noc_tile_1_0, TileControl : 0> + aie.packet_dest<%shim_noc_tile_1_0, South : 0> + } {keep_pkt_header = true, priority_route = true} + aie.packet_flow(15) { + aie.packet_source<%shim_noc_tile_2_0, TileControl : 0> + aie.packet_dest<%shim_noc_tile_2_0, South : 0> + } {keep_pkt_header = true, priority_route = true} + %switchbox_0_0 = aie.switchbox(%shim_noc_tile_0_0) { + aie.connect + %0 = aie.amsel<5> (3) + %1 = aie.masterset(South : 0, %0) {keep_pkt_header = true} + aie.packet_rules(TileControl : 0) { + aie.rule(31, 15, %0) + } + } + %shim_mux_0_0 = aie.shim_mux(%shim_noc_tile_0_0) { + aie.connect + } + %switchbox_0_1 = aie.switchbox(%mem_tile_0_1) { + aie.connect + aie.connect + aie.connect + aie.connect + aie.connect + } + %switchbox_1_0 = aie.switchbox(%shim_noc_tile_1_0) { + aie.connect + %0 = aie.amsel<5> (3) + %1 = aie.masterset(South : 0, %0) {keep_pkt_header = true} + aie.packet_rules(TileControl : 0) { + aie.rule(31, 15, %0) + } + } + %shim_mux_1_0 = aie.shim_mux(%shim_noc_tile_1_0) { + aie.connect + } + %switchbox_1_1 = aie.switchbox(%mem_tile_1_1) { + aie.connect + aie.connect + aie.connect + aie.connect + aie.connect + } + %switchbox_2_0 = aie.switchbox(%shim_noc_tile_2_0) { + aie.connect + %0 = aie.amsel<5> (3) + %1 = aie.masterset(South : 0, %0) {keep_pkt_header = true} + aie.packet_rules(TileControl : 0) { + aie.rule(31, 15, %0) + } + } + %shim_mux_2_0 = aie.shim_mux(%shim_noc_tile_2_0) { + aie.connect + } + %switchbox_2_1 = aie.switchbox(%mem_tile_2_1) { + aie.connect + aie.connect + aie.connect + aie.connect + aie.connect + } + %switchbox_0_2 = aie.switchbox(%tile_0_2) { + aie.connect + aie.connect + aie.connect + aie.connect + aie.connect + aie.connect + } + %switchbox_0_3 = aie.switchbox(%tile_0_3) { + aie.connect + aie.connect + aie.connect + aie.connect + aie.connect + aie.connect + } + %switchbox_0_4 = aie.switchbox(%tile_0_4) { + aie.connect + aie.connect + aie.connect + aie.connect + aie.connect + aie.connect + } + %switchbox_0_5 = aie.switchbox(%tile_0_5) { + aie.connect + aie.connect + aie.connect + } + %tile_1_2 = aie.tile(1, 2) + %switchbox_1_2 = aie.switchbox(%tile_1_2) { + aie.connect + aie.connect + aie.connect + aie.connect + aie.connect + aie.connect + } + %tile_1_3 = aie.tile(1, 3) + %switchbox_1_3 = aie.switchbox(%tile_1_3) { + aie.connect + aie.connect + aie.connect + aie.connect + aie.connect + } + %tile_1_4 = aie.tile(1, 4) + %switchbox_1_4 = aie.switchbox(%tile_1_4) { + aie.connect + aie.connect + aie.connect + } + %tile_2_2 = aie.tile(2, 2) + %switchbox_2_2 = aie.switchbox(%tile_2_2) { + aie.connect + aie.connect + aie.connect + aie.connect + } + %tile_2_3 = aie.tile(2, 3) + %switchbox_2_3 = aie.switchbox(%tile_2_3) { + aie.connect + aie.connect + } + %tile_2_4 = aie.tile(2, 4) + %switchbox_2_4 = aie.switchbox(%tile_2_4) { + aie.connect + } + aie.wire(%shim_mux_0_0 : North, %switchbox_0_0 : South) + aie.wire(%shim_noc_tile_0_0 : DMA, %shim_mux_0_0 : DMA) + aie.wire(%mem_tile_0_1 : Core, %switchbox_0_1 : Core) + aie.wire(%mem_tile_0_1 : DMA, %switchbox_0_1 : DMA) + aie.wire(%switchbox_0_0 : North, %switchbox_0_1 : South) + aie.wire(%tile_0_2 : Core, %switchbox_0_2 : Core) + aie.wire(%tile_0_2 : DMA, %switchbox_0_2 : DMA) + aie.wire(%switchbox_0_1 : North, %switchbox_0_2 : South) + aie.wire(%tile_0_3 : Core, %switchbox_0_3 : Core) + aie.wire(%tile_0_3 : DMA, %switchbox_0_3 : DMA) + aie.wire(%switchbox_0_2 : North, %switchbox_0_3 : South) + aie.wire(%tile_0_4 : Core, %switchbox_0_4 : Core) + aie.wire(%tile_0_4 : DMA, %switchbox_0_4 : DMA) + aie.wire(%switchbox_0_3 : North, %switchbox_0_4 : South) + aie.wire(%tile_0_5 : Core, %switchbox_0_5 : Core) + aie.wire(%tile_0_5 : DMA, %switchbox_0_5 : DMA) + aie.wire(%switchbox_0_4 : North, %switchbox_0_5 : South) + aie.wire(%switchbox_0_0 : East, %switchbox_1_0 : West) + aie.wire(%shim_mux_1_0 : North, %switchbox_1_0 : South) + aie.wire(%shim_noc_tile_1_0 : DMA, %shim_mux_1_0 : DMA) + aie.wire(%switchbox_0_1 : East, %switchbox_1_1 : West) + aie.wire(%mem_tile_1_1 : Core, %switchbox_1_1 : Core) + aie.wire(%mem_tile_1_1 : DMA, %switchbox_1_1 : DMA) + aie.wire(%switchbox_1_0 : North, %switchbox_1_1 : South) + aie.wire(%switchbox_0_2 : East, %switchbox_1_2 : West) + aie.wire(%tile_1_2 : Core, %switchbox_1_2 : Core) + aie.wire(%tile_1_2 : DMA, %switchbox_1_2 : DMA) + aie.wire(%switchbox_1_1 : North, %switchbox_1_2 : South) + aie.wire(%switchbox_0_3 : East, %switchbox_1_3 : West) + aie.wire(%tile_1_3 : Core, %switchbox_1_3 : Core) + aie.wire(%tile_1_3 : DMA, %switchbox_1_3 : DMA) + aie.wire(%switchbox_1_2 : North, %switchbox_1_3 : South) + aie.wire(%switchbox_0_4 : East, %switchbox_1_4 : West) + aie.wire(%tile_1_4 : Core, %switchbox_1_4 : Core) + aie.wire(%tile_1_4 : DMA, %switchbox_1_4 : DMA) + aie.wire(%switchbox_1_3 : North, %switchbox_1_4 : South) + aie.wire(%switchbox_1_0 : East, %switchbox_2_0 : West) + aie.wire(%shim_mux_2_0 : North, %switchbox_2_0 : South) + aie.wire(%shim_noc_tile_2_0 : DMA, %shim_mux_2_0 : DMA) + aie.wire(%switchbox_1_1 : East, %switchbox_2_1 : West) + aie.wire(%mem_tile_2_1 : Core, %switchbox_2_1 : Core) + aie.wire(%mem_tile_2_1 : DMA, %switchbox_2_1 : DMA) + aie.wire(%switchbox_2_0 : North, %switchbox_2_1 : South) + aie.wire(%switchbox_1_2 : East, %switchbox_2_2 : West) + aie.wire(%tile_2_2 : Core, %switchbox_2_2 : Core) + aie.wire(%tile_2_2 : DMA, %switchbox_2_2 : DMA) + aie.wire(%switchbox_2_1 : North, %switchbox_2_2 : South) + aie.wire(%switchbox_1_3 : East, %switchbox_2_3 : West) + aie.wire(%tile_2_3 : Core, %switchbox_2_3 : Core) + aie.wire(%tile_2_3 : DMA, %switchbox_2_3 : DMA) + aie.wire(%switchbox_2_2 : North, %switchbox_2_3 : South) + aie.wire(%switchbox_1_4 : East, %switchbox_2_4 : West) + aie.wire(%tile_2_4 : Core, %switchbox_2_4 : Core) + aie.wire(%tile_2_4 : DMA, %switchbox_2_4 : DMA) + aie.wire(%switchbox_2_3 : North, %switchbox_2_4 : South) + } {dlti.dl_spec = #dlti.dl_spec} + aie.device(npu2) { + aie.runtime_sequence @mul_kernel(%arg0: memref<*xi16>, %arg1: memref<*xi16>, %arg2: memref<*xi16>, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32) { + aiex.configure @mul_kernel_0 { + aiex.run @mul_kernel_0_sequence(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8) : (memref<*xi16>, memref<*xi16>, memref<*xi16>, i32, i32, i32, i32, i32, i32) + } + } + } +} diff --git a/examples/elementwise_arith/air_project/aiecc_failure_1775797174_862028.mlir b/examples/elementwise_arith/air_project/aiecc_failure_1775797174_862028.mlir new file mode 100644 index 0000000..918aa51 --- /dev/null +++ b/examples/elementwise_arith/air_project/aiecc_failure_1775797174_862028.mlir @@ -0,0 +1,431 @@ +#loop_annotation = #llvm.loop_annotation +module { + aie.device(npu2) @square_kernel_0 { + %shim_noc_tile_0_0 = aie.tile(0, 0) {controller_id = #aie.packet_info} + %shim_noc_tile_1_0 = aie.tile(1, 0) {controller_id = #aie.packet_info} + %mem_tile_0_1 = aie.tile(0, 1) {controller_id = #aie.packet_info} + %mem_tile_1_1 = aie.tile(1, 1) {controller_id = #aie.packet_info} + %tile_0_2 = aie.tile(0, 2) {controller_id = #aie.packet_info} + %tile_0_3 = aie.tile(0, 3) {controller_id = #aie.packet_info} + %tile_0_4 = aie.tile(0, 4) {controller_id = #aie.packet_info} + %tile_0_5 = aie.tile(0, 5) {controller_id = #aie.packet_info} + %lock_0_1 = aie.lock(%mem_tile_0_1, 1) {init = 4 : i32} + %lock_0_1_0 = aie.lock(%mem_tile_0_1, 0) {init = 0 : i32} + %lock_1_1 = aie.lock(%mem_tile_1_1, 1) {init = 4 : i32} + %lock_1_1_1 = aie.lock(%mem_tile_1_1, 0) {init = 0 : i32} + %lock_0_2 = aie.lock(%tile_0_2, 3) {init = 1 : i32} + %lock_0_2_2 = aie.lock(%tile_0_2, 2) {init = 0 : i32} + %lock_0_2_3 = aie.lock(%tile_0_2, 1) {init = 1 : i32} + %lock_0_2_4 = aie.lock(%tile_0_2, 0) {init = 0 : i32} + %lock_0_3 = aie.lock(%tile_0_3, 3) {init = 1 : i32} + %lock_0_3_5 = aie.lock(%tile_0_3, 2) {init = 0 : i32} + %lock_0_3_6 = aie.lock(%tile_0_3, 1) {init = 1 : i32} + %lock_0_3_7 = aie.lock(%tile_0_3, 0) {init = 0 : i32} + %lock_0_4 = aie.lock(%tile_0_4, 3) {init = 1 : i32} + %lock_0_4_8 = aie.lock(%tile_0_4, 2) {init = 0 : i32} + %lock_0_4_9 = aie.lock(%tile_0_4, 1) {init = 1 : i32} + %lock_0_4_10 = aie.lock(%tile_0_4, 0) {init = 0 : i32} + %lock_0_5 = aie.lock(%tile_0_5, 3) {init = 1 : i32} + %lock_0_5_11 = aie.lock(%tile_0_5, 2) {init = 0 : i32} + %lock_0_5_12 = aie.lock(%tile_0_5, 1) {init = 1 : i32} + %lock_0_5_13 = aie.lock(%tile_0_5, 0) {init = 0 : i32} + %buf9 = aie.buffer(%mem_tile_0_1) {address = 0 : i32, mem_bank = 0 : i32, sym_name = "buf9"} : memref<1024xi16, 1 : i32> + %buf8 = aie.buffer(%mem_tile_1_1) {address = 0 : i32, mem_bank = 0 : i32, sym_name = "buf8"} : memref<1024xi16, 1> + %buf7 = aie.buffer(%tile_0_5) {address = 1024 : i32, mem_bank = 0 : i32, sym_name = "buf7"} : memref<256xi16, 2> + %buf6 = aie.buffer(%tile_0_5) {address = 16384 : i32, mem_bank = 1 : i32, sym_name = "buf6"} : memref<256xi16, 2> + %buf5 = aie.buffer(%tile_0_4) {address = 1024 : i32, mem_bank = 0 : i32, sym_name = "buf5"} : memref<256xi16, 2> + %buf4 = aie.buffer(%tile_0_4) {address = 16384 : i32, mem_bank = 1 : i32, sym_name = "buf4"} : memref<256xi16, 2> + %buf3 = aie.buffer(%tile_0_3) {address = 1024 : i32, mem_bank = 0 : i32, sym_name = "buf3"} : memref<256xi16, 2> + %buf2 = aie.buffer(%tile_0_3) {address = 16384 : i32, mem_bank = 1 : i32, sym_name = "buf2"} : memref<256xi16, 2> + %buf1 = aie.buffer(%tile_0_2) {address = 1024 : i32, mem_bank = 0 : i32, sym_name = "buf1"} : memref<256xi16, 2> + %buf0 = aie.buffer(%tile_0_2) {address = 16384 : i32, mem_bank = 1 : i32, sym_name = "buf0"} : memref<256xi16, 2> + %mem_0_5 = aie.mem(%tile_0_5) { + %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_0_5_13, AcquireGreaterEqual, 1) + aie.dma_bd(%buf6 : memref<256xi16, 2>, 0, 256) {bd_id = 0 : i32, next_bd_id = 0 : i32, task_id = 0 : i32} + aie.use_lock(%lock_0_5_12, Release, 1) + aie.next_bd ^bb1 + ^bb2: // pred: ^bb3 + aie.end + ^bb3: // pred: ^bb0 + %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb2) + ^bb4: // 2 preds: ^bb3, ^bb4 + aie.use_lock(%lock_0_5, AcquireGreaterEqual, 1) + aie.dma_bd(%buf7 : memref<256xi16, 2>, 0, 256) {bd_id = 1 : i32, next_bd_id = 1 : i32, task_id = 0 : i32} + aie.use_lock(%lock_0_5_11, Release, 1) + aie.next_bd ^bb4 + } + %core_0_5 = aie.core(%tile_0_5) { + %c0_i32 = arith.constant 0 : i32 + %c256 = arith.constant 256 : index + %c32 = arith.constant 32 : index + %c0 = arith.constant 0 : index + cf.br ^bb1 + ^bb1: // 2 preds: ^bb0, ^bb4 + aie.use_lock(%lock_0_5_12, AcquireGreaterEqual, 1) + aie.use_lock(%lock_0_5_11, AcquireGreaterEqual, 1) + cf.br ^bb2(%c0 : index) + ^bb2(%0: index): // 2 preds: ^bb1, ^bb3 + %1 = arith.cmpi slt, %0, %c256 : index + cf.cond_br %1, ^bb3, ^bb4 + ^bb3: // pred: ^bb2 + %2 = vector.load %buf7[%0] : memref<256xi16, 2>, vector<32xi16> + %3 = aievec.mul_elem %2, %2 : vector<32xi16>, vector<32xi16>, vector<32xi32> + %4 = aievec.srs %3, %c0_i32 : vector<32xi32>, i32, vector<32xi16> + vector.store %4, %buf6[%0] : memref<256xi16, 2>, vector<32xi16> + %5 = arith.addi %0, %c32 : index + cf.br ^bb2(%5 : index) {loop_annotation = #loop_annotation} + ^bb4: // pred: ^bb2 + aie.use_lock(%lock_0_5, Release, 1) + aie.use_lock(%lock_0_5_13, Release, 1) + cf.br ^bb1 + } + %mem_0_4 = aie.mem(%tile_0_4) { + %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_0_4_10, AcquireGreaterEqual, 1) + aie.dma_bd(%buf4 : memref<256xi16, 2>, 0, 256) {bd_id = 0 : i32, next_bd_id = 0 : i32, task_id = 0 : i32} + aie.use_lock(%lock_0_4_9, Release, 1) + aie.next_bd ^bb1 + ^bb2: // pred: ^bb3 + aie.end + ^bb3: // pred: ^bb0 + %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb2) + ^bb4: // 2 preds: ^bb3, ^bb4 + aie.use_lock(%lock_0_4, AcquireGreaterEqual, 1) + aie.dma_bd(%buf5 : memref<256xi16, 2>, 0, 256) {bd_id = 1 : i32, next_bd_id = 1 : i32, task_id = 0 : i32} + aie.use_lock(%lock_0_4_8, Release, 1) + aie.next_bd ^bb4 + } + %core_0_4 = aie.core(%tile_0_4) { + %c0_i32 = arith.constant 0 : i32 + %c256 = arith.constant 256 : index + %c32 = arith.constant 32 : index + %c0 = arith.constant 0 : index + cf.br ^bb1 + ^bb1: // 2 preds: ^bb0, ^bb4 + aie.use_lock(%lock_0_4_9, AcquireGreaterEqual, 1) + aie.use_lock(%lock_0_4_8, AcquireGreaterEqual, 1) + cf.br ^bb2(%c0 : index) + ^bb2(%0: index): // 2 preds: ^bb1, ^bb3 + %1 = arith.cmpi slt, %0, %c256 : index + cf.cond_br %1, ^bb3, ^bb4 + ^bb3: // pred: ^bb2 + %2 = vector.load %buf5[%0] : memref<256xi16, 2>, vector<32xi16> + %3 = aievec.mul_elem %2, %2 : vector<32xi16>, vector<32xi16>, vector<32xi32> + %4 = aievec.srs %3, %c0_i32 : vector<32xi32>, i32, vector<32xi16> + vector.store %4, %buf4[%0] : memref<256xi16, 2>, vector<32xi16> + %5 = arith.addi %0, %c32 : index + cf.br ^bb2(%5 : index) {loop_annotation = #loop_annotation} + ^bb4: // pred: ^bb2 + aie.use_lock(%lock_0_4, Release, 1) + aie.use_lock(%lock_0_4_10, Release, 1) + cf.br ^bb1 + } + %mem_0_3 = aie.mem(%tile_0_3) { + %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_0_3_7, AcquireGreaterEqual, 1) + aie.dma_bd(%buf2 : memref<256xi16, 2>, 0, 256) {bd_id = 0 : i32, next_bd_id = 0 : i32, task_id = 0 : i32} + aie.use_lock(%lock_0_3_6, Release, 1) + aie.next_bd ^bb1 + ^bb2: // pred: ^bb3 + aie.end + ^bb3: // pred: ^bb0 + %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb2) + ^bb4: // 2 preds: ^bb3, ^bb4 + aie.use_lock(%lock_0_3, AcquireGreaterEqual, 1) + aie.dma_bd(%buf3 : memref<256xi16, 2>, 0, 256) {bd_id = 1 : i32, next_bd_id = 1 : i32, task_id = 0 : i32} + aie.use_lock(%lock_0_3_5, Release, 1) + aie.next_bd ^bb4 + } + %core_0_3 = aie.core(%tile_0_3) { + %c0_i32 = arith.constant 0 : i32 + %c256 = arith.constant 256 : index + %c32 = arith.constant 32 : index + %c0 = arith.constant 0 : index + cf.br ^bb1 + ^bb1: // 2 preds: ^bb0, ^bb4 + aie.use_lock(%lock_0_3_6, AcquireGreaterEqual, 1) + aie.use_lock(%lock_0_3_5, AcquireGreaterEqual, 1) + cf.br ^bb2(%c0 : index) + ^bb2(%0: index): // 2 preds: ^bb1, ^bb3 + %1 = arith.cmpi slt, %0, %c256 : index + cf.cond_br %1, ^bb3, ^bb4 + ^bb3: // pred: ^bb2 + %2 = vector.load %buf3[%0] : memref<256xi16, 2>, vector<32xi16> + %3 = aievec.mul_elem %2, %2 : vector<32xi16>, vector<32xi16>, vector<32xi32> + %4 = aievec.srs %3, %c0_i32 : vector<32xi32>, i32, vector<32xi16> + vector.store %4, %buf2[%0] : memref<256xi16, 2>, vector<32xi16> + %5 = arith.addi %0, %c32 : index + cf.br ^bb2(%5 : index) {loop_annotation = #loop_annotation} + ^bb4: // pred: ^bb2 + aie.use_lock(%lock_0_3, Release, 1) + aie.use_lock(%lock_0_3_7, Release, 1) + cf.br ^bb1 + } + %mem_0_2 = aie.mem(%tile_0_2) { + %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_0_2_4, AcquireGreaterEqual, 1) + aie.dma_bd(%buf0 : memref<256xi16, 2>, 0, 256) {bd_id = 0 : i32, next_bd_id = 0 : i32, task_id = 0 : i32} + aie.use_lock(%lock_0_2_3, Release, 1) + aie.next_bd ^bb1 + ^bb2: // pred: ^bb3 + aie.end + ^bb3: // pred: ^bb0 + %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb2) + ^bb4: // 2 preds: ^bb3, ^bb4 + aie.use_lock(%lock_0_2, AcquireGreaterEqual, 1) + aie.dma_bd(%buf1 : memref<256xi16, 2>, 0, 256) {bd_id = 1 : i32, next_bd_id = 1 : i32, task_id = 0 : i32} + aie.use_lock(%lock_0_2_2, Release, 1) + aie.next_bd ^bb4 + } + %core_0_2 = aie.core(%tile_0_2) { + %c0_i32 = arith.constant 0 : i32 + %c256 = arith.constant 256 : index + %c32 = arith.constant 32 : index + %c0 = arith.constant 0 : index + cf.br ^bb1 + ^bb1: // 2 preds: ^bb0, ^bb4 + aie.use_lock(%lock_0_2_3, AcquireGreaterEqual, 1) + aie.use_lock(%lock_0_2_2, AcquireGreaterEqual, 1) + cf.br ^bb2(%c0 : index) + ^bb2(%0: index): // 2 preds: ^bb1, ^bb3 + %1 = arith.cmpi slt, %0, %c256 : index + cf.cond_br %1, ^bb3, ^bb4 + ^bb3: // pred: ^bb2 + %2 = vector.load %buf1[%0] : memref<256xi16, 2>, vector<32xi16> + %3 = aievec.mul_elem %2, %2 : vector<32xi16>, vector<32xi16>, vector<32xi32> + %4 = aievec.srs %3, %c0_i32 : vector<32xi32>, i32, vector<32xi16> + vector.store %4, %buf0[%0] : memref<256xi16, 2>, vector<32xi16> + %5 = arith.addi %0, %c32 : index + cf.br ^bb2(%5 : index) {loop_annotation = #loop_annotation} + ^bb4: // pred: ^bb2 + aie.use_lock(%lock_0_2, Release, 1) + aie.use_lock(%lock_0_2_4, Release, 1) + cf.br ^bb1 + } + %memtile_dma_1_1 = aie.memtile_dma(%mem_tile_1_1) { + %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_1_1_1, AcquireGreaterEqual, 4) + aie.dma_bd(%buf8 : memref<1024xi16, 1>, 0, 1024) {bd_id = 0 : i32, next_bd_id = 0 : i32, task_id = 0 : i32} + aie.use_lock(%lock_1_1, Release, 4) + aie.next_bd ^bb1 + ^bb2: // pred: ^bb9 + aie.end + ^bb3: // pred: ^bb0 + %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb5) + ^bb4: // 2 preds: ^bb3, ^bb4 + aie.use_lock(%lock_1_1, AcquireGreaterEqual, 1) + aie.dma_bd(%buf8 : memref<1024xi16, 1>, 0, 256) {bd_id = 1 : i32, next_bd_id = 1 : i32, task_id = 0 : i32} + aie.use_lock(%lock_1_1_1, Release, 1) + aie.next_bd ^bb4 + ^bb5: // pred: ^bb3 + %2 = aie.dma_start(S2MM, 1, ^bb6, ^bb7) + ^bb6: // 2 preds: ^bb5, ^bb6 + aie.use_lock(%lock_1_1, AcquireGreaterEqual, 1) + aie.dma_bd(%buf8 : memref<1024xi16, 1>, 256, 256) {bd_id = 24 : i32, next_bd_id = 24 : i32, task_id = 0 : i32} + aie.use_lock(%lock_1_1_1, Release, 1) + aie.next_bd ^bb6 + ^bb7: // pred: ^bb5 + %3 = aie.dma_start(S2MM, 2, ^bb8, ^bb9) + ^bb8: // 2 preds: ^bb7, ^bb8 + aie.use_lock(%lock_1_1, AcquireGreaterEqual, 1) + aie.dma_bd(%buf8 : memref<1024xi16, 1>, 512, 256) {bd_id = 2 : i32, next_bd_id = 2 : i32, task_id = 0 : i32} + aie.use_lock(%lock_1_1_1, Release, 1) + aie.next_bd ^bb8 + ^bb9: // pred: ^bb7 + %4 = aie.dma_start(S2MM, 3, ^bb10, ^bb2) + ^bb10: // 2 preds: ^bb9, ^bb10 + aie.use_lock(%lock_1_1, AcquireGreaterEqual, 1) + aie.dma_bd(%buf8 : memref<1024xi16, 1>, 768, 256) {bd_id = 25 : i32, next_bd_id = 25 : i32, task_id = 0 : i32} + aie.use_lock(%lock_1_1_1, Release, 1) + aie.next_bd ^bb10 + } + %memtile_dma_0_1 = aie.memtile_dma(%mem_tile_0_1) { + %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_0_1_0, AcquireGreaterEqual, 1) + aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 0, 256) {bd_id = 0 : i32, next_bd_id = 0 : i32, task_id = 0 : i32} + aie.use_lock(%lock_0_1, Release, 1) + aie.next_bd ^bb1 + ^bb2: // pred: ^bb9 + aie.end + ^bb3: // pred: ^bb0 + %1 = aie.dma_start(MM2S, 1, ^bb4, ^bb5) + ^bb4: // 2 preds: ^bb3, ^bb4 + aie.use_lock(%lock_0_1_0, AcquireGreaterEqual, 1) + aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 256, 256) {bd_id = 24 : i32, next_bd_id = 24 : i32, task_id = 0 : i32} + aie.use_lock(%lock_0_1, Release, 1) + aie.next_bd ^bb4 + ^bb5: // pred: ^bb3 + %2 = aie.dma_start(MM2S, 2, ^bb6, ^bb7) + ^bb6: // 2 preds: ^bb5, ^bb6 + aie.use_lock(%lock_0_1_0, AcquireGreaterEqual, 1) + aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 512, 256) {bd_id = 1 : i32, next_bd_id = 1 : i32, task_id = 0 : i32} + aie.use_lock(%lock_0_1, Release, 1) + aie.next_bd ^bb6 + ^bb7: // pred: ^bb5 + %3 = aie.dma_start(MM2S, 3, ^bb8, ^bb9) + ^bb8: // 2 preds: ^bb7, ^bb8 + aie.use_lock(%lock_0_1_0, AcquireGreaterEqual, 1) + aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 768, 256) {bd_id = 25 : i32, next_bd_id = 25 : i32, task_id = 0 : i32} + aie.use_lock(%lock_0_1, Release, 1) + aie.next_bd ^bb8 + ^bb9: // pred: ^bb7 + %4 = aie.dma_start(S2MM, 0, ^bb10, ^bb2) + ^bb10: // 2 preds: ^bb9, ^bb10 + aie.use_lock(%lock_0_1, AcquireGreaterEqual, 4) + aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 0, 1024) {bd_id = 2 : i32, next_bd_id = 2 : i32, task_id = 0 : i32} + aie.use_lock(%lock_0_1_0, Release, 4) + aie.next_bd ^bb10 + } + aie.shim_dma_allocation @air_channel_3(%shim_noc_tile_1_0, S2MM, 0) + aie.shim_dma_allocation @air_channel_0(%shim_noc_tile_0_0, MM2S, 0) + aie.runtime_sequence @square_kernel_0_sequence(%arg0: memref<*xi16>, %arg1: memref<*xi16>, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32) { + %0 = aiex.dma_configure_task_for @air_channel_0 { + aie.dma_bd(%arg0 : memref<*xi16>, 0, 1024, [, ]) + aie.end + } + aiex.dma_start_task(%0) + %1 = aiex.dma_configure_task_for @air_channel_3 { + aie.dma_bd(%arg1 : memref<*xi16>, 0, 1024, [, ]) + aie.end + } {issue_token = true} + aiex.dma_start_task(%1) + aiex.dma_free_task(%0) + aiex.dma_await_task(%1) + } + aie.packet_flow(15) { + aie.packet_source<%shim_noc_tile_0_0, TileControl : 0> + aie.packet_dest<%shim_noc_tile_0_0, South : 0> + } {keep_pkt_header = true, priority_route = true} + aie.packet_flow(15) { + aie.packet_source<%shim_noc_tile_1_0, TileControl : 0> + aie.packet_dest<%shim_noc_tile_1_0, South : 0> + } {keep_pkt_header = true, priority_route = true} + %switchbox_0_0 = aie.switchbox(%shim_noc_tile_0_0) { + aie.connect + %0 = aie.amsel<5> (3) + %1 = aie.masterset(South : 0, %0) {keep_pkt_header = true} + aie.packet_rules(TileControl : 0) { + aie.rule(31, 15, %0) + } + } + %shim_mux_0_0 = aie.shim_mux(%shim_noc_tile_0_0) { + aie.connect + } + %switchbox_0_1 = aie.switchbox(%mem_tile_0_1) { + aie.connect + aie.connect + aie.connect + aie.connect + aie.connect + } + %switchbox_1_0 = aie.switchbox(%shim_noc_tile_1_0) { + aie.connect + %0 = aie.amsel<5> (3) + %1 = aie.masterset(South : 0, %0) {keep_pkt_header = true} + aie.packet_rules(TileControl : 0) { + aie.rule(31, 15, %0) + } + } + %shim_mux_1_0 = aie.shim_mux(%shim_noc_tile_1_0) { + aie.connect + } + %switchbox_1_1 = aie.switchbox(%mem_tile_1_1) { + aie.connect + aie.connect + aie.connect + aie.connect + aie.connect + } + %switchbox_0_2 = aie.switchbox(%tile_0_2) { + aie.connect + aie.connect + aie.connect + aie.connect + aie.connect + aie.connect + } + %switchbox_0_3 = aie.switchbox(%tile_0_3) { + aie.connect + aie.connect + aie.connect + aie.connect + aie.connect + } + %switchbox_0_4 = aie.switchbox(%tile_0_4) { + aie.connect + aie.connect + aie.connect + aie.connect + } + %switchbox_0_5 = aie.switchbox(%tile_0_5) { + aie.connect + aie.connect + } + %tile_1_2 = aie.tile(1, 2) + %switchbox_1_2 = aie.switchbox(%tile_1_2) { + aie.connect + aie.connect + aie.connect + aie.connect + } + %tile_1_3 = aie.tile(1, 3) + %switchbox_1_3 = aie.switchbox(%tile_1_3) { + aie.connect + aie.connect + } + %tile_1_4 = aie.tile(1, 4) + %switchbox_1_4 = aie.switchbox(%tile_1_4) { + aie.connect + } + aie.wire(%shim_mux_0_0 : North, %switchbox_0_0 : South) + aie.wire(%shim_noc_tile_0_0 : DMA, %shim_mux_0_0 : DMA) + aie.wire(%mem_tile_0_1 : Core, %switchbox_0_1 : Core) + aie.wire(%mem_tile_0_1 : DMA, %switchbox_0_1 : DMA) + aie.wire(%switchbox_0_0 : North, %switchbox_0_1 : South) + aie.wire(%tile_0_2 : Core, %switchbox_0_2 : Core) + aie.wire(%tile_0_2 : DMA, %switchbox_0_2 : DMA) + aie.wire(%switchbox_0_1 : North, %switchbox_0_2 : South) + aie.wire(%tile_0_3 : Core, %switchbox_0_3 : Core) + aie.wire(%tile_0_3 : DMA, %switchbox_0_3 : DMA) + aie.wire(%switchbox_0_2 : North, %switchbox_0_3 : South) + aie.wire(%tile_0_4 : Core, %switchbox_0_4 : Core) + aie.wire(%tile_0_4 : DMA, %switchbox_0_4 : DMA) + aie.wire(%switchbox_0_3 : North, %switchbox_0_4 : South) + aie.wire(%tile_0_5 : Core, %switchbox_0_5 : Core) + aie.wire(%tile_0_5 : DMA, %switchbox_0_5 : DMA) + aie.wire(%switchbox_0_4 : North, %switchbox_0_5 : South) + aie.wire(%switchbox_0_0 : East, %switchbox_1_0 : West) + aie.wire(%shim_mux_1_0 : North, %switchbox_1_0 : South) + aie.wire(%shim_noc_tile_1_0 : DMA, %shim_mux_1_0 : DMA) + aie.wire(%switchbox_0_1 : East, %switchbox_1_1 : West) + aie.wire(%mem_tile_1_1 : Core, %switchbox_1_1 : Core) + aie.wire(%mem_tile_1_1 : DMA, %switchbox_1_1 : DMA) + aie.wire(%switchbox_1_0 : North, %switchbox_1_1 : South) + aie.wire(%switchbox_0_2 : East, %switchbox_1_2 : West) + aie.wire(%tile_1_2 : Core, %switchbox_1_2 : Core) + aie.wire(%tile_1_2 : DMA, %switchbox_1_2 : DMA) + aie.wire(%switchbox_1_1 : North, %switchbox_1_2 : South) + aie.wire(%switchbox_0_3 : East, %switchbox_1_3 : West) + aie.wire(%tile_1_3 : Core, %switchbox_1_3 : Core) + aie.wire(%tile_1_3 : DMA, %switchbox_1_3 : DMA) + aie.wire(%switchbox_1_2 : North, %switchbox_1_3 : South) + aie.wire(%switchbox_0_4 : East, %switchbox_1_4 : West) + aie.wire(%tile_1_4 : Core, %switchbox_1_4 : Core) + aie.wire(%tile_1_4 : DMA, %switchbox_1_4 : DMA) + aie.wire(%switchbox_1_3 : North, %switchbox_1_4 : South) + } {dlti.dl_spec = #dlti.dl_spec} + aie.device(npu2) { + aie.runtime_sequence @square_kernel(%arg0: memref<*xi16>, %arg1: memref<*xi16>, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32) { + aiex.configure @square_kernel_0 { + aiex.run @square_kernel_0_sequence(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7) : (memref<*xi16>, memref<*xi16>, i32, i32, i32, i32, i32, i32) + } + } + } +} diff --git a/examples/elementwise_arith/air_project/aiecc_repeater_1775797115_856352.sh b/examples/elementwise_arith/air_project/aiecc_repeater_1775797115_856352.sh new file mode 100755 index 0000000..b7aa36c --- /dev/null +++ b/examples/elementwise_arith/air_project/aiecc_repeater_1775797115_856352.sh @@ -0,0 +1,12 @@ +#!/bin/bash +set -e +# Repeater script for: resource allocation +echo "Original MLIR Diagnostics:" +cat << 'DIAGNOSTICS_EOF' +failed to legalize operation 'arith.subi' that was explicitly marked illegal: %120 = "arith.subi"(%118, %119) <{overflowFlags = #arith.overflow}> : (vector<32xi16>, vector<32xi16>) -> vector<32xi16> +DIAGNOSTICS_EOF +echo "" + +MLIR_FILE='air_project/aiecc_failure_1775797115_856352.mlir' +PASS_PIPELINE='any(canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},unknown,unknown,unknown<{anonymous}::CopyRemovalPass>,unknown,test-canonicalize-vector-for-aievec{aie-target=aie2p target-backend=llvmir},test-lower-vector-to-aievec{aie-target=aie2p target-backend=llvmir},canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},unknown,cse,unknown,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},test-aievec-optimize{aie-target=aie2p target-backend=llvmir},cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},aievec-convolution-analysis{print=false},test-aievec-convolution-optimize{aie-target=aie2p shift=0 target-backend=llvmir},cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},loop-invariant-code-motion,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},lower-affine,aie-canonicalize-device,aie.device(aie-assign-lock-ids,aie-register-objectFifos,aie-objectFifo-stateful-transform{dynamic-objFifos=false packet-sw-objFifos=false},aie-assign-bd-ids,aie-lower-cascade-flows,aie-lower-broadcast-packet,aie-lower-multicast,aie-assign-tile-controller-ids{column-wise-unique-ids=true},aie-generate-column-control-overlay{route-shim-to-tct=shim-only route-shim-to-tile-ctrl=false},aie-assign-buffer-addresses{alloc-scheme=},aie-assign-core-link-files,aie-vector-transfer-lowering{max-transfer-rank=4294967295}),convert-scf-to-cf{allow-pattern-rollback=true})' +aie-opt --mlir-print-ir-after-all --mlir-disable-threading --pass-pipeline="$PASS_PIPELINE" "$MLIR_FILE" diff --git a/examples/elementwise_arith/air_project/aiecc_repeater_1775797139_858651.sh b/examples/elementwise_arith/air_project/aiecc_repeater_1775797139_858651.sh new file mode 100755 index 0000000..2f765b5 --- /dev/null +++ b/examples/elementwise_arith/air_project/aiecc_repeater_1775797139_858651.sh @@ -0,0 +1,14 @@ +#!/bin/bash +set -e +# Repeater script for: LLVM lowering +echo "Original MLIR Diagnostics:" +cat << 'DIAGNOSTICS_EOF' +aievec.mul_elem conversion is not supported for AIE2p. + +failed to legalize operation 'aievec.mul_elem' that was explicitly marked illegal: %28 = "aievec.mul_elem"(%24, %27) : (vector<32xi16>, vector<32xi16>) -> vector<32xi32> +DIAGNOSTICS_EOF +echo "" + +MLIR_FILE='air_project/aiecc_failure_1775797139_858651.mlir' +PASS_PIPELINE='any(aie.device(aie-localize-locks,aie-normalize-address-spaces,aie-transform-bfp-types),aie-standard-lowering{device=mul_kernel_0 tilecol=0 tilerow=5},aiex-standard-lowering,convert-aievec-to-llvm{aie-target=aie2p aie2-fp32-emulation-strategy=accuracy-safe},canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},cse,expand-strided-metadata,lower-affine,arith-expand{include-bf16=false include-f4e2m1=false include-f8e8m0=false},finalize-memref-to-llvm{index-bitwidth=0 use-aligned-alloc=false use-generic-functions=false},convert-func-to-llvm{index-bitwidth=0 use-bare-ptr-memref-call-conv=true},convert-to-llvm{allow-pattern-rollback=true dynamic=true },convert-vector-to-llvm{enable-arm-bf16=false enable-arm-i8mm=false enable-arm-neon=false enable-arm-sve=false enable-x86=false force-32bit-vector-indices=true reassociate-fp-reductions=false use-vector-alignment=false vector-contract-lowering=dot vector-transpose-lowering=eltwise},convert-ub-to-llvm{index-bitwidth=0},canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},cse)' +aie-opt --mlir-print-ir-after-all --mlir-disable-threading --pass-pipeline="$PASS_PIPELINE" "$MLIR_FILE" diff --git a/examples/elementwise_arith/air_project/aiecc_repeater_1775797174_862028.sh b/examples/elementwise_arith/air_project/aiecc_repeater_1775797174_862028.sh new file mode 100755 index 0000000..e9fc1e4 --- /dev/null +++ b/examples/elementwise_arith/air_project/aiecc_repeater_1775797174_862028.sh @@ -0,0 +1,14 @@ +#!/bin/bash +set -e +# Repeater script for: LLVM lowering +echo "Original MLIR Diagnostics:" +cat << 'DIAGNOSTICS_EOF' +aievec.mul_elem conversion is not supported for AIE2p. + +failed to legalize operation 'aievec.mul_elem' that was explicitly marked illegal: %21 = "aievec.mul_elem"(%20, %20) : (vector<32xi16>, vector<32xi16>) -> vector<32xi32> +DIAGNOSTICS_EOF +echo "" + +MLIR_FILE='air_project/aiecc_failure_1775797174_862028.mlir' +PASS_PIPELINE='any(aie.device(aie-localize-locks,aie-normalize-address-spaces,aie-transform-bfp-types),aie-standard-lowering{device=square_kernel_0 tilecol=0 tilerow=5},aiex-standard-lowering,convert-aievec-to-llvm{aie-target=aie2p aie2-fp32-emulation-strategy=accuracy-safe},canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},cse,expand-strided-metadata,lower-affine,arith-expand{include-bf16=false include-f4e2m1=false include-f8e8m0=false},finalize-memref-to-llvm{index-bitwidth=0 use-aligned-alloc=false use-generic-functions=false},convert-func-to-llvm{index-bitwidth=0 use-bare-ptr-memref-call-conv=true},convert-to-llvm{allow-pattern-rollback=true dynamic=true },convert-vector-to-llvm{enable-arm-bf16=false enable-arm-i8mm=false enable-arm-neon=false enable-arm-sve=false enable-x86=false force-32bit-vector-indices=true reassociate-fp-reductions=false use-vector-alignment=false vector-contract-lowering=dot vector-transpose-lowering=eltwise},convert-ub-to-llvm{index-bitwidth=0},canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},cse)' +aie-opt --mlir-print-ir-after-all --mlir-disable-threading --pass-pipeline="$PASS_PIPELINE" "$MLIR_FILE" diff --git a/examples/elementwise_arith/air_project/airinput.mlir b/examples/elementwise_arith/air_project/airinput.mlir new file mode 100644 index 0000000..d0b7377 --- /dev/null +++ b/examples/elementwise_arith/air_project/airinput.mlir @@ -0,0 +1,41 @@ +#map = affine_map<()[s0] -> (s0 * 256)> +module { + func.func @square_kernel(%arg0: memref<*xi16> {tt.divisibility = 16 : i32}, %arg1: memref<*xi16> {tt.divisibility = 16 : i32}, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32) { + %c1 = arith.constant 1 : index + air.launch (%arg8, %arg9, %arg10) in (%arg11=%c1, %arg12=%c1, %arg13=%c1) args(%arg14=%arg0, %arg15=%arg1) : memref<*xi16>, memref<*xi16> { + air.segment @square_kernel_0 args(%arg16=%arg8, %arg17=%arg14, %arg18=%arg15) : index, memref<*xi16>, memref<*xi16> { + %c1024 = arith.constant 1024 : index + %c4 = arith.constant 4 : index + %c1_0 = arith.constant 1 : index + %0 = arith.muli %arg16, %c1024 : index + %alloc = memref.alloc() : memref<1024xi16, 1 : i32> + air.dma_memcpy_nd (%alloc[] [] [], %arg17[%0] [%c1024] [%c1_0]) {id = 1 : i32} : (memref<1024xi16, 1 : i32>, memref<*xi16>) + %alloc_1 = memref.alloc() : memref<1024xi16, 1> + air.herd @herd_0 tile (%arg19, %arg20) in (%arg21=%c4, %arg22=%c1_0) args(%arg23=%alloc, %arg24=%alloc_1) : memref<1024xi16, 1 : i32>, memref<1024xi16, 1> { + %1 = ub.poison : i16 + %c1_2 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %c256 = arith.constant 256 : index + %c32 = arith.constant 32 : index + %2 = affine.apply #map()[%arg19] + %alloc_3 = memref.alloc() : memref<256xi16, 2> + air.dma_memcpy_nd (%alloc_3[] [] [], %arg23[%2] [%c256] [%c1_2]) {id = 1 : i32} : (memref<256xi16, 2>, memref<1024xi16, 1 : i32>) + %alloc_4 = memref.alloc() : memref<256xi16, 2> + scf.for %arg25 = %c0 to %c256 step %c32 { + %subview = memref.subview %alloc_3[%arg25] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2> + %subview_5 = memref.subview %alloc_4[%arg25] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2> + %3 = vector.transfer_read %subview[%c0], %1 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16> + %4 = arith.muli %3, %3 : vector<32xi16> + vector.transfer_write %4, %subview_5[%c0] {in_bounds = [true]} : vector<32xi16>, memref<32xi16, strided<[1], offset: ?>, 2> + } + air.dma_memcpy_nd (%arg24[%2] [%c256] [%c1_2], %alloc_4[] [] []) {id = 2 : i32} : (memref<1024xi16, 1>, memref<256xi16, 2>) + memref.dealloc %alloc_3 : memref<256xi16, 2> + memref.dealloc %alloc_4 : memref<256xi16, 2> + } + air.dma_memcpy_nd (%arg18[%0] [%c1024] [%c1_0], %alloc_1[] [] []) {id = 2 : i32} : (memref<*xi16>, memref<1024xi16, 1>) + memref.dealloc %alloc_1 : memref<1024xi16, 1> + } + } + return + } +} diff --git a/examples/elementwise_arith/air_project/asm_air_output.mlir b/examples/elementwise_arith/air_project/asm_air_output.mlir new file mode 100644 index 0000000..d0b7377 --- /dev/null +++ b/examples/elementwise_arith/air_project/asm_air_output.mlir @@ -0,0 +1,41 @@ +#map = affine_map<()[s0] -> (s0 * 256)> +module { + func.func @square_kernel(%arg0: memref<*xi16> {tt.divisibility = 16 : i32}, %arg1: memref<*xi16> {tt.divisibility = 16 : i32}, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32) { + %c1 = arith.constant 1 : index + air.launch (%arg8, %arg9, %arg10) in (%arg11=%c1, %arg12=%c1, %arg13=%c1) args(%arg14=%arg0, %arg15=%arg1) : memref<*xi16>, memref<*xi16> { + air.segment @square_kernel_0 args(%arg16=%arg8, %arg17=%arg14, %arg18=%arg15) : index, memref<*xi16>, memref<*xi16> { + %c1024 = arith.constant 1024 : index + %c4 = arith.constant 4 : index + %c1_0 = arith.constant 1 : index + %0 = arith.muli %arg16, %c1024 : index + %alloc = memref.alloc() : memref<1024xi16, 1 : i32> + air.dma_memcpy_nd (%alloc[] [] [], %arg17[%0] [%c1024] [%c1_0]) {id = 1 : i32} : (memref<1024xi16, 1 : i32>, memref<*xi16>) + %alloc_1 = memref.alloc() : memref<1024xi16, 1> + air.herd @herd_0 tile (%arg19, %arg20) in (%arg21=%c4, %arg22=%c1_0) args(%arg23=%alloc, %arg24=%alloc_1) : memref<1024xi16, 1 : i32>, memref<1024xi16, 1> { + %1 = ub.poison : i16 + %c1_2 = arith.constant 1 : index + %c0 = arith.constant 0 : index + %c256 = arith.constant 256 : index + %c32 = arith.constant 32 : index + %2 = affine.apply #map()[%arg19] + %alloc_3 = memref.alloc() : memref<256xi16, 2> + air.dma_memcpy_nd (%alloc_3[] [] [], %arg23[%2] [%c256] [%c1_2]) {id = 1 : i32} : (memref<256xi16, 2>, memref<1024xi16, 1 : i32>) + %alloc_4 = memref.alloc() : memref<256xi16, 2> + scf.for %arg25 = %c0 to %c256 step %c32 { + %subview = memref.subview %alloc_3[%arg25] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2> + %subview_5 = memref.subview %alloc_4[%arg25] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2> + %3 = vector.transfer_read %subview[%c0], %1 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16> + %4 = arith.muli %3, %3 : vector<32xi16> + vector.transfer_write %4, %subview_5[%c0] {in_bounds = [true]} : vector<32xi16>, memref<32xi16, strided<[1], offset: ?>, 2> + } + air.dma_memcpy_nd (%arg24[%2] [%c256] [%c1_2], %alloc_4[] [] []) {id = 2 : i32} : (memref<1024xi16, 1>, memref<256xi16, 2>) + memref.dealloc %alloc_3 : memref<256xi16, 2> + memref.dealloc %alloc_4 : memref<256xi16, 2> + } + air.dma_memcpy_nd (%arg18[%0] [%c1024] [%c1_0], %alloc_1[] [] []) {id = 2 : i32} : (memref<*xi16>, memref<1024xi16, 1>) + memref.dealloc %alloc_1 : memref<1024xi16, 1> + } + } + return + } +} diff --git a/examples/elementwise_arith/air_project/asm_src.mlir b/examples/elementwise_arith/air_project/asm_src.mlir new file mode 100644 index 0000000..aa0162c --- /dev/null +++ b/examples/elementwise_arith/air_project/asm_src.mlir @@ -0,0 +1,34 @@ +#loc = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":86:1) +#loc5 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":89:9) +#map = affine_map<(d0) -> (d0)> +#loc8 = loc("X"(#loc)) +#loc9 = loc("OUT"(#loc)) +#loc12 = loc("x"(#loc5)) +module { + func.func @square_kernel(%arg0: memref<*xi16> {tt.divisibility = 16 : i32} loc("X"(#loc)), %arg1: memref<*xi16> {tt.divisibility = 16 : i32} loc("OUT"(#loc)), %arg2: i32 loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":86:1), %arg3: i32 loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":86:1), %arg4: i32 loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":86:1), %arg5: i32 loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":86:1), %arg6: i32 loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":86:1), %arg7: i32 loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":86:1)) { + %c1024_i32 = arith.constant 1024 : i32 loc(#loc1) + %0 = arith.muli %arg5, %c1024_i32 : i32 loc(#loc10) + %1 = arith.index_cast %0 : i32 to index loc(#loc3) + %reinterpret_cast = memref.reinterpret_cast %arg0 to offset: [%1], sizes: [1024], strides: [1] : memref<*xi16> to memref<1024xi16, strided<[1], offset: ?>> loc(#loc11) + %alloc = memref.alloc() : memref<1024xi16> loc(#loc12) + memref.copy %reinterpret_cast, %alloc : memref<1024xi16, strided<[1], offset: ?>> to memref<1024xi16> loc(#loc12) + %2 = bufferization.to_tensor %alloc restrict writable : memref<1024xi16> to tensor<1024xi16> loc(#loc12) + %reinterpret_cast_0 = memref.reinterpret_cast %arg1 to offset: [%1], sizes: [1024], strides: [1] : memref<*xi16> to memref<1024xi16, strided<[1], offset: ?>> loc(#loc3) + %3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%2, %2 : tensor<1024xi16>, tensor<1024xi16>) outs(%2 : tensor<1024xi16>) { + ^bb0(%in: i16 loc("x"(#loc5)), %in_1: i16 loc("x"(#loc5)), %out: i16 loc("x"(#loc5))): + %4 = arith.muli %in, %in_1 : i16 loc(#loc6) + linalg.yield %4 : i16 loc(#loc6) + } -> tensor<1024xi16> loc(#loc6) + bufferization.materialize_in_destination %3 in writable %reinterpret_cast_0 : (tensor<1024xi16>, memref<1024xi16, strided<[1], offset: ?>>) -> () loc(#loc7) + return loc(#loc) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":88:15) +#loc3 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":90:14) +#loc4 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":89:17) +#loc6 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":90:32) +#loc7 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":90:5) +#loc10 = loc("offsets"(#loc2)) +#loc11 = loc("x"(#loc4)) + diff --git a/examples/elementwise_arith/air_project/div_kernel_0.pdi b/examples/elementwise_arith/air_project/div_kernel_0.pdi new file mode 100644 index 0000000000000000000000000000000000000000..3681781c43b9bf80414d1863d6cae3111aa21810 GIT binary patch literal 15904 zcmeHOZ)_aJ6@UB3z9V(Gy~Iuz+aVj<5JN6-i49F4fpzRwF16ANA(h)k#Yjzowy3UD zT7OA7h$;UPVbUTEP=t$ws-k{4S8WuzUyc$GAZSeqQmR69g;Z^&qMi|zOmJ|2v%9nN z?))!lrHT(X(%QfGn>TOXdow$J-rIYdi2AzP()a)18+%{Wn%1rZf_?HnI&8HuX{If`lUJQe^C%8!V5?8JhHoenb+v z_TA2LSV)rCQG}$pDebw;DOU$@Lj-rTM>Wm~ga%PU`heI3bLMT8RHpP=x zN~}$!#E#CCXu2dNu3DcGd)KDKofoCVqfM!6(zK;$P~qx#zVA@s4I}^N%0v$VXWupQ z|Gpq$CNr>AnsbwFYXH#PGv{`FkUHIS`9#h~e7@Y{B;$j}VG~U+p?~AVjr~VM!lU8DoVfOb zD2btEqGJcG=3@sBHjAcLm$5f>Zuahly?d<|(R2d(nI-%?iT+K?`*$Dg-FFc7-h%!W zOV}%+|H|dn&WR#ED&|cVren+g6=Dv8%1)$k~w%>sv-PY;GOd(6e@A z!|U#eU8~wJoEh2hSo6q+U$l(;e6n@qZ?B-|&udTr{^-U*ckHpeh@yS%l7YKtD7t&xHLyy$*7 zDlJ2VnX_W$=Nz#z@9gt_Gja8Y`@C(1$r-UG@0_ulD8Kj3gjdQKKdJ94iGhLiHMfxS zOKK=wH*?J+bN`xMM}@>J?)FZzw65^@8K@bJa@Rh?X1{ze2pl-=m-y2{uM`j`K49J&B@4Fgd%eA&DQ++VxOaB+QSLZvfXHe#eF}ieC+|l5O{mFEu z!!g&BxM(1aTnt&+(4JT`>WEFYGdVW*9b^|i_0hORdC^^;78_rJu0Mto9w+ptNv!nK zq7E@WQ*@S^-%XGs^WU-B4r6E;kTGmkbIO^Q8ZYYd!T95y$zVLj7A%~=VUzI}??MbE zE2+3;>^0rJ>d0>2!(V7yd}{^vf`8J&DuykL*XXR z@(pQm$#Lvh$a|5kAS;xVSqWQ|4@7*t-H!GX<|t{O=liGr{)xulIl>{CPdUER)Z@vH zW5yjO3Yz6#_?8(2ULG#GFZpMB6g@n7UlyE0r##!=O!u{^+33vCK z33v0K5^nnK#QeXC97yjKs&{z|)R6(2*9Y0Y_Zfmlo@K9g)g6iggb(iUZ?mw{R#evRHS= zY!p6z9~g4n!cIrfPDkb`W2`V%n_@=~dcf`{Xx=0J^+{GRQ9dTa^AFL9C!bH5!{uLD zh5Kg2t+R=YY5SJEAK)LxMEYuf$V9o@j_aa}+{d0@V10PqC&x%x)qP4`t)9%F{;SoKwR-aZT~CJfovh-*da_bh^`Gij^`DZ->#eFMd#dWmN|{bo%1I_K zT~F?;)swY)vQ|&l>PZ|uwda$y=aaSPleOoQwda$cyyuf)eW&U}H>&THtm;1{tNKsL z#Vu~tvk>d9I?S*s^;^wjFfT0L2-Cu{X&t)Bek)sta; zr|LsDs_&Gn>OUo``cKOI-AVo#w|tjUeR6Hd&#=qur8-IMFzhj$W!Psp$M6ut!wlyc z9%Fc%;R3@&hNl=VF@)31y#lvPYr-pu0D*kgQ_VV~g~!$S-YGn{95jNx&H z3k(+-o?^Jfa5=(Ze7#lig|d#XhsCF{SJpVou+MOg;UR{H8O}32#_%}91%`_ZPcd9# zxE$dyzCBg(g|d!sH;YeWudH#FVV~g~!$S-YGn{95jNx&H3k(+-o?^JfaG7C;T21+G zb8xQ#tGK|+>QnJLMrySgFJYjQ!=5)H?6Z3z%Xp9R4&#aOIqWTMN8b-w#(QipI*cdA z=dgy_PUT)xyvNquVfw`Q9CD@feU`T@<2}YZjHigN{9jjElkay&FKRecSR zgL4hz+7Uh60Ieau?f7LPBmzeIfKM+@T%v5qgim%{ie`x?}{VnKYmZ+?91&oe%$ z?*~5dAB)Eq*k2+(^rMCGh1h@`u3vcOgz;7OHK^}Y1o7!U9iMzTQ?_P1IOyaK?$XfjvX2+ILM9wAE{{D=DIo_`$V zGGoi;Kd5b@{U2bJW8~jt`LF0h{)6w7$#KYkMIZ8Sviw){A^)Z%qI;^MZ+qFoHy9`{ z_eJA0{zrDvY0ijt?05ApjJU1h9G1w6ILJ0%#~X|%*+v*48-V!RI?aR&M}<1^qf zOTZ@?zZE<@3it-bcb6@b`7GlJ$AV`*OTHNR`MzZ`pG_1vfxc(7#AD%ejNx;P;j_$n zpbwv89zM&QhVwEWKAX(vSl=mI@)iC--=DVRE(yniXFf~582M}~pF5}})`xEzgU6TZ z0z2CoZ-d9%K>@#&@k#J2BfgvQ4d7$n7;l%`3EyK3?6}kIa4p0>_;}qJ-w$4|i`MT0 zALpFuC&A+@LV^83#t(pxbIy3X+{Ns4#r|}I*ZydJJNOGC`yJDoPl9iX_?sBt4}NvT z4={cJy!J=ivCF+I-d^TUFX1KqAghmpK2z>xc6#G^UGUnD=CLkj zeBVuLzO~$LH$-t|r`zNE*9Fhwjrm6K+K$%u!0UDAd>6Cd#q4zH`+8dQ@SoZ54fKP* u4Q1>GI7(n5+2pcaHta0Ak*kVlW*8;+Q8*_0-iVgN_x`jTer_ve)&2uCkqQU^ literal 0 HcmV?d00001 diff --git a/examples/elementwise_arith/air_project/div_kernel_0_aie_cdo_elfs.bin b/examples/elementwise_arith/air_project/div_kernel_0_aie_cdo_elfs.bin new file mode 100644 index 0000000000000000000000000000000000000000..3542f70d63d78dc642ddc7dc29acd12d643307f5 GIT binary patch literal 10704 zcmeHLZEO@p7=Cy6?)GRXH{15=aM#*xDfF;gC?6spDejfVAT||HOx4EJL?ZDc)I`kv zsT{P(R|$nkG6ad0gcuVOw?>=hepM+12q=V-)QHUqP1HnUO)xo1%lXdTZo7EbmVfeN zCz;;vGc(V;?>jT^J{=bTtgCw-Km^#H1MuuuMp6*~WWj|kSk!rduc=~TWT=#PPgM(} zSun_gd2JY<$1Nn`El!pyW#AHkxu6c>vt$c|1w{ZiH%KasXb{r@J_7zkUAss>GI)X2 z^!WgW!(GO1@2}RVR|OSRIY9x41p!b9nQwe#A_?$eb0y5d4%izx*-6RAie(?G3Hg|R zzK=aw>SJ4qeQd{EA3NsvtyW-Zp8&>+n;#G`t_jiWxpElrx%!q6{b!aedNt&VC#kor z5FfBmZ>{;H5m&BsUt1IX8|P7hB-ALS-@r<`9G*`1Mc>gnwxiD{Ug>iS#_k0EhK1(B zBwcpLHynPVd!vs2=>WJk+&{E9U6}towpVsiM9)Ri@9-u5}ltPeyXmCo-pJAnERT7fTyR~ zJU!Igg-`7O&3q*Ff{CMl1iT`s=6UGh%=n5l<1F_X!g8aeN&U8a#oZ=#l@Uv@!YCPa z`60UHx~#@E;bTu-oYmJWt6vB5CCoO~CRQIw{xwzthCGI+JtW3Uj8pgUX}zd|vEW|Q zA05V`(b4x0K=jyU*#|w5)0ioX(%IuVAnlkPCej(2lCp1~5HGRp)6sgt@{!HIKw<m)b^``9&1;|CEl@?VU66xIjsrlSGp{fAd ziB4EB#@?Mx*jBHil@qZP*xY&rW6>;Cr}b0EJ_PHz4=b%X`OMReXYOcfe1FKB8joWQ z2{UlIaR2*uq7QM&Yhg3+`FG_V-K47k9v6E%2ls+LDB&umOWI!=>XNn3Is^|eA*!&N z1BBIJ1ajL69oe9;`Te+K@qG(MD-{*+@~EVX84vWhf1b(&kM2tnVKy_?|EV zJf3`fNSquGt_h#$sW|1>ZNi9*jTQ9~C^N{Y-mG(ar^-Ou%>&I@+$# z(Z?}%VIKV!(d4NE)f3&xs?L!fMr|2&- zWItbKJGakmtq(OsMfJo7cF2V`wLiWmqLvWg;nFa z6S08l_}kh{s8LH8)Dj*iO~?`QMOG8W7DTrK_T0_w)qyJ*Gaj*Y{zLEB$umB&W#ntg z*qdN)j>($n(j|T#&^{bv_P6#24-{S1KPkF^9zT<1eJs6aX+S>C!YmDF$1DwK$1DwK z$3R^M0tc{Id}L?|@D=3M!e|x@vS3~trU;xNT^`h%OC7!G=uJm&X8b>qP#wML=uJm& zqVk4G{p{#X4fUa;HyyqCf9TD$7UU}Kzgp0aSz6GJSz6GJfx0e>-VA5ao0%94Wnx|% zHnrZYb@ZmAHyyp{=uJ#T&i$ryzvI`^B-{pO>2znRu}cFjmljc3O!EojFqEojF; zeIbk9+>}LcW@0duiFs|<)OvHXqcF7;IZ(=HP^roXX9lh!3O-FA&ntC&>@$8zB Rni|iJSz6GJSz3@|{{p#ZAFu!b literal 0 HcmV?d00001 diff --git a/examples/elementwise_arith/air_project/div_kernel_0_aie_cdo_enable.bin b/examples/elementwise_arith/air_project/div_kernel_0_aie_cdo_enable.bin new file mode 100644 index 0000000000000000000000000000000000000000..7cc1818bce4d6ce1226fc5fda519967a4842b99e GIT binary patch literal 104 zcmZQ!U|?`|@n>LQVqg#jvKQ~%@L!OTfq|Wo1;_&e1!V>x7mN{N##qFhv4{o3#Q>)) B1+xGE literal 0 HcmV?d00001 diff --git a/examples/elementwise_arith/air_project/div_kernel_0_aie_cdo_init.bin b/examples/elementwise_arith/air_project/div_kernel_0_aie_cdo_init.bin new file mode 100644 index 0000000000000000000000000000000000000000..d4549ba181167c6e6d7475f23335e34fea2c3376 GIT binary patch literal 6032 zcmb7{&rak<5XQ@%hINw3u1Uy>EJ*QcI7AT#M?)kINHH2BaX`vMPg)mN>ymUudP@3&^d;#l($}PKNY6;mN#BybCv6+u_35mqPjyW9=@;sg z)`iu&Bwdl7lD;5)N&1TPHR&7DGtzU?x1{e$Thh7gr0z}C-Y@95Kj1=K_e*vRasT2A z`KGw%NtC~$_drR!Af6K!;#1rYonOBXO5z3Gubj9LpW;_N?bzRq9a$@;UpUZ4Mwdf~w*)RN!T zTGw0O+2D~L&k!+PZVspqp^>qxyLCZUYYv{=)g)M&yk)HtOi1`aSl`pz$Z~{VHC> z{wm~8zhkS{>@Pz8!5=I0ho0@PV=wrGo|TPu3xmfX=5+8#k7tP3R^-WgJe!Ecz0Z3* zJBWQ9<=Kinq2IpGzOohbc{t*JyYq{${(e{F$2@C)+CR@F|Df@&=r{W@PpQw>ig|vl z-i6*{KG=BId(6k{cs_@S#l1i1@f;%-&#>z8oFW$Y{-no)v!Lz|&O*DNXg+CuAD%p) z+W++V)c&W>r}jU6KIt>&-QrcKx@>wrfeNOYm3=@d_NZ zw|+x>4;)3bJR^PpjtW{nB))G4Db?$G()v+vO7%KEZ|nOoNU7cwf84d}i)35nL%m9) zUZqj5n{~S$^(u~f-3(ifdKE{#Db=fZZU-ih)>n&xd&hVAz^Pux=TW^Tu6I|q)q2#U zHTbJ2{~>V${xaeR#53^EBfd|32(IcOZtO_bqxD;h5z6W6t-#g!Bz^*}&xc-r46f!) z>of4z(euuUpMtA-6E}8D`NwMi``~*2njeCH5^6!G{ z`D%{yOZ(S+3a%8O|3dRUJE8m&wZAQJonLdDk5s*%7Mkzakr_t)DHkL4{@Vhl z{wcl*uJh~l1-L$+%*XWnG36iY_v1oy)KAZ!`1S4I2o|Ro1qz~3EajN7i78>jEaZQL C^4EL- literal 0 HcmV?d00001 diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_2.elf b/examples/elementwise_arith/air_project/div_kernel_0_core_0_2.elf new file mode 100755 index 0000000000000000000000000000000000000000..33f5143237b2608b0fcdae4c633076f2bc60cb03 GIT binary patch literal 4132 zcma)9eN0=|6+h3<_&Hgquld?7gz`d2a2f+PI0>esHH=kJ=AdL-BDO^}#x}tU7={gV zHf4DYv}u||B~c^Ys*TDjbB~+YJ8r|;nZee ztx1e(p=~m8%KUeI)DHz2WGPM65Hu+T8xIJHk4$F(e%8Gc9z!f-y|c@rg;h!x)@rk` z(rp&DYm0>)sM~@IsJUI^?8I0o-m^cY+_3|bfuv=H(%(Tdb-^>LE#{8Vv5}<3a6MTdBu=HVZxg!9 zGPLP4%R<|eQwKblp9iq!!1Kc@sKiRIp?>GoclJ)J0vU#+414M`t%#~;fSEC|d}eH{ zoRz-6j=$@V=I;gc_kvi#O6SquzD9hvP+zxxe5cXh=`r;8G1}L!;V*^yhV}iOL4Rj> zz6)r-R{j2k`bVhW*|GAOvm(a#U$nPReeb}-*CQ=Yl=FNKcdfG?=f#Sd^O$cz8R7YM zb$^vVQo-~6LEk$5UKy*Hc||N|1?A|^Hg#m^EB#@7JEoqC&uaUn=nHz`eTiPUj%^;> zEo>iSOG)#c2KnML{#F;=o$su!xp+5TyQLyt>)af#ZL5se-k|fHMOzPRuK*V^I-@@v?Xcskv3A@jfGDo9Anc-n1-R8``w zFOl18kRh@C%On3jm#AM_ioFcVr4J+vOvm3AjIdd~J-!WO9B&5$4^g9Rd@l*9Dm^|A zMla|&vTg5@SpqgeMss1d3;OzDBiI&ayYBVX+4v1=-|;D)a{C|8o2u`!y16p8c$F|2 zoAMrE-@jIbwY-Iq~frn680Q>6`uzi^gY$7aVw%TimVGFRwy;iJ6x9Cv)v9fW(>WpM|A~T59#h5x^ zQUry$Ojfq@8rpg$@P-EpUk6s`v9bcp@$Dp8Ykn#L;q|X6HmW&PczF)Z`kZ{`wZ^jo zB|Co3X3mbsu?Z1xV3m0O$&;8vN;K=Tb^Mh+Uv%-HM+Pt%*|Ckd7d*E_Tt$`Ce5q|x zvj6;?U;-v6R@VA5VXat!!f`?u_gUGtIoz>`4`H>kRRJ@vib`5NFypzcMr}Xg9I2I< zAB<1`{;`rj5k`R5laIFp%dmouI)o^Ni(2mRf*pD))gYzsESg2Ujykhz}U=uRq4G2qtYlX5VAA zU#);@0qaTV(zul_J%P1r3Mtw`ufm>!D@f=oFLs}C*C6LM2(eMyo5D$SXtFK39~3A ztm;e&J27dciE~!wz;!2v66V4FL^q*{cEX^Y@H*LrjY6@3wYH%Kr6EAeQ#@V|@E43$ zk6HEp!|d4QcPxgX#ovmF(`j~eS+X099+961j1R|X_WJ(dg`)1cl}8uQ*WP)^`gp^U zW1-;|+3W3)o$h)^J)ZACzdv$R9t{mg!-Ek`D%<*Ev7u;7L&MQ$RBKSxOcmngQ38lpexwq>}l$81)T0s(CzAV@8Hqh zPeL9v!ktVp;$%)HKn3!tJiIl>^Vidz41YOw^9O2{1bok;JwB^c65t!iB=Ynfh?BYM z2(TZqBlmz6;4s(cVIN{S&%O^a$D=T;jVG`rxKDa+oTR>UEU(4bZ`x?YNKHa_nG3Pn)@h-%27V!sw0Bx)A zcUR%(R^iAh{L@wVMZ{oK7l`L~mSfKfp5Rzn!B8Iy#bdy?`9LUwO!W9b%-@53Uohbja%0=>WnUF)Fq8B-CwiLxOoV{Fb^Rx9COwOcqqLik$|r! zbTk}sg3pJCxpBxB4fy;0!@k~Ncob!7+z=QX4*49uM(7#o&Hgw!=THsMD>BiTe>eud zD0<+WZ`9vE67qHI=P+;#d;|V)#21c)W57A^1!9B4QGFDb`~1Vh{^PntRfi%$$PUt! zSsctCsRAy?fOhd87yoI>?mG`I@1qjm3cRVi0k;M6{Qs{`L}U2w=5`JZ%lO^=vBvXG z_rvWCd2)t!lPKWV5r6jLk<9+D#=9HvIOeFi5MWOZss1>u#+Gk4h<2CwWE$!0lZZL5 PXmTU|tLAg+dcFTY7d$5= literal 0 HcmV?d00001 diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_2.ld.script b/examples/elementwise_arith/air_project/div_kernel_0_core_0_2.ld.script new file mode 100644 index 0000000..fc4f0cf --- /dev/null +++ b/examples/elementwise_arith/air_project/div_kernel_0_core_0_2.ld.script @@ -0,0 +1,72 @@ + +MEMORY +{ + program (RX) : ORIGIN = 0, LENGTH = 0x0020000 + data (!RX) : ORIGIN = 0x78400, LENGTH = 0x7C00 +} +ENTRY(__start) +SECTIONS +{ + . = 0x0; + .text : { + /* the __start symbol has to come at address zero. */ + *crt0.o(.text*) + _ctors_start = .; + _init_array_start = .; + KEEP(SORT(*.init_array)) + _ctors_end = .; + _init_array_end = .; + _dtors_start = .; + _dtors_end = .; + *(.text*) + } > program + .data : { + *(.data*) + *(.rodata*) + } > data + .comment : { + *(.comment*) + } + .symtab : { + *(.symtab) + } + .shstrtab : { + *(.shstrtab) + } + .strtab : { + *(.strtab) + } + .stack_sizes : { + *(.stack_sizes) + } + +. = 0x70000; +_sp_start_value_DM_stack = .; +. += 0x400; /* stack */ +/* No tile with memory exists to the south. */ +. = 0x40000; +. += 0x10000; +/* No tile with memory exists to the west. */ +. = 0x50000; +. += 0x10000; +. = 0x60400; +buf5 = .; +. += 0x400; +. = 0x64000; +buf4 = .; +. += 0x400; +. = 0x68000; +buf3 = .; +. += 0x400; +. = 0x70400; +buf2 = .; +. += 0x400; +. = 0x74000; +buf1 = .; +. += 0x400; +. = 0x78000; +buf0 = .; +. += 0x400; + .bss : { *(.bss*) } > data +} +PROVIDE(main = core_0_2); diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_2.ll b/examples/elementwise_arith/air_project/div_kernel_0_core_0_2.ll new file mode 100644 index 0000000..bf98238 --- /dev/null +++ b/examples/elementwise_arith/air_project/div_kernel_0_core_0_2.ll @@ -0,0 +1,158 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target triple = "aie2p" + +@buf0 = external global [256 x float] +@buf1 = external global [256 x float] +@buf2 = external global [256 x float] +@buf3 = external global [256 x float] +@buf4 = external global [256 x float] +@buf5 = external global [256 x float] +@buf6 = external global [256 x float] +@buf7 = external global [256 x float] +@buf8 = external global [256 x float] +@buf9 = external global [256 x float] +@buf10 = external global [256 x float] +@buf11 = external global [256 x float] +@buf12 = external global [1024 x float] +@buf13 = external global [1024 x float] +@buf14 = external global [1024 x float] + +; Function Attrs: noinline +define float @__aie2p_scalar_fdiv(float %0, float %1) #0 { + %3 = call float @llvm.aie2p.inv(float %1) + %4 = fmul float %0, %3 + ret float %4 +} + +declare void @debug_i32(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.event(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.put.ms(i32, i32) + +; Unknown intrinsic +declare { i32, i32 } @llvm.aie2p.get.ss() + +; Unknown intrinsic +declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32) + +; Unknown intrinsic +declare <16 x i32> @llvm.aie2p.scd.read.vec(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.acquire(i32, i32) + +; Unknown intrinsic +declare void @llvm.aie2p.release(i32, i32) + +; Unknown intrinsic +declare void @llvm.aie2p.set.ctrl.reg(i32, i32) + +define void @core_0_2() { + br label %1 + +1: ; preds = %76, %0 + call void @llvm.aie2p.acquire(i32 49, i32 -1) + call void @llvm.aie2p.acquire(i32 50, i32 -1) + call void @llvm.aie2p.acquire(i32 52, i32 -1) + br label %2 + +2: ; preds = %5, %1 + %3 = phi i32 [ %75, %5 ], [ 0, %1 ] + %4 = icmp slt i32 %3, 256 + br i1 %4, label %5, label %76 + +5: ; preds = %2 + %6 = getelementptr float, ptr @buf2, i32 %3 + %7 = load <16 x float>, ptr %6, align 4 + %8 = getelementptr float, ptr @buf1, i32 %3 + %9 = load <16 x float>, ptr %8, align 4 + %10 = extractelement <16 x float> %7, i64 0 + %11 = extractelement <16 x float> %9, i64 0 + %12 = call float @__aie2p_scalar_fdiv(float %10, float %11) + %13 = insertelement <16 x float> poison, float %12, i64 0 + %14 = extractelement <16 x float> %7, i64 1 + %15 = extractelement <16 x float> %9, i64 1 + %16 = call float @__aie2p_scalar_fdiv(float %14, float %15) + %17 = insertelement <16 x float> %13, float %16, i64 1 + %18 = extractelement <16 x float> %7, i64 2 + %19 = extractelement <16 x float> %9, i64 2 + %20 = call float @__aie2p_scalar_fdiv(float %18, float %19) + %21 = insertelement <16 x float> %17, float %20, i64 2 + %22 = extractelement <16 x float> %7, i64 3 + %23 = extractelement <16 x float> %9, i64 3 + %24 = call float @__aie2p_scalar_fdiv(float %22, float %23) + %25 = insertelement <16 x float> %21, float %24, i64 3 + %26 = extractelement <16 x float> %7, i64 4 + %27 = extractelement <16 x float> %9, i64 4 + %28 = call float @__aie2p_scalar_fdiv(float %26, float %27) + %29 = insertelement <16 x float> %25, float %28, i64 4 + %30 = extractelement <16 x float> %7, i64 5 + %31 = extractelement <16 x float> %9, i64 5 + %32 = call float @__aie2p_scalar_fdiv(float %30, float %31) + %33 = insertelement <16 x float> %29, float %32, i64 5 + %34 = extractelement <16 x float> %7, i64 6 + %35 = extractelement <16 x float> %9, i64 6 + %36 = call float @__aie2p_scalar_fdiv(float %34, float %35) + %37 = insertelement <16 x float> %33, float %36, i64 6 + %38 = extractelement <16 x float> %7, i64 7 + %39 = extractelement <16 x float> %9, i64 7 + %40 = call float @__aie2p_scalar_fdiv(float %38, float %39) + %41 = insertelement <16 x float> %37, float %40, i64 7 + %42 = extractelement <16 x float> %7, i64 8 + %43 = extractelement <16 x float> %9, i64 8 + %44 = call float @__aie2p_scalar_fdiv(float %42, float %43) + %45 = insertelement <16 x float> %41, float %44, i64 8 + %46 = extractelement <16 x float> %7, i64 9 + %47 = extractelement <16 x float> %9, i64 9 + %48 = call float @__aie2p_scalar_fdiv(float %46, float %47) + %49 = insertelement <16 x float> %45, float %48, i64 9 + %50 = extractelement <16 x float> %7, i64 10 + %51 = extractelement <16 x float> %9, i64 10 + %52 = call float @__aie2p_scalar_fdiv(float %50, float %51) + %53 = insertelement <16 x float> %49, float %52, i64 10 + %54 = extractelement <16 x float> %7, i64 11 + %55 = extractelement <16 x float> %9, i64 11 + %56 = call float @__aie2p_scalar_fdiv(float %54, float %55) + %57 = insertelement <16 x float> %53, float %56, i64 11 + %58 = extractelement <16 x float> %7, i64 12 + %59 = extractelement <16 x float> %9, i64 12 + %60 = call float @__aie2p_scalar_fdiv(float %58, float %59) + %61 = insertelement <16 x float> %57, float %60, i64 12 + %62 = extractelement <16 x float> %7, i64 13 + %63 = extractelement <16 x float> %9, i64 13 + %64 = call float @__aie2p_scalar_fdiv(float %62, float %63) + %65 = insertelement <16 x float> %61, float %64, i64 13 + %66 = extractelement <16 x float> %7, i64 14 + %67 = extractelement <16 x float> %9, i64 14 + %68 = call float @__aie2p_scalar_fdiv(float %66, float %67) + %69 = insertelement <16 x float> %65, float %68, i64 14 + %70 = extractelement <16 x float> %7, i64 15 + %71 = extractelement <16 x float> %9, i64 15 + %72 = call float @__aie2p_scalar_fdiv(float %70, float %71) + %73 = insertelement <16 x float> %69, float %72, i64 15 + %74 = getelementptr float, ptr @buf0, i32 %3 + store <16 x float> %73, ptr %74, align 4 + %75 = add i32 %3, 16 + br label %2, !llvm.loop !1 + +76: ; preds = %2 + call void @llvm.aie2p.release(i32 51, i32 1) + call void @llvm.aie2p.release(i32 53, i32 1) + call void @llvm.aie2p.release(i32 48, i32 1) + br label %1 +} + +; Unknown intrinsic +declare float @llvm.aie2p.inv(float) + +attributes #0 = { noinline } + +!llvm.module.flags = !{!0} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !{!1, !2} +!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_2.o b/examples/elementwise_arith/air_project/div_kernel_0_core_0_2.o new file mode 100644 index 0000000000000000000000000000000000000000..728d41f24858dcddcc40661dd9b21afbd5fbd4eb GIT binary patch literal 2048 zcma)7VQ3R)7=AC8G*_738AWV00Qk?Tg4MC=qDWzIhp- z5}hqfNRLb!561-+RBx35C^iJZpp|!YX0-ZI+GPhqCdLU*59`I#68bemtf?L4@(YH+Z9fbqCh38@(TFwBaycj6u^8v$ur;gvk`uxtL z-+3`;_{*pduQ9*-$hX&@-xT^y<a{ie7Lot7zTYMEyF}})pnk3U_YC=U+`r5D zK~Jg$hr(iL2Y>gpdMMhghwia*%=gj8svf!?&_llj^`Gaq=uiJd=0S7i?(McdHdosG zBGA>KUaVGPg&QK96e8AD@yL{j`Tk(}m#ZNPJ^;S28NT^U>gR6{lV-7+n9G}{3LR`TUgP@v9|RW9QdpQ|HpwZIq+?+ z6`u&!w$GRYpK;*7JMcvZp2hnl&<7l25_3CW@2`ZDjzSwvrIT7VrDwH7A~}@m8cAdZ zlIi4VV(|3P7?=!bXNR*X?Tgsa_DnW8@D23#9y@$sD49+TWDgIY&ZSeJWwN8$, ptr %5, align 64 + %7 = getelementptr float, ptr @buf1, i20 %4 + %8 = load <16 x float>, ptr %7, align 64 + %9 = extractelement <16 x float> %6, i64 0 + %10 = extractelement <16 x float> %8, i64 0 + %11 = tail call float @__aie2p_scalar_fdiv(float %9, float %10) + %12 = insertelement <16 x float> poison, float %11, i64 0 + %13 = extractelement <16 x float> %6, i64 1 + %14 = extractelement <16 x float> %8, i64 1 + %15 = tail call float @__aie2p_scalar_fdiv(float %13, float %14) + %16 = insertelement <16 x float> %12, float %15, i64 1 + %17 = extractelement <16 x float> %6, i64 2 + %18 = extractelement <16 x float> %8, i64 2 + %19 = tail call float @__aie2p_scalar_fdiv(float %17, float %18) + %20 = insertelement <16 x float> %16, float %19, i64 2 + %21 = extractelement <16 x float> %6, i64 3 + %22 = extractelement <16 x float> %8, i64 3 + %23 = tail call float @__aie2p_scalar_fdiv(float %21, float %22) + %24 = insertelement <16 x float> %20, float %23, i64 3 + %25 = extractelement <16 x float> %6, i64 4 + %26 = extractelement <16 x float> %8, i64 4 + %27 = tail call float @__aie2p_scalar_fdiv(float %25, float %26) + %28 = insertelement <16 x float> %24, float %27, i64 4 + %29 = extractelement <16 x float> %6, i64 5 + %30 = extractelement <16 x float> %8, i64 5 + %31 = tail call float @__aie2p_scalar_fdiv(float %29, float %30) + %32 = insertelement <16 x float> %28, float %31, i64 5 + %33 = extractelement <16 x float> %6, i64 6 + %34 = extractelement <16 x float> %8, i64 6 + %35 = tail call float @__aie2p_scalar_fdiv(float %33, float %34) + %36 = insertelement <16 x float> %32, float %35, i64 6 + %37 = extractelement <16 x float> %6, i64 7 + %38 = extractelement <16 x float> %8, i64 7 + %39 = tail call float @__aie2p_scalar_fdiv(float %37, float %38) + %40 = insertelement <16 x float> %36, float %39, i64 7 + %41 = extractelement <16 x float> %6, i64 8 + %42 = extractelement <16 x float> %8, i64 8 + %43 = tail call float @__aie2p_scalar_fdiv(float %41, float %42) + %44 = insertelement <16 x float> %40, float %43, i64 8 + %45 = extractelement <16 x float> %6, i64 9 + %46 = extractelement <16 x float> %8, i64 9 + %47 = tail call float @__aie2p_scalar_fdiv(float %45, float %46) + %48 = insertelement <16 x float> %44, float %47, i64 9 + %49 = extractelement <16 x float> %6, i64 10 + %50 = extractelement <16 x float> %8, i64 10 + %51 = tail call float @__aie2p_scalar_fdiv(float %49, float %50) + %52 = insertelement <16 x float> %48, float %51, i64 10 + %53 = extractelement <16 x float> %6, i64 11 + %54 = extractelement <16 x float> %8, i64 11 + %55 = tail call float @__aie2p_scalar_fdiv(float %53, float %54) + %56 = insertelement <16 x float> %52, float %55, i64 11 + %57 = extractelement <16 x float> %6, i64 12 + %58 = extractelement <16 x float> %8, i64 12 + %59 = tail call float @__aie2p_scalar_fdiv(float %57, float %58) + %60 = insertelement <16 x float> %56, float %59, i64 12 + %61 = extractelement <16 x float> %6, i64 13 + %62 = extractelement <16 x float> %8, i64 13 + %63 = tail call float @__aie2p_scalar_fdiv(float %61, float %62) + %64 = insertelement <16 x float> %60, float %63, i64 13 + %65 = extractelement <16 x float> %6, i64 14 + %66 = extractelement <16 x float> %8, i64 14 + %67 = tail call float @__aie2p_scalar_fdiv(float %65, float %66) + %68 = insertelement <16 x float> %64, float %67, i64 14 + %69 = extractelement <16 x float> %6, i64 15 + %70 = extractelement <16 x float> %8, i64 15 + %71 = tail call float @__aie2p_scalar_fdiv(float %69, float %70) + %72 = insertelement <16 x float> %68, float %71, i64 15 + %73 = getelementptr float, ptr @buf0, i20 %4 + store <16 x float> %72, ptr %73, align 64 + %74 = add nuw nsw i32 %3, 16 + %75 = icmp ult i32 %3, 240 + br i1 %75, label %2, label %76, !llvm.loop !1 + +76: ; preds = %2 + tail call void @llvm.aie2p.release(i32 51, i32 1) + tail call void @llvm.aie2p.release(i32 53, i32 1) + tail call void @llvm.aie2p.release(i32 48, i32 1) + br label %1 +} + +; Function Attrs: nofree nosync nounwind memory(none) +declare float @llvm.aie2p.inv(float) #3 + +attributes #0 = { nofree noinline nosync nounwind memory(none) } +attributes #1 = { nounwind } +attributes #2 = { noreturn nounwind } +attributes #3 = { nofree nosync nounwind memory(none) } + +!llvm.module.flags = !{!0} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !{!1, !2} +!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_2.peanohack.ll b/examples/elementwise_arith/air_project/div_kernel_0_core_0_2.peanohack.ll new file mode 100644 index 0000000..61bace1 --- /dev/null +++ b/examples/elementwise_arith/air_project/div_kernel_0_core_0_2.peanohack.ll @@ -0,0 +1,158 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target triple = "aie2p" + +@buf0 = external global [256 x float] +@buf1 = external global [256 x float] +@buf2 = external global [256 x float] +@buf3 = external global [256 x float] +@buf4 = external global [256 x float] +@buf5 = external global [256 x float] +@buf6 = external global [256 x float] +@buf7 = external global [256 x float] +@buf8 = external global [256 x float] +@buf9 = external global [256 x float] +@buf10 = external global [256 x float] +@buf11 = external global [256 x float] +@buf12 = external global [1024 x float] +@buf13 = external global [1024 x float] +@buf14 = external global [1024 x float] + +; Function Attrs: noinline +define float @__aie2p_scalar_fdiv(float %0, float %1) #0 { + %3 = call float @llvm.aie2p.inv(float %1) + %4 = fmul float %0, %3 + ret float %4 +} + +declare void @debug_i32(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.event(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.put.ms(i32, i32) + +; Unknown intrinsic +declare { i32, i32 } @llvm.aie2p.get.ss() + +; Unknown intrinsic +declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32) + +; Unknown intrinsic +declare <16 x i32> @llvm.aie2p.scd.read.vec(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.acquire(i32, i32) + +; Unknown intrinsic +declare void @llvm.aie2p.release(i32, i32) + +; Unknown intrinsic +declare void @llvm.aie2p.set.ctrl.reg(i32, i32) + +define void @core_0_2() { + br label %1 + +1: ; preds = %76, %0 + call void @llvm.aie2p.acquire(i32 49, i32 -1) + call void @llvm.aie2p.acquire(i32 50, i32 -1) + call void @llvm.aie2p.acquire(i32 52, i32 -1) + br label %2 + +2: ; preds = %5, %1 + %3 = phi i32 [ %75, %5 ], [ 0, %1 ] + %4 = icmp slt i32 %3, 256 + br i1 %4, label %5, label %76 + +5: ; preds = %2 + %6 = getelementptr float, ptr @buf2, i32 %3 + %7 = load <16 x float>, ptr %6 + %8 = getelementptr float, ptr @buf1, i32 %3 + %9 = load <16 x float>, ptr %8 + %10 = extractelement <16 x float> %7, i64 0 + %11 = extractelement <16 x float> %9, i64 0 + %12 = call float @__aie2p_scalar_fdiv(float %10, float %11) + %13 = insertelement <16 x float> poison, float %12, i64 0 + %14 = extractelement <16 x float> %7, i64 1 + %15 = extractelement <16 x float> %9, i64 1 + %16 = call float @__aie2p_scalar_fdiv(float %14, float %15) + %17 = insertelement <16 x float> %13, float %16, i64 1 + %18 = extractelement <16 x float> %7, i64 2 + %19 = extractelement <16 x float> %9, i64 2 + %20 = call float @__aie2p_scalar_fdiv(float %18, float %19) + %21 = insertelement <16 x float> %17, float %20, i64 2 + %22 = extractelement <16 x float> %7, i64 3 + %23 = extractelement <16 x float> %9, i64 3 + %24 = call float @__aie2p_scalar_fdiv(float %22, float %23) + %25 = insertelement <16 x float> %21, float %24, i64 3 + %26 = extractelement <16 x float> %7, i64 4 + %27 = extractelement <16 x float> %9, i64 4 + %28 = call float @__aie2p_scalar_fdiv(float %26, float %27) + %29 = insertelement <16 x float> %25, float %28, i64 4 + %30 = extractelement <16 x float> %7, i64 5 + %31 = extractelement <16 x float> %9, i64 5 + %32 = call float @__aie2p_scalar_fdiv(float %30, float %31) + %33 = insertelement <16 x float> %29, float %32, i64 5 + %34 = extractelement <16 x float> %7, i64 6 + %35 = extractelement <16 x float> %9, i64 6 + %36 = call float @__aie2p_scalar_fdiv(float %34, float %35) + %37 = insertelement <16 x float> %33, float %36, i64 6 + %38 = extractelement <16 x float> %7, i64 7 + %39 = extractelement <16 x float> %9, i64 7 + %40 = call float @__aie2p_scalar_fdiv(float %38, float %39) + %41 = insertelement <16 x float> %37, float %40, i64 7 + %42 = extractelement <16 x float> %7, i64 8 + %43 = extractelement <16 x float> %9, i64 8 + %44 = call float @__aie2p_scalar_fdiv(float %42, float %43) + %45 = insertelement <16 x float> %41, float %44, i64 8 + %46 = extractelement <16 x float> %7, i64 9 + %47 = extractelement <16 x float> %9, i64 9 + %48 = call float @__aie2p_scalar_fdiv(float %46, float %47) + %49 = insertelement <16 x float> %45, float %48, i64 9 + %50 = extractelement <16 x float> %7, i64 10 + %51 = extractelement <16 x float> %9, i64 10 + %52 = call float @__aie2p_scalar_fdiv(float %50, float %51) + %53 = insertelement <16 x float> %49, float %52, i64 10 + %54 = extractelement <16 x float> %7, i64 11 + %55 = extractelement <16 x float> %9, i64 11 + %56 = call float @__aie2p_scalar_fdiv(float %54, float %55) + %57 = insertelement <16 x float> %53, float %56, i64 11 + %58 = extractelement <16 x float> %7, i64 12 + %59 = extractelement <16 x float> %9, i64 12 + %60 = call float @__aie2p_scalar_fdiv(float %58, float %59) + %61 = insertelement <16 x float> %57, float %60, i64 12 + %62 = extractelement <16 x float> %7, i64 13 + %63 = extractelement <16 x float> %9, i64 13 + %64 = call float @__aie2p_scalar_fdiv(float %62, float %63) + %65 = insertelement <16 x float> %61, float %64, i64 13 + %66 = extractelement <16 x float> %7, i64 14 + %67 = extractelement <16 x float> %9, i64 14 + %68 = call float @__aie2p_scalar_fdiv(float %66, float %67) + %69 = insertelement <16 x float> %65, float %68, i64 14 + %70 = extractelement <16 x float> %7, i64 15 + %71 = extractelement <16 x float> %9, i64 15 + %72 = call float @__aie2p_scalar_fdiv(float %70, float %71) + %73 = insertelement <16 x float> %69, float %72, i64 15 + %74 = getelementptr float, ptr @buf0, i32 %3 + store <16 x float> %73, ptr %74 + %75 = add i32 %3, 16 + br label %2, !llvm.loop !1 + +76: ; preds = %2 + call void @llvm.aie2p.release(i32 51, i32 1) + call void @llvm.aie2p.release(i32 53, i32 1) + call void @llvm.aie2p.release(i32 48, i32 1) + br label %1 +} + +; Unknown intrinsic +declare float @llvm.aie2p.inv(float) + +attributes #0 = { noinline } + +!llvm.module.flags = !{!0} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !{!1, !2} +!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_3.elf b/examples/elementwise_arith/air_project/div_kernel_0_core_0_3.elf new file mode 100755 index 0000000000000000000000000000000000000000..a92e4ae85366d8708c15091ef67871349c6409af GIT binary patch literal 4192 zcma)9eQZp-&G1j-|P#1tp7V*@cAtwXa4F@sLfIp>~x&bc4He*WvhXC+AjgG^v6A|DF3*#XSh`#=E`I6(#rkhO}y>gFAn*QIQa@WHn`|!hm7C?Z z`Wj@AxNWP>xtOKWeAo4)dfNp;0hU)(&VL83+y_rnwwIKE@(@7NIf`v~ou=JA(9ee=Tpj-$Wh zl4vuD>4vNP+Rmy~kLJ@2uh*s< zepQ$L<;1e|Ki@^oUzgwd^M&;td}4gbC$%kB<>maXotZbxd{_$f>l+t13()zYQ}uz# z+6Gks`o88M;pyqrPY<_^A~y`sNh7%pbUEh7z-9tPKMxnkGeUH8M^rw`M5W61DZd}v z`1wA?pP9&u8kJ33szKdzT~=}~={0M6P6UIF&0%1_hDS0_^OW63&MxU%VgihA7<<7u71%kIepMX4Z)3Omv$b`_854iO*3O9DN0^A&9In;Ov=YQ)K?l~N)9lv#iJD<18ZB&MWVR2h zi!rsss2Nn@vpU53%V--I!5bbpd=*4xz#%Fy$G5U4#FfiE#R;Eg{m{V0}6oEA`ULby%4xz##Piv z%a=PxW!KM6N>&h(>JVFAVPXqbpmLb;sqGH2W)gQS;yqZcLRG*L)VxBeKvDp~dU zglm(-b*T>OC9Ego)58uv{W#XHjjG>o(*3l9VpX3PjhEO3^!-yhBe(;(JK*3So0*WZ zOT_Z+4&Lziy!$P$RCCN1Jv{h)>3WKv~;F*&S;l_1mrllf){gF<_Gh3N}t&HmAk`_x#OkztLYET*jv^+}j z27q2LdOaqz`wz1d=iauP`lo(t&K%2&12eMAWC@t*JYak{Mz=Ti2NjC@Crghm;IF^^ zfb|K+69;1ho0VX&UGeyv+)a4CBR%270p(C^AQ|sV=u*Ymol5m5H#avQh^M*-yP6_> zz0Do*o_HeN+|zTYw=o=#DP7)3)a{RYy|HlA>-KN-wMJu+O;KOe>*?}#`63>FEb8~| z_phVq?j@-VTHyBV62#e}OoBS(b7go{k*C+w?OA#`b*FpKe#5AtW$7f{jhR|>dfzZx;cXV^C*CZ34K`w%MyL=ON7 z+UDSI&B4#j!HGHeM|1E?h{2+52+HpS;XnzG5LQe0Y{6bzP_q9?!M=pg7wk(Ino_ZJ z3g}q$#uCUR5BH|RUD$UgQv(`T?8A|Rp=A7ru_Qn+xOYc;JlqqDq;~X02YX`B6x_Pi z6Y}VN3-k{5B=>upA_ajvv_bE^dcR5UTMU86txG(HsmHMK8=4k_F(fU9q{V0WYc&`@ zg%FKX2$6x5rzt|A>>o@-LS3-~@q`CLAw2vo{h?$e+!G!M?T^L}p-hV#B7Fm~kUQjs zuEG5q=;Nc0S2IAb$RtzYffR(2=z+M

WS&EY!Y(VB{c#dc*NVD4vL?fH(+6QhftS zV-(3l;emnhVMC&+V~Hpf2Id?T nz@{S7{1L3jR&IA5?S4*^>141^ASPbb< program + .data : { + *(.data*) + *(.rodata*) + } > data + .comment : { + *(.comment*) + } + .symtab : { + *(.symtab) + } + .shstrtab : { + *(.shstrtab) + } + .strtab : { + *(.strtab) + } + .stack_sizes : { + *(.stack_sizes) + } + +. = 0x70000; +_sp_start_value_DM_stack = .; +. += 0x400; /* stack */ +. = 0x40400; +buf2 = .; +. += 0x400; +. = 0x44000; +buf1 = .; +. += 0x400; +. = 0x48000; +buf0 = .; +. += 0x400; +/* No tile with memory exists to the west. */ +. = 0x50000; +. += 0x10000; +. = 0x60400; +buf8 = .; +. += 0x400; +. = 0x64000; +buf7 = .; +. += 0x400; +. = 0x68000; +buf6 = .; +. += 0x400; +. = 0x70400; +buf5 = .; +. += 0x400; +. = 0x74000; +buf4 = .; +. += 0x400; +. = 0x78000; +buf3 = .; +. += 0x400; + .bss : { *(.bss*) } > data +} +PROVIDE(main = core_0_3); diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_3.ll b/examples/elementwise_arith/air_project/div_kernel_0_core_0_3.ll new file mode 100644 index 0000000..666390f --- /dev/null +++ b/examples/elementwise_arith/air_project/div_kernel_0_core_0_3.ll @@ -0,0 +1,158 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target triple = "aie2p" + +@buf0 = external global [256 x float] +@buf1 = external global [256 x float] +@buf2 = external global [256 x float] +@buf3 = external global [256 x float] +@buf4 = external global [256 x float] +@buf5 = external global [256 x float] +@buf6 = external global [256 x float] +@buf7 = external global [256 x float] +@buf8 = external global [256 x float] +@buf9 = external global [256 x float] +@buf10 = external global [256 x float] +@buf11 = external global [256 x float] +@buf12 = external global [1024 x float] +@buf13 = external global [1024 x float] +@buf14 = external global [1024 x float] + +; Function Attrs: noinline +define float @__aie2p_scalar_fdiv(float %0, float %1) #0 { + %3 = call float @llvm.aie2p.inv(float %1) + %4 = fmul float %0, %3 + ret float %4 +} + +declare void @debug_i32(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.event(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.put.ms(i32, i32) + +; Unknown intrinsic +declare { i32, i32 } @llvm.aie2p.get.ss() + +; Unknown intrinsic +declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32) + +; Unknown intrinsic +declare <16 x i32> @llvm.aie2p.scd.read.vec(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.acquire(i32, i32) + +; Unknown intrinsic +declare void @llvm.aie2p.release(i32, i32) + +; Unknown intrinsic +declare void @llvm.aie2p.set.ctrl.reg(i32, i32) + +define void @core_0_3() { + br label %1 + +1: ; preds = %76, %0 + call void @llvm.aie2p.acquire(i32 49, i32 -1) + call void @llvm.aie2p.acquire(i32 50, i32 -1) + call void @llvm.aie2p.acquire(i32 52, i32 -1) + br label %2 + +2: ; preds = %5, %1 + %3 = phi i32 [ %75, %5 ], [ 0, %1 ] + %4 = icmp slt i32 %3, 256 + br i1 %4, label %5, label %76 + +5: ; preds = %2 + %6 = getelementptr float, ptr @buf5, i32 %3 + %7 = load <16 x float>, ptr %6, align 4 + %8 = getelementptr float, ptr @buf4, i32 %3 + %9 = load <16 x float>, ptr %8, align 4 + %10 = extractelement <16 x float> %7, i64 0 + %11 = extractelement <16 x float> %9, i64 0 + %12 = call float @__aie2p_scalar_fdiv(float %10, float %11) + %13 = insertelement <16 x float> poison, float %12, i64 0 + %14 = extractelement <16 x float> %7, i64 1 + %15 = extractelement <16 x float> %9, i64 1 + %16 = call float @__aie2p_scalar_fdiv(float %14, float %15) + %17 = insertelement <16 x float> %13, float %16, i64 1 + %18 = extractelement <16 x float> %7, i64 2 + %19 = extractelement <16 x float> %9, i64 2 + %20 = call float @__aie2p_scalar_fdiv(float %18, float %19) + %21 = insertelement <16 x float> %17, float %20, i64 2 + %22 = extractelement <16 x float> %7, i64 3 + %23 = extractelement <16 x float> %9, i64 3 + %24 = call float @__aie2p_scalar_fdiv(float %22, float %23) + %25 = insertelement <16 x float> %21, float %24, i64 3 + %26 = extractelement <16 x float> %7, i64 4 + %27 = extractelement <16 x float> %9, i64 4 + %28 = call float @__aie2p_scalar_fdiv(float %26, float %27) + %29 = insertelement <16 x float> %25, float %28, i64 4 + %30 = extractelement <16 x float> %7, i64 5 + %31 = extractelement <16 x float> %9, i64 5 + %32 = call float @__aie2p_scalar_fdiv(float %30, float %31) + %33 = insertelement <16 x float> %29, float %32, i64 5 + %34 = extractelement <16 x float> %7, i64 6 + %35 = extractelement <16 x float> %9, i64 6 + %36 = call float @__aie2p_scalar_fdiv(float %34, float %35) + %37 = insertelement <16 x float> %33, float %36, i64 6 + %38 = extractelement <16 x float> %7, i64 7 + %39 = extractelement <16 x float> %9, i64 7 + %40 = call float @__aie2p_scalar_fdiv(float %38, float %39) + %41 = insertelement <16 x float> %37, float %40, i64 7 + %42 = extractelement <16 x float> %7, i64 8 + %43 = extractelement <16 x float> %9, i64 8 + %44 = call float @__aie2p_scalar_fdiv(float %42, float %43) + %45 = insertelement <16 x float> %41, float %44, i64 8 + %46 = extractelement <16 x float> %7, i64 9 + %47 = extractelement <16 x float> %9, i64 9 + %48 = call float @__aie2p_scalar_fdiv(float %46, float %47) + %49 = insertelement <16 x float> %45, float %48, i64 9 + %50 = extractelement <16 x float> %7, i64 10 + %51 = extractelement <16 x float> %9, i64 10 + %52 = call float @__aie2p_scalar_fdiv(float %50, float %51) + %53 = insertelement <16 x float> %49, float %52, i64 10 + %54 = extractelement <16 x float> %7, i64 11 + %55 = extractelement <16 x float> %9, i64 11 + %56 = call float @__aie2p_scalar_fdiv(float %54, float %55) + %57 = insertelement <16 x float> %53, float %56, i64 11 + %58 = extractelement <16 x float> %7, i64 12 + %59 = extractelement <16 x float> %9, i64 12 + %60 = call float @__aie2p_scalar_fdiv(float %58, float %59) + %61 = insertelement <16 x float> %57, float %60, i64 12 + %62 = extractelement <16 x float> %7, i64 13 + %63 = extractelement <16 x float> %9, i64 13 + %64 = call float @__aie2p_scalar_fdiv(float %62, float %63) + %65 = insertelement <16 x float> %61, float %64, i64 13 + %66 = extractelement <16 x float> %7, i64 14 + %67 = extractelement <16 x float> %9, i64 14 + %68 = call float @__aie2p_scalar_fdiv(float %66, float %67) + %69 = insertelement <16 x float> %65, float %68, i64 14 + %70 = extractelement <16 x float> %7, i64 15 + %71 = extractelement <16 x float> %9, i64 15 + %72 = call float @__aie2p_scalar_fdiv(float %70, float %71) + %73 = insertelement <16 x float> %69, float %72, i64 15 + %74 = getelementptr float, ptr @buf3, i32 %3 + store <16 x float> %73, ptr %74, align 4 + %75 = add i32 %3, 16 + br label %2, !llvm.loop !1 + +76: ; preds = %2 + call void @llvm.aie2p.release(i32 51, i32 1) + call void @llvm.aie2p.release(i32 53, i32 1) + call void @llvm.aie2p.release(i32 48, i32 1) + br label %1 +} + +; Unknown intrinsic +declare float @llvm.aie2p.inv(float) + +attributes #0 = { noinline } + +!llvm.module.flags = !{!0} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !{!1, !2} +!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_3.o b/examples/elementwise_arith/air_project/div_kernel_0_core_0_3.o new file mode 100644 index 0000000000000000000000000000000000000000..0cbf9dffbcb51eefa1ba59e76899a9a17c429aac GIT binary patch literal 2048 zcma)7VQ3R)7=AC8G*_73EhFVyV&Uwj>DDxBYb`R7CB<$2s38oMGB>Yjjayk z@AEwO-1psm_uc1j4!tEw61X9OB2)lY3xJGdzb)v5R#?*#v=Fz6l_)Up7uq!>6|nX# zssOdbeCd>O+orKQD(Rs6>kL41Api!QyrL`X4S>%^x?n2`P>r6}53*{t`&DaSr)mXv zsMhZ7s&%+swF)n)*0(|R_2;1n#ifBe(;AdynE7LiKLMEcVE%Wj-=jp4UT!d@qYV=b zrVR2ds(H7_|3>%DJi`0vpw;ym#;(8~)(>wuTCcrStFWkgmKJ?d=~NxZgf!CDV7nfx zwZyCCR2Hs?A{0(dGW4}{6&H$C79I6zHAuD*jiQm%Z z{7$3aX*p~KtEi7|Fu$9~cQ>Ek8T30-M88|8k8R+$jQnfO{m!D_Sz2!m^&36E`^azN z`JF3Cj$cDg-4CAu}-aGKHDKD)X(vpGyyz$K3L_PgX2?PFXEvT03Qsm(sBKKh4tvL98%3GbB<|}!fyhH=f?+UnmByxN}TGOX6clBQi_`HDsDc~yt zzSFbj6UW^3nHKOl0smRRmjyhJ&q<;K9Ag&8Zob~12qzsyY%-Hg$MP97A2W>fc&0CB zOpT?p=}F_n@$qS}8Hjy6kUM!)oGfIgFcEGyHZhqodJM#d1_yhMK0fw3^V&pz05yDhw%UvT zt8aws=#f(B*eFNZjyNCPLE{PW>vgmj-EeiJp;sQ~#0nO)cWDdJe2AmEfKBw9!J;, ptr %5, align 64 + %7 = getelementptr float, ptr @buf4, i20 %4 + %8 = load <16 x float>, ptr %7, align 64 + %9 = extractelement <16 x float> %6, i64 0 + %10 = extractelement <16 x float> %8, i64 0 + %11 = tail call float @__aie2p_scalar_fdiv(float %9, float %10) + %12 = insertelement <16 x float> poison, float %11, i64 0 + %13 = extractelement <16 x float> %6, i64 1 + %14 = extractelement <16 x float> %8, i64 1 + %15 = tail call float @__aie2p_scalar_fdiv(float %13, float %14) + %16 = insertelement <16 x float> %12, float %15, i64 1 + %17 = extractelement <16 x float> %6, i64 2 + %18 = extractelement <16 x float> %8, i64 2 + %19 = tail call float @__aie2p_scalar_fdiv(float %17, float %18) + %20 = insertelement <16 x float> %16, float %19, i64 2 + %21 = extractelement <16 x float> %6, i64 3 + %22 = extractelement <16 x float> %8, i64 3 + %23 = tail call float @__aie2p_scalar_fdiv(float %21, float %22) + %24 = insertelement <16 x float> %20, float %23, i64 3 + %25 = extractelement <16 x float> %6, i64 4 + %26 = extractelement <16 x float> %8, i64 4 + %27 = tail call float @__aie2p_scalar_fdiv(float %25, float %26) + %28 = insertelement <16 x float> %24, float %27, i64 4 + %29 = extractelement <16 x float> %6, i64 5 + %30 = extractelement <16 x float> %8, i64 5 + %31 = tail call float @__aie2p_scalar_fdiv(float %29, float %30) + %32 = insertelement <16 x float> %28, float %31, i64 5 + %33 = extractelement <16 x float> %6, i64 6 + %34 = extractelement <16 x float> %8, i64 6 + %35 = tail call float @__aie2p_scalar_fdiv(float %33, float %34) + %36 = insertelement <16 x float> %32, float %35, i64 6 + %37 = extractelement <16 x float> %6, i64 7 + %38 = extractelement <16 x float> %8, i64 7 + %39 = tail call float @__aie2p_scalar_fdiv(float %37, float %38) + %40 = insertelement <16 x float> %36, float %39, i64 7 + %41 = extractelement <16 x float> %6, i64 8 + %42 = extractelement <16 x float> %8, i64 8 + %43 = tail call float @__aie2p_scalar_fdiv(float %41, float %42) + %44 = insertelement <16 x float> %40, float %43, i64 8 + %45 = extractelement <16 x float> %6, i64 9 + %46 = extractelement <16 x float> %8, i64 9 + %47 = tail call float @__aie2p_scalar_fdiv(float %45, float %46) + %48 = insertelement <16 x float> %44, float %47, i64 9 + %49 = extractelement <16 x float> %6, i64 10 + %50 = extractelement <16 x float> %8, i64 10 + %51 = tail call float @__aie2p_scalar_fdiv(float %49, float %50) + %52 = insertelement <16 x float> %48, float %51, i64 10 + %53 = extractelement <16 x float> %6, i64 11 + %54 = extractelement <16 x float> %8, i64 11 + %55 = tail call float @__aie2p_scalar_fdiv(float %53, float %54) + %56 = insertelement <16 x float> %52, float %55, i64 11 + %57 = extractelement <16 x float> %6, i64 12 + %58 = extractelement <16 x float> %8, i64 12 + %59 = tail call float @__aie2p_scalar_fdiv(float %57, float %58) + %60 = insertelement <16 x float> %56, float %59, i64 12 + %61 = extractelement <16 x float> %6, i64 13 + %62 = extractelement <16 x float> %8, i64 13 + %63 = tail call float @__aie2p_scalar_fdiv(float %61, float %62) + %64 = insertelement <16 x float> %60, float %63, i64 13 + %65 = extractelement <16 x float> %6, i64 14 + %66 = extractelement <16 x float> %8, i64 14 + %67 = tail call float @__aie2p_scalar_fdiv(float %65, float %66) + %68 = insertelement <16 x float> %64, float %67, i64 14 + %69 = extractelement <16 x float> %6, i64 15 + %70 = extractelement <16 x float> %8, i64 15 + %71 = tail call float @__aie2p_scalar_fdiv(float %69, float %70) + %72 = insertelement <16 x float> %68, float %71, i64 15 + %73 = getelementptr float, ptr @buf3, i20 %4 + store <16 x float> %72, ptr %73, align 64 + %74 = add nuw nsw i32 %3, 16 + %75 = icmp ult i32 %3, 240 + br i1 %75, label %2, label %76, !llvm.loop !1 + +76: ; preds = %2 + tail call void @llvm.aie2p.release(i32 51, i32 1) + tail call void @llvm.aie2p.release(i32 53, i32 1) + tail call void @llvm.aie2p.release(i32 48, i32 1) + br label %1 +} + +; Function Attrs: nofree nosync nounwind memory(none) +declare float @llvm.aie2p.inv(float) #3 + +attributes #0 = { nofree noinline nosync nounwind memory(none) } +attributes #1 = { nounwind } +attributes #2 = { noreturn nounwind } +attributes #3 = { nofree nosync nounwind memory(none) } + +!llvm.module.flags = !{!0} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !{!1, !2} +!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_3.peanohack.ll b/examples/elementwise_arith/air_project/div_kernel_0_core_0_3.peanohack.ll new file mode 100644 index 0000000..0c167b0 --- /dev/null +++ b/examples/elementwise_arith/air_project/div_kernel_0_core_0_3.peanohack.ll @@ -0,0 +1,158 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target triple = "aie2p" + +@buf0 = external global [256 x float] +@buf1 = external global [256 x float] +@buf2 = external global [256 x float] +@buf3 = external global [256 x float] +@buf4 = external global [256 x float] +@buf5 = external global [256 x float] +@buf6 = external global [256 x float] +@buf7 = external global [256 x float] +@buf8 = external global [256 x float] +@buf9 = external global [256 x float] +@buf10 = external global [256 x float] +@buf11 = external global [256 x float] +@buf12 = external global [1024 x float] +@buf13 = external global [1024 x float] +@buf14 = external global [1024 x float] + +; Function Attrs: noinline +define float @__aie2p_scalar_fdiv(float %0, float %1) #0 { + %3 = call float @llvm.aie2p.inv(float %1) + %4 = fmul float %0, %3 + ret float %4 +} + +declare void @debug_i32(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.event(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.put.ms(i32, i32) + +; Unknown intrinsic +declare { i32, i32 } @llvm.aie2p.get.ss() + +; Unknown intrinsic +declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32) + +; Unknown intrinsic +declare <16 x i32> @llvm.aie2p.scd.read.vec(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.acquire(i32, i32) + +; Unknown intrinsic +declare void @llvm.aie2p.release(i32, i32) + +; Unknown intrinsic +declare void @llvm.aie2p.set.ctrl.reg(i32, i32) + +define void @core_0_3() { + br label %1 + +1: ; preds = %76, %0 + call void @llvm.aie2p.acquire(i32 49, i32 -1) + call void @llvm.aie2p.acquire(i32 50, i32 -1) + call void @llvm.aie2p.acquire(i32 52, i32 -1) + br label %2 + +2: ; preds = %5, %1 + %3 = phi i32 [ %75, %5 ], [ 0, %1 ] + %4 = icmp slt i32 %3, 256 + br i1 %4, label %5, label %76 + +5: ; preds = %2 + %6 = getelementptr float, ptr @buf5, i32 %3 + %7 = load <16 x float>, ptr %6 + %8 = getelementptr float, ptr @buf4, i32 %3 + %9 = load <16 x float>, ptr %8 + %10 = extractelement <16 x float> %7, i64 0 + %11 = extractelement <16 x float> %9, i64 0 + %12 = call float @__aie2p_scalar_fdiv(float %10, float %11) + %13 = insertelement <16 x float> poison, float %12, i64 0 + %14 = extractelement <16 x float> %7, i64 1 + %15 = extractelement <16 x float> %9, i64 1 + %16 = call float @__aie2p_scalar_fdiv(float %14, float %15) + %17 = insertelement <16 x float> %13, float %16, i64 1 + %18 = extractelement <16 x float> %7, i64 2 + %19 = extractelement <16 x float> %9, i64 2 + %20 = call float @__aie2p_scalar_fdiv(float %18, float %19) + %21 = insertelement <16 x float> %17, float %20, i64 2 + %22 = extractelement <16 x float> %7, i64 3 + %23 = extractelement <16 x float> %9, i64 3 + %24 = call float @__aie2p_scalar_fdiv(float %22, float %23) + %25 = insertelement <16 x float> %21, float %24, i64 3 + %26 = extractelement <16 x float> %7, i64 4 + %27 = extractelement <16 x float> %9, i64 4 + %28 = call float @__aie2p_scalar_fdiv(float %26, float %27) + %29 = insertelement <16 x float> %25, float %28, i64 4 + %30 = extractelement <16 x float> %7, i64 5 + %31 = extractelement <16 x float> %9, i64 5 + %32 = call float @__aie2p_scalar_fdiv(float %30, float %31) + %33 = insertelement <16 x float> %29, float %32, i64 5 + %34 = extractelement <16 x float> %7, i64 6 + %35 = extractelement <16 x float> %9, i64 6 + %36 = call float @__aie2p_scalar_fdiv(float %34, float %35) + %37 = insertelement <16 x float> %33, float %36, i64 6 + %38 = extractelement <16 x float> %7, i64 7 + %39 = extractelement <16 x float> %9, i64 7 + %40 = call float @__aie2p_scalar_fdiv(float %38, float %39) + %41 = insertelement <16 x float> %37, float %40, i64 7 + %42 = extractelement <16 x float> %7, i64 8 + %43 = extractelement <16 x float> %9, i64 8 + %44 = call float @__aie2p_scalar_fdiv(float %42, float %43) + %45 = insertelement <16 x float> %41, float %44, i64 8 + %46 = extractelement <16 x float> %7, i64 9 + %47 = extractelement <16 x float> %9, i64 9 + %48 = call float @__aie2p_scalar_fdiv(float %46, float %47) + %49 = insertelement <16 x float> %45, float %48, i64 9 + %50 = extractelement <16 x float> %7, i64 10 + %51 = extractelement <16 x float> %9, i64 10 + %52 = call float @__aie2p_scalar_fdiv(float %50, float %51) + %53 = insertelement <16 x float> %49, float %52, i64 10 + %54 = extractelement <16 x float> %7, i64 11 + %55 = extractelement <16 x float> %9, i64 11 + %56 = call float @__aie2p_scalar_fdiv(float %54, float %55) + %57 = insertelement <16 x float> %53, float %56, i64 11 + %58 = extractelement <16 x float> %7, i64 12 + %59 = extractelement <16 x float> %9, i64 12 + %60 = call float @__aie2p_scalar_fdiv(float %58, float %59) + %61 = insertelement <16 x float> %57, float %60, i64 12 + %62 = extractelement <16 x float> %7, i64 13 + %63 = extractelement <16 x float> %9, i64 13 + %64 = call float @__aie2p_scalar_fdiv(float %62, float %63) + %65 = insertelement <16 x float> %61, float %64, i64 13 + %66 = extractelement <16 x float> %7, i64 14 + %67 = extractelement <16 x float> %9, i64 14 + %68 = call float @__aie2p_scalar_fdiv(float %66, float %67) + %69 = insertelement <16 x float> %65, float %68, i64 14 + %70 = extractelement <16 x float> %7, i64 15 + %71 = extractelement <16 x float> %9, i64 15 + %72 = call float @__aie2p_scalar_fdiv(float %70, float %71) + %73 = insertelement <16 x float> %69, float %72, i64 15 + %74 = getelementptr float, ptr @buf3, i32 %3 + store <16 x float> %73, ptr %74 + %75 = add i32 %3, 16 + br label %2, !llvm.loop !1 + +76: ; preds = %2 + call void @llvm.aie2p.release(i32 51, i32 1) + call void @llvm.aie2p.release(i32 53, i32 1) + call void @llvm.aie2p.release(i32 48, i32 1) + br label %1 +} + +; Unknown intrinsic +declare float @llvm.aie2p.inv(float) + +attributes #0 = { noinline } + +!llvm.module.flags = !{!0} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !{!1, !2} +!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_4.elf b/examples/elementwise_arith/air_project/div_kernel_0_core_0_4.elf new file mode 100755 index 0000000000000000000000000000000000000000..b9ef327d81371a030c01aa887037ee689da34b56 GIT binary patch literal 4196 zcma)9eQZp-&G1e!P`05uj%|oV;$*SI z%qGSwNlPi922`Y5G1RnDTTPlYXWCelzq~adTD2RfAW^N8Jj%39TeTpylA8vybNt@3 zO`r)!y6>LfIp>~x&bc4HKJrZ9SxJ(>AQRXM$cw^&9l(se7Zfmo6J)ReS*zF#8eIfH ztqcVW+pd2kBZjWDx0V8(w@4r?&<@!}azVRh8NkhLW(B5PFyRMy35esC7uY0DZ3D46 zgHhdT8%-az{oNS#V@Ux;&T%aSUCO~p0U^=I+$_K^I@iIYh^4%Dao%hf%VoRR?6ix@ z)poIdgu^|*;#I@ zt3n2e+qUSOi&@IecU@1aw_P97zO9+oaB_ zS>E!6{Z`u(V>|trpC7R1K>1-6G!m7UQNM2Nds{DP0tNczS@Gl-dJ#3x0OLdE>hYnW zYEk*&0{*T%oWB>*-;3rNQF#sRUGv0u1N99H$9D|<9UDS_pQ3%^JpQt%Z(7*jarAeb z^1X%j^VRQPsDFt1ofxVfKVioB{)_gmu^;Sw@OtEwk!s3!Psal5amrjXehTv~t74RI zN9Q;BV>OiTkGdD|_v%p1_^ak>QC5xqoMU_YztJDYw|4CL)P%lYD!$^UKa~0DE7-1K zyMgUfY+3o*4wG_b2ETO`cdqTITXp76s(wXHs@}6KRo}KeRezOV+flLd(Ojzj_3Bjp zuWC~7PAp6P^8?iUb@{D7Us%_{C&rh2Ufpa}Ue4Xxkv?bU!;-&W-?+e;pUw}Rs*g?9 zHmCs5_caFzPfw?Qdbn*AxnY1#8p&;-%P~I&HWMiNdALBH5u%$rqWoDV%2l>k`Tf|2 zul6dw^h8cnschO(3F_|avXXU4uUXr(A`ozF3IY2yJd%DYx9Rlk|7L0-EzjU-cbc-b z>9@W{ZmUUw^qQ~t{`+LQae6xW3aDp4lI?II^|oYzW!mlW9I#=$9ZVFWPWkv=ma?@* ze142xGIA8F-)9pHoRWg((nJSz_rPLsPEB;&?QU?=4eDC^Ii7OYA5U59?udrTDlv75 z2?d+_0TVyGT!FQ`fsqvm=R=e$+z)s|6mN-(Y6a>?X8$rXYs7OVvQ>|L$5Qmg&8>#3 zc68*2dHt378CN85og0BQGotqqCZaZnYZWuC0P$GBfwkx~yET8JYM6-z3!503?ZfI~ zOzkjg232^i4zcbs+WJTEh6fJc22t*Jh%(Ia?F^f5el7!}`mb7I(Q>E>P!6reoHX-% z<3*X8A3y1|<;UaLv>9(;jVS-j5zHZLwi&Vo{8heCab}la0kB%cfyKBN{5Qgp^q# zmTzL;d-(uqMDu=KKm}qKuh}!Qm zVON)m+T%;b;ZX-qpL7T}t~)a=6@KgwcQT&d!USw#R413TSXyEdo7+%>(kP(iQHs|O z^n%grF`?amn4LKPp54?x^;>iLSWfJpkzFQ>-%RHLWKBk;;E*do`by&p;%PuY6(Z&zDP?;G!$uZ`!;x6Bhm22h&R&W>1yfnhCRM$#OK}T zTT9X1OHv86!0p*3h%*J51U1NKOYq79Pp_xjv-EQ6qz7uABz(`JJwB^6lHfbYq)YVe zh%<%iNU$BTyYPUOU=Qg_a1gOlV&9FJmFQ`i`b+R3#GNJfM-it>@Cn5D)YVCX*N{_7 z@Oi|eCHNh~6no0|H*@eG5WDe>u8{;6kx{Vi#8v?ke1g3j>oJ0j@J+-NbH=RU{~!+* zEkFiVm?G>i;x7rSMLdl#gYl6M@@Li;P|@+X^0;Wf3NgiJ*f$|2o`}YK5i5B_4*&_; z=HTzo!OzdZ@j3YCbMQ-u!J=&l%I^eWe-V!mR*U#--dIn@5_eEj{QKrQW;l6=r&>i$b z*WkWZ`fQ}n2F(DyB9lml29gj=paaR9AEYbuI8r)N10=_Q*3%~+WD9W~PyXwsuP?uG{kT6Kl4xsC z({=}LE9B|_vq4N}=yZ^sTgM9i?*3HgsWbeLy{SYl(C!Kf=sKfkGwoWk!Mh*uI2Wk9 n9AIMsY5oY-VJo$J1MS|S$#gQhfpsUpdWb=#BpWGCVKz literal 0 HcmV?d00001 diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_4.ld.script b/examples/elementwise_arith/air_project/div_kernel_0_core_0_4.ld.script new file mode 100644 index 0000000..ddda3c2 --- /dev/null +++ b/examples/elementwise_arith/air_project/div_kernel_0_core_0_4.ld.script @@ -0,0 +1,78 @@ + +MEMORY +{ + program (RX) : ORIGIN = 0, LENGTH = 0x0020000 + data (!RX) : ORIGIN = 0x78400, LENGTH = 0x7C00 +} +ENTRY(__start) +SECTIONS +{ + . = 0x0; + .text : { + /* the __start symbol has to come at address zero. */ + *crt0.o(.text*) + _ctors_start = .; + _init_array_start = .; + KEEP(SORT(*.init_array)) + _ctors_end = .; + _init_array_end = .; + _dtors_start = .; + _dtors_end = .; + *(.text*) + } > program + .data : { + *(.data*) + *(.rodata*) + } > data + .comment : { + *(.comment*) + } + .symtab : { + *(.symtab) + } + .shstrtab : { + *(.shstrtab) + } + .strtab : { + *(.strtab) + } + .stack_sizes : { + *(.stack_sizes) + } + +. = 0x70000; +_sp_start_value_DM_stack = .; +. += 0x400; /* stack */ +. = 0x40400; +buf5 = .; +. += 0x400; +. = 0x44000; +buf4 = .; +. += 0x400; +. = 0x48000; +buf3 = .; +. += 0x400; +/* No tile with memory exists to the west. */ +. = 0x50000; +. += 0x10000; +. = 0x60400; +buf11 = .; +. += 0x400; +. = 0x64000; +buf10 = .; +. += 0x400; +. = 0x68000; +buf9 = .; +. += 0x400; +. = 0x70400; +buf8 = .; +. += 0x400; +. = 0x74000; +buf7 = .; +. += 0x400; +. = 0x78000; +buf6 = .; +. += 0x400; + .bss : { *(.bss*) } > data +} +PROVIDE(main = core_0_4); diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_4.ll b/examples/elementwise_arith/air_project/div_kernel_0_core_0_4.ll new file mode 100644 index 0000000..678847a --- /dev/null +++ b/examples/elementwise_arith/air_project/div_kernel_0_core_0_4.ll @@ -0,0 +1,158 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target triple = "aie2p" + +@buf0 = external global [256 x float] +@buf1 = external global [256 x float] +@buf2 = external global [256 x float] +@buf3 = external global [256 x float] +@buf4 = external global [256 x float] +@buf5 = external global [256 x float] +@buf6 = external global [256 x float] +@buf7 = external global [256 x float] +@buf8 = external global [256 x float] +@buf9 = external global [256 x float] +@buf10 = external global [256 x float] +@buf11 = external global [256 x float] +@buf12 = external global [1024 x float] +@buf13 = external global [1024 x float] +@buf14 = external global [1024 x float] + +; Function Attrs: noinline +define float @__aie2p_scalar_fdiv(float %0, float %1) #0 { + %3 = call float @llvm.aie2p.inv(float %1) + %4 = fmul float %0, %3 + ret float %4 +} + +declare void @debug_i32(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.event(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.put.ms(i32, i32) + +; Unknown intrinsic +declare { i32, i32 } @llvm.aie2p.get.ss() + +; Unknown intrinsic +declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32) + +; Unknown intrinsic +declare <16 x i32> @llvm.aie2p.scd.read.vec(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.acquire(i32, i32) + +; Unknown intrinsic +declare void @llvm.aie2p.release(i32, i32) + +; Unknown intrinsic +declare void @llvm.aie2p.set.ctrl.reg(i32, i32) + +define void @core_0_4() { + br label %1 + +1: ; preds = %76, %0 + call void @llvm.aie2p.acquire(i32 49, i32 -1) + call void @llvm.aie2p.acquire(i32 50, i32 -1) + call void @llvm.aie2p.acquire(i32 52, i32 -1) + br label %2 + +2: ; preds = %5, %1 + %3 = phi i32 [ %75, %5 ], [ 0, %1 ] + %4 = icmp slt i32 %3, 256 + br i1 %4, label %5, label %76 + +5: ; preds = %2 + %6 = getelementptr float, ptr @buf8, i32 %3 + %7 = load <16 x float>, ptr %6, align 4 + %8 = getelementptr float, ptr @buf7, i32 %3 + %9 = load <16 x float>, ptr %8, align 4 + %10 = extractelement <16 x float> %7, i64 0 + %11 = extractelement <16 x float> %9, i64 0 + %12 = call float @__aie2p_scalar_fdiv(float %10, float %11) + %13 = insertelement <16 x float> poison, float %12, i64 0 + %14 = extractelement <16 x float> %7, i64 1 + %15 = extractelement <16 x float> %9, i64 1 + %16 = call float @__aie2p_scalar_fdiv(float %14, float %15) + %17 = insertelement <16 x float> %13, float %16, i64 1 + %18 = extractelement <16 x float> %7, i64 2 + %19 = extractelement <16 x float> %9, i64 2 + %20 = call float @__aie2p_scalar_fdiv(float %18, float %19) + %21 = insertelement <16 x float> %17, float %20, i64 2 + %22 = extractelement <16 x float> %7, i64 3 + %23 = extractelement <16 x float> %9, i64 3 + %24 = call float @__aie2p_scalar_fdiv(float %22, float %23) + %25 = insertelement <16 x float> %21, float %24, i64 3 + %26 = extractelement <16 x float> %7, i64 4 + %27 = extractelement <16 x float> %9, i64 4 + %28 = call float @__aie2p_scalar_fdiv(float %26, float %27) + %29 = insertelement <16 x float> %25, float %28, i64 4 + %30 = extractelement <16 x float> %7, i64 5 + %31 = extractelement <16 x float> %9, i64 5 + %32 = call float @__aie2p_scalar_fdiv(float %30, float %31) + %33 = insertelement <16 x float> %29, float %32, i64 5 + %34 = extractelement <16 x float> %7, i64 6 + %35 = extractelement <16 x float> %9, i64 6 + %36 = call float @__aie2p_scalar_fdiv(float %34, float %35) + %37 = insertelement <16 x float> %33, float %36, i64 6 + %38 = extractelement <16 x float> %7, i64 7 + %39 = extractelement <16 x float> %9, i64 7 + %40 = call float @__aie2p_scalar_fdiv(float %38, float %39) + %41 = insertelement <16 x float> %37, float %40, i64 7 + %42 = extractelement <16 x float> %7, i64 8 + %43 = extractelement <16 x float> %9, i64 8 + %44 = call float @__aie2p_scalar_fdiv(float %42, float %43) + %45 = insertelement <16 x float> %41, float %44, i64 8 + %46 = extractelement <16 x float> %7, i64 9 + %47 = extractelement <16 x float> %9, i64 9 + %48 = call float @__aie2p_scalar_fdiv(float %46, float %47) + %49 = insertelement <16 x float> %45, float %48, i64 9 + %50 = extractelement <16 x float> %7, i64 10 + %51 = extractelement <16 x float> %9, i64 10 + %52 = call float @__aie2p_scalar_fdiv(float %50, float %51) + %53 = insertelement <16 x float> %49, float %52, i64 10 + %54 = extractelement <16 x float> %7, i64 11 + %55 = extractelement <16 x float> %9, i64 11 + %56 = call float @__aie2p_scalar_fdiv(float %54, float %55) + %57 = insertelement <16 x float> %53, float %56, i64 11 + %58 = extractelement <16 x float> %7, i64 12 + %59 = extractelement <16 x float> %9, i64 12 + %60 = call float @__aie2p_scalar_fdiv(float %58, float %59) + %61 = insertelement <16 x float> %57, float %60, i64 12 + %62 = extractelement <16 x float> %7, i64 13 + %63 = extractelement <16 x float> %9, i64 13 + %64 = call float @__aie2p_scalar_fdiv(float %62, float %63) + %65 = insertelement <16 x float> %61, float %64, i64 13 + %66 = extractelement <16 x float> %7, i64 14 + %67 = extractelement <16 x float> %9, i64 14 + %68 = call float @__aie2p_scalar_fdiv(float %66, float %67) + %69 = insertelement <16 x float> %65, float %68, i64 14 + %70 = extractelement <16 x float> %7, i64 15 + %71 = extractelement <16 x float> %9, i64 15 + %72 = call float @__aie2p_scalar_fdiv(float %70, float %71) + %73 = insertelement <16 x float> %69, float %72, i64 15 + %74 = getelementptr float, ptr @buf6, i32 %3 + store <16 x float> %73, ptr %74, align 4 + %75 = add i32 %3, 16 + br label %2, !llvm.loop !1 + +76: ; preds = %2 + call void @llvm.aie2p.release(i32 51, i32 1) + call void @llvm.aie2p.release(i32 53, i32 1) + call void @llvm.aie2p.release(i32 48, i32 1) + br label %1 +} + +; Unknown intrinsic +declare float @llvm.aie2p.inv(float) + +attributes #0 = { noinline } + +!llvm.module.flags = !{!0} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !{!1, !2} +!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_4.o b/examples/elementwise_arith/air_project/div_kernel_0_core_0_4.o new file mode 100644 index 0000000000000000000000000000000000000000..75208a26da516d5325930c6a9cff6520d51ab84d GIT binary patch literal 2048 zcma)7Z)h837=JF8G*@uDGm`R}tZ=KkRGXe{S8H($lQMN5x)5X=#edhdOSPEUk|yOu z(7JTaF~_>0a3Vs#4ZlbNGD1J~!w``n6{8H?hblq+pwPjD2%GVF?%r!(ifj77`#yhu zzx&<$+`aeSd(VwNFG&)3CV?Vk0CpRIjNqU{=!SOK;!j)}fA6``f z>dEEu8RZuT#_45A2R+ze0Gd|(0V1EgRP5t4DU9-dAAl_4X@#rLupjwh6)lw;9hi|mdTic!9 zS=2i#N9}M8`SBgrcN6j6*6TZmdgn^0cMJK(4tnc|KiOLEJnEgNe(T8J>G}PE_%5E` zg;Jz?LB{p{iTrrwc>4d}M?+tVP`@X~TD*^oaicr?T=cq{(V*~*=fbTXSG!5k-ZHo_G!e5eG#>;uI#gZ{}z#-JL}(F>ltG! z)rbCy47BMVHtMPJXEIxq67F4bW?I7eelYyO*Or730N>XPzkH^Q^RI^~%NQ;4>zD$( zzA40mA)(-h+s%(KPjA)&j>4>vKfq(+gz)5fXA^S|jy%s-@lXmu0LHfAuWrjHw&j%i z0&dYj*TcWaVT^kfw|<1;b~mQ=N`5ykljHGkLfttMd3{LQ;-@k8_#X)MWug9)P~Q;h z2Yg#P35-3Rd7-`})V~($>q5PN&q<;K9BmPOFYotDsuPdQ$Y*mIqmZ=kTqUNo#>q_WG3H)(b1E~j!b8A*~!AOnN!7F7L2()OS96gj*-hwaDz z)yKkh^hn8c926t$Sb}@cX*?i&JCF9FXC99@^vV+)*uaGLE^Q%N4`Gz&aEN+KnA6_% v%+q@v3!>g#OccD3^k}, ptr %5, align 64 + %7 = getelementptr float, ptr @buf7, i20 %4 + %8 = load <16 x float>, ptr %7, align 64 + %9 = extractelement <16 x float> %6, i64 0 + %10 = extractelement <16 x float> %8, i64 0 + %11 = tail call float @__aie2p_scalar_fdiv(float %9, float %10) + %12 = insertelement <16 x float> poison, float %11, i64 0 + %13 = extractelement <16 x float> %6, i64 1 + %14 = extractelement <16 x float> %8, i64 1 + %15 = tail call float @__aie2p_scalar_fdiv(float %13, float %14) + %16 = insertelement <16 x float> %12, float %15, i64 1 + %17 = extractelement <16 x float> %6, i64 2 + %18 = extractelement <16 x float> %8, i64 2 + %19 = tail call float @__aie2p_scalar_fdiv(float %17, float %18) + %20 = insertelement <16 x float> %16, float %19, i64 2 + %21 = extractelement <16 x float> %6, i64 3 + %22 = extractelement <16 x float> %8, i64 3 + %23 = tail call float @__aie2p_scalar_fdiv(float %21, float %22) + %24 = insertelement <16 x float> %20, float %23, i64 3 + %25 = extractelement <16 x float> %6, i64 4 + %26 = extractelement <16 x float> %8, i64 4 + %27 = tail call float @__aie2p_scalar_fdiv(float %25, float %26) + %28 = insertelement <16 x float> %24, float %27, i64 4 + %29 = extractelement <16 x float> %6, i64 5 + %30 = extractelement <16 x float> %8, i64 5 + %31 = tail call float @__aie2p_scalar_fdiv(float %29, float %30) + %32 = insertelement <16 x float> %28, float %31, i64 5 + %33 = extractelement <16 x float> %6, i64 6 + %34 = extractelement <16 x float> %8, i64 6 + %35 = tail call float @__aie2p_scalar_fdiv(float %33, float %34) + %36 = insertelement <16 x float> %32, float %35, i64 6 + %37 = extractelement <16 x float> %6, i64 7 + %38 = extractelement <16 x float> %8, i64 7 + %39 = tail call float @__aie2p_scalar_fdiv(float %37, float %38) + %40 = insertelement <16 x float> %36, float %39, i64 7 + %41 = extractelement <16 x float> %6, i64 8 + %42 = extractelement <16 x float> %8, i64 8 + %43 = tail call float @__aie2p_scalar_fdiv(float %41, float %42) + %44 = insertelement <16 x float> %40, float %43, i64 8 + %45 = extractelement <16 x float> %6, i64 9 + %46 = extractelement <16 x float> %8, i64 9 + %47 = tail call float @__aie2p_scalar_fdiv(float %45, float %46) + %48 = insertelement <16 x float> %44, float %47, i64 9 + %49 = extractelement <16 x float> %6, i64 10 + %50 = extractelement <16 x float> %8, i64 10 + %51 = tail call float @__aie2p_scalar_fdiv(float %49, float %50) + %52 = insertelement <16 x float> %48, float %51, i64 10 + %53 = extractelement <16 x float> %6, i64 11 + %54 = extractelement <16 x float> %8, i64 11 + %55 = tail call float @__aie2p_scalar_fdiv(float %53, float %54) + %56 = insertelement <16 x float> %52, float %55, i64 11 + %57 = extractelement <16 x float> %6, i64 12 + %58 = extractelement <16 x float> %8, i64 12 + %59 = tail call float @__aie2p_scalar_fdiv(float %57, float %58) + %60 = insertelement <16 x float> %56, float %59, i64 12 + %61 = extractelement <16 x float> %6, i64 13 + %62 = extractelement <16 x float> %8, i64 13 + %63 = tail call float @__aie2p_scalar_fdiv(float %61, float %62) + %64 = insertelement <16 x float> %60, float %63, i64 13 + %65 = extractelement <16 x float> %6, i64 14 + %66 = extractelement <16 x float> %8, i64 14 + %67 = tail call float @__aie2p_scalar_fdiv(float %65, float %66) + %68 = insertelement <16 x float> %64, float %67, i64 14 + %69 = extractelement <16 x float> %6, i64 15 + %70 = extractelement <16 x float> %8, i64 15 + %71 = tail call float @__aie2p_scalar_fdiv(float %69, float %70) + %72 = insertelement <16 x float> %68, float %71, i64 15 + %73 = getelementptr float, ptr @buf6, i20 %4 + store <16 x float> %72, ptr %73, align 64 + %74 = add nuw nsw i32 %3, 16 + %75 = icmp ult i32 %3, 240 + br i1 %75, label %2, label %76, !llvm.loop !1 + +76: ; preds = %2 + tail call void @llvm.aie2p.release(i32 51, i32 1) + tail call void @llvm.aie2p.release(i32 53, i32 1) + tail call void @llvm.aie2p.release(i32 48, i32 1) + br label %1 +} + +; Function Attrs: nofree nosync nounwind memory(none) +declare float @llvm.aie2p.inv(float) #3 + +attributes #0 = { nofree noinline nosync nounwind memory(none) } +attributes #1 = { nounwind } +attributes #2 = { noreturn nounwind } +attributes #3 = { nofree nosync nounwind memory(none) } + +!llvm.module.flags = !{!0} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !{!1, !2} +!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_4.peanohack.ll b/examples/elementwise_arith/air_project/div_kernel_0_core_0_4.peanohack.ll new file mode 100644 index 0000000..9a0f789 --- /dev/null +++ b/examples/elementwise_arith/air_project/div_kernel_0_core_0_4.peanohack.ll @@ -0,0 +1,158 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target triple = "aie2p" + +@buf0 = external global [256 x float] +@buf1 = external global [256 x float] +@buf2 = external global [256 x float] +@buf3 = external global [256 x float] +@buf4 = external global [256 x float] +@buf5 = external global [256 x float] +@buf6 = external global [256 x float] +@buf7 = external global [256 x float] +@buf8 = external global [256 x float] +@buf9 = external global [256 x float] +@buf10 = external global [256 x float] +@buf11 = external global [256 x float] +@buf12 = external global [1024 x float] +@buf13 = external global [1024 x float] +@buf14 = external global [1024 x float] + +; Function Attrs: noinline +define float @__aie2p_scalar_fdiv(float %0, float %1) #0 { + %3 = call float @llvm.aie2p.inv(float %1) + %4 = fmul float %0, %3 + ret float %4 +} + +declare void @debug_i32(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.event(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.put.ms(i32, i32) + +; Unknown intrinsic +declare { i32, i32 } @llvm.aie2p.get.ss() + +; Unknown intrinsic +declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32) + +; Unknown intrinsic +declare <16 x i32> @llvm.aie2p.scd.read.vec(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.acquire(i32, i32) + +; Unknown intrinsic +declare void @llvm.aie2p.release(i32, i32) + +; Unknown intrinsic +declare void @llvm.aie2p.set.ctrl.reg(i32, i32) + +define void @core_0_4() { + br label %1 + +1: ; preds = %76, %0 + call void @llvm.aie2p.acquire(i32 49, i32 -1) + call void @llvm.aie2p.acquire(i32 50, i32 -1) + call void @llvm.aie2p.acquire(i32 52, i32 -1) + br label %2 + +2: ; preds = %5, %1 + %3 = phi i32 [ %75, %5 ], [ 0, %1 ] + %4 = icmp slt i32 %3, 256 + br i1 %4, label %5, label %76 + +5: ; preds = %2 + %6 = getelementptr float, ptr @buf8, i32 %3 + %7 = load <16 x float>, ptr %6 + %8 = getelementptr float, ptr @buf7, i32 %3 + %9 = load <16 x float>, ptr %8 + %10 = extractelement <16 x float> %7, i64 0 + %11 = extractelement <16 x float> %9, i64 0 + %12 = call float @__aie2p_scalar_fdiv(float %10, float %11) + %13 = insertelement <16 x float> poison, float %12, i64 0 + %14 = extractelement <16 x float> %7, i64 1 + %15 = extractelement <16 x float> %9, i64 1 + %16 = call float @__aie2p_scalar_fdiv(float %14, float %15) + %17 = insertelement <16 x float> %13, float %16, i64 1 + %18 = extractelement <16 x float> %7, i64 2 + %19 = extractelement <16 x float> %9, i64 2 + %20 = call float @__aie2p_scalar_fdiv(float %18, float %19) + %21 = insertelement <16 x float> %17, float %20, i64 2 + %22 = extractelement <16 x float> %7, i64 3 + %23 = extractelement <16 x float> %9, i64 3 + %24 = call float @__aie2p_scalar_fdiv(float %22, float %23) + %25 = insertelement <16 x float> %21, float %24, i64 3 + %26 = extractelement <16 x float> %7, i64 4 + %27 = extractelement <16 x float> %9, i64 4 + %28 = call float @__aie2p_scalar_fdiv(float %26, float %27) + %29 = insertelement <16 x float> %25, float %28, i64 4 + %30 = extractelement <16 x float> %7, i64 5 + %31 = extractelement <16 x float> %9, i64 5 + %32 = call float @__aie2p_scalar_fdiv(float %30, float %31) + %33 = insertelement <16 x float> %29, float %32, i64 5 + %34 = extractelement <16 x float> %7, i64 6 + %35 = extractelement <16 x float> %9, i64 6 + %36 = call float @__aie2p_scalar_fdiv(float %34, float %35) + %37 = insertelement <16 x float> %33, float %36, i64 6 + %38 = extractelement <16 x float> %7, i64 7 + %39 = extractelement <16 x float> %9, i64 7 + %40 = call float @__aie2p_scalar_fdiv(float %38, float %39) + %41 = insertelement <16 x float> %37, float %40, i64 7 + %42 = extractelement <16 x float> %7, i64 8 + %43 = extractelement <16 x float> %9, i64 8 + %44 = call float @__aie2p_scalar_fdiv(float %42, float %43) + %45 = insertelement <16 x float> %41, float %44, i64 8 + %46 = extractelement <16 x float> %7, i64 9 + %47 = extractelement <16 x float> %9, i64 9 + %48 = call float @__aie2p_scalar_fdiv(float %46, float %47) + %49 = insertelement <16 x float> %45, float %48, i64 9 + %50 = extractelement <16 x float> %7, i64 10 + %51 = extractelement <16 x float> %9, i64 10 + %52 = call float @__aie2p_scalar_fdiv(float %50, float %51) + %53 = insertelement <16 x float> %49, float %52, i64 10 + %54 = extractelement <16 x float> %7, i64 11 + %55 = extractelement <16 x float> %9, i64 11 + %56 = call float @__aie2p_scalar_fdiv(float %54, float %55) + %57 = insertelement <16 x float> %53, float %56, i64 11 + %58 = extractelement <16 x float> %7, i64 12 + %59 = extractelement <16 x float> %9, i64 12 + %60 = call float @__aie2p_scalar_fdiv(float %58, float %59) + %61 = insertelement <16 x float> %57, float %60, i64 12 + %62 = extractelement <16 x float> %7, i64 13 + %63 = extractelement <16 x float> %9, i64 13 + %64 = call float @__aie2p_scalar_fdiv(float %62, float %63) + %65 = insertelement <16 x float> %61, float %64, i64 13 + %66 = extractelement <16 x float> %7, i64 14 + %67 = extractelement <16 x float> %9, i64 14 + %68 = call float @__aie2p_scalar_fdiv(float %66, float %67) + %69 = insertelement <16 x float> %65, float %68, i64 14 + %70 = extractelement <16 x float> %7, i64 15 + %71 = extractelement <16 x float> %9, i64 15 + %72 = call float @__aie2p_scalar_fdiv(float %70, float %71) + %73 = insertelement <16 x float> %69, float %72, i64 15 + %74 = getelementptr float, ptr @buf6, i32 %3 + store <16 x float> %73, ptr %74 + %75 = add i32 %3, 16 + br label %2, !llvm.loop !1 + +76: ; preds = %2 + call void @llvm.aie2p.release(i32 51, i32 1) + call void @llvm.aie2p.release(i32 53, i32 1) + call void @llvm.aie2p.release(i32 48, i32 1) + br label %1 +} + +; Unknown intrinsic +declare float @llvm.aie2p.inv(float) + +attributes #0 = { noinline } + +!llvm.module.flags = !{!0} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !{!1, !2} +!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_5.elf b/examples/elementwise_arith/air_project/div_kernel_0_core_0_5.elf new file mode 100755 index 0000000000000000000000000000000000000000..8162c282e6181e1320623e660451ca081a79c550 GIT binary patch literal 4132 zcma)9eQZJArh@=bU@)Ip==(`s7#rFG!LE2ARN?Lk%e0UzS2wF zR#}D&61VNuITy1On(w=wRqwh$D8Mp`%K2}gnK!^!G+WFaqZ30ZyXktWKuVspXx=_JOfORn#-q0 zN6SU&du#Z+{&@agLVqurD@5rW+PhYX?-uH-*N*Qr`a3;}{yswcnpOOzQD3{Zzq9D? zEaiI#?N_VcKT!V|^*c9OK7G!N@%DC6an?~(R3*5iV?V)_E+TTsR* z-}a6#^2aJD-*0uV;qR5vis@I(<)WY*{W&L(4t$|MjBn@UcM>!DekuBlUwlvI7q4TR z!*&bXN7&NxT$@R`w20rzqI+{~mD?`eOH^&CNK|M5`OL<|-`_>epEuq4oY?`sYco}Lc<^l;k*a$^7;G?Lpumt%emY$j0j^Dsr85u%%WqVNSK3RQMg`R!!m zXGaxpawa3nRJLF#1@-VvSxLL3SFNpS;rBas2Z8+(HYA_R?7q14pT*6Plo#=|J5A}$ z$u~boZl6hk=zGN`Y;FWX@%@s?zPjoR(;HDF_SJD4a$owD(rkWaKEezr$u2I3)$mrI~ibI&3!52V>c{G0 zOsz0s230gz9b(rtwDpbS4G$c?1ftOA5CxdyTPe2M{9Fb`_1{)((Q>HpQx46>oHX-l z<3)j*9Y5=|Wyj;#q#18ujVS-rNz5T_wi&WD{FS~~bm@>!0kB%c@%6YDe7DTFiW+J8 za_5BX`ssPe3PMsHV$aJ=?7<2Yjxj#J-yyco;*Ldp7^{`73fQPB8tL`GjAyr6^!S#{n77fmK0E) z%kUdlWbRGNeDWQPb<3l}(}VAJZAZ9mw522mjd2gp?{4 zoAx_+)l;kPw~SKCF<(FOeVw;!=S z{%GIv$lz|p?{8H+-Wqoep6^gkuv3xVu!0IF*x0P=S0p4{y!!^m@9xL@%cfdZ1=W!uKrNGEd)% zIF+l81P2hia}QVvj*vbN2M{ZH_T7kCo}QLzAP=8F+>vL03UM+IpF@mKU7aL&6*)Bz zUq(EUhks_6Vo&-0Y6bosVmH3gHIm>8G77eX*or`c53zS+J;t#SzKxh-PMJ0QFXX|Z z1<1e(cM1EJ@&5>`%XkrC3gaUmN19!cq9=A+UC8HK4fAidgH-P?7L&} zL5<7x!O-zQEc*RO48ZR{a-cOD?1_Zp2l~T9JrSt!@7?POc=WywdWU*qUG+7gtiT<3 zM(^wOzDe)v41ve3OFV|D$FT7lnmU6qBz1{Zu)vP6ZDHrEFK(; zLm-Aehzkq{dxj!`)&m4X$05)gjP?bhebG1&2Z2z$e=uf@B6%PqomE(PTOq?9+&eS9Q4+ M|5ekRhTiD^55T1-w*UYD literal 0 HcmV?d00001 diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_5.ld.script b/examples/elementwise_arith/air_project/div_kernel_0_core_0_5.ld.script new file mode 100644 index 0000000..51c13db --- /dev/null +++ b/examples/elementwise_arith/air_project/div_kernel_0_core_0_5.ld.script @@ -0,0 +1,72 @@ + +MEMORY +{ + program (RX) : ORIGIN = 0, LENGTH = 0x0020000 + data (!RX) : ORIGIN = 0x78400, LENGTH = 0x7C00 +} +ENTRY(__start) +SECTIONS +{ + . = 0x0; + .text : { + /* the __start symbol has to come at address zero. */ + *crt0.o(.text*) + _ctors_start = .; + _init_array_start = .; + KEEP(SORT(*.init_array)) + _ctors_end = .; + _init_array_end = .; + _dtors_start = .; + _dtors_end = .; + *(.text*) + } > program + .data : { + *(.data*) + *(.rodata*) + } > data + .comment : { + *(.comment*) + } + .symtab : { + *(.symtab) + } + .shstrtab : { + *(.shstrtab) + } + .strtab : { + *(.strtab) + } + .stack_sizes : { + *(.stack_sizes) + } + +. = 0x70000; +_sp_start_value_DM_stack = .; +. += 0x400; /* stack */ +. = 0x40400; +buf8 = .; +. += 0x400; +. = 0x44000; +buf7 = .; +. += 0x400; +. = 0x48000; +buf6 = .; +. += 0x400; +/* No tile with memory exists to the west. */ +. = 0x50000; +. += 0x10000; +/* No tile with memory exists to the north. */ +. = 0x60000; +. += 0x10000; +. = 0x70400; +buf11 = .; +. += 0x400; +. = 0x74000; +buf10 = .; +. += 0x400; +. = 0x78000; +buf9 = .; +. += 0x400; + .bss : { *(.bss*) } > data +} +PROVIDE(main = core_0_5); diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_5.ll b/examples/elementwise_arith/air_project/div_kernel_0_core_0_5.ll new file mode 100644 index 0000000..e652b65 --- /dev/null +++ b/examples/elementwise_arith/air_project/div_kernel_0_core_0_5.ll @@ -0,0 +1,158 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target triple = "aie2p" + +@buf0 = external global [256 x float] +@buf1 = external global [256 x float] +@buf2 = external global [256 x float] +@buf3 = external global [256 x float] +@buf4 = external global [256 x float] +@buf5 = external global [256 x float] +@buf6 = external global [256 x float] +@buf7 = external global [256 x float] +@buf8 = external global [256 x float] +@buf9 = external global [256 x float] +@buf10 = external global [256 x float] +@buf11 = external global [256 x float] +@buf12 = external global [1024 x float] +@buf13 = external global [1024 x float] +@buf14 = external global [1024 x float] + +; Function Attrs: noinline +define float @__aie2p_scalar_fdiv(float %0, float %1) #0 { + %3 = call float @llvm.aie2p.inv(float %1) + %4 = fmul float %0, %3 + ret float %4 +} + +declare void @debug_i32(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.event(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.put.ms(i32, i32) + +; Unknown intrinsic +declare { i32, i32 } @llvm.aie2p.get.ss() + +; Unknown intrinsic +declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32) + +; Unknown intrinsic +declare <16 x i32> @llvm.aie2p.scd.read.vec(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.acquire(i32, i32) + +; Unknown intrinsic +declare void @llvm.aie2p.release(i32, i32) + +; Unknown intrinsic +declare void @llvm.aie2p.set.ctrl.reg(i32, i32) + +define void @core_0_5() { + br label %1 + +1: ; preds = %76, %0 + call void @llvm.aie2p.acquire(i32 49, i32 -1) + call void @llvm.aie2p.acquire(i32 50, i32 -1) + call void @llvm.aie2p.acquire(i32 52, i32 -1) + br label %2 + +2: ; preds = %5, %1 + %3 = phi i32 [ %75, %5 ], [ 0, %1 ] + %4 = icmp slt i32 %3, 256 + br i1 %4, label %5, label %76 + +5: ; preds = %2 + %6 = getelementptr float, ptr @buf11, i32 %3 + %7 = load <16 x float>, ptr %6, align 4 + %8 = getelementptr float, ptr @buf10, i32 %3 + %9 = load <16 x float>, ptr %8, align 4 + %10 = extractelement <16 x float> %7, i64 0 + %11 = extractelement <16 x float> %9, i64 0 + %12 = call float @__aie2p_scalar_fdiv(float %10, float %11) + %13 = insertelement <16 x float> poison, float %12, i64 0 + %14 = extractelement <16 x float> %7, i64 1 + %15 = extractelement <16 x float> %9, i64 1 + %16 = call float @__aie2p_scalar_fdiv(float %14, float %15) + %17 = insertelement <16 x float> %13, float %16, i64 1 + %18 = extractelement <16 x float> %7, i64 2 + %19 = extractelement <16 x float> %9, i64 2 + %20 = call float @__aie2p_scalar_fdiv(float %18, float %19) + %21 = insertelement <16 x float> %17, float %20, i64 2 + %22 = extractelement <16 x float> %7, i64 3 + %23 = extractelement <16 x float> %9, i64 3 + %24 = call float @__aie2p_scalar_fdiv(float %22, float %23) + %25 = insertelement <16 x float> %21, float %24, i64 3 + %26 = extractelement <16 x float> %7, i64 4 + %27 = extractelement <16 x float> %9, i64 4 + %28 = call float @__aie2p_scalar_fdiv(float %26, float %27) + %29 = insertelement <16 x float> %25, float %28, i64 4 + %30 = extractelement <16 x float> %7, i64 5 + %31 = extractelement <16 x float> %9, i64 5 + %32 = call float @__aie2p_scalar_fdiv(float %30, float %31) + %33 = insertelement <16 x float> %29, float %32, i64 5 + %34 = extractelement <16 x float> %7, i64 6 + %35 = extractelement <16 x float> %9, i64 6 + %36 = call float @__aie2p_scalar_fdiv(float %34, float %35) + %37 = insertelement <16 x float> %33, float %36, i64 6 + %38 = extractelement <16 x float> %7, i64 7 + %39 = extractelement <16 x float> %9, i64 7 + %40 = call float @__aie2p_scalar_fdiv(float %38, float %39) + %41 = insertelement <16 x float> %37, float %40, i64 7 + %42 = extractelement <16 x float> %7, i64 8 + %43 = extractelement <16 x float> %9, i64 8 + %44 = call float @__aie2p_scalar_fdiv(float %42, float %43) + %45 = insertelement <16 x float> %41, float %44, i64 8 + %46 = extractelement <16 x float> %7, i64 9 + %47 = extractelement <16 x float> %9, i64 9 + %48 = call float @__aie2p_scalar_fdiv(float %46, float %47) + %49 = insertelement <16 x float> %45, float %48, i64 9 + %50 = extractelement <16 x float> %7, i64 10 + %51 = extractelement <16 x float> %9, i64 10 + %52 = call float @__aie2p_scalar_fdiv(float %50, float %51) + %53 = insertelement <16 x float> %49, float %52, i64 10 + %54 = extractelement <16 x float> %7, i64 11 + %55 = extractelement <16 x float> %9, i64 11 + %56 = call float @__aie2p_scalar_fdiv(float %54, float %55) + %57 = insertelement <16 x float> %53, float %56, i64 11 + %58 = extractelement <16 x float> %7, i64 12 + %59 = extractelement <16 x float> %9, i64 12 + %60 = call float @__aie2p_scalar_fdiv(float %58, float %59) + %61 = insertelement <16 x float> %57, float %60, i64 12 + %62 = extractelement <16 x float> %7, i64 13 + %63 = extractelement <16 x float> %9, i64 13 + %64 = call float @__aie2p_scalar_fdiv(float %62, float %63) + %65 = insertelement <16 x float> %61, float %64, i64 13 + %66 = extractelement <16 x float> %7, i64 14 + %67 = extractelement <16 x float> %9, i64 14 + %68 = call float @__aie2p_scalar_fdiv(float %66, float %67) + %69 = insertelement <16 x float> %65, float %68, i64 14 + %70 = extractelement <16 x float> %7, i64 15 + %71 = extractelement <16 x float> %9, i64 15 + %72 = call float @__aie2p_scalar_fdiv(float %70, float %71) + %73 = insertelement <16 x float> %69, float %72, i64 15 + %74 = getelementptr float, ptr @buf9, i32 %3 + store <16 x float> %73, ptr %74, align 4 + %75 = add i32 %3, 16 + br label %2, !llvm.loop !1 + +76: ; preds = %2 + call void @llvm.aie2p.release(i32 51, i32 1) + call void @llvm.aie2p.release(i32 53, i32 1) + call void @llvm.aie2p.release(i32 48, i32 1) + br label %1 +} + +; Unknown intrinsic +declare float @llvm.aie2p.inv(float) + +attributes #0 = { noinline } + +!llvm.module.flags = !{!0} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !{!1, !2} +!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_5.o b/examples/elementwise_arith/air_project/div_kernel_0_core_0_5.o new file mode 100644 index 0000000000000000000000000000000000000000..78cc76657161c0618a24de8080e832261028cff6 GIT binary patch literal 2052 zcma)7VQ5=b6h1F6Xy+ihh!hDn*aA6*FIMsezE+NG?R+1e)M z=?|?<=bUq_8z@hl(0{T&l7NiRU;QB>GNfWA1NWm!P=6?NFd@QbJmUd)n7nRKPyG ztOC@Mi={c`7pIKVNl6DiSZ4s52LUkX7B)h;}$+8>A2XC8(gB$xVsozb8q!|cr+!6abZgYn<3L7x)G@@j)A9c>t( zF=c>9am~L&{x^PT_747z4q9EGW$Y_B$o9jtF4yn0Y8RGO-^y}8D$Ui=C#B)G2HSU6 zttFoBKeIY{qo)~KKJ}%19%0=uq(56wpL&Fc!)1uVx%cq;@ zT`EQ^m*j{Yh@f6)`IX5B>f!#LD8FVc@$(Y;hh4rAWS3WPRB_zGaT~{Ku$uDeSMK55 z5xQSZbv%CMz7^XYwPL-ytXQ($iv7TsIX3h!wjQwZ2%|W&QpgR(@`;eSf`c zh%Hqf`76@bqJL1YrAnX4Y+g#Zzlt-{63+L7;g7zSB!mF?xn}s~Gi8{6Jxp1|Xr8ZQ z3UGW~hzI=w;D!0sODO~a_;6EwVN*V| zDW}vIa6toI7ylv;V%)8`_z{X*-S{JPlHZL>fIC+r$NQxXei~zs|Gt1P3iwX~ zz9!&%eH%Inj6Izh0bdaCZv=c*!1MT=B)Y)S=FxZae!nD~cx1*@HkUE-Su1atW@bFw zH)&3fW^$P+bL`ak3^)}q-k8W|jTchKyQlM+(br*c@Winr&;T z&y1jd>kRpk!r0TI_UOb^*6cA4fN7p7, ptr %5, align 64 + %7 = getelementptr float, ptr @buf10, i20 %4 + %8 = load <16 x float>, ptr %7, align 64 + %9 = extractelement <16 x float> %6, i64 0 + %10 = extractelement <16 x float> %8, i64 0 + %11 = tail call float @__aie2p_scalar_fdiv(float %9, float %10) + %12 = insertelement <16 x float> poison, float %11, i64 0 + %13 = extractelement <16 x float> %6, i64 1 + %14 = extractelement <16 x float> %8, i64 1 + %15 = tail call float @__aie2p_scalar_fdiv(float %13, float %14) + %16 = insertelement <16 x float> %12, float %15, i64 1 + %17 = extractelement <16 x float> %6, i64 2 + %18 = extractelement <16 x float> %8, i64 2 + %19 = tail call float @__aie2p_scalar_fdiv(float %17, float %18) + %20 = insertelement <16 x float> %16, float %19, i64 2 + %21 = extractelement <16 x float> %6, i64 3 + %22 = extractelement <16 x float> %8, i64 3 + %23 = tail call float @__aie2p_scalar_fdiv(float %21, float %22) + %24 = insertelement <16 x float> %20, float %23, i64 3 + %25 = extractelement <16 x float> %6, i64 4 + %26 = extractelement <16 x float> %8, i64 4 + %27 = tail call float @__aie2p_scalar_fdiv(float %25, float %26) + %28 = insertelement <16 x float> %24, float %27, i64 4 + %29 = extractelement <16 x float> %6, i64 5 + %30 = extractelement <16 x float> %8, i64 5 + %31 = tail call float @__aie2p_scalar_fdiv(float %29, float %30) + %32 = insertelement <16 x float> %28, float %31, i64 5 + %33 = extractelement <16 x float> %6, i64 6 + %34 = extractelement <16 x float> %8, i64 6 + %35 = tail call float @__aie2p_scalar_fdiv(float %33, float %34) + %36 = insertelement <16 x float> %32, float %35, i64 6 + %37 = extractelement <16 x float> %6, i64 7 + %38 = extractelement <16 x float> %8, i64 7 + %39 = tail call float @__aie2p_scalar_fdiv(float %37, float %38) + %40 = insertelement <16 x float> %36, float %39, i64 7 + %41 = extractelement <16 x float> %6, i64 8 + %42 = extractelement <16 x float> %8, i64 8 + %43 = tail call float @__aie2p_scalar_fdiv(float %41, float %42) + %44 = insertelement <16 x float> %40, float %43, i64 8 + %45 = extractelement <16 x float> %6, i64 9 + %46 = extractelement <16 x float> %8, i64 9 + %47 = tail call float @__aie2p_scalar_fdiv(float %45, float %46) + %48 = insertelement <16 x float> %44, float %47, i64 9 + %49 = extractelement <16 x float> %6, i64 10 + %50 = extractelement <16 x float> %8, i64 10 + %51 = tail call float @__aie2p_scalar_fdiv(float %49, float %50) + %52 = insertelement <16 x float> %48, float %51, i64 10 + %53 = extractelement <16 x float> %6, i64 11 + %54 = extractelement <16 x float> %8, i64 11 + %55 = tail call float @__aie2p_scalar_fdiv(float %53, float %54) + %56 = insertelement <16 x float> %52, float %55, i64 11 + %57 = extractelement <16 x float> %6, i64 12 + %58 = extractelement <16 x float> %8, i64 12 + %59 = tail call float @__aie2p_scalar_fdiv(float %57, float %58) + %60 = insertelement <16 x float> %56, float %59, i64 12 + %61 = extractelement <16 x float> %6, i64 13 + %62 = extractelement <16 x float> %8, i64 13 + %63 = tail call float @__aie2p_scalar_fdiv(float %61, float %62) + %64 = insertelement <16 x float> %60, float %63, i64 13 + %65 = extractelement <16 x float> %6, i64 14 + %66 = extractelement <16 x float> %8, i64 14 + %67 = tail call float @__aie2p_scalar_fdiv(float %65, float %66) + %68 = insertelement <16 x float> %64, float %67, i64 14 + %69 = extractelement <16 x float> %6, i64 15 + %70 = extractelement <16 x float> %8, i64 15 + %71 = tail call float @__aie2p_scalar_fdiv(float %69, float %70) + %72 = insertelement <16 x float> %68, float %71, i64 15 + %73 = getelementptr float, ptr @buf9, i20 %4 + store <16 x float> %72, ptr %73, align 64 + %74 = add nuw nsw i32 %3, 16 + %75 = icmp ult i32 %3, 240 + br i1 %75, label %2, label %76, !llvm.loop !1 + +76: ; preds = %2 + tail call void @llvm.aie2p.release(i32 51, i32 1) + tail call void @llvm.aie2p.release(i32 53, i32 1) + tail call void @llvm.aie2p.release(i32 48, i32 1) + br label %1 +} + +; Function Attrs: nofree nosync nounwind memory(none) +declare float @llvm.aie2p.inv(float) #3 + +attributes #0 = { nofree noinline nosync nounwind memory(none) } +attributes #1 = { nounwind } +attributes #2 = { noreturn nounwind } +attributes #3 = { nofree nosync nounwind memory(none) } + +!llvm.module.flags = !{!0} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !{!1, !2} +!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_5.peanohack.ll b/examples/elementwise_arith/air_project/div_kernel_0_core_0_5.peanohack.ll new file mode 100644 index 0000000..5ef9373 --- /dev/null +++ b/examples/elementwise_arith/air_project/div_kernel_0_core_0_5.peanohack.ll @@ -0,0 +1,158 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target triple = "aie2p" + +@buf0 = external global [256 x float] +@buf1 = external global [256 x float] +@buf2 = external global [256 x float] +@buf3 = external global [256 x float] +@buf4 = external global [256 x float] +@buf5 = external global [256 x float] +@buf6 = external global [256 x float] +@buf7 = external global [256 x float] +@buf8 = external global [256 x float] +@buf9 = external global [256 x float] +@buf10 = external global [256 x float] +@buf11 = external global [256 x float] +@buf12 = external global [1024 x float] +@buf13 = external global [1024 x float] +@buf14 = external global [1024 x float] + +; Function Attrs: noinline +define float @__aie2p_scalar_fdiv(float %0, float %1) #0 { + %3 = call float @llvm.aie2p.inv(float %1) + %4 = fmul float %0, %3 + ret float %4 +} + +declare void @debug_i32(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.event(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.put.ms(i32, i32) + +; Unknown intrinsic +declare { i32, i32 } @llvm.aie2p.get.ss() + +; Unknown intrinsic +declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32) + +; Unknown intrinsic +declare <16 x i32> @llvm.aie2p.scd.read.vec(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.acquire(i32, i32) + +; Unknown intrinsic +declare void @llvm.aie2p.release(i32, i32) + +; Unknown intrinsic +declare void @llvm.aie2p.set.ctrl.reg(i32, i32) + +define void @core_0_5() { + br label %1 + +1: ; preds = %76, %0 + call void @llvm.aie2p.acquire(i32 49, i32 -1) + call void @llvm.aie2p.acquire(i32 50, i32 -1) + call void @llvm.aie2p.acquire(i32 52, i32 -1) + br label %2 + +2: ; preds = %5, %1 + %3 = phi i32 [ %75, %5 ], [ 0, %1 ] + %4 = icmp slt i32 %3, 256 + br i1 %4, label %5, label %76 + +5: ; preds = %2 + %6 = getelementptr float, ptr @buf11, i32 %3 + %7 = load <16 x float>, ptr %6 + %8 = getelementptr float, ptr @buf10, i32 %3 + %9 = load <16 x float>, ptr %8 + %10 = extractelement <16 x float> %7, i64 0 + %11 = extractelement <16 x float> %9, i64 0 + %12 = call float @__aie2p_scalar_fdiv(float %10, float %11) + %13 = insertelement <16 x float> poison, float %12, i64 0 + %14 = extractelement <16 x float> %7, i64 1 + %15 = extractelement <16 x float> %9, i64 1 + %16 = call float @__aie2p_scalar_fdiv(float %14, float %15) + %17 = insertelement <16 x float> %13, float %16, i64 1 + %18 = extractelement <16 x float> %7, i64 2 + %19 = extractelement <16 x float> %9, i64 2 + %20 = call float @__aie2p_scalar_fdiv(float %18, float %19) + %21 = insertelement <16 x float> %17, float %20, i64 2 + %22 = extractelement <16 x float> %7, i64 3 + %23 = extractelement <16 x float> %9, i64 3 + %24 = call float @__aie2p_scalar_fdiv(float %22, float %23) + %25 = insertelement <16 x float> %21, float %24, i64 3 + %26 = extractelement <16 x float> %7, i64 4 + %27 = extractelement <16 x float> %9, i64 4 + %28 = call float @__aie2p_scalar_fdiv(float %26, float %27) + %29 = insertelement <16 x float> %25, float %28, i64 4 + %30 = extractelement <16 x float> %7, i64 5 + %31 = extractelement <16 x float> %9, i64 5 + %32 = call float @__aie2p_scalar_fdiv(float %30, float %31) + %33 = insertelement <16 x float> %29, float %32, i64 5 + %34 = extractelement <16 x float> %7, i64 6 + %35 = extractelement <16 x float> %9, i64 6 + %36 = call float @__aie2p_scalar_fdiv(float %34, float %35) + %37 = insertelement <16 x float> %33, float %36, i64 6 + %38 = extractelement <16 x float> %7, i64 7 + %39 = extractelement <16 x float> %9, i64 7 + %40 = call float @__aie2p_scalar_fdiv(float %38, float %39) + %41 = insertelement <16 x float> %37, float %40, i64 7 + %42 = extractelement <16 x float> %7, i64 8 + %43 = extractelement <16 x float> %9, i64 8 + %44 = call float @__aie2p_scalar_fdiv(float %42, float %43) + %45 = insertelement <16 x float> %41, float %44, i64 8 + %46 = extractelement <16 x float> %7, i64 9 + %47 = extractelement <16 x float> %9, i64 9 + %48 = call float @__aie2p_scalar_fdiv(float %46, float %47) + %49 = insertelement <16 x float> %45, float %48, i64 9 + %50 = extractelement <16 x float> %7, i64 10 + %51 = extractelement <16 x float> %9, i64 10 + %52 = call float @__aie2p_scalar_fdiv(float %50, float %51) + %53 = insertelement <16 x float> %49, float %52, i64 10 + %54 = extractelement <16 x float> %7, i64 11 + %55 = extractelement <16 x float> %9, i64 11 + %56 = call float @__aie2p_scalar_fdiv(float %54, float %55) + %57 = insertelement <16 x float> %53, float %56, i64 11 + %58 = extractelement <16 x float> %7, i64 12 + %59 = extractelement <16 x float> %9, i64 12 + %60 = call float @__aie2p_scalar_fdiv(float %58, float %59) + %61 = insertelement <16 x float> %57, float %60, i64 12 + %62 = extractelement <16 x float> %7, i64 13 + %63 = extractelement <16 x float> %9, i64 13 + %64 = call float @__aie2p_scalar_fdiv(float %62, float %63) + %65 = insertelement <16 x float> %61, float %64, i64 13 + %66 = extractelement <16 x float> %7, i64 14 + %67 = extractelement <16 x float> %9, i64 14 + %68 = call float @__aie2p_scalar_fdiv(float %66, float %67) + %69 = insertelement <16 x float> %65, float %68, i64 14 + %70 = extractelement <16 x float> %7, i64 15 + %71 = extractelement <16 x float> %9, i64 15 + %72 = call float @__aie2p_scalar_fdiv(float %70, float %71) + %73 = insertelement <16 x float> %69, float %72, i64 15 + %74 = getelementptr float, ptr @buf9, i32 %3 + store <16 x float> %73, ptr %74 + %75 = add i32 %3, 16 + br label %2, !llvm.loop !1 + +76: ; preds = %2 + call void @llvm.aie2p.release(i32 51, i32 1) + call void @llvm.aie2p.release(i32 53, i32 1) + call void @llvm.aie2p.release(i32 48, i32 1) + br label %1 +} + +; Unknown intrinsic +declare float @llvm.aie2p.inv(float) + +attributes #0 = { noinline } + +!llvm.module.flags = !{!0} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !{!1, !2} +!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/div_kernel_0_design.bif b/examples/elementwise_arith/air_project/div_kernel_0_design.bif new file mode 100644 index 0000000..11c5e21 --- /dev/null +++ b/examples/elementwise_arith/air_project/div_kernel_0_design.bif @@ -0,0 +1,10 @@ +all: +{ + id_code = 0x14ca8093 + extended_id_code = 0x01 + image + { + name=aie_image, id=0x1c000000 + { type=cdo file=air_project/div_kernel_0_aie_cdo_elfs.bin file=air_project/div_kernel_0_aie_cdo_init.bin file=air_project/div_kernel_0_aie_cdo_enable.bin } + } +} diff --git a/examples/elementwise_arith/air_project/div_kernel_0_div_kernel_0_sequence.bin b/examples/elementwise_arith/air_project/div_kernel_0_div_kernel_0_sequence.bin new file mode 100644 index 0000000000000000000000000000000000000000..f2eb383b7eb903e8ad8809b7d44fb0a4d660bfb6 GIT binary patch literal 3248 zcmcJQ-Aw~A5QNuneg4w5;eLtEZyo^(@j}krmd|WcZ&8OQJf_lyu+9e|Yukmso`>#0IXQ!}T>04{- zdWRz5+iJIiZ@khs7OO7qzIV<49O}M3Vd35Et*zr4+n!k}H6?s5ZHHP2b8iNd9f!uq z2%ELeWFG=0l9+H#Mw;SV&vYiNQGA=m?@TyH^<^fmv5ZNwd1Or5@6ws1SwdvOIT;Fy zFEe2cjS1(dzRbimmN6++jgb*5y%`xL$~=|+C!AAeiZ3%^4UGxssJ_g^HI^|MPc*|& zYb`xk4J-t?`*yLPCTa8!jZ|yI(P^-T#)NY+)Rg;`nXrb&gmYA1X5t#lnB*HUVGWH5 W=cv75Caj?`;T+YMnYhL+nEU|x-i@&U literal 0 HcmV?d00001 diff --git a/examples/elementwise_arith/air_project/empty_0.pdi b/examples/elementwise_arith/air_project/empty_0.pdi new file mode 100644 index 0000000000000000000000000000000000000000..a2347424a644d017f5e8ac814673b9061a6becd0 GIT binary patch literal 368 zcmcc1z`)>QtR&b`o)&s$_o|r;3@i+cKrs~{2GJ%!3<8rIPKiKR5Yp2nAV5Jtfx!S- z5XuJWBL`Gy9h?QS4_SX=W@>z9Zen^W!ax}W6O}bv!%~(5CLQVqjndvbXQt@Lv$f001PR1P=fJ literal 0 HcmV?d00001 diff --git a/examples/elementwise_arith/air_project/empty_0_aie_cdo_enable.bin b/examples/elementwise_arith/air_project/empty_0_aie_cdo_enable.bin new file mode 100644 index 0000000000000000000000000000000000000000..cba6b8778c42200ab6ec35c68cb3586f8fb4e055 GIT binary patch literal 24 dcmZQ!U|?`|@n>LQVqjndvbXQt@Lv$f001PR1P=fJ literal 0 HcmV?d00001 diff --git a/examples/elementwise_arith/air_project/empty_0_aie_cdo_init.bin b/examples/elementwise_arith/air_project/empty_0_aie_cdo_init.bin new file mode 100644 index 0000000000000000000000000000000000000000..cba6b8778c42200ab6ec35c68cb3586f8fb4e055 GIT binary patch literal 24 dcmZQ!U|?`|@n>LQVqjndvbXQt@Lv$f001PR1P=fJ literal 0 HcmV?d00001 diff --git a/examples/elementwise_arith/air_project/empty_0_design.bif b/examples/elementwise_arith/air_project/empty_0_design.bif new file mode 100644 index 0000000..b22ae3c --- /dev/null +++ b/examples/elementwise_arith/air_project/empty_0_design.bif @@ -0,0 +1,10 @@ +all: +{ + id_code = 0x14ca8093 + extended_id_code = 0x01 + image + { + name=aie_image, id=0x1c000000 + { type=cdo file=air_project/empty_0_aie_cdo_elfs.bin file=air_project/empty_0_aie_cdo_init.bin file=air_project/empty_0_aie_cdo_enable.bin } + } +} diff --git a/examples/elementwise_arith/air_project/full_elf_config.json b/examples/elementwise_arith/air_project/full_elf_config.json new file mode 100644 index 0000000..eab4fdb --- /dev/null +++ b/examples/elementwise_arith/air_project/full_elf_config.json @@ -0,0 +1,134 @@ +{ + "xrt-kernels": [ + { + "PDIs": [ + { + "PDI_file": "/home/strixminipc/Triton-XDNA/examples/elementwise_arith/air_project/empty_0.pdi", + "id": 1 + }, + { + "PDI_file": "/home/strixminipc/Triton-XDNA/examples/elementwise_arith/air_project/square_kernel_0.pdi", + "id": 2 + }, + { + "PDI_file": "/home/strixminipc/Triton-XDNA/examples/elementwise_arith/air_project/main.pdi", + "id": 3 + } + ], + "arguments": [ + { + "name": "arg_0", + "offset": "0x0", + "type": "char *" + }, + { + "name": "arg_1", + "offset": "0x8", + "type": "char *" + }, + { + "name": "arg_2", + "offset": "0x10", + "type": "char *" + }, + { + "name": "arg_3", + "offset": "0x18", + "type": "char *" + }, + { + "name": "arg_4", + "offset": "0x20", + "type": "char *" + }, + { + "name": "arg_5", + "offset": "0x28", + "type": "char *" + }, + { + "name": "arg_6", + "offset": "0x30", + "type": "char *" + }, + { + "name": "arg_7", + "offset": "0x38", + "type": "char *" + } + ], + "instance": [ + { + "TXN_ctrl_code_file": "/home/strixminipc/Triton-XDNA/examples/elementwise_arith/air_project/square_kernel_0_square_kernel_0_sequence.bin", + "id": "square_kernel_0_sequence" + } + ], + "name": "square_kernel_0" + }, + { + "PDIs": [ + { + "PDI_file": "/home/strixminipc/Triton-XDNA/examples/elementwise_arith/air_project/empty_0.pdi", + "id": 1 + }, + { + "PDI_file": "/home/strixminipc/Triton-XDNA/examples/elementwise_arith/air_project/square_kernel_0.pdi", + "id": 2 + }, + { + "PDI_file": "/home/strixminipc/Triton-XDNA/examples/elementwise_arith/air_project/main.pdi", + "id": 3 + } + ], + "arguments": [ + { + "name": "arg_0", + "offset": "0x0", + "type": "char *" + }, + { + "name": "arg_1", + "offset": "0x8", + "type": "char *" + }, + { + "name": "arg_2", + "offset": "0x10", + "type": "char *" + }, + { + "name": "arg_3", + "offset": "0x18", + "type": "char *" + }, + { + "name": "arg_4", + "offset": "0x20", + "type": "char *" + }, + { + "name": "arg_5", + "offset": "0x28", + "type": "char *" + }, + { + "name": "arg_6", + "offset": "0x30", + "type": "char *" + }, + { + "name": "arg_7", + "offset": "0x38", + "type": "char *" + } + ], + "instance": [ + { + "TXN_ctrl_code_file": "/home/strixminipc/Triton-XDNA/examples/elementwise_arith/air_project/main_square_kernel.bin", + "id": "square_kernel" + } + ], + "name": "main" + } + ] +} diff --git a/examples/elementwise_arith/air_project/input_with_addresses.mlir b/examples/elementwise_arith/air_project/input_with_addresses.mlir new file mode 100644 index 0000000..f2c48f0 --- /dev/null +++ b/examples/elementwise_arith/air_project/input_with_addresses.mlir @@ -0,0 +1,328 @@ +#loop_annotation = #llvm.loop_annotation +module { + aie.device(npu2) @square_kernel_0 { + %shim_noc_tile_0_0 = aie.tile(0, 0) {controller_id = #aie.packet_info} + %shim_noc_tile_1_0 = aie.tile(1, 0) {controller_id = #aie.packet_info} + %mem_tile_0_1 = aie.tile(0, 1) {controller_id = #aie.packet_info} + %mem_tile_1_1 = aie.tile(1, 1) {controller_id = #aie.packet_info} + %tile_0_2 = aie.tile(0, 2) {controller_id = #aie.packet_info} + %tile_0_3 = aie.tile(0, 3) {controller_id = #aie.packet_info} + %tile_0_4 = aie.tile(0, 4) {controller_id = #aie.packet_info} + %tile_0_5 = aie.tile(0, 5) {controller_id = #aie.packet_info} + %lock_0_1 = aie.lock(%mem_tile_0_1, 1) {init = 4 : i32} + %lock_0_1_0 = aie.lock(%mem_tile_0_1, 0) {init = 0 : i32} + %lock_1_1 = aie.lock(%mem_tile_1_1, 1) {init = 4 : i32} + %lock_1_1_1 = aie.lock(%mem_tile_1_1, 0) {init = 0 : i32} + %lock_0_2 = aie.lock(%tile_0_2, 3) {init = 1 : i32} + %lock_0_2_2 = aie.lock(%tile_0_2, 2) {init = 0 : i32} + %lock_0_2_3 = aie.lock(%tile_0_2, 1) {init = 1 : i32} + %lock_0_2_4 = aie.lock(%tile_0_2, 0) {init = 0 : i32} + %lock_0_3 = aie.lock(%tile_0_3, 3) {init = 1 : i32} + %lock_0_3_5 = aie.lock(%tile_0_3, 2) {init = 0 : i32} + %lock_0_3_6 = aie.lock(%tile_0_3, 1) {init = 1 : i32} + %lock_0_3_7 = aie.lock(%tile_0_3, 0) {init = 0 : i32} + %lock_0_4 = aie.lock(%tile_0_4, 3) {init = 1 : i32} + %lock_0_4_8 = aie.lock(%tile_0_4, 2) {init = 0 : i32} + %lock_0_4_9 = aie.lock(%tile_0_4, 1) {init = 1 : i32} + %lock_0_4_10 = aie.lock(%tile_0_4, 0) {init = 0 : i32} + %lock_0_5 = aie.lock(%tile_0_5, 3) {init = 1 : i32} + %lock_0_5_11 = aie.lock(%tile_0_5, 2) {init = 0 : i32} + %lock_0_5_12 = aie.lock(%tile_0_5, 1) {init = 1 : i32} + %lock_0_5_13 = aie.lock(%tile_0_5, 0) {init = 0 : i32} + %buf9 = aie.buffer(%mem_tile_0_1) {address = 0 : i32, mem_bank = 0 : i32, sym_name = "buf9"} : memref<1024xi16, 1 : i32> + %buf8 = aie.buffer(%mem_tile_1_1) {address = 0 : i32, mem_bank = 0 : i32, sym_name = "buf8"} : memref<1024xi16, 1> + %buf7 = aie.buffer(%tile_0_5) {address = 1024 : i32, mem_bank = 0 : i32, sym_name = "buf7"} : memref<256xi16, 2> + %buf6 = aie.buffer(%tile_0_5) {address = 16384 : i32, mem_bank = 1 : i32, sym_name = "buf6"} : memref<256xi16, 2> + %buf5 = aie.buffer(%tile_0_4) {address = 1024 : i32, mem_bank = 0 : i32, sym_name = "buf5"} : memref<256xi16, 2> + %buf4 = aie.buffer(%tile_0_4) {address = 16384 : i32, mem_bank = 1 : i32, sym_name = "buf4"} : memref<256xi16, 2> + %buf3 = aie.buffer(%tile_0_3) {address = 1024 : i32, mem_bank = 0 : i32, sym_name = "buf3"} : memref<256xi16, 2> + %buf2 = aie.buffer(%tile_0_3) {address = 16384 : i32, mem_bank = 1 : i32, sym_name = "buf2"} : memref<256xi16, 2> + %buf1 = aie.buffer(%tile_0_2) {address = 1024 : i32, mem_bank = 0 : i32, sym_name = "buf1"} : memref<256xi16, 2> + %buf0 = aie.buffer(%tile_0_2) {address = 16384 : i32, mem_bank = 1 : i32, sym_name = "buf0"} : memref<256xi16, 2> + %mem_0_5 = aie.mem(%tile_0_5) { + %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_0_5_13, AcquireGreaterEqual, 1) + aie.dma_bd(%buf6 : memref<256xi16, 2>, 0, 256) {bd_id = 0 : i32, next_bd_id = 0 : i32, task_id = 0 : i32} + aie.use_lock(%lock_0_5_12, Release, 1) + aie.next_bd ^bb1 + ^bb2: // pred: ^bb3 + aie.end + ^bb3: // pred: ^bb0 + %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb2) + ^bb4: // 2 preds: ^bb3, ^bb4 + aie.use_lock(%lock_0_5, AcquireGreaterEqual, 1) + aie.dma_bd(%buf7 : memref<256xi16, 2>, 0, 256) {bd_id = 1 : i32, next_bd_id = 1 : i32, task_id = 0 : i32} + aie.use_lock(%lock_0_5_11, Release, 1) + aie.next_bd ^bb4 + } + %core_0_5 = aie.core(%tile_0_5) { + %c0_i32 = arith.constant 0 : i32 + %c256 = arith.constant 256 : index + %c32 = arith.constant 32 : index + %c0 = arith.constant 0 : index + cf.br ^bb1 + ^bb1: // 2 preds: ^bb0, ^bb4 + aie.use_lock(%lock_0_5_12, AcquireGreaterEqual, 1) + aie.use_lock(%lock_0_5_11, AcquireGreaterEqual, 1) + cf.br ^bb2(%c0 : index) + ^bb2(%0: index): // 2 preds: ^bb1, ^bb3 + %1 = arith.cmpi slt, %0, %c256 : index + cf.cond_br %1, ^bb3, ^bb4 + ^bb3: // pred: ^bb2 + %2 = vector.load %buf7[%0] : memref<256xi16, 2>, vector<32xi16> + %3 = aievec.mul_elem %2, %2 : vector<32xi16>, vector<32xi16>, vector<32xi32> + %4 = aievec.srs %3, %c0_i32 : vector<32xi32>, i32, vector<32xi16> + vector.store %4, %buf6[%0] : memref<256xi16, 2>, vector<32xi16> + %5 = arith.addi %0, %c32 : index + cf.br ^bb2(%5 : index) {loop_annotation = #loop_annotation} + ^bb4: // pred: ^bb2 + aie.use_lock(%lock_0_5, Release, 1) + aie.use_lock(%lock_0_5_13, Release, 1) + cf.br ^bb1 + } + %mem_0_4 = aie.mem(%tile_0_4) { + %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_0_4_10, AcquireGreaterEqual, 1) + aie.dma_bd(%buf4 : memref<256xi16, 2>, 0, 256) {bd_id = 0 : i32, next_bd_id = 0 : i32, task_id = 0 : i32} + aie.use_lock(%lock_0_4_9, Release, 1) + aie.next_bd ^bb1 + ^bb2: // pred: ^bb3 + aie.end + ^bb3: // pred: ^bb0 + %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb2) + ^bb4: // 2 preds: ^bb3, ^bb4 + aie.use_lock(%lock_0_4, AcquireGreaterEqual, 1) + aie.dma_bd(%buf5 : memref<256xi16, 2>, 0, 256) {bd_id = 1 : i32, next_bd_id = 1 : i32, task_id = 0 : i32} + aie.use_lock(%lock_0_4_8, Release, 1) + aie.next_bd ^bb4 + } + %core_0_4 = aie.core(%tile_0_4) { + %c0_i32 = arith.constant 0 : i32 + %c256 = arith.constant 256 : index + %c32 = arith.constant 32 : index + %c0 = arith.constant 0 : index + cf.br ^bb1 + ^bb1: // 2 preds: ^bb0, ^bb4 + aie.use_lock(%lock_0_4_9, AcquireGreaterEqual, 1) + aie.use_lock(%lock_0_4_8, AcquireGreaterEqual, 1) + cf.br ^bb2(%c0 : index) + ^bb2(%0: index): // 2 preds: ^bb1, ^bb3 + %1 = arith.cmpi slt, %0, %c256 : index + cf.cond_br %1, ^bb3, ^bb4 + ^bb3: // pred: ^bb2 + %2 = vector.load %buf5[%0] : memref<256xi16, 2>, vector<32xi16> + %3 = aievec.mul_elem %2, %2 : vector<32xi16>, vector<32xi16>, vector<32xi32> + %4 = aievec.srs %3, %c0_i32 : vector<32xi32>, i32, vector<32xi16> + vector.store %4, %buf4[%0] : memref<256xi16, 2>, vector<32xi16> + %5 = arith.addi %0, %c32 : index + cf.br ^bb2(%5 : index) {loop_annotation = #loop_annotation} + ^bb4: // pred: ^bb2 + aie.use_lock(%lock_0_4, Release, 1) + aie.use_lock(%lock_0_4_10, Release, 1) + cf.br ^bb1 + } + %mem_0_3 = aie.mem(%tile_0_3) { + %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_0_3_7, AcquireGreaterEqual, 1) + aie.dma_bd(%buf2 : memref<256xi16, 2>, 0, 256) {bd_id = 0 : i32, next_bd_id = 0 : i32, task_id = 0 : i32} + aie.use_lock(%lock_0_3_6, Release, 1) + aie.next_bd ^bb1 + ^bb2: // pred: ^bb3 + aie.end + ^bb3: // pred: ^bb0 + %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb2) + ^bb4: // 2 preds: ^bb3, ^bb4 + aie.use_lock(%lock_0_3, AcquireGreaterEqual, 1) + aie.dma_bd(%buf3 : memref<256xi16, 2>, 0, 256) {bd_id = 1 : i32, next_bd_id = 1 : i32, task_id = 0 : i32} + aie.use_lock(%lock_0_3_5, Release, 1) + aie.next_bd ^bb4 + } + %core_0_3 = aie.core(%tile_0_3) { + %c0_i32 = arith.constant 0 : i32 + %c256 = arith.constant 256 : index + %c32 = arith.constant 32 : index + %c0 = arith.constant 0 : index + cf.br ^bb1 + ^bb1: // 2 preds: ^bb0, ^bb4 + aie.use_lock(%lock_0_3_6, AcquireGreaterEqual, 1) + aie.use_lock(%lock_0_3_5, AcquireGreaterEqual, 1) + cf.br ^bb2(%c0 : index) + ^bb2(%0: index): // 2 preds: ^bb1, ^bb3 + %1 = arith.cmpi slt, %0, %c256 : index + cf.cond_br %1, ^bb3, ^bb4 + ^bb3: // pred: ^bb2 + %2 = vector.load %buf3[%0] : memref<256xi16, 2>, vector<32xi16> + %3 = aievec.mul_elem %2, %2 : vector<32xi16>, vector<32xi16>, vector<32xi32> + %4 = aievec.srs %3, %c0_i32 : vector<32xi32>, i32, vector<32xi16> + vector.store %4, %buf2[%0] : memref<256xi16, 2>, vector<32xi16> + %5 = arith.addi %0, %c32 : index + cf.br ^bb2(%5 : index) {loop_annotation = #loop_annotation} + ^bb4: // pred: ^bb2 + aie.use_lock(%lock_0_3, Release, 1) + aie.use_lock(%lock_0_3_7, Release, 1) + cf.br ^bb1 + } + %mem_0_2 = aie.mem(%tile_0_2) { + %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_0_2_4, AcquireGreaterEqual, 1) + aie.dma_bd(%buf0 : memref<256xi16, 2>, 0, 256) {bd_id = 0 : i32, next_bd_id = 0 : i32, task_id = 0 : i32} + aie.use_lock(%lock_0_2_3, Release, 1) + aie.next_bd ^bb1 + ^bb2: // pred: ^bb3 + aie.end + ^bb3: // pred: ^bb0 + %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb2) + ^bb4: // 2 preds: ^bb3, ^bb4 + aie.use_lock(%lock_0_2, AcquireGreaterEqual, 1) + aie.dma_bd(%buf1 : memref<256xi16, 2>, 0, 256) {bd_id = 1 : i32, next_bd_id = 1 : i32, task_id = 0 : i32} + aie.use_lock(%lock_0_2_2, Release, 1) + aie.next_bd ^bb4 + } + %core_0_2 = aie.core(%tile_0_2) { + %c0_i32 = arith.constant 0 : i32 + %c256 = arith.constant 256 : index + %c32 = arith.constant 32 : index + %c0 = arith.constant 0 : index + cf.br ^bb1 + ^bb1: // 2 preds: ^bb0, ^bb4 + aie.use_lock(%lock_0_2_3, AcquireGreaterEqual, 1) + aie.use_lock(%lock_0_2_2, AcquireGreaterEqual, 1) + cf.br ^bb2(%c0 : index) + ^bb2(%0: index): // 2 preds: ^bb1, ^bb3 + %1 = arith.cmpi slt, %0, %c256 : index + cf.cond_br %1, ^bb3, ^bb4 + ^bb3: // pred: ^bb2 + %2 = vector.load %buf1[%0] : memref<256xi16, 2>, vector<32xi16> + %3 = aievec.mul_elem %2, %2 : vector<32xi16>, vector<32xi16>, vector<32xi32> + %4 = aievec.srs %3, %c0_i32 : vector<32xi32>, i32, vector<32xi16> + vector.store %4, %buf0[%0] : memref<256xi16, 2>, vector<32xi16> + %5 = arith.addi %0, %c32 : index + cf.br ^bb2(%5 : index) {loop_annotation = #loop_annotation} + ^bb4: // pred: ^bb2 + aie.use_lock(%lock_0_2, Release, 1) + aie.use_lock(%lock_0_2_4, Release, 1) + cf.br ^bb1 + } + aie.flow(%shim_noc_tile_0_0, DMA : 0, %mem_tile_0_1, DMA : 0) + aie.flow(%mem_tile_1_1, DMA : 0, %shim_noc_tile_1_0, DMA : 0) + aie.flow(%mem_tile_0_1, DMA : 0, %tile_0_2, DMA : 0) + aie.flow(%mem_tile_0_1, DMA : 1, %tile_0_3, DMA : 0) + aie.flow(%mem_tile_0_1, DMA : 2, %tile_0_4, DMA : 0) + aie.flow(%mem_tile_0_1, DMA : 3, %tile_0_5, DMA : 0) + aie.flow(%tile_0_2, DMA : 0, %mem_tile_1_1, DMA : 0) + aie.flow(%tile_0_3, DMA : 0, %mem_tile_1_1, DMA : 1) + aie.flow(%tile_0_4, DMA : 0, %mem_tile_1_1, DMA : 2) + aie.flow(%tile_0_5, DMA : 0, %mem_tile_1_1, DMA : 3) + %memtile_dma_1_1 = aie.memtile_dma(%mem_tile_1_1) { + %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_1_1_1, AcquireGreaterEqual, 4) + aie.dma_bd(%buf8 : memref<1024xi16, 1>, 0, 1024) {bd_id = 0 : i32, next_bd_id = 0 : i32, task_id = 0 : i32} + aie.use_lock(%lock_1_1, Release, 4) + aie.next_bd ^bb1 + ^bb2: // pred: ^bb9 + aie.end + ^bb3: // pred: ^bb0 + %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb5) + ^bb4: // 2 preds: ^bb3, ^bb4 + aie.use_lock(%lock_1_1, AcquireGreaterEqual, 1) + aie.dma_bd(%buf8 : memref<1024xi16, 1>, 0, 256) {bd_id = 1 : i32, next_bd_id = 1 : i32, task_id = 0 : i32} + aie.use_lock(%lock_1_1_1, Release, 1) + aie.next_bd ^bb4 + ^bb5: // pred: ^bb3 + %2 = aie.dma_start(S2MM, 1, ^bb6, ^bb7) + ^bb6: // 2 preds: ^bb5, ^bb6 + aie.use_lock(%lock_1_1, AcquireGreaterEqual, 1) + aie.dma_bd(%buf8 : memref<1024xi16, 1>, 256, 256) {bd_id = 24 : i32, next_bd_id = 24 : i32, task_id = 0 : i32} + aie.use_lock(%lock_1_1_1, Release, 1) + aie.next_bd ^bb6 + ^bb7: // pred: ^bb5 + %3 = aie.dma_start(S2MM, 2, ^bb8, ^bb9) + ^bb8: // 2 preds: ^bb7, ^bb8 + aie.use_lock(%lock_1_1, AcquireGreaterEqual, 1) + aie.dma_bd(%buf8 : memref<1024xi16, 1>, 512, 256) {bd_id = 2 : i32, next_bd_id = 2 : i32, task_id = 0 : i32} + aie.use_lock(%lock_1_1_1, Release, 1) + aie.next_bd ^bb8 + ^bb9: // pred: ^bb7 + %4 = aie.dma_start(S2MM, 3, ^bb10, ^bb2) + ^bb10: // 2 preds: ^bb9, ^bb10 + aie.use_lock(%lock_1_1, AcquireGreaterEqual, 1) + aie.dma_bd(%buf8 : memref<1024xi16, 1>, 768, 256) {bd_id = 25 : i32, next_bd_id = 25 : i32, task_id = 0 : i32} + aie.use_lock(%lock_1_1_1, Release, 1) + aie.next_bd ^bb10 + } + %memtile_dma_0_1 = aie.memtile_dma(%mem_tile_0_1) { + %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_0_1_0, AcquireGreaterEqual, 1) + aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 0, 256) {bd_id = 0 : i32, next_bd_id = 0 : i32, task_id = 0 : i32} + aie.use_lock(%lock_0_1, Release, 1) + aie.next_bd ^bb1 + ^bb2: // pred: ^bb9 + aie.end + ^bb3: // pred: ^bb0 + %1 = aie.dma_start(MM2S, 1, ^bb4, ^bb5) + ^bb4: // 2 preds: ^bb3, ^bb4 + aie.use_lock(%lock_0_1_0, AcquireGreaterEqual, 1) + aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 256, 256) {bd_id = 24 : i32, next_bd_id = 24 : i32, task_id = 0 : i32} + aie.use_lock(%lock_0_1, Release, 1) + aie.next_bd ^bb4 + ^bb5: // pred: ^bb3 + %2 = aie.dma_start(MM2S, 2, ^bb6, ^bb7) + ^bb6: // 2 preds: ^bb5, ^bb6 + aie.use_lock(%lock_0_1_0, AcquireGreaterEqual, 1) + aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 512, 256) {bd_id = 1 : i32, next_bd_id = 1 : i32, task_id = 0 : i32} + aie.use_lock(%lock_0_1, Release, 1) + aie.next_bd ^bb6 + ^bb7: // pred: ^bb5 + %3 = aie.dma_start(MM2S, 3, ^bb8, ^bb9) + ^bb8: // 2 preds: ^bb7, ^bb8 + aie.use_lock(%lock_0_1_0, AcquireGreaterEqual, 1) + aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 768, 256) {bd_id = 25 : i32, next_bd_id = 25 : i32, task_id = 0 : i32} + aie.use_lock(%lock_0_1, Release, 1) + aie.next_bd ^bb8 + ^bb9: // pred: ^bb7 + %4 = aie.dma_start(S2MM, 0, ^bb10, ^bb2) + ^bb10: // 2 preds: ^bb9, ^bb10 + aie.use_lock(%lock_0_1, AcquireGreaterEqual, 4) + aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 0, 1024) {bd_id = 2 : i32, next_bd_id = 2 : i32, task_id = 0 : i32} + aie.use_lock(%lock_0_1_0, Release, 4) + aie.next_bd ^bb10 + } + aie.shim_dma_allocation @air_channel_3(%shim_noc_tile_1_0, S2MM, 0) + aie.shim_dma_allocation @air_channel_0(%shim_noc_tile_0_0, MM2S, 0) + aie.runtime_sequence @square_kernel_0_sequence(%arg0: memref<*xi16>, %arg1: memref<*xi16>, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32) { + %0 = aiex.dma_configure_task_for @air_channel_0 { + aie.dma_bd(%arg0 : memref<*xi16>, 0, 1024, [, ]) + aie.end + } + aiex.dma_start_task(%0) + %1 = aiex.dma_configure_task_for @air_channel_3 { + aie.dma_bd(%arg1 : memref<*xi16>, 0, 1024, [, ]) + aie.end + } {issue_token = true} + aiex.dma_start_task(%1) + aiex.dma_free_task(%0) + aiex.dma_await_task(%1) + } + aie.packet_flow(15) { + aie.packet_source<%shim_noc_tile_0_0, TileControl : 0> + aie.packet_dest<%shim_noc_tile_0_0, South : 0> + } {keep_pkt_header = true, priority_route = true} + aie.packet_flow(15) { + aie.packet_source<%shim_noc_tile_1_0, TileControl : 0> + aie.packet_dest<%shim_noc_tile_1_0, South : 0> + } {keep_pkt_header = true, priority_route = true} + } {dlti.dl_spec = #dlti.dl_spec} + aie.device(npu2) { + aie.runtime_sequence @square_kernel(%arg0: memref<*xi16>, %arg1: memref<*xi16>, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32) { + aiex.configure @square_kernel_0 { + aiex.run @square_kernel_0_sequence(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7) : (memref<*xi16>, memref<*xi16>, i32, i32, i32, i32, i32, i32) + } + } + } +} diff --git a/examples/elementwise_arith/air_project/main.pdi b/examples/elementwise_arith/air_project/main.pdi new file mode 100644 index 0000000000000000000000000000000000000000..a2347424a644d017f5e8ac814673b9061a6becd0 GIT binary patch literal 368 zcmcc1z`)>QtR&b`o)&s$_o|r;3@i+cKrs~{2GJ%!3<8rIPKiKR5Yp2nAV5Jtfx!S- z5XuJWBL`Gy9h?QS4_SX=W@>z9Zen^W!ax}W6O}bv!%~(5CLQVqjndvbXQt@Lv$f001PR1P=fJ literal 0 HcmV?d00001 diff --git a/examples/elementwise_arith/air_project/main_aie_cdo_enable.bin b/examples/elementwise_arith/air_project/main_aie_cdo_enable.bin new file mode 100644 index 0000000000000000000000000000000000000000..cba6b8778c42200ab6ec35c68cb3586f8fb4e055 GIT binary patch literal 24 dcmZQ!U|?`|@n>LQVqjndvbXQt@Lv$f001PR1P=fJ literal 0 HcmV?d00001 diff --git a/examples/elementwise_arith/air_project/main_aie_cdo_init.bin b/examples/elementwise_arith/air_project/main_aie_cdo_init.bin new file mode 100644 index 0000000000000000000000000000000000000000..cba6b8778c42200ab6ec35c68cb3586f8fb4e055 GIT binary patch literal 24 dcmZQ!U|?`|@n>LQVqjndvbXQt@Lv$f001PR1P=fJ literal 0 HcmV?d00001 diff --git a/examples/elementwise_arith/air_project/main_design.bif b/examples/elementwise_arith/air_project/main_design.bif new file mode 100644 index 0000000..27149ca --- /dev/null +++ b/examples/elementwise_arith/air_project/main_design.bif @@ -0,0 +1,10 @@ +all: +{ + id_code = 0x14ca8093 + extended_id_code = 0x01 + image + { + name=aie_image, id=0x1c000000 + { type=cdo file=air_project/main_aie_cdo_elfs.bin file=air_project/main_aie_cdo_init.bin file=air_project/main_aie_cdo_enable.bin } + } +} diff --git a/examples/elementwise_arith/air_project/main_div_kernel.bin b/examples/elementwise_arith/air_project/main_div_kernel.bin new file mode 100644 index 0000000000000000000000000000000000000000..e44b65c166f6fc2f297dc1df988f5b8b540f6252 GIT binary patch literal 22460 zcmeHPZ)_aJ6@PoT_8qCs?KyV3*bZJN1{-pL3o+C{0(*&DS=5Rg{jAV{f7qXSa4m5RDXR62r#`(}3L zc4v3S-XbFU;YPZ<-QWD?&71d|nSHn3GZvvxU$O0aON?IH{Tg4G$^x~PQ zvrhN3x)i?CSGdkzqYl=n%PQ&u&$dA20szwhV!|lYF37O{>D#5)`P(<89qp^{Tmpuhv7z_i(^Oc z$eq<(YN^^{RPL_hXUUk_v}*5sKX;~d`ND*Y<8v`e0Carlv0q5n3s}Ex;pVYpKH$J~ zW|dt3J`bXA8Q}QgbocSYhr3DF%j?)%N@njKw0BRshjg7peqjy&PGSA__5HgS?cIAA z?Y)WoE7q`A!}=@Nw|76FilObi7}^V;y@BAMQE+aJrkcbfdkYg#*)Cd+6U~ z3tyjE(an!jv5^T%V`Chf)J8{h4|s@1u# zer8|giL^ayxYK;%f^E5U{m@nQdaAVq9RPe@+hE}8nc}O*&d%e}EWi|PX=j1sI6e!q zDRB6DI7ThwDVjM)+P`3t_Of-r`R&3r9~^M5tt{5brm}U`?1J)bZ)BWW!FaTFv_{6q z^VfY7tY1M}<%arokF5TCr4K5Z6(^GM~2vuA(!0F)nl zC6j|=GtU?%^!mHU7r~mv-GT0xqA>sNGirUpKNr0>#2CpXFItNh3>gmcjm1g0ghiCQ1 z{(w6~uuD8w?!fxF)jzGQ3V)7%YmEE8uM=(A=@DUS6b-qsro9%|vK<2oPt3ukE984^ z7P&o}FAP|zejpc(=P?#j>7s9sY?`&mcGFs%Tm3#p7kwIq`81TtaBH4ydjYxb9PaSA zLw^=XyPGF1IL2qH)>{4B8L(*l2R54i7<$I(7)C^&wCAF8h2p7 z(D7IA!ZFm+S%Ix%uj^39(fzIi&}x$JY{0qTo=T&Ne(}d^56x!^KR;r$0x`;Yvikvx z?8XSR&sw&3Q=VLM0%t6q--gj*QGqOtieGp_vt5esNM zY5Sp6&>qajm~n;C@iuyl+mq%Kj+fe-+kpOYd-QtBiUZCKb^F!lGWNB#jJ@z8?j^5e z>>s?EvG>1`v3LAAW9R>xiC>Sge;3tk6g;0W<@lrjWuuk%M+%?I7oP8dEe6KZvd_-u z?XwqS>`FBH(ASy^9GuVF zl_PmlMBS}aCvoxf;FM)o_E-e=STs%rV}r3VMRu342GQFAnRiovU7#5ZkH@0F|KaG! zlh5W-)84Psm3!;t+bfwu%5>9oJ)nQshV#YzpnLZ4`UNZZjUtMWrt zoyZSWbs|4h)q%5%hO2W`eI1W&x77!Yu;CF-dBBVjr&;5T(z02 zHt(x~`{~?Q_47QMyTx-=eVxb;RqaH6sHy|!77DXMb4K~T>VAf+b5(sEoYvQ66?L>< zzuer+RhzkLGgocqs?A)r`3ykw8Gz<904yzNJ_GRo`V4@d=TVX3xvIWS-u z;57~IMR4(~hU9?r8W$4eo022@O7_!KXENS%c4M@Oceh z(co1LzNEox8r+NEfo}faG1R&Rf8&Vj7JN^YbyMSxr^bsK+|}R{8hlEFPiye92A|X5 z^BTOO!K)g4NrTrkxEH|#-Td$LYTbhG`*GcZJzLgIjXRziFKTdCgHLGiDGffY!OI$a zPJ_>D@QMbnYVaiuUen;72DhM>NrQt(_Py1(OX+N;XpAGzxPNdXsG+PDF zY4F?#3*en^m=_A3+u*S&EOGz#l*xY)0MNdp)B2SI-+d}s+!}dk) zD)_>0SSlrqgW$Q+*yxEI2f~dBfmCj)UN>7l;0afv}(Aw|>G;K{EO8^`f|VhC7P4g?`@`wlALR7JT~Ymu{2& zJcM0+=#L2E#a$`k*MGk_!1)a5j*}57t-6NI8KVOrxj%y+`} z(?8YP7mbHoyOr13BFCwLaNJA{4jR0T+*u3$~+wPWw=6LhAZ?{@=;$Uj`}Kb)R*B3eU*IFm*Mm+yxvh?C4ZOaf1_*xD2Mr- z%Z+cE5}xJ!&*M+e!+Z$%GX9=#=W3%bC-I?&%{SaJB>O{#k7K2T7dbzQgNE;NKI$de zF*W=G=VOp0epbUD;d~sO#BbB^hrKk5zaq{IkB$rD%y31V1-~lA^F5!hh%@73axVGl z7(K#{gkzkQIL29tW1JbTh_jN9ab`HaN&RA+m3)jd!xeE>@+~jTR=O3j?=JheBF>Dj zh%>{HAB{7^qj4U99>JH!g>h5koR9z1K;rMx@J-IgC#xm?h=!l#{PqZcSi^7Q{8-#H zeAC;Cp}}9lM(o?mK8{tApDyRe*1Lv3#`*E}qP9QE`LTJfv7hC9{6Z=DIjP}~bAD`| zYxt(;-}8?9)3_Hd^*hY@@%X6uyEq^JorYv*U|G%2a(-8Yf2)Q+#`zl~{BaF`ob%)H zQTt+9pu`7_M<_~TV+yuHh6ewOp&@l^B2I6odAHGiD*nveSfvmcL-nm@t$@$stp z4(G@A|7A75;O*7;fie`&GD(9<#D!_}}fRGjgfo zsXqq%JcK7jxt^Vw*nX|j`C3p{kjBsoMOZbe&uim$hxYCDKH0 zQ^m$f`emGkCbA7kq>0=nX}_{2+{Ss+qzvIw#ginl3Ae!+AfG1EL~c_>3rYHwHIZ#3 z(?o8Qv|m{hZsWXZLcIz~6%UidCTKyAs`6|Zr2Wd8a2ub5CjSM>GL3!! literal 0 HcmV?d00001 diff --git a/examples/elementwise_arith/air_project/main_mul_kernel.bin b/examples/elementwise_arith/air_project/main_mul_kernel.bin new file mode 100644 index 0000000000000000000000000000000000000000..48ac55f27234b0be60abf7239927bde8dd3fe95a GIT binary patch literal 14460 zcmeHNO>87r5w7-h>{)PjCy9v$uTgq`gjSIcV{9ZNS)!dak_ASZ5Du0&V7PGMU~;b$RHtn;XdM!!=6S;4xG>^r<_(!fO}6)IjmUfy;raM)q7Rm$11+8_Q{^^ z{_3lG)%8{X&OlKv_6vY1{{M0tpbxkbjGkYwZuvFKSi>!E8$7&WZ43Bm{zKdF0vlXl z>kGCnvzu^^8)X3#fIT?BvSjBsA797GKmBg6f@U2p|9~ZUe*lwN0Ttlig8_^_LC;Sf zKR7HtzH$@by`=|-@bUL*fEW7jzw@_ZI@qwjCKkYO_{OVlZGBX}xcJ}G@BjX#)A2uF z`uFbAPkys;`W1lFKZFGvzFPN(f2^+n?C+khp8o0=U)wt}^X=82F5;{JIK$81Uj9*G z=WyHi-o6@a&vu7D{oP`5)8vnKH;jF<5N*F)t%G6t!RIz8^LO!Z{u(xEKFofx_!91A z{|xu?9>5*rHS7cJGy2z9WdPzoI7ah=ne+57A6#2FZvNhz{QC3LpFLVyUnuJy?H~FO ze+V!z5zQEW>%&eY82;&lr6JrQ>xTuv=a18VGJXsAFZGjT^Zg{*d_PGxs5T932d2X2 z8$8^wxXB~UMYq8Petp4a=5^y6&wG3FEcdqS8KbqvtS6N86uT3qgJ z-3Ax<^#xnk+dp{T+s|}wyPh#kmG-u7^Zg|G`F@gYK=<2H|9V^KXV&d#Zum98`>%Ek z6ueUKk%HF>K34D}1wU5s69vDo;AaXxQShmP&lLPz!J7#8&qDrQQr4}z&-6Xi3e;@0 zt6Vo3uN3}B!D|H{EBKLuA1nBYg5Ov0GXeijJ zxo$FEH8MU@@LIvg3Vx*E#|nO;;P(~$Ou;7#K2`9Uf}bmRqu>KrGm@CM3+y_DcX1Xs zHl~ZcyA%0_YDdI3p{4j1H@1*Jrt^6b;dOMLH-2V}6n>@f2MQk){+P~FX+HAxV5IP? z=zQn0#}_jS9~AzWj=waYi_d2s=7Yi?Q(wjWTIsiu!mkwmK;c8gH}l&u zd(*u>EASO+G|skSCeD6;C1l4C)GhdKlt~UK2JYON$(D}pXg^&Ga^Njle`;Fxv=)3TL>f7=2^zHb0`gZ(0 zeH}kvU-3R1^|iR5uf+v@MLy~);;65PqrMgw^cDH2uf@$rc)X*&B7diGPn0hJoP_+9 zdHbZv@ILXsgI}G8eCYTb|IEetX7SoJ#)s>+-|&pV;@>G8jz)%$h#%vi@N43uUMx@0pgceqez%NOcw zapX^=5FdYX!{XeY%ltm^@uxeC|3ihpPyAOR{(-_jAbwgOIiIq*Yx^hF z*Wq#hY!M%SOvCbDo6Gz@@zZ+B{C(o5^^y4p#83BE<{uJ2Io_50%jPYmJ`RuT(g+&d2V**WKwUKOPQ`mVhFZ*20{OG%V6WRQ8 z+a@9oPi^j;D*L(fRcJHocHqxnM`pz&!^adi-9F&*UEHud&VdsBFB&T!b#`-)*lx`V z_sse9^V;0ncu^8Kug@K#dEL9Vr=Qnq*9NC~+n)HyX$#oY@S4-lr&f9!9e*ciaA{z) zq6ll#X;OP{yPV>upE_TY+WNL1S(dh8HAUCj?i$a1a_8lm@ECcTaG%_Hxh53j(lj~g>W78Q%@L`= z;4$(v;XYH0mea3X6CNW^6Yi5cFV}=(T$(1+FF+F>BTp0VlUp0E36GJd3HQmJmuo^X Iz6eeJ2joH%+5i9m literal 0 HcmV?d00001 diff --git a/examples/elementwise_arith/air_project/main_square_kernel.bin b/examples/elementwise_arith/air_project/main_square_kernel.bin new file mode 100644 index 0000000000000000000000000000000000000000..8ba56366c72a88bc1322e4d03447a2fe49afc7c7 GIT binary patch literal 11048 zcmeHNzi%8x6n=ZRu{U6n4LYKYW0Z9gp$J@@6C(vF5_`l*78v0qO(Y5o4Go1!MaM;0 z0U?B38YCn)^e2iGsa8sAbV`-Vx&;1!b*dB*=FQBT+xhj(#)^wb8OgV^-}~mxoA&0$7C6;Ym5aczFvafAmp1hiVQU0;F&rnV&^DKn}N47z`uqz{A};*P{oAPXl~E zdFMJj__zQ#l-&OIkEl%NLN=|J zkPY$+0+tO!vGkd4&r4jy;m5+;V2^Q0z$WHZ?fgsa%0#@V+EtnmOW^bpvT419Y+5fN z8|0S-ESs<0p2Wl2LArfK;$iLbHrQia60p_Uz4}tSGUG3*c9kZ?5`69QHm#S?IIWkE4d}cZ(xm%dT4(&#N%rW$=Q*R~USi!PgkP&)^#jzQy1J1}_g*>>H+cwU9^E`t{gzQW+E48F$TeFooP@GS-(FnGz}Lk1r)c*WqU zJdMe3|JLV}I;E+t(5ef}h!TpY1ukFW!e;Wjv_D4?ZhxhB0 z;e)dusb~PYq<7zcc>hQlJ~;22+P~7X??0^blG{(;e5k*HLsw7^~H{t(Jeho((~)lF@x*p44PlZJKHc`&HKN2 zXLTr!9u%(U*QA)9=eoFlrl)anRc*S}6&&< z!bMOfKV|qcgr9KunZvIWF%7c55@71P+cWD`^O<@T1|RiM^IKs(Sp2rZXX<70Q4h+$ z)F-2dr^Py^LpH3h?ppcm67JUb65(#WE;9T@1}_+VnQ*sWOAH_NCgXSYfO?a-7k{Nr zh=&0BrhYm*C&d2{pfveV^KJaG^7C$V>ZrwslQJGul;!_iq;ND^c$fGt4h+8_ zKCajD6Epld;$z+{e!}oC5+7G*@uwJmTD9fAfj_la{9;Gp?z%9~k{`&k#F1ZG`FyPS zZl2G=NzJ$B#k_@a;^TqB;$LL=G4Z{1G5m!1uQ~n~82%LTk2?HI3_q^UU}^yHXFV(a zY)9c}vhafVo(>uQ67l`_OW6M+@$sieYu=pkzf62jw+#O}@xA;o{J2ujR{VIz&s?lH z7l`lYC(OS@eEccX@^f}4%wHzHzuqvvLwtX|VSZd)mU{5(rEpgd3&h7CNv-&2cEbE+ z;`{N3`5of>@rU_wb&ZK%;comL;`{N0`B`;F?gOtn>z&(Cxa&VBzW07-_#NWE?)YC~ z_%p=!_8*3yRTr4}74FBca6kSqKdUY?{uQqMx3MEzg_##koHwTI-%{c#lD=?mc_uM` zXVK59#c1(o+myD$Hl>@eP1W4AiN0+34;Ed~5y9B4#9JAsxZcDU$-}GJVjRC$>)fyH z;&WuW}6d27^=PQxYtRwviV5NKs*+w%OM~GhEzHO8ndrXNQn#^nGhI|}Y zZbVddE3w-o#(IS4mG*6;+}LBpU6Xn3+>nn0%T4`G!|>!Po5U!%MzeQ-M!DJFymoHL z$ARUhu+~-Lf}cG%wvUGU&A#Tfb3^kUSZ@0JiqX&vc5NTca$}EaS`#}r T*gl%&#vZf3+>npwk(>Vj59vP! literal 0 HcmV?d00001 diff --git a/examples/elementwise_arith/air_project/main_sub_kernel.bin b/examples/elementwise_arith/air_project/main_sub_kernel.bin new file mode 100644 index 0000000000000000000000000000000000000000..32bcee0ccb77c74fa42e783e1359dfb1a7806953 GIT binary patch literal 14396 zcmeHN&ub*t5w4zTOXFm`J9d;X*;R0_cL|HpVPq+wfWR%U0wO3dj&oSZ!JAy{Lk_|x zALX!+5JK2XKnaBC639{Hki(uD>~k3W2Q0Fa+*>~Rutcf%UcK&D?^S7^7wfX@gPHFB z>Z^Ly^;Q3zfudX*6#xx>wzmLAfGfcm`t|CjU$cx=-14@`qZ`&Xz;E*jZPN>EaDi@8jg3|9DtIy9S>DOyCXNv!53g!0GuvCvbv(xLyn% zPGI&K#`yK}!@I@%do{odqlX{0>eipLxo$FEwK6_a@LIv=3Vu() z4;1`R!5=92v4Wo{c%$G81wU2rR>3E*Y9ujl3)poE@7gSGY)lt>Hz)E9)sBd7LQC;2 zZfqfcPUrI?!t3ZfZ~V-dDf~*|PZT~V{5hSc(tPCW!A#*-(fQ70k1tvjJ}CS-9e-&) z7oX2!eia?ZeLgNu%m;-(r@o5$wbE}hg)TE_q(n z@#=^L@b7oaE1p+(Jln#I``^Eo^u`Tfg`PnSA?#=Wyc5@_Bz}zh`_vd82hTWRe1Gri z`DSUyVkff>o>!A&zuD^+C`}avPJ`UbR@;bPAJ>M+tSlr00gXh)c z*e`e;Jnu<$xX}d&^Yp(p_&hzSq~8l-+`l7^@p16`LKxq_H}-t9bmLk%Gb{G$gD=sA zQ9s_f@A(xF-vZYJHo(q#{gLOxFyI%z>)_Ba?)0Abxq1qTHm6K-OJaYhP%g>Z9o5%MPrf0gj>M*L~S?+bAatiBdd^gZ7R_3HVG zUUk67e(3oFxgQvR81NOnLO%9`i*Hp^`r&bXAIo6}>g!ioJyBna$NFv)9_zKG@OKrw zR`7koW4-niKK7fo3&V;_-fAoh>fZ+4CD#&o3NbsI%qc%{j}nDz5uD497Z)IM!LjvCbA3>MZiH z&KAcnHZRs${7r>FCVo;kgG#15kIYu%-<(|y1z32F7cD&UCF;}-%;x0@VGuB;-~v7^H+$U<}dSS?M-{W6^?iD zJiJBxG#?iy&VQTuX+OyPP2#7=m&~6N|C>?&tewgHyTr#oh_Qa#Q}}D`hEgAg$NPJW z_-Q}Od^|tc_-TD){+#&f{>uD{_{sVIOy;k(ca(e_9`Emz_-Q^eACDIsKe^tW$^6y! zroGONBfVy44o~xOxS+3iyu>Qie~0_{ zLtL?=#WRB$#tVxY{-*@m!fGSg%C@lcU|#mQn)%Uh`zEsaaN7nV4o_|FoGSad^Hpdw z>vrIuR%d3#B*RC8n{FR)`4Mhd9_K)5{w#aJSoyTKn|s7=Yf-po&ZnQ(=GMiFa=>|g z?hwuE-nBjbyjHs|IL+Jj#HUVMz@~;*oqj&G(%b0xJ4Ay^1EUp1Sessx+I!n&gWrDY zd`)WW+kIte!hITa&Y9P8I!$;C$kK%SJVu@-+$VQlt_j7sG)+ua{b+DCr#8KZS!?)amL}ZCY~{?$HQ_PxG~qtE^Kwln z#-(Y}pV0tEZjOcf5{q-OQZ&C%Ar`{FT7 zZ8}qQt?jSz+$VQlt_hEkrwRATotJAuF)mG$!@ho4$lM%}8VnvIPZRFbV6>cm<(lvq td75ya+7DIJ042kN406>6<}6eA>Btx)wQyxiPco%Md%H$e1Z z*Xk_&@?rwd;kAq3cx#nWwO1b!MJEnxG*S)rcmBQn(x=_a&7Zq}uXmlfG`jpa(eme6 znlOwGCt>tOvX5x0zS+C{{yRq|=lHzIevp(*e%NE|Bccj>po*O$Syocw;&WV-noLJe;iIpj?Wew4DH>zhY^Ou)tBQJ+S&d=M^59jTHoHyNn zJ3o=>zm@HLSlP~pl__o%=B-|sx4cXnc{vnve%?+zoVN#Z-gN)%{6wbzR<`qDWjh~M zrg)|>Zzl`$mX~QGFNZ?T&)X{x=k0-3A`roy1<(Prvg`~-{RJP~d4cCo=azRlQhy+LhRUXinfDA%#I;Zr zd@T5i;7RaJ%$D<$zXmKHi&?A)dlGyTJ+$_T=xtT-vEVC$C&4#SE9WQQQ_IJ)p2D6C z-_7?iv+VcH4lmTqI(&L$1ZzdK`E1ZB_gVW`*(c7>px;gh2X?Ygw13qnx#^FugTvhO z9^A8e@vK)2ws{4R8LR7{pesY>tXRkQbzW?{J-_-M^JDvj-R0Kq`6>VR$Paofou9@Y zWX?Un=Oe~Czr3z&KRc`ZoNjx5qkGJc?Gv1uU%Th0{NE!#=&^Ku8oT`TwWRaQ>&o`K zsLIdjw&!3fdOFRv@xXFMuDr*rw?;e=Yf zfx6L=l?`p)lDqZAZXDRb4>Pc0f<0uSUjDOf)whxlq&=04z4x)&RoqatPpp6cSkY9; zMf=$LFJ0NyKi6uk6LDa1t*Qp|231GMj8z>Ax-w)uYf-wepzDEb?*Wy9t{?IfW?v+9 zJJg@&3u~A9Z`F(S)b_j7Gbz}kU)tW^xnJAtOU8bedKT@`FZCFUXqSH7O%_f4IZ0c8 zg>C&OI(y01cpR9Q)lEXSy-5nXX~?$5Qw3cB+4hEnZio8&d#t~Yvl0{=TSxPKWyYOqgpbr<+Ji)Iqes zbJ5ercOHX7@XrygF;AM;_;!h&@t!=Wcla3BC+Ii%sxBI>^7Ps5qncAVM?ogU|Xd_$lzNFV21vd{%Q|AA-jt zc;$aq@YCS4nhQQiYr;=0i)ReHi^uU3;PDfM@;}sedy zI6pz!5cxJlJPrIFqy6`{9UsEp#p(Dd@Gc(5PlI>oI(`;>)_37QNM}SmGg&+#cz3Sj zd%!#Yj<2SJzUX^4!{gwcpTUfu05AHU@nhgc-!r}m{!vq(k+$lR&vjY0>mMR!jRXJy literal 0 HcmV?d00001 diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_aie_cdo_elfs.bin b/examples/elementwise_arith/air_project/mul_kernel_0_aie_cdo_elfs.bin new file mode 100644 index 0000000000000000000000000000000000000000..9f902fee90c41225fe0fdf079a639fbbc0bb5606 GIT binary patch literal 2656 zcmeH_KT8}z7>9qex8ClBnCuCWK@!-UBv@SGX=S&kSBMglm_|sG*w~~n79m}V2wH?R zlCTznjs23sz*PY)ti;mlP{{|dvndHhs^2 zC*7(5JmFG=RyYpYrpvOR^Him$pJlRup@8{}RfX<6BtDr5HMjx^(877eGYokC7`R+0 zg}ACyn`u(>eouT&|>kruev?*7 zzAWz1sr@~=R))~&MGn^ruzI`OyFX%K!bF9J0*<+oLN^mv#xN4aIV0|W`U1+hEAUOjAcH~4t(reAGHJA zDQO(w%(}vvb%is8Zwh>TUEpJ$p_6AmW0{Xj10Va=N9_Q2N*V_^v#xMvT~X`??RlZC literal 0 HcmV?d00001 diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_aie_cdo_enable.bin b/examples/elementwise_arith/air_project/mul_kernel_0_aie_cdo_enable.bin new file mode 100644 index 0000000000000000000000000000000000000000..7cc1818bce4d6ce1226fc5fda519967a4842b99e GIT binary patch literal 104 zcmZQ!U|?`|@n>LQVqg#jvKQ~%@L!OTfq|Wo1;_&e1!V>x7mN{N##qFhv4{o3#Q>)) B1+xGE literal 0 HcmV?d00001 diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_aie_cdo_init.bin b/examples/elementwise_arith/air_project/mul_kernel_0_aie_cdo_init.bin new file mode 100644 index 0000000000000000000000000000000000000000..d4549ba181167c6e6d7475f23335e34fea2c3376 GIT binary patch literal 6032 zcmb7{&rak<5XQ@%hINw3u1Uy>EJ*QcI7AT#M?)kINHH2BaX`vMPg)mN>ymUudP@3&^d;#l($}PKNY6;mN#BybCv6+u_35mqPjyW9=@;sg z)`iu&Bwdl7lD;5)N&1TPHR&7DGtzU?x1{e$Thh7gr0z}C-Y@95Kj1=K_e*vRasT2A z`KGw%NtC~$_drR!Af6K!;#1rYonOBXO5z3Gubj9LpW;_N?bzRq9a$@;UpUZ4Mwdf~w*)RN!T zTGw0O+2D~L&k!+PZVspqp^>qxyLCZUYYv{=)g)M&yk)HtOi1`aSl`pz$Z~{VHC> z{wm~8zhkS{>@Pz8!5=I0ho0@PV=wrGo|TPu3xmfX=5+8#k7tP3R^-WgJe!Ecz0Z3* zJBWQ9<=Kinq2IpGzOohbc{t*JyYq{${(e{F$2@C)+CR@F|Df@&=r{W@PpQw>ig|vl z-i6*{KG=BId(6k{cs_@S#l1i1@f;%-&#>z8oFW$Y{-no)v!Lz|&O*DNXg+CuAD%p) z+W++V)c&W>r}jU6KIt>&-QrcKx@>wrfeNOYm3=@d_NZ zw|+x>4;)3bJR^PpjtW{nB))G4Db?$G()v+vO7%KEZ|nOoNU7cwf84d}i)35nL%m9) zUZqj5n{~S$^(u~f-3(ifdKE{#Db=fZZU-ih)>n&xd&hVAz^Pux=TW^Tu6I|q)q2#U zHTbJ2{~>V${xaeR#53^EBfd|32(IcOZtO_bqxD;h5z6W6t-#g!Bz^*}&xc-r46f!) z>of4z(euuUpMtA-6E}8D`NwMi``~*2njeCH5^6!G{ z`D%{yOZ(S+3a%8O|3dRUJE8m&wZAQJonLdDk5s*%7Mkzakr_t)DHkL4{@Vhl z{wcl*uJh~l1-L$+%*XWnG36iY_v1oy)KAZ!`1S4I2o|Ro1qz~3EajN7i78>jEaZQL C^4EL- literal 0 HcmV?d00001 diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.elf b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.elf new file mode 100755 index 0000000000000000000000000000000000000000..c7c58092079ae00dffa136a058d3c0d77b6c8d5b GIT binary patch literal 1672 zcma)6O^6$17=9)jldSlYRxE=*FsvY}tl3Eut?6lYNu}5*= z9yVZ6LGh-_4LK~Lh&&{Fz>(o59Wt1TGRIqg#%>my<-)8;?Yx2! z(8TTe;w1XwbHMSNa~kd}F>-@Fua+@l2MzokmZ2Z9+tJD4hH`6u0r57aSK}=uNl?!iCG~4# z9=N!2`y{C+OB>GDD#EibVqS{{+lm%Ho;|U}_%Kb~9 z)|Mby86NH}-iPXw`Pkn;E$zqE7nDZ`{5UuI^xkm$pW;$3Z_K6pV@;Z`CRZwxLTy~1UJ}%At`$`*%7qix72w_p(DH*_ zFFdccT1~CmsOS|PaN1U|t?hYXu|!c*-awy^N`TTjGPM zepfW1aY(%_f^WJ&>OUfHa_$~k;-8Q;4uR;;$(jhFOg$uPA_&nxkTns6=%2`%2txEP zWK9Gi`ZuyBf)M??82RKME9*HHZA|rFq6brrO6={&kZbIC0gdQ#C$?sdS|WaTW!yY*Sc=M?V-|IUDeHM%4f-D>q6k_aKgA+aU43f`hjEG-nJi9 z;dDdK)XlT7`&;8jRV*E*8O2r@!;E@~W$sz+zGpVqL^``LJC+}qe&ELt3#Joy!zil~ zziEY`bvg4SVJ~noE=q%A?ao#vLi}9EtMaJJqc*-yLyqnLKw*-M z#)w-_^?DBYAhm_Ec_g2q32smf7?f|e%)u`N@16f~4*A|^xX8e{OpvoBEYP0KJ7C`5 U|B?4{S~EyvyKE>uvw1Rp0HPHLbpQYW literal 0 HcmV?d00001 diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.ld.script b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.ld.script new file mode 100644 index 0000000..fc4f0cf --- /dev/null +++ b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.ld.script @@ -0,0 +1,72 @@ + +MEMORY +{ + program (RX) : ORIGIN = 0, LENGTH = 0x0020000 + data (!RX) : ORIGIN = 0x78400, LENGTH = 0x7C00 +} +ENTRY(__start) +SECTIONS +{ + . = 0x0; + .text : { + /* the __start symbol has to come at address zero. */ + *crt0.o(.text*) + _ctors_start = .; + _init_array_start = .; + KEEP(SORT(*.init_array)) + _ctors_end = .; + _init_array_end = .; + _dtors_start = .; + _dtors_end = .; + *(.text*) + } > program + .data : { + *(.data*) + *(.rodata*) + } > data + .comment : { + *(.comment*) + } + .symtab : { + *(.symtab) + } + .shstrtab : { + *(.shstrtab) + } + .strtab : { + *(.strtab) + } + .stack_sizes : { + *(.stack_sizes) + } + +. = 0x70000; +_sp_start_value_DM_stack = .; +. += 0x400; /* stack */ +/* No tile with memory exists to the south. */ +. = 0x40000; +. += 0x10000; +/* No tile with memory exists to the west. */ +. = 0x50000; +. += 0x10000; +. = 0x60400; +buf5 = .; +. += 0x400; +. = 0x64000; +buf4 = .; +. += 0x400; +. = 0x68000; +buf3 = .; +. += 0x400; +. = 0x70400; +buf2 = .; +. += 0x400; +. = 0x74000; +buf1 = .; +. += 0x400; +. = 0x78000; +buf0 = .; +. += 0x400; + .bss : { *(.bss*) } > data +} +PROVIDE(main = core_0_2); diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.ll b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.ll new file mode 100644 index 0000000..19c8134 --- /dev/null +++ b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.ll @@ -0,0 +1,95 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target triple = "aie2p" + +@buf0 = external global [256 x float] +@buf1 = external global [256 x float] +@buf2 = external global [256 x float] +@buf3 = external global [256 x float] +@buf4 = external global [256 x float] +@buf5 = external global [256 x float] +@buf6 = external global [256 x float] +@buf7 = external global [256 x float] +@buf8 = external global [256 x float] +@buf9 = external global [256 x float] +@buf10 = external global [256 x float] +@buf11 = external global [256 x float] +@buf12 = external global [1024 x float] +@buf13 = external global [1024 x float] +@buf14 = external global [1024 x float] + +declare void @debug_i32(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.event(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.put.ms(i32, i32) + +; Unknown intrinsic +declare { i32, i32 } @llvm.aie2p.get.ss() + +; Unknown intrinsic +declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32) + +; Unknown intrinsic +declare <16 x i32> @llvm.aie2p.scd.read.vec(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.acquire(i32, i32) + +; Unknown intrinsic +declare void @llvm.aie2p.release(i32, i32) + +; Unknown intrinsic +declare void @llvm.aie2p.set.ctrl.reg(i32, i32) + +define void @core_0_2() { + call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1) + call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0) + br label %1 + +1: ; preds = %17, %0 + call void @llvm.aie2p.acquire(i32 49, i32 -1) + call void @llvm.aie2p.acquire(i32 50, i32 -1) + call void @llvm.aie2p.acquire(i32 52, i32 -1) + br label %2 + +2: ; preds = %5, %1 + %3 = phi i32 [ %16, %5 ], [ 0, %1 ] + %4 = icmp slt i32 %3, 256 + br i1 %4, label %5, label %17 + +5: ; preds = %2 + %6 = getelementptr float, ptr @buf2, i32 %3 + %7 = load <16 x float>, ptr %6, align 4 + %8 = getelementptr float, ptr @buf1, i32 %3 + %9 = load <16 x float>, ptr %8, align 4 + %10 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %7) + %11 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %9) + %12 = shufflevector <16 x bfloat> %10, <16 x bfloat> %10, <32 x i32> + %13 = shufflevector <16 x bfloat> %11, <16 x bfloat> %11, <32 x i32> + %14 = call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %12, <32 x bfloat> %13, i32 60) + %15 = getelementptr float, ptr @buf0, i32 %3 + store <16 x float> %14, ptr %15, align 4 + %16 = add i32 %3, 16 + br label %2, !llvm.loop !1 + +17: ; preds = %2 + call void @llvm.aie2p.release(i32 51, i32 1) + call void @llvm.aie2p.release(i32 53, i32 1) + call void @llvm.aie2p.release(i32 48, i32 1) + br label %1 +} + +; Unknown intrinsic +declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>) + +; Unknown intrinsic +declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32) + +!llvm.module.flags = !{!0} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !{!1, !2} +!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.o b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.o new file mode 100644 index 0000000000000000000000000000000000000000..43b8281695feaa671e97fd88c5e79951f78aeaed GIT binary patch literal 1000 zcmaJ5L(5Psh7b~lwOX~7xET}-o3PZjJz zM5GrbM=#!c>S0g)1=3@W75oAE2dvNJtLYvbGMRa1=ACEWNw!v;J4z{JqM#KJa4-dw zn7k1eTt*cqwGwJ$pYJ^#yEHX<#yTvi`FT97K@;C*%6#o>J$nAY|-zpH({QN!;g z)`%EJYa#KLtn%5)->CC>E_(Q5?pL$werQDJf#^$-)L5`JMq6(#wQ)p3 zzAIz}$lwRF&s5Bj;O*dC0UtDt{-=sslo^;$MDn{+6L`MHDf3A=>nUzIc5A=0ffPW&Anr5nFkzgh%vu3fLE4Wq}gzrLN7`Jipw{ zEJdqRpHAQx6IfuKdlV6~T#a4!0MbnkQF4TxCI~BQ2ovf2@ xkV#f|ivd}eohEehuF{wOGLPhDsj~LMal9*nLasrdEpm~%9U>{8rb<8I{s-KramN4v literal 0 HcmV?d00001 diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.opt.ll b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.opt.ll new file mode 100644 index 0000000..0eee48f --- /dev/null +++ b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.opt.ll @@ -0,0 +1,72 @@ +; ModuleID = 'air_project/mul_kernel_0_core_0_2.peanohack.ll' +source_filename = "LLVMDialectModule" +target datalayout = "e-m:e-p:20:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-f32:32:32-i64:32-f64:32-a:0:32-n32" +target triple = "aie2p" + +@buf0 = external local_unnamed_addr global [256 x float] +@buf1 = external local_unnamed_addr global [256 x float] +@buf2 = external local_unnamed_addr global [256 x float] + +; Function Attrs: nounwind +declare void @llvm.aie2p.acquire(i32, i32) #0 + +; Function Attrs: nounwind +declare void @llvm.aie2p.release(i32, i32) #0 + +; Function Attrs: nounwind memory(inaccessiblemem: write) +declare void @llvm.aie2p.set.ctrl.reg(i32, i32) #1 + +; Function Attrs: noreturn nounwind +define void @core_0_2() local_unnamed_addr #2 { + tail call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1) + tail call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0) + br label %1 + +1: ; preds = %17, %0 + tail call void @llvm.aie2p.acquire(i32 49, i32 -1) + tail call void @llvm.aie2p.acquire(i32 50, i32 -1) + tail call void @llvm.aie2p.acquire(i32 52, i32 -1) + br label %2 + +2: ; preds = %1, %2 + %3 = phi i32 [ 0, %1 ], [ %15, %2 ] + %4 = trunc nuw i32 %3 to i20 + %5 = getelementptr float, ptr @buf2, i20 %4 + %6 = load <16 x float>, ptr %5, align 64 + %7 = getelementptr float, ptr @buf1, i20 %4 + %8 = load <16 x float>, ptr %7, align 64 + %9 = tail call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %6) + %10 = tail call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %8) + %11 = shufflevector <16 x bfloat> %9, <16 x bfloat> poison, <32 x i32> + %12 = shufflevector <16 x bfloat> %10, <16 x bfloat> poison, <32 x i32> + %13 = tail call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %11, <32 x bfloat> %12, i32 60) + %14 = getelementptr float, ptr @buf0, i20 %4 + store <16 x float> %13, ptr %14, align 64 + %15 = add nuw nsw i32 %3, 16 + %16 = icmp ult i32 %3, 240 + br i1 %16, label %2, label %17, !llvm.loop !1 + +17: ; preds = %2 + tail call void @llvm.aie2p.release(i32 51, i32 1) + tail call void @llvm.aie2p.release(i32 53, i32 1) + tail call void @llvm.aie2p.release(i32 48, i32 1) + br label %1 +} + +; Function Attrs: nofree nounwind memory(inaccessiblemem: read) +declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>) #3 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) +declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32) #4 + +attributes #0 = { nounwind } +attributes #1 = { nounwind memory(inaccessiblemem: write) } +attributes #2 = { noreturn nounwind } +attributes #3 = { nofree nounwind memory(inaccessiblemem: read) } +attributes #4 = { mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) } + +!llvm.module.flags = !{!0} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !{!1, !2} +!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.peanohack.ll b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.peanohack.ll new file mode 100644 index 0000000..7de74b2 --- /dev/null +++ b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.peanohack.ll @@ -0,0 +1,95 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target triple = "aie2p" + +@buf0 = external global [256 x float] +@buf1 = external global [256 x float] +@buf2 = external global [256 x float] +@buf3 = external global [256 x float] +@buf4 = external global [256 x float] +@buf5 = external global [256 x float] +@buf6 = external global [256 x float] +@buf7 = external global [256 x float] +@buf8 = external global [256 x float] +@buf9 = external global [256 x float] +@buf10 = external global [256 x float] +@buf11 = external global [256 x float] +@buf12 = external global [1024 x float] +@buf13 = external global [1024 x float] +@buf14 = external global [1024 x float] + +declare void @debug_i32(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.event(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.put.ms(i32, i32) + +; Unknown intrinsic +declare { i32, i32 } @llvm.aie2p.get.ss() + +; Unknown intrinsic +declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32) + +; Unknown intrinsic +declare <16 x i32> @llvm.aie2p.scd.read.vec(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.acquire(i32, i32) + +; Unknown intrinsic +declare void @llvm.aie2p.release(i32, i32) + +; Unknown intrinsic +declare void @llvm.aie2p.set.ctrl.reg(i32, i32) + +define void @core_0_2() { + call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1) + call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0) + br label %1 + +1: ; preds = %17, %0 + call void @llvm.aie2p.acquire(i32 49, i32 -1) + call void @llvm.aie2p.acquire(i32 50, i32 -1) + call void @llvm.aie2p.acquire(i32 52, i32 -1) + br label %2 + +2: ; preds = %5, %1 + %3 = phi i32 [ %16, %5 ], [ 0, %1 ] + %4 = icmp slt i32 %3, 256 + br i1 %4, label %5, label %17 + +5: ; preds = %2 + %6 = getelementptr float, ptr @buf2, i32 %3 + %7 = load <16 x float>, ptr %6 + %8 = getelementptr float, ptr @buf1, i32 %3 + %9 = load <16 x float>, ptr %8 + %10 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %7) + %11 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %9) + %12 = shufflevector <16 x bfloat> %10, <16 x bfloat> %10, <32 x i32> + %13 = shufflevector <16 x bfloat> %11, <16 x bfloat> %11, <32 x i32> + %14 = call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %12, <32 x bfloat> %13, i32 60) + %15 = getelementptr float, ptr @buf0, i32 %3 + store <16 x float> %14, ptr %15 + %16 = add i32 %3, 16 + br label %2, !llvm.loop !1 + +17: ; preds = %2 + call void @llvm.aie2p.release(i32 51, i32 1) + call void @llvm.aie2p.release(i32 53, i32 1) + call void @llvm.aie2p.release(i32 48, i32 1) + br label %1 +} + +; Unknown intrinsic +declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>) + +; Unknown intrinsic +declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32) + +!llvm.module.flags = !{!0} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !{!1, !2} +!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.elf b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.elf new file mode 100755 index 0000000000000000000000000000000000000000..f1be4eefad7b89482558fe9d6e292826c0748801 GIT binary patch literal 1736 zcma)7O^6$17=9)jldSlYRxE=*FsvYJS+g@u>^42kE~$vk!g^2^JZQrV_9w>RX<+chXiISt)Kg|b z{nDHTUR%EV%;567&(}7jU(Gz^1`YsrXoioyql|jQ{7U%56u-5sHa^QKbFzNMV@Q0M z`4>N_EkLqTJltEn57o!>vA>CO(vPdpDUSkb;T!Jq3V20?T0pXA)|=p+A<5tGFr1N| zgRieYnd#jBerEXbgTdBc`Gri@oJsb_nj~M1uT&<5TD>v3B&c6q&#Rc13ny+Uz=LC; z?gu+wcwVd5Yno9l=_MU-ns%_I?RsI?ZwHB2o8OM2PWSxc;+7w6_ZlUq-CA7tn|`pj z*lg~$UbcNtYm^;Vue#;3XS-#+dbV=T^_(TQ;+Bm@xlwV9s^?ZKo7I=%HB3t`XN)Y4 zj-Mi18+#NyOM5uQU!bjxixf=mL{9JMSbhr|Q}Gqa=P>Cz2?ZBvBO4!25xKaRIpJ?g zyg%V@3nw%Vsc#73OBYD}2hk>$G~WIcysm=pfH@cx9~6P}HK;gWwO+??<~g!d;LrO4Zh zAlKgV0vg>*t;lXrZ+D|Gw#M3acC4=dzSrgZs=u{b^X;bRM5}GL*Yr@TudL{nk?1qj zR{cWY>TtryC^-%to4vrX8s3&47;xI5XX#cMjo#)fA}S)vu&i!mhY_r9C$_9zyV>)s z+Nw}z2Ug4W1IrKm2x7r program + .data : { + *(.data*) + *(.rodata*) + } > data + .comment : { + *(.comment*) + } + .symtab : { + *(.symtab) + } + .shstrtab : { + *(.shstrtab) + } + .strtab : { + *(.strtab) + } + .stack_sizes : { + *(.stack_sizes) + } + +. = 0x70000; +_sp_start_value_DM_stack = .; +. += 0x400; /* stack */ +. = 0x40400; +buf2 = .; +. += 0x400; +. = 0x44000; +buf1 = .; +. += 0x400; +. = 0x48000; +buf0 = .; +. += 0x400; +/* No tile with memory exists to the west. */ +. = 0x50000; +. += 0x10000; +. = 0x60400; +buf8 = .; +. += 0x400; +. = 0x64000; +buf7 = .; +. += 0x400; +. = 0x68000; +buf6 = .; +. += 0x400; +. = 0x70400; +buf5 = .; +. += 0x400; +. = 0x74000; +buf4 = .; +. += 0x400; +. = 0x78000; +buf3 = .; +. += 0x400; + .bss : { *(.bss*) } > data +} +PROVIDE(main = core_0_3); diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.ll b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.ll new file mode 100644 index 0000000..79b2ca7 --- /dev/null +++ b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.ll @@ -0,0 +1,95 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target triple = "aie2p" + +@buf0 = external global [256 x float] +@buf1 = external global [256 x float] +@buf2 = external global [256 x float] +@buf3 = external global [256 x float] +@buf4 = external global [256 x float] +@buf5 = external global [256 x float] +@buf6 = external global [256 x float] +@buf7 = external global [256 x float] +@buf8 = external global [256 x float] +@buf9 = external global [256 x float] +@buf10 = external global [256 x float] +@buf11 = external global [256 x float] +@buf12 = external global [1024 x float] +@buf13 = external global [1024 x float] +@buf14 = external global [1024 x float] + +declare void @debug_i32(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.event(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.put.ms(i32, i32) + +; Unknown intrinsic +declare { i32, i32 } @llvm.aie2p.get.ss() + +; Unknown intrinsic +declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32) + +; Unknown intrinsic +declare <16 x i32> @llvm.aie2p.scd.read.vec(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.acquire(i32, i32) + +; Unknown intrinsic +declare void @llvm.aie2p.release(i32, i32) + +; Unknown intrinsic +declare void @llvm.aie2p.set.ctrl.reg(i32, i32) + +define void @core_0_3() { + call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1) + call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0) + br label %1 + +1: ; preds = %17, %0 + call void @llvm.aie2p.acquire(i32 49, i32 -1) + call void @llvm.aie2p.acquire(i32 50, i32 -1) + call void @llvm.aie2p.acquire(i32 52, i32 -1) + br label %2 + +2: ; preds = %5, %1 + %3 = phi i32 [ %16, %5 ], [ 0, %1 ] + %4 = icmp slt i32 %3, 256 + br i1 %4, label %5, label %17 + +5: ; preds = %2 + %6 = getelementptr float, ptr @buf5, i32 %3 + %7 = load <16 x float>, ptr %6, align 4 + %8 = getelementptr float, ptr @buf4, i32 %3 + %9 = load <16 x float>, ptr %8, align 4 + %10 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %7) + %11 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %9) + %12 = shufflevector <16 x bfloat> %10, <16 x bfloat> %10, <32 x i32> + %13 = shufflevector <16 x bfloat> %11, <16 x bfloat> %11, <32 x i32> + %14 = call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %12, <32 x bfloat> %13, i32 60) + %15 = getelementptr float, ptr @buf3, i32 %3 + store <16 x float> %14, ptr %15, align 4 + %16 = add i32 %3, 16 + br label %2, !llvm.loop !1 + +17: ; preds = %2 + call void @llvm.aie2p.release(i32 51, i32 1) + call void @llvm.aie2p.release(i32 53, i32 1) + call void @llvm.aie2p.release(i32 48, i32 1) + br label %1 +} + +; Unknown intrinsic +declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>) + +; Unknown intrinsic +declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32) + +!llvm.module.flags = !{!0} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !{!1, !2} +!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.o b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.o new file mode 100644 index 0000000000000000000000000000000000000000..4343d406be95fbbfffcd92a0f42bf02db06255c2 GIT binary patch literal 1000 zcmaJ9W#}>r3!R-d>se(O- zi1ecD(Tn$NW?5%scbU%sbD#v)Ng5?klB`ih`Cyz|j;? zU~)AoxQ;T;Y9-XfAw1o!29yJjpK(lFs&VIe^-Zkt%Bc6 ztPwGc#zO2ZS;dQ`zhUe1T=@9M+^>4M`=J(I2Es3SQe(l^jIZ_#uwFm88t%Ng*2D=3 zd6viukiqk3pR0%?#=HKd96qWWy-y{zC^Il0i{$sGCh&ZPQ)ZKL)^pr)@!b8mANg&p zBz&B{q%QEL_KmOHjk$=Byg3PE=1S%PMf^D(5L;QSfEV=la@Z4JrGWyTB(BZaJipwX zG)1dZ?@r)*6IfuCdlV6~T#j7!0&Z%tFR!S|pMvs{S}jH{OmZszUm`IBcRPT&ISgF$ z!K0_kTS2$~3Xb!v)qL6Y+h_4Fe3~l#g!><@G;zKF literal 0 HcmV?d00001 diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.opt.ll b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.opt.ll new file mode 100644 index 0000000..ce97114 --- /dev/null +++ b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.opt.ll @@ -0,0 +1,72 @@ +; ModuleID = 'air_project/mul_kernel_0_core_0_3.peanohack.ll' +source_filename = "LLVMDialectModule" +target datalayout = "e-m:e-p:20:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-f32:32:32-i64:32-f64:32-a:0:32-n32" +target triple = "aie2p" + +@buf3 = external local_unnamed_addr global [256 x float] +@buf4 = external local_unnamed_addr global [256 x float] +@buf5 = external local_unnamed_addr global [256 x float] + +; Function Attrs: nounwind +declare void @llvm.aie2p.acquire(i32, i32) #0 + +; Function Attrs: nounwind +declare void @llvm.aie2p.release(i32, i32) #0 + +; Function Attrs: nounwind memory(inaccessiblemem: write) +declare void @llvm.aie2p.set.ctrl.reg(i32, i32) #1 + +; Function Attrs: noreturn nounwind +define void @core_0_3() local_unnamed_addr #2 { + tail call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1) + tail call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0) + br label %1 + +1: ; preds = %17, %0 + tail call void @llvm.aie2p.acquire(i32 49, i32 -1) + tail call void @llvm.aie2p.acquire(i32 50, i32 -1) + tail call void @llvm.aie2p.acquire(i32 52, i32 -1) + br label %2 + +2: ; preds = %1, %2 + %3 = phi i32 [ 0, %1 ], [ %15, %2 ] + %4 = trunc nuw i32 %3 to i20 + %5 = getelementptr float, ptr @buf5, i20 %4 + %6 = load <16 x float>, ptr %5, align 64 + %7 = getelementptr float, ptr @buf4, i20 %4 + %8 = load <16 x float>, ptr %7, align 64 + %9 = tail call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %6) + %10 = tail call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %8) + %11 = shufflevector <16 x bfloat> %9, <16 x bfloat> poison, <32 x i32> + %12 = shufflevector <16 x bfloat> %10, <16 x bfloat> poison, <32 x i32> + %13 = tail call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %11, <32 x bfloat> %12, i32 60) + %14 = getelementptr float, ptr @buf3, i20 %4 + store <16 x float> %13, ptr %14, align 64 + %15 = add nuw nsw i32 %3, 16 + %16 = icmp ult i32 %3, 240 + br i1 %16, label %2, label %17, !llvm.loop !1 + +17: ; preds = %2 + tail call void @llvm.aie2p.release(i32 51, i32 1) + tail call void @llvm.aie2p.release(i32 53, i32 1) + tail call void @llvm.aie2p.release(i32 48, i32 1) + br label %1 +} + +; Function Attrs: nofree nounwind memory(inaccessiblemem: read) +declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>) #3 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) +declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32) #4 + +attributes #0 = { nounwind } +attributes #1 = { nounwind memory(inaccessiblemem: write) } +attributes #2 = { noreturn nounwind } +attributes #3 = { nofree nounwind memory(inaccessiblemem: read) } +attributes #4 = { mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) } + +!llvm.module.flags = !{!0} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !{!1, !2} +!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.peanohack.ll b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.peanohack.ll new file mode 100644 index 0000000..c86e34d --- /dev/null +++ b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.peanohack.ll @@ -0,0 +1,95 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target triple = "aie2p" + +@buf0 = external global [256 x float] +@buf1 = external global [256 x float] +@buf2 = external global [256 x float] +@buf3 = external global [256 x float] +@buf4 = external global [256 x float] +@buf5 = external global [256 x float] +@buf6 = external global [256 x float] +@buf7 = external global [256 x float] +@buf8 = external global [256 x float] +@buf9 = external global [256 x float] +@buf10 = external global [256 x float] +@buf11 = external global [256 x float] +@buf12 = external global [1024 x float] +@buf13 = external global [1024 x float] +@buf14 = external global [1024 x float] + +declare void @debug_i32(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.event(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.put.ms(i32, i32) + +; Unknown intrinsic +declare { i32, i32 } @llvm.aie2p.get.ss() + +; Unknown intrinsic +declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32) + +; Unknown intrinsic +declare <16 x i32> @llvm.aie2p.scd.read.vec(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.acquire(i32, i32) + +; Unknown intrinsic +declare void @llvm.aie2p.release(i32, i32) + +; Unknown intrinsic +declare void @llvm.aie2p.set.ctrl.reg(i32, i32) + +define void @core_0_3() { + call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1) + call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0) + br label %1 + +1: ; preds = %17, %0 + call void @llvm.aie2p.acquire(i32 49, i32 -1) + call void @llvm.aie2p.acquire(i32 50, i32 -1) + call void @llvm.aie2p.acquire(i32 52, i32 -1) + br label %2 + +2: ; preds = %5, %1 + %3 = phi i32 [ %16, %5 ], [ 0, %1 ] + %4 = icmp slt i32 %3, 256 + br i1 %4, label %5, label %17 + +5: ; preds = %2 + %6 = getelementptr float, ptr @buf5, i32 %3 + %7 = load <16 x float>, ptr %6 + %8 = getelementptr float, ptr @buf4, i32 %3 + %9 = load <16 x float>, ptr %8 + %10 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %7) + %11 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %9) + %12 = shufflevector <16 x bfloat> %10, <16 x bfloat> %10, <32 x i32> + %13 = shufflevector <16 x bfloat> %11, <16 x bfloat> %11, <32 x i32> + %14 = call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %12, <32 x bfloat> %13, i32 60) + %15 = getelementptr float, ptr @buf3, i32 %3 + store <16 x float> %14, ptr %15 + %16 = add i32 %3, 16 + br label %2, !llvm.loop !1 + +17: ; preds = %2 + call void @llvm.aie2p.release(i32 51, i32 1) + call void @llvm.aie2p.release(i32 53, i32 1) + call void @llvm.aie2p.release(i32 48, i32 1) + br label %1 +} + +; Unknown intrinsic +declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>) + +; Unknown intrinsic +declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32) + +!llvm.module.flags = !{!0} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !{!1, !2} +!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.elf b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.elf new file mode 100755 index 0000000000000000000000000000000000000000..2158287344d4e58ea0bb86865df912b73324cb82 GIT binary patch literal 1740 zcma)7O=u%!7=9*=NhRM5e%Yx=t5Xt=vT&vn+kQpAuIH1Kb~2>pQ74vzQNm0PoOzSyLG@bdEQCkB_^db+kD^=jrJH*f&3LNk2mO=VOg=9j}qrudCzwefLInUntMk0JhH z=3o4zwg8Draer;G-j^S*$NDD9Nj-L-Qyv7=!k66TW$=m!wSZ*L%s0V1LlVE=p*tfr z2VY!$G}F2J-OTWVqrujn`Gri@oJsb_oFrb2uT&O=TD>;8B&eTX&8wJ~3n#8Az|k>K z_k$fTJg3#`HO;7&^pXxZO*`1qcD=Ccw}Zs0&2L9hr+aR3am$akdySIQZY{3+O+VOM zY&LgW&)dGIHOh{wSKV^iv)!^@JySXBdd`wtamz-d+^9H4)pM(r&FXV;57RT-86%5( z$B&V%jV%hEq&}SD&rsLKNeU)+BByunSbhr|Q~ni+=P+qI2?ZCZBO4!25xKaQIpME~ zzdzw`2q!cS$!`eZOBYD~yObMTyF)4d_bD|Ff$)zgH6cWid_buQA%uTTsR<#3|3Ik; zA%y=#sR<#3-=x%p5W;^ECChh1NI!?7m=pe+@cxAVE<79m!X^F>;pT)N3GYuhN|Co0 zL9V^!1ys5hTan$M-0ntUtc|7Z>{wm@9k0vxReyc8=G#rriB{Wguj!#wUs=&DBatsq zTJ`gRtHTK+qvSX=Z1w`jYIs|IV8CgIo~2tAGp$X7CprDZ_YDh5&2*hq{sTMI5=sC7 literal 0 HcmV?d00001 diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.ld.script b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.ld.script new file mode 100644 index 0000000..ddda3c2 --- /dev/null +++ b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.ld.script @@ -0,0 +1,78 @@ + +MEMORY +{ + program (RX) : ORIGIN = 0, LENGTH = 0x0020000 + data (!RX) : ORIGIN = 0x78400, LENGTH = 0x7C00 +} +ENTRY(__start) +SECTIONS +{ + . = 0x0; + .text : { + /* the __start symbol has to come at address zero. */ + *crt0.o(.text*) + _ctors_start = .; + _init_array_start = .; + KEEP(SORT(*.init_array)) + _ctors_end = .; + _init_array_end = .; + _dtors_start = .; + _dtors_end = .; + *(.text*) + } > program + .data : { + *(.data*) + *(.rodata*) + } > data + .comment : { + *(.comment*) + } + .symtab : { + *(.symtab) + } + .shstrtab : { + *(.shstrtab) + } + .strtab : { + *(.strtab) + } + .stack_sizes : { + *(.stack_sizes) + } + +. = 0x70000; +_sp_start_value_DM_stack = .; +. += 0x400; /* stack */ +. = 0x40400; +buf5 = .; +. += 0x400; +. = 0x44000; +buf4 = .; +. += 0x400; +. = 0x48000; +buf3 = .; +. += 0x400; +/* No tile with memory exists to the west. */ +. = 0x50000; +. += 0x10000; +. = 0x60400; +buf11 = .; +. += 0x400; +. = 0x64000; +buf10 = .; +. += 0x400; +. = 0x68000; +buf9 = .; +. += 0x400; +. = 0x70400; +buf8 = .; +. += 0x400; +. = 0x74000; +buf7 = .; +. += 0x400; +. = 0x78000; +buf6 = .; +. += 0x400; + .bss : { *(.bss*) } > data +} +PROVIDE(main = core_0_4); diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.ll b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.ll new file mode 100644 index 0000000..2552e6c --- /dev/null +++ b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.ll @@ -0,0 +1,95 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target triple = "aie2p" + +@buf0 = external global [256 x float] +@buf1 = external global [256 x float] +@buf2 = external global [256 x float] +@buf3 = external global [256 x float] +@buf4 = external global [256 x float] +@buf5 = external global [256 x float] +@buf6 = external global [256 x float] +@buf7 = external global [256 x float] +@buf8 = external global [256 x float] +@buf9 = external global [256 x float] +@buf10 = external global [256 x float] +@buf11 = external global [256 x float] +@buf12 = external global [1024 x float] +@buf13 = external global [1024 x float] +@buf14 = external global [1024 x float] + +declare void @debug_i32(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.event(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.put.ms(i32, i32) + +; Unknown intrinsic +declare { i32, i32 } @llvm.aie2p.get.ss() + +; Unknown intrinsic +declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32) + +; Unknown intrinsic +declare <16 x i32> @llvm.aie2p.scd.read.vec(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.acquire(i32, i32) + +; Unknown intrinsic +declare void @llvm.aie2p.release(i32, i32) + +; Unknown intrinsic +declare void @llvm.aie2p.set.ctrl.reg(i32, i32) + +define void @core_0_4() { + call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1) + call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0) + br label %1 + +1: ; preds = %17, %0 + call void @llvm.aie2p.acquire(i32 49, i32 -1) + call void @llvm.aie2p.acquire(i32 50, i32 -1) + call void @llvm.aie2p.acquire(i32 52, i32 -1) + br label %2 + +2: ; preds = %5, %1 + %3 = phi i32 [ %16, %5 ], [ 0, %1 ] + %4 = icmp slt i32 %3, 256 + br i1 %4, label %5, label %17 + +5: ; preds = %2 + %6 = getelementptr float, ptr @buf8, i32 %3 + %7 = load <16 x float>, ptr %6, align 4 + %8 = getelementptr float, ptr @buf7, i32 %3 + %9 = load <16 x float>, ptr %8, align 4 + %10 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %7) + %11 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %9) + %12 = shufflevector <16 x bfloat> %10, <16 x bfloat> %10, <32 x i32> + %13 = shufflevector <16 x bfloat> %11, <16 x bfloat> %11, <32 x i32> + %14 = call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %12, <32 x bfloat> %13, i32 60) + %15 = getelementptr float, ptr @buf6, i32 %3 + store <16 x float> %14, ptr %15, align 4 + %16 = add i32 %3, 16 + br label %2, !llvm.loop !1 + +17: ; preds = %2 + call void @llvm.aie2p.release(i32 51, i32 1) + call void @llvm.aie2p.release(i32 53, i32 1) + call void @llvm.aie2p.release(i32 48, i32 1) + br label %1 +} + +; Unknown intrinsic +declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>) + +; Unknown intrinsic +declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32) + +!llvm.module.flags = !{!0} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !{!1, !2} +!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.o b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.o new file mode 100644 index 0000000000000000000000000000000000000000..2fea81b77544b9d8acd307af0a8527ae8aff2af3 GIT binary patch literal 1000 zcmaJse(O- zi1ecD(Tn$NW?5%scbU%sbD#v)Nj;?A0`JS?4h|n!z_hlv`CaW->Q($+ zVvUHdHy2`W$t<2N{SDin=fX!n=6*HG&WCz<9tgkWN%aLw)4y6Xz*^(zVz~9@QVT~U zR3_db=>qRha2ERx@yn!xi_PMJ;0Sx<4x#Z&j=e&n}s zJ>lc@C3S{3wQqdocFaYD%+h` z?mv98yb(D4SFr7;?bb`jbNfMiI2d^@jE%q#oF47l*J6947q{fNDaQ>MzUw(rFQY&7 z-LBbPfnhhBX7^fR>$F{ab$xKMreuWwnJY{tPl(DooO_hJkgim$XtEMmxf*e0ry_0Y zWSxA2RMNUN24r1UlBkeZV$xsck=!g*)}FhLcX^P@)#, ptr %5, align 64 + %7 = getelementptr float, ptr @buf7, i20 %4 + %8 = load <16 x float>, ptr %7, align 64 + %9 = tail call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %6) + %10 = tail call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %8) + %11 = shufflevector <16 x bfloat> %9, <16 x bfloat> poison, <32 x i32> + %12 = shufflevector <16 x bfloat> %10, <16 x bfloat> poison, <32 x i32> + %13 = tail call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %11, <32 x bfloat> %12, i32 60) + %14 = getelementptr float, ptr @buf6, i20 %4 + store <16 x float> %13, ptr %14, align 64 + %15 = add nuw nsw i32 %3, 16 + %16 = icmp ult i32 %3, 240 + br i1 %16, label %2, label %17, !llvm.loop !1 + +17: ; preds = %2 + tail call void @llvm.aie2p.release(i32 51, i32 1) + tail call void @llvm.aie2p.release(i32 53, i32 1) + tail call void @llvm.aie2p.release(i32 48, i32 1) + br label %1 +} + +; Function Attrs: nofree nounwind memory(inaccessiblemem: read) +declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>) #3 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) +declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32) #4 + +attributes #0 = { nounwind } +attributes #1 = { nounwind memory(inaccessiblemem: write) } +attributes #2 = { noreturn nounwind } +attributes #3 = { nofree nounwind memory(inaccessiblemem: read) } +attributes #4 = { mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) } + +!llvm.module.flags = !{!0} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !{!1, !2} +!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.peanohack.ll b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.peanohack.ll new file mode 100644 index 0000000..bfe891f --- /dev/null +++ b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.peanohack.ll @@ -0,0 +1,95 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target triple = "aie2p" + +@buf0 = external global [256 x float] +@buf1 = external global [256 x float] +@buf2 = external global [256 x float] +@buf3 = external global [256 x float] +@buf4 = external global [256 x float] +@buf5 = external global [256 x float] +@buf6 = external global [256 x float] +@buf7 = external global [256 x float] +@buf8 = external global [256 x float] +@buf9 = external global [256 x float] +@buf10 = external global [256 x float] +@buf11 = external global [256 x float] +@buf12 = external global [1024 x float] +@buf13 = external global [1024 x float] +@buf14 = external global [1024 x float] + +declare void @debug_i32(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.event(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.put.ms(i32, i32) + +; Unknown intrinsic +declare { i32, i32 } @llvm.aie2p.get.ss() + +; Unknown intrinsic +declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32) + +; Unknown intrinsic +declare <16 x i32> @llvm.aie2p.scd.read.vec(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.acquire(i32, i32) + +; Unknown intrinsic +declare void @llvm.aie2p.release(i32, i32) + +; Unknown intrinsic +declare void @llvm.aie2p.set.ctrl.reg(i32, i32) + +define void @core_0_4() { + call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1) + call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0) + br label %1 + +1: ; preds = %17, %0 + call void @llvm.aie2p.acquire(i32 49, i32 -1) + call void @llvm.aie2p.acquire(i32 50, i32 -1) + call void @llvm.aie2p.acquire(i32 52, i32 -1) + br label %2 + +2: ; preds = %5, %1 + %3 = phi i32 [ %16, %5 ], [ 0, %1 ] + %4 = icmp slt i32 %3, 256 + br i1 %4, label %5, label %17 + +5: ; preds = %2 + %6 = getelementptr float, ptr @buf8, i32 %3 + %7 = load <16 x float>, ptr %6 + %8 = getelementptr float, ptr @buf7, i32 %3 + %9 = load <16 x float>, ptr %8 + %10 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %7) + %11 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %9) + %12 = shufflevector <16 x bfloat> %10, <16 x bfloat> %10, <32 x i32> + %13 = shufflevector <16 x bfloat> %11, <16 x bfloat> %11, <32 x i32> + %14 = call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %12, <32 x bfloat> %13, i32 60) + %15 = getelementptr float, ptr @buf6, i32 %3 + store <16 x float> %14, ptr %15 + %16 = add i32 %3, 16 + br label %2, !llvm.loop !1 + +17: ; preds = %2 + call void @llvm.aie2p.release(i32 51, i32 1) + call void @llvm.aie2p.release(i32 53, i32 1) + call void @llvm.aie2p.release(i32 48, i32 1) + br label %1 +} + +; Unknown intrinsic +declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>) + +; Unknown intrinsic +declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32) + +!llvm.module.flags = !{!0} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !{!1, !2} +!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.elf b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.elf new file mode 100755 index 0000000000000000000000000000000000000000..680e4695811b0567bb930a499c2383fd1f5d7bf8 GIT binary patch literal 1676 zcma)6O^6$17=9)jldSlYR$2ysU|2y`S+g@HcAK7NmsE<)!uFyI7G@_iv%AA4lXNm! zw}NcIf<34=B?^KUZyth&3VRY-uqWxkLvMCVJ$NbYtq1A$c{4LfDjs~8?|a|p{rTSS zn|J2hKU;rKQ52-epfF|?8lz``8OjPYWH66BvXJOLM~YiC$Y3an92fqI+|)NMbJHZz zvkHbl4R>bqWA7WU0(ak^(ePk_o|~+Bt%xBjXy9{Dgnr0sherqN%I(=X;IsU}I&Pmg zxr4&By&sidi^eE7UQkt6PR81To_#*|c(D4_vBBE!$L=lX?W@M%1Tgq|bfh>4>PfSp zeq+u8=a%ohJlMbVT5Ut>)y%s*z#+g2&G5O8lu?bCUkP8B;T&luLQ0b4|<1te!?z6tgWN&G>F?u^tN ze1HALOy|MRGsDjx4YvNtFJ!XjOmaTvB=Kt8Qdtyg^~R)0P`|yNS1~UQCvGUfqa&d1 z2RmMPMyuCrno%w3B^_{@cCe-GdSTaZ2Z>di-;SbA_srtrmLF~R8YQRQT3q*=ez3RL zZ0@$+wtY`)lpR;Ey5+KGyJfw4s&d-(oF%v7mW@WaQE`l_=TikOwZiT7+E|# zdWmdpY*FwE_2CqMow_zoQZVU>yuF7<@-J*m`Bx;K!=&vv6r87yZ2WqP$l-!;bHYCq z-kH-^5Y+7;{TNP4X)iu{8uS84uQmfNvR1Tisau>YC;I%KT&Ez2;sj_YC;I% zzfo#J2;qNFYC;I%e~OajcSFc|?ulYf_}{|&6OK~k?M09_w!DB!_i`(;8wn^P`MuOXSgrYX({rNLw%coZDAiY1bjwKOJCs)aY~bo}!pJB&4h@^Vz_A+M zmLC{!+M#FZ*1Kr*HVtDm>oQkiS>4DEBUs%|tXaEuv*%g0RiVxftd{KumLK>LM1$o- z?Xa6>iQTfp(7v2nVz(E#7$qgn5qGCEVqtCZW85U^(<%41tRdZ$82`Q+*?UbKk>C#P->>@r1A+>rU?lE literal 0 HcmV?d00001 diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.ld.script b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.ld.script new file mode 100644 index 0000000..51c13db --- /dev/null +++ b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.ld.script @@ -0,0 +1,72 @@ + +MEMORY +{ + program (RX) : ORIGIN = 0, LENGTH = 0x0020000 + data (!RX) : ORIGIN = 0x78400, LENGTH = 0x7C00 +} +ENTRY(__start) +SECTIONS +{ + . = 0x0; + .text : { + /* the __start symbol has to come at address zero. */ + *crt0.o(.text*) + _ctors_start = .; + _init_array_start = .; + KEEP(SORT(*.init_array)) + _ctors_end = .; + _init_array_end = .; + _dtors_start = .; + _dtors_end = .; + *(.text*) + } > program + .data : { + *(.data*) + *(.rodata*) + } > data + .comment : { + *(.comment*) + } + .symtab : { + *(.symtab) + } + .shstrtab : { + *(.shstrtab) + } + .strtab : { + *(.strtab) + } + .stack_sizes : { + *(.stack_sizes) + } + +. = 0x70000; +_sp_start_value_DM_stack = .; +. += 0x400; /* stack */ +. = 0x40400; +buf8 = .; +. += 0x400; +. = 0x44000; +buf7 = .; +. += 0x400; +. = 0x48000; +buf6 = .; +. += 0x400; +/* No tile with memory exists to the west. */ +. = 0x50000; +. += 0x10000; +/* No tile with memory exists to the north. */ +. = 0x60000; +. += 0x10000; +. = 0x70400; +buf11 = .; +. += 0x400; +. = 0x74000; +buf10 = .; +. += 0x400; +. = 0x78000; +buf9 = .; +. += 0x400; + .bss : { *(.bss*) } > data +} +PROVIDE(main = core_0_5); diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.ll b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.ll new file mode 100644 index 0000000..4ed7251 --- /dev/null +++ b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.ll @@ -0,0 +1,95 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target triple = "aie2p" + +@buf0 = external global [256 x float] +@buf1 = external global [256 x float] +@buf2 = external global [256 x float] +@buf3 = external global [256 x float] +@buf4 = external global [256 x float] +@buf5 = external global [256 x float] +@buf6 = external global [256 x float] +@buf7 = external global [256 x float] +@buf8 = external global [256 x float] +@buf9 = external global [256 x float] +@buf10 = external global [256 x float] +@buf11 = external global [256 x float] +@buf12 = external global [1024 x float] +@buf13 = external global [1024 x float] +@buf14 = external global [1024 x float] + +declare void @debug_i32(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.event(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.put.ms(i32, i32) + +; Unknown intrinsic +declare { i32, i32 } @llvm.aie2p.get.ss() + +; Unknown intrinsic +declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32) + +; Unknown intrinsic +declare <16 x i32> @llvm.aie2p.scd.read.vec(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.acquire(i32, i32) + +; Unknown intrinsic +declare void @llvm.aie2p.release(i32, i32) + +; Unknown intrinsic +declare void @llvm.aie2p.set.ctrl.reg(i32, i32) + +define void @core_0_5() { + call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1) + call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0) + br label %1 + +1: ; preds = %17, %0 + call void @llvm.aie2p.acquire(i32 49, i32 -1) + call void @llvm.aie2p.acquire(i32 50, i32 -1) + call void @llvm.aie2p.acquire(i32 52, i32 -1) + br label %2 + +2: ; preds = %5, %1 + %3 = phi i32 [ %16, %5 ], [ 0, %1 ] + %4 = icmp slt i32 %3, 256 + br i1 %4, label %5, label %17 + +5: ; preds = %2 + %6 = getelementptr float, ptr @buf11, i32 %3 + %7 = load <16 x float>, ptr %6, align 4 + %8 = getelementptr float, ptr @buf10, i32 %3 + %9 = load <16 x float>, ptr %8, align 4 + %10 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %7) + %11 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %9) + %12 = shufflevector <16 x bfloat> %10, <16 x bfloat> %10, <32 x i32> + %13 = shufflevector <16 x bfloat> %11, <16 x bfloat> %11, <32 x i32> + %14 = call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %12, <32 x bfloat> %13, i32 60) + %15 = getelementptr float, ptr @buf9, i32 %3 + store <16 x float> %14, ptr %15, align 4 + %16 = add i32 %3, 16 + br label %2, !llvm.loop !1 + +17: ; preds = %2 + call void @llvm.aie2p.release(i32 51, i32 1) + call void @llvm.aie2p.release(i32 53, i32 1) + call void @llvm.aie2p.release(i32 48, i32 1) + br label %1 +} + +; Unknown intrinsic +declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>) + +; Unknown intrinsic +declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32) + +!llvm.module.flags = !{!0} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !{!1, !2} +!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.o b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.o new file mode 100644 index 0000000000000000000000000000000000000000..e70224c9ad756c9bb1acf0ca65fa018ff13a4066 GIT binary patch literal 1000 zcmZ`%O=}Zj5Pse!O}0{HTd*#I5PH(sF1xlKdXY^`wc~_4Dt!23>Q}AY`Cx|^f$&S7)R=KJlto2c; zVY>CY)xMEfTMBKPsO>JVty&R0T~so}|IIa~k}E_-EzbQWSDvm^tZ1?lIJp{eWv3#y zsiSrB4N^(#ZZRP1a*~8j-WB@NU*?hABvsa)JC1jGkjvTh*&-9E+a;3najNta?tf@J Bay|e6 literal 0 HcmV?d00001 diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.opt.ll b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.opt.ll new file mode 100644 index 0000000..80307e8 --- /dev/null +++ b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.opt.ll @@ -0,0 +1,72 @@ +; ModuleID = 'air_project/mul_kernel_0_core_0_5.peanohack.ll' +source_filename = "LLVMDialectModule" +target datalayout = "e-m:e-p:20:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-f32:32:32-i64:32-f64:32-a:0:32-n32" +target triple = "aie2p" + +@buf9 = external local_unnamed_addr global [256 x float] +@buf10 = external local_unnamed_addr global [256 x float] +@buf11 = external local_unnamed_addr global [256 x float] + +; Function Attrs: nounwind +declare void @llvm.aie2p.acquire(i32, i32) #0 + +; Function Attrs: nounwind +declare void @llvm.aie2p.release(i32, i32) #0 + +; Function Attrs: nounwind memory(inaccessiblemem: write) +declare void @llvm.aie2p.set.ctrl.reg(i32, i32) #1 + +; Function Attrs: noreturn nounwind +define void @core_0_5() local_unnamed_addr #2 { + tail call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1) + tail call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0) + br label %1 + +1: ; preds = %17, %0 + tail call void @llvm.aie2p.acquire(i32 49, i32 -1) + tail call void @llvm.aie2p.acquire(i32 50, i32 -1) + tail call void @llvm.aie2p.acquire(i32 52, i32 -1) + br label %2 + +2: ; preds = %1, %2 + %3 = phi i32 [ 0, %1 ], [ %15, %2 ] + %4 = trunc nuw i32 %3 to i20 + %5 = getelementptr float, ptr @buf11, i20 %4 + %6 = load <16 x float>, ptr %5, align 64 + %7 = getelementptr float, ptr @buf10, i20 %4 + %8 = load <16 x float>, ptr %7, align 64 + %9 = tail call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %6) + %10 = tail call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %8) + %11 = shufflevector <16 x bfloat> %9, <16 x bfloat> poison, <32 x i32> + %12 = shufflevector <16 x bfloat> %10, <16 x bfloat> poison, <32 x i32> + %13 = tail call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %11, <32 x bfloat> %12, i32 60) + %14 = getelementptr float, ptr @buf9, i20 %4 + store <16 x float> %13, ptr %14, align 64 + %15 = add nuw nsw i32 %3, 16 + %16 = icmp ult i32 %3, 240 + br i1 %16, label %2, label %17, !llvm.loop !1 + +17: ; preds = %2 + tail call void @llvm.aie2p.release(i32 51, i32 1) + tail call void @llvm.aie2p.release(i32 53, i32 1) + tail call void @llvm.aie2p.release(i32 48, i32 1) + br label %1 +} + +; Function Attrs: nofree nounwind memory(inaccessiblemem: read) +declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>) #3 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) +declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32) #4 + +attributes #0 = { nounwind } +attributes #1 = { nounwind memory(inaccessiblemem: write) } +attributes #2 = { noreturn nounwind } +attributes #3 = { nofree nounwind memory(inaccessiblemem: read) } +attributes #4 = { mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) } + +!llvm.module.flags = !{!0} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !{!1, !2} +!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.peanohack.ll b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.peanohack.ll new file mode 100644 index 0000000..5a9b5b8 --- /dev/null +++ b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.peanohack.ll @@ -0,0 +1,95 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target triple = "aie2p" + +@buf0 = external global [256 x float] +@buf1 = external global [256 x float] +@buf2 = external global [256 x float] +@buf3 = external global [256 x float] +@buf4 = external global [256 x float] +@buf5 = external global [256 x float] +@buf6 = external global [256 x float] +@buf7 = external global [256 x float] +@buf8 = external global [256 x float] +@buf9 = external global [256 x float] +@buf10 = external global [256 x float] +@buf11 = external global [256 x float] +@buf12 = external global [1024 x float] +@buf13 = external global [1024 x float] +@buf14 = external global [1024 x float] + +declare void @debug_i32(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.event(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.put.ms(i32, i32) + +; Unknown intrinsic +declare { i32, i32 } @llvm.aie2p.get.ss() + +; Unknown intrinsic +declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32) + +; Unknown intrinsic +declare <16 x i32> @llvm.aie2p.scd.read.vec(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.acquire(i32, i32) + +; Unknown intrinsic +declare void @llvm.aie2p.release(i32, i32) + +; Unknown intrinsic +declare void @llvm.aie2p.set.ctrl.reg(i32, i32) + +define void @core_0_5() { + call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1) + call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0) + br label %1 + +1: ; preds = %17, %0 + call void @llvm.aie2p.acquire(i32 49, i32 -1) + call void @llvm.aie2p.acquire(i32 50, i32 -1) + call void @llvm.aie2p.acquire(i32 52, i32 -1) + br label %2 + +2: ; preds = %5, %1 + %3 = phi i32 [ %16, %5 ], [ 0, %1 ] + %4 = icmp slt i32 %3, 256 + br i1 %4, label %5, label %17 + +5: ; preds = %2 + %6 = getelementptr float, ptr @buf11, i32 %3 + %7 = load <16 x float>, ptr %6 + %8 = getelementptr float, ptr @buf10, i32 %3 + %9 = load <16 x float>, ptr %8 + %10 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %7) + %11 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %9) + %12 = shufflevector <16 x bfloat> %10, <16 x bfloat> %10, <32 x i32> + %13 = shufflevector <16 x bfloat> %11, <16 x bfloat> %11, <32 x i32> + %14 = call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %12, <32 x bfloat> %13, i32 60) + %15 = getelementptr float, ptr @buf9, i32 %3 + store <16 x float> %14, ptr %15 + %16 = add i32 %3, 16 + br label %2, !llvm.loop !1 + +17: ; preds = %2 + call void @llvm.aie2p.release(i32 51, i32 1) + call void @llvm.aie2p.release(i32 53, i32 1) + call void @llvm.aie2p.release(i32 48, i32 1) + br label %1 +} + +; Unknown intrinsic +declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>) + +; Unknown intrinsic +declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32) + +!llvm.module.flags = !{!0} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !{!1, !2} +!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_design.bif b/examples/elementwise_arith/air_project/mul_kernel_0_design.bif new file mode 100644 index 0000000..86ba205 --- /dev/null +++ b/examples/elementwise_arith/air_project/mul_kernel_0_design.bif @@ -0,0 +1,10 @@ +all: +{ + id_code = 0x14ca8093 + extended_id_code = 0x01 + image + { + name=aie_image, id=0x1c000000 + { type=cdo file=air_project/mul_kernel_0_aie_cdo_elfs.bin file=air_project/mul_kernel_0_aie_cdo_init.bin file=air_project/mul_kernel_0_aie_cdo_enable.bin } + } +} diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_mul_kernel_0_sequence.bin b/examples/elementwise_arith/air_project/mul_kernel_0_mul_kernel_0_sequence.bin new file mode 100644 index 0000000000000000000000000000000000000000..f2eb383b7eb903e8ad8809b7d44fb0a4d660bfb6 GIT binary patch literal 3248 zcmcJQ-Aw~A5QNuneg4w5;eLtEZyo^(@j}krmd|WcZ&8OQJf_lyu+9e|Yukmso`>#0IXQ!}T>04{- zdWRz5+iJIiZ@khs7OO7qzIV<49O}M3Vd35Et*zr4+n!k}H6?s5ZHHP2b8iNd9f!uq z2%ELeWFG=0l9+H#Mw;SV&vYiNQGA=m?@TyH^<^fmv5ZNwd1Or5@6ws1SwdvOIT;Fy zFEe2cjS1(dzRbimmN6++jgb*5y%`xL$~=|+C!AAeiZ3%^4UGxssJ_g^HI^|MPc*|& zYb`xk4J-t?`*yLPCTa8!jZ|yI(P^-T#)NY+)Rg;`nXrb&gmYA1X5t#lnB*HUVGWH5 W=cv75Caj?`;T+YMnYhL+nEU|x-i@&U literal 0 HcmV?d00001 diff --git a/examples/elementwise_arith/air_project/npu.asm_air_output.mlir b/examples/elementwise_arith/air_project/npu.asm_air_output.mlir new file mode 100644 index 0000000..a66ce9e --- /dev/null +++ b/examples/elementwise_arith/air_project/npu.asm_air_output.mlir @@ -0,0 +1,300 @@ +#loop_annotation = #llvm.loop_annotation +module { + aie.device(npu2) @square_kernel_0 { + %shim_noc_tile_0_0 = aie.tile(0, 0) + %shim_noc_tile_1_0 = aie.tile(1, 0) + %mem_tile_0_1 = aie.tile(0, 1) + %mem_tile_1_1 = aie.tile(1, 1) + %tile_0_2 = aie.tile(0, 2) + %tile_0_3 = aie.tile(0, 3) + %tile_0_4 = aie.tile(0, 4) + %tile_0_5 = aie.tile(0, 5) + %lock_0_1 = aie.lock(%mem_tile_0_1, 1) {init = 4 : i32} + %lock_0_1_0 = aie.lock(%mem_tile_0_1, 0) {init = 0 : i32} + %lock_1_1 = aie.lock(%mem_tile_1_1, 1) {init = 4 : i32} + %lock_1_1_1 = aie.lock(%mem_tile_1_1, 0) {init = 0 : i32} + %lock_0_2 = aie.lock(%tile_0_2, 3) {init = 1 : i32} + %lock_0_2_2 = aie.lock(%tile_0_2, 2) {init = 0 : i32} + %lock_0_2_3 = aie.lock(%tile_0_2, 1) {init = 1 : i32} + %lock_0_2_4 = aie.lock(%tile_0_2, 0) {init = 0 : i32} + %lock_0_3 = aie.lock(%tile_0_3, 3) {init = 1 : i32} + %lock_0_3_5 = aie.lock(%tile_0_3, 2) {init = 0 : i32} + %lock_0_3_6 = aie.lock(%tile_0_3, 1) {init = 1 : i32} + %lock_0_3_7 = aie.lock(%tile_0_3, 0) {init = 0 : i32} + %lock_0_4 = aie.lock(%tile_0_4, 3) {init = 1 : i32} + %lock_0_4_8 = aie.lock(%tile_0_4, 2) {init = 0 : i32} + %lock_0_4_9 = aie.lock(%tile_0_4, 1) {init = 1 : i32} + %lock_0_4_10 = aie.lock(%tile_0_4, 0) {init = 0 : i32} + %lock_0_5 = aie.lock(%tile_0_5, 3) {init = 1 : i32} + %lock_0_5_11 = aie.lock(%tile_0_5, 2) {init = 0 : i32} + %lock_0_5_12 = aie.lock(%tile_0_5, 1) {init = 1 : i32} + %lock_0_5_13 = aie.lock(%tile_0_5, 0) {init = 0 : i32} + %buf9 = aie.buffer(%mem_tile_0_1) {sym_name = "buf9"} : memref<1024xi16, 1 : i32> + %buf8 = aie.buffer(%mem_tile_1_1) {sym_name = "buf8"} : memref<1024xi16, 1> + %buf7 = aie.buffer(%tile_0_5) {sym_name = "buf7"} : memref<256xi16, 2> + %buf6 = aie.buffer(%tile_0_5) {sym_name = "buf6"} : memref<256xi16, 2> + %buf5 = aie.buffer(%tile_0_4) {sym_name = "buf5"} : memref<256xi16, 2> + %buf4 = aie.buffer(%tile_0_4) {sym_name = "buf4"} : memref<256xi16, 2> + %buf3 = aie.buffer(%tile_0_3) {sym_name = "buf3"} : memref<256xi16, 2> + %buf2 = aie.buffer(%tile_0_3) {sym_name = "buf2"} : memref<256xi16, 2> + %buf1 = aie.buffer(%tile_0_2) {sym_name = "buf1"} : memref<256xi16, 2> + %buf0 = aie.buffer(%tile_0_2) {sym_name = "buf0"} : memref<256xi16, 2> + %mem_0_5 = aie.mem(%tile_0_5) { + %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_0_5_13, AcquireGreaterEqual, 1) + aie.dma_bd(%buf6 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32} + aie.use_lock(%lock_0_5_12, Release, 1) + aie.next_bd ^bb1 + ^bb2: // pred: ^bb3 + aie.end + ^bb3: // pred: ^bb0 + %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb2) + ^bb4: // 2 preds: ^bb3, ^bb4 + aie.use_lock(%lock_0_5, AcquireGreaterEqual, 1) + aie.dma_bd(%buf7 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32} + aie.use_lock(%lock_0_5_11, Release, 1) + aie.next_bd ^bb4 + } + %core_0_5 = aie.core(%tile_0_5) { + %0 = ub.poison : i16 + %c256 = arith.constant 256 : index + %c32 = arith.constant 32 : index + %c0 = arith.constant 0 : index + cf.br ^bb1 + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_0_5_12, AcquireGreaterEqual, 1) + aie.use_lock(%lock_0_5_11, AcquireGreaterEqual, 1) + scf.for %arg0 = %c0 to %c256 step %c32 { + %subview = memref.subview %buf7[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2> + %subview_14 = memref.subview %buf6[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2> + %1 = vector.transfer_read %subview[%c0], %0 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16> + %2 = arith.muli %1, %1 : vector<32xi16> + vector.transfer_write %2, %subview_14[%c0] {in_bounds = [true]} : vector<32xi16>, memref<32xi16, strided<[1], offset: ?>, 2> + } {loop_annotation = #loop_annotation} + aie.use_lock(%lock_0_5, Release, 1) + aie.use_lock(%lock_0_5_13, Release, 1) + cf.br ^bb1 + } + %mem_0_4 = aie.mem(%tile_0_4) { + %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_0_4_10, AcquireGreaterEqual, 1) + aie.dma_bd(%buf4 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32} + aie.use_lock(%lock_0_4_9, Release, 1) + aie.next_bd ^bb1 + ^bb2: // pred: ^bb3 + aie.end + ^bb3: // pred: ^bb0 + %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb2) + ^bb4: // 2 preds: ^bb3, ^bb4 + aie.use_lock(%lock_0_4, AcquireGreaterEqual, 1) + aie.dma_bd(%buf5 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32} + aie.use_lock(%lock_0_4_8, Release, 1) + aie.next_bd ^bb4 + } + %core_0_4 = aie.core(%tile_0_4) { + %0 = ub.poison : i16 + %c256 = arith.constant 256 : index + %c32 = arith.constant 32 : index + %c0 = arith.constant 0 : index + cf.br ^bb1 + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_0_4_9, AcquireGreaterEqual, 1) + aie.use_lock(%lock_0_4_8, AcquireGreaterEqual, 1) + scf.for %arg0 = %c0 to %c256 step %c32 { + %subview = memref.subview %buf5[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2> + %subview_14 = memref.subview %buf4[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2> + %1 = vector.transfer_read %subview[%c0], %0 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16> + %2 = arith.muli %1, %1 : vector<32xi16> + vector.transfer_write %2, %subview_14[%c0] {in_bounds = [true]} : vector<32xi16>, memref<32xi16, strided<[1], offset: ?>, 2> + } {loop_annotation = #loop_annotation} + aie.use_lock(%lock_0_4, Release, 1) + aie.use_lock(%lock_0_4_10, Release, 1) + cf.br ^bb1 + } + %mem_0_3 = aie.mem(%tile_0_3) { + %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_0_3_7, AcquireGreaterEqual, 1) + aie.dma_bd(%buf2 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32} + aie.use_lock(%lock_0_3_6, Release, 1) + aie.next_bd ^bb1 + ^bb2: // pred: ^bb3 + aie.end + ^bb3: // pred: ^bb0 + %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb2) + ^bb4: // 2 preds: ^bb3, ^bb4 + aie.use_lock(%lock_0_3, AcquireGreaterEqual, 1) + aie.dma_bd(%buf3 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32} + aie.use_lock(%lock_0_3_5, Release, 1) + aie.next_bd ^bb4 + } + %core_0_3 = aie.core(%tile_0_3) { + %0 = ub.poison : i16 + %c256 = arith.constant 256 : index + %c32 = arith.constant 32 : index + %c0 = arith.constant 0 : index + cf.br ^bb1 + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_0_3_6, AcquireGreaterEqual, 1) + aie.use_lock(%lock_0_3_5, AcquireGreaterEqual, 1) + scf.for %arg0 = %c0 to %c256 step %c32 { + %subview = memref.subview %buf3[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2> + %subview_14 = memref.subview %buf2[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2> + %1 = vector.transfer_read %subview[%c0], %0 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16> + %2 = arith.muli %1, %1 : vector<32xi16> + vector.transfer_write %2, %subview_14[%c0] {in_bounds = [true]} : vector<32xi16>, memref<32xi16, strided<[1], offset: ?>, 2> + } {loop_annotation = #loop_annotation} + aie.use_lock(%lock_0_3, Release, 1) + aie.use_lock(%lock_0_3_7, Release, 1) + cf.br ^bb1 + } + %mem_0_2 = aie.mem(%tile_0_2) { + %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_0_2_4, AcquireGreaterEqual, 1) + aie.dma_bd(%buf0 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32} + aie.use_lock(%lock_0_2_3, Release, 1) + aie.next_bd ^bb1 + ^bb2: // pred: ^bb3 + aie.end + ^bb3: // pred: ^bb0 + %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb2) + ^bb4: // 2 preds: ^bb3, ^bb4 + aie.use_lock(%lock_0_2, AcquireGreaterEqual, 1) + aie.dma_bd(%buf1 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32} + aie.use_lock(%lock_0_2_2, Release, 1) + aie.next_bd ^bb4 + } + %core_0_2 = aie.core(%tile_0_2) { + %0 = ub.poison : i16 + %c256 = arith.constant 256 : index + %c32 = arith.constant 32 : index + %c0 = arith.constant 0 : index + cf.br ^bb1 + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_0_2_3, AcquireGreaterEqual, 1) + aie.use_lock(%lock_0_2_2, AcquireGreaterEqual, 1) + scf.for %arg0 = %c0 to %c256 step %c32 { + %subview = memref.subview %buf1[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2> + %subview_14 = memref.subview %buf0[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2> + %1 = vector.transfer_read %subview[%c0], %0 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16> + %2 = arith.muli %1, %1 : vector<32xi16> + vector.transfer_write %2, %subview_14[%c0] {in_bounds = [true]} : vector<32xi16>, memref<32xi16, strided<[1], offset: ?>, 2> + } {loop_annotation = #loop_annotation} + aie.use_lock(%lock_0_2, Release, 1) + aie.use_lock(%lock_0_2_4, Release, 1) + cf.br ^bb1 + } + aie.flow(%shim_noc_tile_0_0, DMA : 0, %mem_tile_0_1, DMA : 0) + aie.flow(%mem_tile_1_1, DMA : 0, %shim_noc_tile_1_0, DMA : 0) + aie.flow(%mem_tile_0_1, DMA : 0, %tile_0_2, DMA : 0) + aie.flow(%mem_tile_0_1, DMA : 1, %tile_0_3, DMA : 0) + aie.flow(%mem_tile_0_1, DMA : 2, %tile_0_4, DMA : 0) + aie.flow(%mem_tile_0_1, DMA : 3, %tile_0_5, DMA : 0) + aie.flow(%tile_0_2, DMA : 0, %mem_tile_1_1, DMA : 0) + aie.flow(%tile_0_3, DMA : 0, %mem_tile_1_1, DMA : 1) + aie.flow(%tile_0_4, DMA : 0, %mem_tile_1_1, DMA : 2) + aie.flow(%tile_0_5, DMA : 0, %mem_tile_1_1, DMA : 3) + %memtile_dma_1_1 = aie.memtile_dma(%mem_tile_1_1) { + %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_1_1_1, AcquireGreaterEqual, 4) + aie.dma_bd(%buf8 : memref<1024xi16, 1>, 0, 1024) {task_id = 0 : i32} + aie.use_lock(%lock_1_1, Release, 4) + aie.next_bd ^bb1 + ^bb2: // pred: ^bb9 + aie.end + ^bb3: // pred: ^bb0 + %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb5) + ^bb4: // 2 preds: ^bb3, ^bb4 + aie.use_lock(%lock_1_1, AcquireGreaterEqual, 1) + aie.dma_bd(%buf8 : memref<1024xi16, 1>, 0, 256) {task_id = 0 : i32} + aie.use_lock(%lock_1_1_1, Release, 1) + aie.next_bd ^bb4 + ^bb5: // pred: ^bb3 + %2 = aie.dma_start(S2MM, 1, ^bb6, ^bb7) + ^bb6: // 2 preds: ^bb5, ^bb6 + aie.use_lock(%lock_1_1, AcquireGreaterEqual, 1) + aie.dma_bd(%buf8 : memref<1024xi16, 1>, 256, 256) {task_id = 0 : i32} + aie.use_lock(%lock_1_1_1, Release, 1) + aie.next_bd ^bb6 + ^bb7: // pred: ^bb5 + %3 = aie.dma_start(S2MM, 2, ^bb8, ^bb9) + ^bb8: // 2 preds: ^bb7, ^bb8 + aie.use_lock(%lock_1_1, AcquireGreaterEqual, 1) + aie.dma_bd(%buf8 : memref<1024xi16, 1>, 512, 256) {task_id = 0 : i32} + aie.use_lock(%lock_1_1_1, Release, 1) + aie.next_bd ^bb8 + ^bb9: // pred: ^bb7 + %4 = aie.dma_start(S2MM, 3, ^bb10, ^bb2) + ^bb10: // 2 preds: ^bb9, ^bb10 + aie.use_lock(%lock_1_1, AcquireGreaterEqual, 1) + aie.dma_bd(%buf8 : memref<1024xi16, 1>, 768, 256) {task_id = 0 : i32} + aie.use_lock(%lock_1_1_1, Release, 1) + aie.next_bd ^bb10 + } + %memtile_dma_0_1 = aie.memtile_dma(%mem_tile_0_1) { + %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) + ^bb1: // 2 preds: ^bb0, ^bb1 + aie.use_lock(%lock_0_1_0, AcquireGreaterEqual, 1) + aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 0, 256) {task_id = 0 : i32} + aie.use_lock(%lock_0_1, Release, 1) + aie.next_bd ^bb1 + ^bb2: // pred: ^bb9 + aie.end + ^bb3: // pred: ^bb0 + %1 = aie.dma_start(MM2S, 1, ^bb4, ^bb5) + ^bb4: // 2 preds: ^bb3, ^bb4 + aie.use_lock(%lock_0_1_0, AcquireGreaterEqual, 1) + aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 256, 256) {task_id = 0 : i32} + aie.use_lock(%lock_0_1, Release, 1) + aie.next_bd ^bb4 + ^bb5: // pred: ^bb3 + %2 = aie.dma_start(MM2S, 2, ^bb6, ^bb7) + ^bb6: // 2 preds: ^bb5, ^bb6 + aie.use_lock(%lock_0_1_0, AcquireGreaterEqual, 1) + aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 512, 256) {task_id = 0 : i32} + aie.use_lock(%lock_0_1, Release, 1) + aie.next_bd ^bb6 + ^bb7: // pred: ^bb5 + %3 = aie.dma_start(MM2S, 3, ^bb8, ^bb9) + ^bb8: // 2 preds: ^bb7, ^bb8 + aie.use_lock(%lock_0_1_0, AcquireGreaterEqual, 1) + aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 768, 256) {task_id = 0 : i32} + aie.use_lock(%lock_0_1, Release, 1) + aie.next_bd ^bb8 + ^bb9: // pred: ^bb7 + %4 = aie.dma_start(S2MM, 0, ^bb10, ^bb2) + ^bb10: // 2 preds: ^bb9, ^bb10 + aie.use_lock(%lock_0_1, AcquireGreaterEqual, 4) + aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 0, 1024) {task_id = 0 : i32} + aie.use_lock(%lock_0_1_0, Release, 4) + aie.next_bd ^bb10 + } + aie.shim_dma_allocation @air_channel_3(%shim_noc_tile_1_0, S2MM, 0) + aie.shim_dma_allocation @air_channel_0(%shim_noc_tile_0_0, MM2S, 0) + aie.runtime_sequence @square_kernel_0_sequence(%arg0: memref<*xi16>, %arg1: memref<*xi16>, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32) { + %0 = aiex.dma_configure_task_for @air_channel_0 { + aie.dma_bd(%arg0 : memref<*xi16>, 0, 1024, [, ]) + aie.end + } + aiex.dma_start_task(%0) + %1 = aiex.dma_configure_task_for @air_channel_3 { + aie.dma_bd(%arg1 : memref<*xi16>, 0, 1024, [, ]) + aie.end + } {issue_token = true} + aiex.dma_start_task(%1) + aiex.dma_free_task(%0) + aiex.dma_await_task(%1) + } + } {dlti.dl_spec = #dlti.dl_spec} + aie.device(npu2) { + aie.runtime_sequence @square_kernel(%arg0: memref<*xi16>, %arg1: memref<*xi16>, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32) { + aiex.configure @square_kernel_0 { + aiex.run @square_kernel_0_sequence(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7) : (memref<*xi16>, memref<*xi16>, i32, i32, i32, i32, i32, i32) + } + } + } +} diff --git a/examples/elementwise_arith/air_project/placed.asm_air_output.mlir b/examples/elementwise_arith/air_project/placed.asm_air_output.mlir new file mode 100644 index 0000000..aa82d2e --- /dev/null +++ b/examples/elementwise_arith/air_project/placed.asm_air_output.mlir @@ -0,0 +1,86 @@ +module { + air.channel @channel_0 [] + air.channel @channel_1 [4, 1] + air.channel @channel_2 [4, 1] + air.channel @channel_3 [] + func.func @square_kernel(%arg0: memref<*xi16> {tt.divisibility = 16 : i32}, %arg1: memref<*xi16> {tt.divisibility = 16 : i32}, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32) { + %c1 = arith.constant 1 : index + %0 = air.launch async (%arg8, %arg9, %arg10) in (%arg11=%c1, %arg12=%c1, %arg13=%c1) args(%arg14=%arg0, %arg15=%arg1) : memref<*xi16>, memref<*xi16> attributes {id = 1 : i32} { + %c1024 = arith.constant 1024 : index + %c1_0 = arith.constant 1 : index + %1 = arith.muli %arg8, %c1024 : index + %2 = air.channel.put async @channel_0[] (%arg14[%1] [%c1024] [%c1_0]) {id = 1 : i32} : (memref<*xi16>) + %3 = air.channel.get async @channel_3[] (%arg15[%1] [%c1024] [%c1_0]) {id = 2 : i32} : (memref<*xi16>) + %4 = air.segment @square_kernel_0 async attributes {id = 2 : i32, x_loc = 0 : i64, x_size = 8 : i64, y_loc = 2 : i64, y_size = 6 : i64} { + %c4 = arith.constant 4 : index + %c768 = arith.constant 768 : index + %c3 = arith.constant 3 : index + %c512 = arith.constant 512 : index + %c2 = arith.constant 2 : index + %c256 = arith.constant 256 : index + %c0 = arith.constant 0 : index + %c1_1 = arith.constant 1 : index + %async_token, %results = air.execute -> (memref<1024xi16, 1 : i32>) { + %alloc = memref.alloc() : memref<1024xi16, 1 : i32> + air.execute_terminator %alloc : memref<1024xi16, 1 : i32> + } + %5 = air.channel.get async [%async_token] @channel_0[] (%results[] [] []) {id = 3 : i32} : (memref<1024xi16, 1 : i32>) + %async_token_2, %results_3 = air.execute -> (memref<1024xi16, 1>) { + %alloc = memref.alloc() : memref<1024xi16, 1> + air.execute_terminator %alloc : memref<1024xi16, 1> + } + %6 = air.channel.put async [%5] @channel_1[%c0, %c0] (%results[%c0] [%c256] [%c1_1]) {id = 4 : i32} : (memref<1024xi16, 1 : i32>) + %7 = air.channel.put async [%5] @channel_1[%c1_1, %c0] (%results[%c256] [%c256] [%c1_1]) {id = 5 : i32} : (memref<1024xi16, 1 : i32>) + %8 = air.channel.put async [%5] @channel_1[%c2, %c0] (%results[%c512] [%c256] [%c1_1]) {id = 6 : i32} : (memref<1024xi16, 1 : i32>) + %9 = air.channel.put async [%5] @channel_1[%c3, %c0] (%results[%c768] [%c256] [%c1_1]) {id = 7 : i32} : (memref<1024xi16, 1 : i32>) + %10 = air.channel.get async [%async_token_2] @channel_2[%c0, %c0] (%results_3[%c0] [%c256] [%c1_1]) {id = 8 : i32} : (memref<1024xi16, 1>) + %11 = air.channel.get async [%async_token_2] @channel_2[%c1_1, %c0] (%results_3[%c256] [%c256] [%c1_1]) {id = 9 : i32} : (memref<1024xi16, 1>) + %12 = air.channel.get async [%async_token_2] @channel_2[%c2, %c0] (%results_3[%c512] [%c256] [%c1_1]) {id = 10 : i32} : (memref<1024xi16, 1>) + %13 = air.channel.get async [%async_token_2] @channel_2[%c3, %c0] (%results_3[%c768] [%c256] [%c1_1]) {id = 11 : i32} : (memref<1024xi16, 1>) + %14 = air.herd @herd_0 async [%5, %async_token_2] tile (%arg16, %arg17) in (%arg18=%c1_1, %arg19=%c4) attributes {id = 3 : i32, x_loc = 0 : i64, y_loc = 2 : i64} { + %c32 = arith.constant 32 : index + %c256_5 = arith.constant 256 : index + %c0_6 = arith.constant 0 : index + %16 = ub.poison : i16 + %async_token_7, %results_8 = air.execute -> (memref<256xi16, 2>) { + %alloc = memref.alloc() : memref<256xi16, 2> + air.execute_terminator %alloc : memref<256xi16, 2> + } + %17 = air.channel.get async [%async_token_7] @channel_1[%arg17, %c0_6] (%results_8[] [] []) {id = 12 : i32} : (memref<256xi16, 2>) + %async_token_9, %results_10 = air.execute -> (memref<256xi16, 2>) { + %alloc = memref.alloc() : memref<256xi16, 2> + air.execute_terminator %alloc : memref<256xi16, 2> + } + %18 = air.wait_all async [%17, %async_token_9] + %19 = scf.for %arg20 = %c0_6 to %c256_5 step %c32 iter_args(%arg21 = %18) -> (!air.async.token) { + %subview = memref.subview %results_8[%arg20] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2> + %subview_13 = memref.subview %results_10[%arg20] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2> + %async_token_14, %results_15 = air.execute [%arg21] -> (vector<32xi16>) { + %23 = vector.transfer_read %subview[%c0_6], %16 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16> + air.execute_terminator %23 : vector<32xi16> + } + %21 = arith.muli %results_15, %results_15 : vector<32xi16> + %async_token_16 = air.execute [%arg21] { + vector.transfer_write %21, %subview_13[%c0_6] {in_bounds = [true]} : vector<32xi16>, memref<32xi16, strided<[1], offset: ?>, 2> + } + %22 = air.wait_all async [%async_token_14, %async_token_16] + scf.yield %22 : !air.async.token + } + %20 = air.channel.put async [%async_token_9] @channel_2[%arg17, %c0_6] (%results_10[] [] []) {id = 13 : i32} : (memref<256xi16, 2>) + %async_token_11 = air.execute [%17] { + memref.dealloc %results_8 : memref<256xi16, 2> + } + %async_token_12 = air.execute [%20] { + memref.dealloc %results_10 : memref<256xi16, 2> + } + } + %15 = air.channel.put async [%14] @channel_3[] (%results_3[] [] []) {id = 14 : i32} : (memref<1024xi16, 1>) + %async_token_4 = air.execute [%15] { + memref.dealloc %results_3 : memref<1024xi16, 1> + } + air.wait_all [%6, %7, %8, %9, %10, %11, %12, %13, %async_token_4] {air.segment_end} + } + } + return + } +} diff --git a/examples/elementwise_arith/air_project/square_kernel_0.pdi b/examples/elementwise_arith/air_project/square_kernel_0.pdi new file mode 100644 index 0000000000000000000000000000000000000000..1a6b4e2869f47c37579486ca1fbb299d49a2e6c4 GIT binary patch literal 6272 zcmeHLO=w(I6h3d}C7Eh$-^f^A+Z11tBB5mQ(uvSQz&FkmqDGt=g@TKwLM;WA*p1y8 zG1MX=0XNf%kc}HxgF+W|GtzF9BD3tqFs-iTrMT##I-YZX-u>RQSyWerUYMEh`|i2t zo_o%jzxyi@wdzyVOCP@T=FK0zy-ehj$J!x&$m={mWPG{yb4?@t{8Fpkj)I6{6b+Vj z|7Bb_{Nokb57(bQcjnZ&_fDTZ;~Y*nm`7jD96NY(gy<)J$SaPa8E>#SN>su<(C+`> z<9`hQ^4USN^&0O$bc18~^ZV|w%A1wEGEp?0olSVKH7REL!8|N*D)0*Ms;5sNzDK>>)quKvj{)5tUOVaC+BHS= z$=J1%bnBIbJy)(?_{Qr6b80QAF&nRskN(|x^^0n!{Y&-t#@OjgbDd}OoP7@WCKaMZ zZ?LcP(FX@x$!(6Z72V;)^ZxAd-SgwJzmA9b`uW|!`;GAJ&o9!Xnb%HbzFU&@j`R9! znlZA&?JY{tKCGrzP`Svw3>cb?tXf& zbN2q&Y{^f`<~)>G`%v<0MtJOoD*}a=uaAW^V`gt~M(z!DU)Vm7WzrY64=lHRV7cuB z%PFcC`f9GwS9wlbd0qjY@24Y=`-vXKe$w@|{Slh_T5juaxvjtD6g3O|)F|{*p3_#I zSAggH>BYzW^hoxTuCMKn(A3v*TYt-K{Vk_xxzJB13jLJlw3X)-;Q4-f{c%4%lKrIX zYx^TK^|jpA-*Q`j;ZlG5TMBzGxjWwp zIG@8&>TnD=0o(?>0(ceh8sH7Un}EB3dw~0Z2Y^$+0ZquSWc^-aY|q7d33S!6@_vlD zw>@JYLC*)E6X-GbinDh;9e_^ITQTR}*}EPIKqshG%x7`-F6RO0WX@Za&l597JpN-N zqt9zRl>*Jp^WX)4^g;HKvQKihLVi2t@0R@3_%l$3^iE)~jn8K;SoIamRatcLG#)Y@ zu}@^n+;H*mJP#d@#vXHt)#vYj!8#r{myd^Q_E7l|`-Hvbhl_{jdFXgFcJb(5(eb#s zd_0p68IRZ}YEOJ}@$ftk9goHybDbaE>pC7cr{j4-ts~mci4Xx#|E1Mjr7%}xE>2bM3v(09_nEm6=InjRh-dT7c~p^G&Wm6q!$vWm+J1+8 z7VWuS+P?JQdi~43Z0vW)XVISPrLHN6cBt3=WZmRnoyz22VVnPj(fwp|e3Y>aJJ(`v zYm*e_mYCZdFBax}=C(Fem|IB8#8+Z#5?5`T@52nPuzsAVCpu|eg-g|bB(&>P_ zQ`FiS??ARTX5IQB&=b}^~IDLV2zMfP2 z9@=*L66JEC1^{0*fBIpyWXZ-_unAX969sJk9zth9C3HD88uRe=Y pj((hv!{1NB{nRVPOgYBf|1Y+ZbL!7;a|h|TzC&_c-vJ26@egb?K~ literal 0 HcmV?d00001 diff --git a/examples/elementwise_arith/air_project/square_kernel_0_aie_cdo_elfs.bin b/examples/elementwise_arith/air_project/square_kernel_0_aie_cdo_elfs.bin new file mode 100644 index 0000000000000000000000000000000000000000..29b57b909b220d0abba36bf749886d055504d85a GIT binary patch literal 2528 zcmeH_ze~eF6vw}pR+B=}cCehF5GzQB4z^Bisa6n0{IwKZ)Wt=l*2T@iq7LF52f@X? zI4E=~*jYL{6)XM$jt*kIYwpOy^oQP-Xxzuyda=lvgB0F1+5C6JrH%WKn zHThXE_pWm8n75{XdCIYXW%fSqo*hpXosMXzg-Y Q>I_#ircAp)nRY?4KU-3TT>t<8 literal 0 HcmV?d00001 diff --git a/examples/elementwise_arith/air_project/square_kernel_0_aie_cdo_enable.bin b/examples/elementwise_arith/air_project/square_kernel_0_aie_cdo_enable.bin new file mode 100644 index 0000000000000000000000000000000000000000..7cc1818bce4d6ce1226fc5fda519967a4842b99e GIT binary patch literal 104 zcmZQ!U|?`|@n>LQVqg#jvKQ~%@L!OTfq|Wo1;_&e1!V>x7mN{N##qFhv4{o3#Q>)) B1+xGE literal 0 HcmV?d00001 diff --git a/examples/elementwise_arith/air_project/square_kernel_0_aie_cdo_init.bin b/examples/elementwise_arith/air_project/square_kernel_0_aie_cdo_init.bin new file mode 100644 index 0000000000000000000000000000000000000000..ace360fc11f90c98660ab2cdf87c15ccc1146121 GIT binary patch literal 4300 zcmb7`y>1gh6ov2pBp3*=K`u;UiAAPBC_;-Iq!f`D%Rx$EiJ&xeBnG9QU zs%kD6pNrU!l#}M1^n%zBpAerBpA%1sFNiORXT)>j1@V&D*1XPX+>%pO(>aYOC(XIl zTo4=L6XG-CbK)uS1@R^EjCfAGAYKyNn%6m1A3S*c+Sygb5o&%9 zDQb?rM*@%T4`=_N{G8ulX8q{?;KAe9&aNuf{Z)6+dnEAa{@y&WKj$~)`r$v_A3S*c z+Sygb5o+uFs&~wLB=A)IJt^l9*@>&$=eaJ|ALeHCEA;2i?-r6BZLDc@Ykr1)Nos%e zyBx)X*l+y&@q4zG%>LZZpZvLZexJ$yCVIi+Gbw6TxuNFRd!)s)g<3E0bXq)J)ZAiz zo~*^wN9|SM=>?vR$e#ytE_db`1A@ogjyUhie^1`*$9Y!%xcxlu_>oKMN&`5dAaW`EG)IYKQwL(}4!pcZC-+~P@53(pXD;ch=Rkqwcl z-$Qp*{x)y81O1E0-0M>5Yran8Q}p^pqsy+~q?_^YqFuLU!v*w^g>(ZQdvE+H>3h(z zr$+CPegGXiX!I@8Gn>R&pqo|VB5M5DZ%q4j`n=iSbrOgBeJi_aJocj*`txA^1JYCI zsvpuj(4Ph4_etM^{vyy1Nl)!SHUqu48mRmYbah^&A4AvoLytd#j$Z;zemRYwKv(BU z`giE6f6`Msr1^)c|9$AXf2|)ve;oApcBS1;N=hu4A4&tXle|xKey1zMebw5e(LVp>IKPJ5oUEP1u pdv-|c4{7}&t*`Z-9n<)6Io^K0T<@{sIXlnetpM3{;ac?}PM z2Ar8;ylOTZ%BWZLiViq!J6Ka&UKsh^ zAoZ%b^*G*)&Mz#i`SE(MRdKqVg%!W;2iptn_EzUJ+xJwf>bQE{tyVqTt?Kp9YhSpY zbIz@~RijmH)f}Vlx%Jv={gY%5lTU7Eggjmzze{#?V5&GVNhAXs@PmZUA>0~W)%)gO1p(v97oe=&l0?FT^HrY$YBk`Z86-6L? zms$xSO5}H`l@LPsLuw_25I&$*LI~me)QTby{yVi2LI^()BcJ?%vY$VNn`4ej>}|)8 zF?GCvMs%YS+b!z#C=L^Aq-|%ziu@nFh~Go=+vSFDw>>9b?z+9Uhe~s4Nw^53YkpwB>4u)ATZ?G*Rtv4HRQ_>x1Mr62mFxQqOv}c&(H{uNUk#|cdhi`pU?U7ZlK6LouSCU ixr|U}TR2O5GVcQOj{YO>)3oMR8au-UWzDQlrvCwN-uX%Z literal 0 HcmV?d00001 diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_2.ld.script b/examples/elementwise_arith/air_project/square_kernel_0_core_0_2.ld.script new file mode 100644 index 0000000..13a60c2 --- /dev/null +++ b/examples/elementwise_arith/air_project/square_kernel_0_core_0_2.ld.script @@ -0,0 +1,66 @@ + +MEMORY +{ + program (RX) : ORIGIN = 0, LENGTH = 0x0020000 + data (!RX) : ORIGIN = 0x74400, LENGTH = 0xBC00 +} +ENTRY(__start) +SECTIONS +{ + . = 0x0; + .text : { + /* the __start symbol has to come at address zero. */ + *crt0.o(.text*) + _ctors_start = .; + _init_array_start = .; + KEEP(SORT(*.init_array)) + _ctors_end = .; + _init_array_end = .; + _dtors_start = .; + _dtors_end = .; + *(.text*) + } > program + .data : { + *(.data*) + *(.rodata*) + } > data + .comment : { + *(.comment*) + } + .symtab : { + *(.symtab) + } + .shstrtab : { + *(.shstrtab) + } + .strtab : { + *(.strtab) + } + .stack_sizes : { + *(.stack_sizes) + } + +. = 0x70000; +_sp_start_value_DM_stack = .; +. += 0x400; /* stack */ +/* No tile with memory exists to the south. */ +. = 0x40000; +. += 0x10000; +/* No tile with memory exists to the west. */ +. = 0x50000; +. += 0x10000; +. = 0x60400; +buf3 = .; +. += 0x400; +. = 0x64000; +buf2 = .; +. += 0x400; +. = 0x70400; +buf1 = .; +. += 0x400; +. = 0x74000; +buf0 = .; +. += 0x400; + .bss : { *(.bss*) } > data +} +PROVIDE(main = core_0_2); diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_2.ll b/examples/elementwise_arith/air_project/square_kernel_0_core_0_2.ll new file mode 100644 index 0000000..d193819 --- /dev/null +++ b/examples/elementwise_arith/air_project/square_kernel_0_core_0_2.ll @@ -0,0 +1,84 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target triple = "aie2p" + +@buf0 = external global [256 x float] +@buf1 = external global [256 x float] +@buf2 = external global [256 x float] +@buf3 = external global [256 x float] +@buf4 = external global [256 x float] +@buf5 = external global [256 x float] +@buf6 = external global [256 x float] +@buf7 = external global [256 x float] +@buf8 = external global [1024 x float] +@buf9 = external global [1024 x float] + +declare void @debug_i32(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.event(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.put.ms(i32, i32) + +; Unknown intrinsic +declare { i32, i32 } @llvm.aie2p.get.ss() + +; Unknown intrinsic +declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32) + +; Unknown intrinsic +declare <16 x i32> @llvm.aie2p.scd.read.vec(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.acquire(i32, i32) + +; Unknown intrinsic +declare void @llvm.aie2p.release(i32, i32) + +; Unknown intrinsic +declare void @llvm.aie2p.set.ctrl.reg(i32, i32) + +define void @core_0_2() { + call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1) + call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0) + br label %1 + +1: ; preds = %13, %0 + call void @llvm.aie2p.acquire(i32 49, i32 -1) + call void @llvm.aie2p.acquire(i32 50, i32 -1) + br label %2 + +2: ; preds = %5, %1 + %3 = phi i32 [ %12, %5 ], [ 0, %1 ] + %4 = icmp slt i32 %3, 256 + br i1 %4, label %5, label %13 + +5: ; preds = %2 + %6 = getelementptr float, ptr @buf1, i32 %3 + %7 = load <16 x float>, ptr %6, align 4 + %8 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %7) + %9 = shufflevector <16 x bfloat> %8, <16 x bfloat> %8, <32 x i32> + %10 = call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %9, <32 x bfloat> %9, i32 60) + %11 = getelementptr float, ptr @buf0, i32 %3 + store <16 x float> %10, ptr %11, align 4 + %12 = add i32 %3, 16 + br label %2, !llvm.loop !1 + +13: ; preds = %2 + call void @llvm.aie2p.release(i32 51, i32 1) + call void @llvm.aie2p.release(i32 48, i32 1) + br label %1 +} + +; Unknown intrinsic +declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>) + +; Unknown intrinsic +declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32) + +!llvm.module.flags = !{!0} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !{!1, !2} +!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_2.o b/examples/elementwise_arith/air_project/square_kernel_0_core_0_2.o new file mode 100644 index 0000000000000000000000000000000000000000..57437bb20b770876576b8753e9fb67450fce9b78 GIT binary patch literal 932 zcmZ`$O-mb56g|(cj1VLWO;czgx~if>M%)Neq9kphlj6oAbrC0VT1r#Xjw9G^5@?Vv zLf7*Lx_9LwtNw!0rOOEZ0Q(2D=f0V7hJY9DyZ5|v-hKDo*Qwgil~RaJ!AK#XKL(7G zT+@Q5$RN?m)+LZbiOHTS0!5tXa7M)Z@^-MRE~XcN@0r0aE?$*^N#p$Fm-?Nvinv~; zOl-N-%TC<{wV$)W_Mh3S&5ZNI3Lb=9ycY6h19+*1kAlvZClw6Y1lK8jKP$GptlZC< z?1#AL&7|<(%|fGpius5Siwb`>Ci(>P{2)Yk-2utnLXGMR#(o|MPBBfTfqXAKfA9J* z#Z2Je?OS3QUuff~G3iD~CjXxKbqZIayvPa6k>9!FrgieuTUBy4wX8P`l5A(3?yXK46J1a-N)BFhAep9P_a9VEDueA@3TQ1Ba-}9XY z, ptr %5, align 64 + %7 = tail call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %6) + %8 = shufflevector <16 x bfloat> %7, <16 x bfloat> poison, <32 x i32> + %9 = tail call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %8, <32 x bfloat> %8, i32 60) + %10 = getelementptr float, ptr @buf0, i20 %4 + store <16 x float> %9, ptr %10, align 64 + %11 = add nuw nsw i32 %3, 16 + %12 = icmp ult i32 %3, 240 + br i1 %12, label %2, label %13, !llvm.loop !1 + +13: ; preds = %2 + tail call void @llvm.aie2p.release(i32 51, i32 1) + tail call void @llvm.aie2p.release(i32 48, i32 1) + br label %1 +} + +; Function Attrs: nofree nounwind memory(inaccessiblemem: read) +declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>) #3 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) +declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32) #4 + +attributes #0 = { nounwind } +attributes #1 = { nounwind memory(inaccessiblemem: write) } +attributes #2 = { noreturn nounwind } +attributes #3 = { nofree nounwind memory(inaccessiblemem: read) } +attributes #4 = { mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) } + +!llvm.module.flags = !{!0} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !{!1, !2} +!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_2.peanohack.ll b/examples/elementwise_arith/air_project/square_kernel_0_core_0_2.peanohack.ll new file mode 100644 index 0000000..055e011 --- /dev/null +++ b/examples/elementwise_arith/air_project/square_kernel_0_core_0_2.peanohack.ll @@ -0,0 +1,84 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target triple = "aie2p" + +@buf0 = external global [256 x float] +@buf1 = external global [256 x float] +@buf2 = external global [256 x float] +@buf3 = external global [256 x float] +@buf4 = external global [256 x float] +@buf5 = external global [256 x float] +@buf6 = external global [256 x float] +@buf7 = external global [256 x float] +@buf8 = external global [1024 x float] +@buf9 = external global [1024 x float] + +declare void @debug_i32(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.event(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.put.ms(i32, i32) + +; Unknown intrinsic +declare { i32, i32 } @llvm.aie2p.get.ss() + +; Unknown intrinsic +declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32) + +; Unknown intrinsic +declare <16 x i32> @llvm.aie2p.scd.read.vec(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.acquire(i32, i32) + +; Unknown intrinsic +declare void @llvm.aie2p.release(i32, i32) + +; Unknown intrinsic +declare void @llvm.aie2p.set.ctrl.reg(i32, i32) + +define void @core_0_2() { + call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1) + call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0) + br label %1 + +1: ; preds = %13, %0 + call void @llvm.aie2p.acquire(i32 49, i32 -1) + call void @llvm.aie2p.acquire(i32 50, i32 -1) + br label %2 + +2: ; preds = %5, %1 + %3 = phi i32 [ %12, %5 ], [ 0, %1 ] + %4 = icmp slt i32 %3, 256 + br i1 %4, label %5, label %13 + +5: ; preds = %2 + %6 = getelementptr float, ptr @buf1, i32 %3 + %7 = load <16 x float>, ptr %6 + %8 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %7) + %9 = shufflevector <16 x bfloat> %8, <16 x bfloat> %8, <32 x i32> + %10 = call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %9, <32 x bfloat> %9, i32 60) + %11 = getelementptr float, ptr @buf0, i32 %3 + store <16 x float> %10, ptr %11 + %12 = add i32 %3, 16 + br label %2, !llvm.loop !1 + +13: ; preds = %2 + call void @llvm.aie2p.release(i32 51, i32 1) + call void @llvm.aie2p.release(i32 48, i32 1) + br label %1 +} + +; Unknown intrinsic +declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>) + +; Unknown intrinsic +declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32) + +!llvm.module.flags = !{!0} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !{!1, !2} +!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_3.elf b/examples/elementwise_arith/air_project/square_kernel_0_core_0_3.elf new file mode 100755 index 0000000000000000000000000000000000000000..a8d3607d6115f3e937af1d24e81c3abb59403ece GIT binary patch literal 1640 zcma)6J!~UI6#mvRu_K+p2O=^8q~VZIK*8CK4TrDVi+l(XM+yZf5<<&*cayAQufy&- z~cP;43G)R{SL_=}cU4aPS+aKGa;*(~+_r3RK=IzYv z+`7?zSJO1)D4;x64LaAK0%ph!C={@O5{i)Q1GXF^I;1fb6}Aulh}}Fk&9l=Yvx^#T z11&t9D~+SCy#$=UKcnzy5oa>x@`;9zH3cZ#tzh&(qk*%F;|=X}?h5c#>39RDt0pTb z-`&5Xom7luED|j0hnHsm8LoYIdAR=T<=?ND>~D?RIVNk1=fC;Y z@~88_@Cb82wT$^g@DQfJd=osAWPfD-T?PwM$?(T}&lGka{X8@J`pIzX@6uwSXwGEc z!!cR?)%3`c8LQ3vlS4rDhkK zBflSHUbV0t$Gg$BrKK%D-X3&nPQSOb;dlLDf2rHu>%D3Fp6b*cS8uxYx@Ws}z4>CUt+1PBpp4Kqkvz#%Cczp33)%w_@;YHfFr}V3|E8{^LCi^eD z_xM8Yx;YhJp;pr9dlnj2X``5KPX)8-TINK5C~-ouN&Y7y__qioe~)Z(E;$~Fe@Rws z0?|igC4#6>e@|8-2+?=QN(3Q#L{=gQ(I;fZCJ_A#S&1M-pNdiBJt3r?-^4H{`doA| z{X-@GXA+-i)M9Txh8$zh3ur_)d$HXiZ%1*MT4T1I9V_y`@FL!q_D5?i-|l)&yw-OI zT@SVP%8G6o884Hq_Km>R;e@eKa~wK02Z3XCye&U4;PgY!(ycl=gUzH{hGj*u9mcSt z-PE%7?C!v`T5BSm9aufv4=g|MV~7RIiTh!cj}pIShoOBl_oQJjaFGnkf{AvgN2Wsj zEXI&dm+md;8p-jRvTgqd8k3}@JDk3!@`mQ9WQN=zaqF4Z3&7`@EtJnA`3#Nm6U708 z@+Qk1a?+B+|2=unNeLDhxS9)cUcwdH^6uxbnD program + .data : { + *(.data*) + *(.rodata*) + } > data + .comment : { + *(.comment*) + } + .symtab : { + *(.symtab) + } + .shstrtab : { + *(.shstrtab) + } + .strtab : { + *(.strtab) + } + .stack_sizes : { + *(.stack_sizes) + } + +. = 0x70000; +_sp_start_value_DM_stack = .; +. += 0x400; /* stack */ +. = 0x40400; +buf1 = .; +. += 0x400; +. = 0x44000; +buf0 = .; +. += 0x400; +/* No tile with memory exists to the west. */ +. = 0x50000; +. += 0x10000; +. = 0x60400; +buf5 = .; +. += 0x400; +. = 0x64000; +buf4 = .; +. += 0x400; +. = 0x70400; +buf3 = .; +. += 0x400; +. = 0x74000; +buf2 = .; +. += 0x400; + .bss : { *(.bss*) } > data +} +PROVIDE(main = core_0_3); diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_3.ll b/examples/elementwise_arith/air_project/square_kernel_0_core_0_3.ll new file mode 100644 index 0000000..9d2e115 --- /dev/null +++ b/examples/elementwise_arith/air_project/square_kernel_0_core_0_3.ll @@ -0,0 +1,84 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target triple = "aie2p" + +@buf0 = external global [256 x float] +@buf1 = external global [256 x float] +@buf2 = external global [256 x float] +@buf3 = external global [256 x float] +@buf4 = external global [256 x float] +@buf5 = external global [256 x float] +@buf6 = external global [256 x float] +@buf7 = external global [256 x float] +@buf8 = external global [1024 x float] +@buf9 = external global [1024 x float] + +declare void @debug_i32(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.event(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.put.ms(i32, i32) + +; Unknown intrinsic +declare { i32, i32 } @llvm.aie2p.get.ss() + +; Unknown intrinsic +declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32) + +; Unknown intrinsic +declare <16 x i32> @llvm.aie2p.scd.read.vec(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.acquire(i32, i32) + +; Unknown intrinsic +declare void @llvm.aie2p.release(i32, i32) + +; Unknown intrinsic +declare void @llvm.aie2p.set.ctrl.reg(i32, i32) + +define void @core_0_3() { + call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1) + call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0) + br label %1 + +1: ; preds = %13, %0 + call void @llvm.aie2p.acquire(i32 49, i32 -1) + call void @llvm.aie2p.acquire(i32 50, i32 -1) + br label %2 + +2: ; preds = %5, %1 + %3 = phi i32 [ %12, %5 ], [ 0, %1 ] + %4 = icmp slt i32 %3, 256 + br i1 %4, label %5, label %13 + +5: ; preds = %2 + %6 = getelementptr float, ptr @buf3, i32 %3 + %7 = load <16 x float>, ptr %6, align 4 + %8 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %7) + %9 = shufflevector <16 x bfloat> %8, <16 x bfloat> %8, <32 x i32> + %10 = call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %9, <32 x bfloat> %9, i32 60) + %11 = getelementptr float, ptr @buf2, i32 %3 + store <16 x float> %10, ptr %11, align 4 + %12 = add i32 %3, 16 + br label %2, !llvm.loop !1 + +13: ; preds = %2 + call void @llvm.aie2p.release(i32 51, i32 1) + call void @llvm.aie2p.release(i32 48, i32 1) + br label %1 +} + +; Unknown intrinsic +declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>) + +; Unknown intrinsic +declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32) + +!llvm.module.flags = !{!0} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !{!1, !2} +!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_3.o b/examples/elementwise_arith/air_project/square_kernel_0_core_0_3.o new file mode 100644 index 0000000000000000000000000000000000000000..6b3d34570ca51b7ee784f20a1a0a4907b4e35d5c GIT binary patch literal 932 zcmZ`$O-mb56g|(cj1VLWO(V3Bc2z}(q;^vfiB{W6C#48}fQxYwr$tPSjw5I{3JubQ zbUlBddsi;9>MxKkT}JQ+*gsHv?wc8B2zcSXd(S)P-FM%8#|ze5r4(XQFj5FO>j8R6 zzS4ph$RN?m)-{kriOISu0!3Wpa8AVh`mVjEt_Gh2-!ttsTz#kj{l>-NFZDZDF5-5A zGO>wLD?9KnsQw%dK3xvq%x0V)<=|1s#cLs7F@X1~Ga4N4J)K8~O>iC1_p@Td%gX(% z$$p4i-cSnvnJv`LPVp?_!=l2U_lQ2hJl_t{eRn`|w@}^sg0YbYf>R7qX&~PU&p)_6 zPBDG>duJ2N_(JQ&DwC54$>hHuP zSlT!Gc%M#@V_n5R*Jk?ep$~JX>AU8;kIR$$zEj_ZWvx`_x15Gs_p8m#gN6%p-}ii{ z#(4BqV(sAb6im-;IJ$yaZ+fm>vZsW23DcUJE7@@@6Wi9p&So-&==KN07@4Sbt$D(J zcT;LUL7PPJ==rGV-DHt@h`phA>tw#M#kv`8$e1b-(V=FVxzv|_gv-)p#Hr)hNP`qt TW*(24sn(q^ioBmL_2lmVgu7{C literal 0 HcmV?d00001 diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_3.opt.ll b/examples/elementwise_arith/air_project/square_kernel_0_core_0_3.opt.ll new file mode 100644 index 0000000..f2c89be --- /dev/null +++ b/examples/elementwise_arith/air_project/square_kernel_0_core_0_3.opt.ll @@ -0,0 +1,65 @@ +; ModuleID = 'air_project/square_kernel_0_core_0_3.peanohack.ll' +source_filename = "LLVMDialectModule" +target datalayout = "e-m:e-p:20:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-f32:32:32-i64:32-f64:32-a:0:32-n32" +target triple = "aie2p" + +@buf2 = external local_unnamed_addr global [256 x float] +@buf3 = external local_unnamed_addr global [256 x float] + +; Function Attrs: nounwind +declare void @llvm.aie2p.acquire(i32, i32) #0 + +; Function Attrs: nounwind +declare void @llvm.aie2p.release(i32, i32) #0 + +; Function Attrs: nounwind memory(inaccessiblemem: write) +declare void @llvm.aie2p.set.ctrl.reg(i32, i32) #1 + +; Function Attrs: noreturn nounwind +define void @core_0_3() local_unnamed_addr #2 { + tail call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1) + tail call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0) + br label %1 + +1: ; preds = %13, %0 + tail call void @llvm.aie2p.acquire(i32 49, i32 -1) + tail call void @llvm.aie2p.acquire(i32 50, i32 -1) + br label %2 + +2: ; preds = %1, %2 + %3 = phi i32 [ 0, %1 ], [ %11, %2 ] + %4 = trunc nuw i32 %3 to i20 + %5 = getelementptr float, ptr @buf3, i20 %4 + %6 = load <16 x float>, ptr %5, align 64 + %7 = tail call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %6) + %8 = shufflevector <16 x bfloat> %7, <16 x bfloat> poison, <32 x i32> + %9 = tail call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %8, <32 x bfloat> %8, i32 60) + %10 = getelementptr float, ptr @buf2, i20 %4 + store <16 x float> %9, ptr %10, align 64 + %11 = add nuw nsw i32 %3, 16 + %12 = icmp ult i32 %3, 240 + br i1 %12, label %2, label %13, !llvm.loop !1 + +13: ; preds = %2 + tail call void @llvm.aie2p.release(i32 51, i32 1) + tail call void @llvm.aie2p.release(i32 48, i32 1) + br label %1 +} + +; Function Attrs: nofree nounwind memory(inaccessiblemem: read) +declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>) #3 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) +declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32) #4 + +attributes #0 = { nounwind } +attributes #1 = { nounwind memory(inaccessiblemem: write) } +attributes #2 = { noreturn nounwind } +attributes #3 = { nofree nounwind memory(inaccessiblemem: read) } +attributes #4 = { mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) } + +!llvm.module.flags = !{!0} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !{!1, !2} +!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_3.peanohack.ll b/examples/elementwise_arith/air_project/square_kernel_0_core_0_3.peanohack.ll new file mode 100644 index 0000000..ed78c15 --- /dev/null +++ b/examples/elementwise_arith/air_project/square_kernel_0_core_0_3.peanohack.ll @@ -0,0 +1,84 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target triple = "aie2p" + +@buf0 = external global [256 x float] +@buf1 = external global [256 x float] +@buf2 = external global [256 x float] +@buf3 = external global [256 x float] +@buf4 = external global [256 x float] +@buf5 = external global [256 x float] +@buf6 = external global [256 x float] +@buf7 = external global [256 x float] +@buf8 = external global [1024 x float] +@buf9 = external global [1024 x float] + +declare void @debug_i32(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.event(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.put.ms(i32, i32) + +; Unknown intrinsic +declare { i32, i32 } @llvm.aie2p.get.ss() + +; Unknown intrinsic +declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32) + +; Unknown intrinsic +declare <16 x i32> @llvm.aie2p.scd.read.vec(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.acquire(i32, i32) + +; Unknown intrinsic +declare void @llvm.aie2p.release(i32, i32) + +; Unknown intrinsic +declare void @llvm.aie2p.set.ctrl.reg(i32, i32) + +define void @core_0_3() { + call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1) + call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0) + br label %1 + +1: ; preds = %13, %0 + call void @llvm.aie2p.acquire(i32 49, i32 -1) + call void @llvm.aie2p.acquire(i32 50, i32 -1) + br label %2 + +2: ; preds = %5, %1 + %3 = phi i32 [ %12, %5 ], [ 0, %1 ] + %4 = icmp slt i32 %3, 256 + br i1 %4, label %5, label %13 + +5: ; preds = %2 + %6 = getelementptr float, ptr @buf3, i32 %3 + %7 = load <16 x float>, ptr %6 + %8 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %7) + %9 = shufflevector <16 x bfloat> %8, <16 x bfloat> %8, <32 x i32> + %10 = call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %9, <32 x bfloat> %9, i32 60) + %11 = getelementptr float, ptr @buf2, i32 %3 + store <16 x float> %10, ptr %11 + %12 = add i32 %3, 16 + br label %2, !llvm.loop !1 + +13: ; preds = %2 + call void @llvm.aie2p.release(i32 51, i32 1) + call void @llvm.aie2p.release(i32 48, i32 1) + br label %1 +} + +; Unknown intrinsic +declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>) + +; Unknown intrinsic +declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32) + +!llvm.module.flags = !{!0} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !{!1, !2} +!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_4.elf b/examples/elementwise_arith/air_project/square_kernel_0_core_0_4.elf new file mode 100755 index 0000000000000000000000000000000000000000..b06bbc2edc2ab1b34e0e4e5e00fa2e23d127e11e GIT binary patch literal 1640 zcma)6O>7%Q6#mvRu_IOBf{2U?(oiH6P;l0#(~{ zQ!Wi63LFp@F6MyLOC^pLhaPf@)Lub7=7Q8jATB+$w;mATd;4QsocN@f?|tvRnRz=i zJ9lohKF~A`8FDC&RGrSX7l0{p9SS+jp@2Ljd!H@CkPc}~MTza>KO-lLP4e`lNbS6a zyFe3A5AGdoyVz6xp=JMrltUe`y~t?X*6(pezdNg%v=V(E*!1nWW`_w z#ru2rwBu4W9*Y?B`oV?ie+H}HUmUFccJYtv1?$`D;MF9S_2pTk2z;nLer<5)i#M9a zDZi$v&R8(l=Zu};f~=p^LwyjuIH4b1FSoxt#G7e6nN;G#oQxl9jz3BitqkL}#q;0% za`E$7U~q^Tpj^c40eA=#V7>vKNxVNY{|oh!|@@Y{NsZ{8FO-=#6t~ueh#!ef7=bNsaC70s*Q?X(E+<-`I~Cj z4MMNyr(QL;6-7JYwS|REFWTz2D|WBDu{-qTc|1FRg=%f&(Qt|O-3k2`?b2wFhVlN( z?mat~yKYRxm#LLB`d)^H723!r+f%}HvX(K{pGcfgY?A+(2>vYs$=@X#oJ)>J;$M*! zn?Up-S&1M@)IX4w2txEdvJydv9+H&^Li90Nu?a-~N>(BW(I;Z$c~1zb=XWuTu|5-> zPySGe{}so_8kNZ1iy+6?b$uG)?QUeX$y;F*B-V&6d)o}XFWr#$rS<7*)3Z9R9j*49 ze#b?nwY;pG)sz>>X6uIU=&*yRTCr_9Hu}D8w%tw7ufpyHuBn@KwEG+HOIL?!hLIIS zFvFe1GIy;`-!+@7BJFLMUCZ-L&-WsT1=EgtL70sazi9=5bvyGUVb^yM4@!fvb|yzA zLj0_wCf%ZROVVA5kJpfG`#;c_BrVzD program + .data : { + *(.data*) + *(.rodata*) + } > data + .comment : { + *(.comment*) + } + .symtab : { + *(.symtab) + } + .shstrtab : { + *(.shstrtab) + } + .strtab : { + *(.strtab) + } + .stack_sizes : { + *(.stack_sizes) + } + +. = 0x70000; +_sp_start_value_DM_stack = .; +. += 0x400; /* stack */ +. = 0x40400; +buf3 = .; +. += 0x400; +. = 0x44000; +buf2 = .; +. += 0x400; +/* No tile with memory exists to the west. */ +. = 0x50000; +. += 0x10000; +. = 0x60400; +buf7 = .; +. += 0x400; +. = 0x64000; +buf6 = .; +. += 0x400; +. = 0x70400; +buf5 = .; +. += 0x400; +. = 0x74000; +buf4 = .; +. += 0x400; + .bss : { *(.bss*) } > data +} +PROVIDE(main = core_0_4); diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_4.ll b/examples/elementwise_arith/air_project/square_kernel_0_core_0_4.ll new file mode 100644 index 0000000..cfa104c --- /dev/null +++ b/examples/elementwise_arith/air_project/square_kernel_0_core_0_4.ll @@ -0,0 +1,84 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target triple = "aie2p" + +@buf0 = external global [256 x float] +@buf1 = external global [256 x float] +@buf2 = external global [256 x float] +@buf3 = external global [256 x float] +@buf4 = external global [256 x float] +@buf5 = external global [256 x float] +@buf6 = external global [256 x float] +@buf7 = external global [256 x float] +@buf8 = external global [1024 x float] +@buf9 = external global [1024 x float] + +declare void @debug_i32(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.event(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.put.ms(i32, i32) + +; Unknown intrinsic +declare { i32, i32 } @llvm.aie2p.get.ss() + +; Unknown intrinsic +declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32) + +; Unknown intrinsic +declare <16 x i32> @llvm.aie2p.scd.read.vec(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.acquire(i32, i32) + +; Unknown intrinsic +declare void @llvm.aie2p.release(i32, i32) + +; Unknown intrinsic +declare void @llvm.aie2p.set.ctrl.reg(i32, i32) + +define void @core_0_4() { + call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1) + call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0) + br label %1 + +1: ; preds = %13, %0 + call void @llvm.aie2p.acquire(i32 49, i32 -1) + call void @llvm.aie2p.acquire(i32 50, i32 -1) + br label %2 + +2: ; preds = %5, %1 + %3 = phi i32 [ %12, %5 ], [ 0, %1 ] + %4 = icmp slt i32 %3, 256 + br i1 %4, label %5, label %13 + +5: ; preds = %2 + %6 = getelementptr float, ptr @buf5, i32 %3 + %7 = load <16 x float>, ptr %6, align 4 + %8 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %7) + %9 = shufflevector <16 x bfloat> %8, <16 x bfloat> %8, <32 x i32> + %10 = call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %9, <32 x bfloat> %9, i32 60) + %11 = getelementptr float, ptr @buf4, i32 %3 + store <16 x float> %10, ptr %11, align 4 + %12 = add i32 %3, 16 + br label %2, !llvm.loop !1 + +13: ; preds = %2 + call void @llvm.aie2p.release(i32 51, i32 1) + call void @llvm.aie2p.release(i32 48, i32 1) + br label %1 +} + +; Unknown intrinsic +declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>) + +; Unknown intrinsic +declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32) + +!llvm.module.flags = !{!0} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !{!1, !2} +!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_4.o b/examples/elementwise_arith/air_project/square_kernel_0_core_0_4.o new file mode 100644 index 0000000000000000000000000000000000000000..6afdc2e5495148f08a97bed4dbec3859aee95823 GIT binary patch literal 932 zcmZ`$O-mb56g|(cj1WpJ7*nv2c2z}(MC?KkiC?YIiF8vHx)>*MTEx_};|S_Tp+Q`T z>-huSyK<3Le?jTeWrY3!{R7%_-^@5ezzg@?d)_(kzWeSwUbfyTr4XBfkwU!IOCshs5A6+gJ@^{atR$a!F532&x#E%EBCV| z`yp<5&rgPD z#q{Cs!6ugRh1QF8CMOY+$$w>jo5HJ6UgQLxlb?CypH7iuUBy4wX8P`d4|BKayXO1VPm_DTQ{RDQtyPz{orYWYtIe(bh6{7g_k5?u zc=%0XZU4(#S*BrnZo|=K%zD#v?UKzsYhj^ePerlJXj{v>Tgeon+n)?$WTMu!W|aMY zPpSC?Z4$+!=cA%`lSSqswnFdL$$Vprb&K4PF;ya>L(MF6sW1Hqm!->yQ^&EH1}U!0 TJRUVutvg{9c|Tq1$=&|}aJy+@ literal 0 HcmV?d00001 diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_4.opt.ll b/examples/elementwise_arith/air_project/square_kernel_0_core_0_4.opt.ll new file mode 100644 index 0000000..a653490 --- /dev/null +++ b/examples/elementwise_arith/air_project/square_kernel_0_core_0_4.opt.ll @@ -0,0 +1,65 @@ +; ModuleID = 'air_project/square_kernel_0_core_0_4.peanohack.ll' +source_filename = "LLVMDialectModule" +target datalayout = "e-m:e-p:20:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-f32:32:32-i64:32-f64:32-a:0:32-n32" +target triple = "aie2p" + +@buf4 = external local_unnamed_addr global [256 x float] +@buf5 = external local_unnamed_addr global [256 x float] + +; Function Attrs: nounwind +declare void @llvm.aie2p.acquire(i32, i32) #0 + +; Function Attrs: nounwind +declare void @llvm.aie2p.release(i32, i32) #0 + +; Function Attrs: nounwind memory(inaccessiblemem: write) +declare void @llvm.aie2p.set.ctrl.reg(i32, i32) #1 + +; Function Attrs: noreturn nounwind +define void @core_0_4() local_unnamed_addr #2 { + tail call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1) + tail call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0) + br label %1 + +1: ; preds = %13, %0 + tail call void @llvm.aie2p.acquire(i32 49, i32 -1) + tail call void @llvm.aie2p.acquire(i32 50, i32 -1) + br label %2 + +2: ; preds = %1, %2 + %3 = phi i32 [ 0, %1 ], [ %11, %2 ] + %4 = trunc nuw i32 %3 to i20 + %5 = getelementptr float, ptr @buf5, i20 %4 + %6 = load <16 x float>, ptr %5, align 64 + %7 = tail call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %6) + %8 = shufflevector <16 x bfloat> %7, <16 x bfloat> poison, <32 x i32> + %9 = tail call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %8, <32 x bfloat> %8, i32 60) + %10 = getelementptr float, ptr @buf4, i20 %4 + store <16 x float> %9, ptr %10, align 64 + %11 = add nuw nsw i32 %3, 16 + %12 = icmp ult i32 %3, 240 + br i1 %12, label %2, label %13, !llvm.loop !1 + +13: ; preds = %2 + tail call void @llvm.aie2p.release(i32 51, i32 1) + tail call void @llvm.aie2p.release(i32 48, i32 1) + br label %1 +} + +; Function Attrs: nofree nounwind memory(inaccessiblemem: read) +declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>) #3 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) +declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32) #4 + +attributes #0 = { nounwind } +attributes #1 = { nounwind memory(inaccessiblemem: write) } +attributes #2 = { noreturn nounwind } +attributes #3 = { nofree nounwind memory(inaccessiblemem: read) } +attributes #4 = { mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) } + +!llvm.module.flags = !{!0} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !{!1, !2} +!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_4.peanohack.ll b/examples/elementwise_arith/air_project/square_kernel_0_core_0_4.peanohack.ll new file mode 100644 index 0000000..520a891 --- /dev/null +++ b/examples/elementwise_arith/air_project/square_kernel_0_core_0_4.peanohack.ll @@ -0,0 +1,84 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target triple = "aie2p" + +@buf0 = external global [256 x float] +@buf1 = external global [256 x float] +@buf2 = external global [256 x float] +@buf3 = external global [256 x float] +@buf4 = external global [256 x float] +@buf5 = external global [256 x float] +@buf6 = external global [256 x float] +@buf7 = external global [256 x float] +@buf8 = external global [1024 x float] +@buf9 = external global [1024 x float] + +declare void @debug_i32(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.event(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.put.ms(i32, i32) + +; Unknown intrinsic +declare { i32, i32 } @llvm.aie2p.get.ss() + +; Unknown intrinsic +declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32) + +; Unknown intrinsic +declare <16 x i32> @llvm.aie2p.scd.read.vec(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.acquire(i32, i32) + +; Unknown intrinsic +declare void @llvm.aie2p.release(i32, i32) + +; Unknown intrinsic +declare void @llvm.aie2p.set.ctrl.reg(i32, i32) + +define void @core_0_4() { + call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1) + call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0) + br label %1 + +1: ; preds = %13, %0 + call void @llvm.aie2p.acquire(i32 49, i32 -1) + call void @llvm.aie2p.acquire(i32 50, i32 -1) + br label %2 + +2: ; preds = %5, %1 + %3 = phi i32 [ %12, %5 ], [ 0, %1 ] + %4 = icmp slt i32 %3, 256 + br i1 %4, label %5, label %13 + +5: ; preds = %2 + %6 = getelementptr float, ptr @buf5, i32 %3 + %7 = load <16 x float>, ptr %6 + %8 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %7) + %9 = shufflevector <16 x bfloat> %8, <16 x bfloat> %8, <32 x i32> + %10 = call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %9, <32 x bfloat> %9, i32 60) + %11 = getelementptr float, ptr @buf4, i32 %3 + store <16 x float> %10, ptr %11 + %12 = add i32 %3, 16 + br label %2, !llvm.loop !1 + +13: ; preds = %2 + call void @llvm.aie2p.release(i32 51, i32 1) + call void @llvm.aie2p.release(i32 48, i32 1) + br label %1 +} + +; Unknown intrinsic +declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>) + +; Unknown intrinsic +declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32) + +!llvm.module.flags = !{!0} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !{!1, !2} +!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_5.elf b/examples/elementwise_arith/air_project/square_kernel_0_core_0_5.elf new file mode 100755 index 0000000000000000000000000000000000000000..9b231c6aa9ecc51f66997deeb1a18cef3b26ae8b GIT binary patch literal 1600 zcma)6L2MgE6n*QM*pVu5sfdgKX<8%{P;l0@X>c!WBNrj+NaX;EgwS@qyGd5D*I~U5 zDVGKj1rCS{7jr=Br4mPrLk~GcYOkOkb3tk%xVN_+5aIpZ9oy=OC(XY9{{J)ccmD3& zx!(HFFbrtqP#mi|oeM7jGvqoHa#%nCc}VsihsKBw8B9fqNM3%gKWZ-i{0Sfm^7(Fs*;NwZ zHcfTPg1J9e>_q2e|Fja%aSF_jfayap#1ZLLKzFvP~ss^2( zkT!MwkVgM@H+I_O&3+uE)|ef4%kBqX`+a^7teGs~K+usPnD%@V=+ot^<+Jp6Fd0c^Q_hToDVfVLF%ieK11K)10 zigdSNcby=#gD{987Hl`}MSVR>{I(NC&TZ{U!+z)?nUn<+?M=^2h4@)VQc*o=nk9$z z|6njphQ90eXv^KGQOXSYGQ@3W+ROo8WwwxBNAejO<5!AX49ZZ9 k8CceWtV_5=Tkdm>514nV1zG34li7P&>>@XmJ#}33e}uC6SpWb4 literal 0 HcmV?d00001 diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_5.ld.script b/examples/elementwise_arith/air_project/square_kernel_0_core_0_5.ld.script new file mode 100644 index 0000000..818260c --- /dev/null +++ b/examples/elementwise_arith/air_project/square_kernel_0_core_0_5.ld.script @@ -0,0 +1,66 @@ + +MEMORY +{ + program (RX) : ORIGIN = 0, LENGTH = 0x0020000 + data (!RX) : ORIGIN = 0x74400, LENGTH = 0xBC00 +} +ENTRY(__start) +SECTIONS +{ + . = 0x0; + .text : { + /* the __start symbol has to come at address zero. */ + *crt0.o(.text*) + _ctors_start = .; + _init_array_start = .; + KEEP(SORT(*.init_array)) + _ctors_end = .; + _init_array_end = .; + _dtors_start = .; + _dtors_end = .; + *(.text*) + } > program + .data : { + *(.data*) + *(.rodata*) + } > data + .comment : { + *(.comment*) + } + .symtab : { + *(.symtab) + } + .shstrtab : { + *(.shstrtab) + } + .strtab : { + *(.strtab) + } + .stack_sizes : { + *(.stack_sizes) + } + +. = 0x70000; +_sp_start_value_DM_stack = .; +. += 0x400; /* stack */ +. = 0x40400; +buf5 = .; +. += 0x400; +. = 0x44000; +buf4 = .; +. += 0x400; +/* No tile with memory exists to the west. */ +. = 0x50000; +. += 0x10000; +/* No tile with memory exists to the north. */ +. = 0x60000; +. += 0x10000; +. = 0x70400; +buf7 = .; +. += 0x400; +. = 0x74000; +buf6 = .; +. += 0x400; + .bss : { *(.bss*) } > data +} +PROVIDE(main = core_0_5); diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_5.ll b/examples/elementwise_arith/air_project/square_kernel_0_core_0_5.ll new file mode 100644 index 0000000..3e15d3e --- /dev/null +++ b/examples/elementwise_arith/air_project/square_kernel_0_core_0_5.ll @@ -0,0 +1,84 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target triple = "aie2p" + +@buf0 = external global [256 x float] +@buf1 = external global [256 x float] +@buf2 = external global [256 x float] +@buf3 = external global [256 x float] +@buf4 = external global [256 x float] +@buf5 = external global [256 x float] +@buf6 = external global [256 x float] +@buf7 = external global [256 x float] +@buf8 = external global [1024 x float] +@buf9 = external global [1024 x float] + +declare void @debug_i32(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.event(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.put.ms(i32, i32) + +; Unknown intrinsic +declare { i32, i32 } @llvm.aie2p.get.ss() + +; Unknown intrinsic +declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32) + +; Unknown intrinsic +declare <16 x i32> @llvm.aie2p.scd.read.vec(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.acquire(i32, i32) + +; Unknown intrinsic +declare void @llvm.aie2p.release(i32, i32) + +; Unknown intrinsic +declare void @llvm.aie2p.set.ctrl.reg(i32, i32) + +define void @core_0_5() { + call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1) + call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0) + br label %1 + +1: ; preds = %13, %0 + call void @llvm.aie2p.acquire(i32 49, i32 -1) + call void @llvm.aie2p.acquire(i32 50, i32 -1) + br label %2 + +2: ; preds = %5, %1 + %3 = phi i32 [ %12, %5 ], [ 0, %1 ] + %4 = icmp slt i32 %3, 256 + br i1 %4, label %5, label %13 + +5: ; preds = %2 + %6 = getelementptr float, ptr @buf7, i32 %3 + %7 = load <16 x float>, ptr %6, align 4 + %8 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %7) + %9 = shufflevector <16 x bfloat> %8, <16 x bfloat> %8, <32 x i32> + %10 = call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %9, <32 x bfloat> %9, i32 60) + %11 = getelementptr float, ptr @buf6, i32 %3 + store <16 x float> %10, ptr %11, align 4 + %12 = add i32 %3, 16 + br label %2, !llvm.loop !1 + +13: ; preds = %2 + call void @llvm.aie2p.release(i32 51, i32 1) + call void @llvm.aie2p.release(i32 48, i32 1) + br label %1 +} + +; Unknown intrinsic +declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>) + +; Unknown intrinsic +declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32) + +!llvm.module.flags = !{!0} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !{!1, !2} +!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_5.o b/examples/elementwise_arith/air_project/square_kernel_0_core_0_5.o new file mode 100644 index 0000000000000000000000000000000000000000..cb309eb89ecceee56448345389d9d7a76543e35e GIT binary patch literal 932 zcmZ`$O-mb56g|(cj1Wo`OjBqfx>C_0DQ!h4C0cC-C(=z@!NoX9rlmABI*wqwQD~4Z z#P$3E_pV%I)n8Ctx{S~tVE;h%+&44M5b(l%_nvpoyYIgH4wkJqN-4yqV5AUm+z0fN ze5M7H$RN?m))kOLiOGg40!5tXa7M)Z>ZY@|nSpmDx`s($CnMO@EN zCN@)QXNUd=)svCn-Jg-a3mNBEIT#DMcrD~B2C$;KkAj1pCrjwE39du>epYOHS-GDz z*$;8sdyv8x3x(S85ym4vEGqn2pXd|J^PLdgbq6GO3)QPH7@K(@IKnWM2J-Fj{JrbL z6f=N-H#V`1FSLGqWO5iGnfzzw*D1UfGc1Cm#X4>cIl1I43dTXEQp5&oC_? zOZ!2e9?&UrtgHCv+DzZw_hD|eeAj%t_I`HPcj{kZSs$uPU!0~}_p7bOUekrS>wCUa zWBmS2Vr}o!oGdS3dT!IvWz2fZbM2D-9HzCnShAl+vCL>&%Ug|P3eoNNhAA>p>ss@e z{cfk!e1bNK;?eU_(Ywha^AKC1_v&Q6vBkRA+>kL1m> VT$y=1YNlFu$SCqby3~`qZvl4{X=eZc literal 0 HcmV?d00001 diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_5.opt.ll b/examples/elementwise_arith/air_project/square_kernel_0_core_0_5.opt.ll new file mode 100644 index 0000000..bccc4ff --- /dev/null +++ b/examples/elementwise_arith/air_project/square_kernel_0_core_0_5.opt.ll @@ -0,0 +1,65 @@ +; ModuleID = 'air_project/square_kernel_0_core_0_5.peanohack.ll' +source_filename = "LLVMDialectModule" +target datalayout = "e-m:e-p:20:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-f32:32:32-i64:32-f64:32-a:0:32-n32" +target triple = "aie2p" + +@buf6 = external local_unnamed_addr global [256 x float] +@buf7 = external local_unnamed_addr global [256 x float] + +; Function Attrs: nounwind +declare void @llvm.aie2p.acquire(i32, i32) #0 + +; Function Attrs: nounwind +declare void @llvm.aie2p.release(i32, i32) #0 + +; Function Attrs: nounwind memory(inaccessiblemem: write) +declare void @llvm.aie2p.set.ctrl.reg(i32, i32) #1 + +; Function Attrs: noreturn nounwind +define void @core_0_5() local_unnamed_addr #2 { + tail call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1) + tail call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0) + br label %1 + +1: ; preds = %13, %0 + tail call void @llvm.aie2p.acquire(i32 49, i32 -1) + tail call void @llvm.aie2p.acquire(i32 50, i32 -1) + br label %2 + +2: ; preds = %1, %2 + %3 = phi i32 [ 0, %1 ], [ %11, %2 ] + %4 = trunc nuw i32 %3 to i20 + %5 = getelementptr float, ptr @buf7, i20 %4 + %6 = load <16 x float>, ptr %5, align 64 + %7 = tail call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %6) + %8 = shufflevector <16 x bfloat> %7, <16 x bfloat> poison, <32 x i32> + %9 = tail call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %8, <32 x bfloat> %8, i32 60) + %10 = getelementptr float, ptr @buf6, i20 %4 + store <16 x float> %9, ptr %10, align 64 + %11 = add nuw nsw i32 %3, 16 + %12 = icmp ult i32 %3, 240 + br i1 %12, label %2, label %13, !llvm.loop !1 + +13: ; preds = %2 + tail call void @llvm.aie2p.release(i32 51, i32 1) + tail call void @llvm.aie2p.release(i32 48, i32 1) + br label %1 +} + +; Function Attrs: nofree nounwind memory(inaccessiblemem: read) +declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>) #3 + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) +declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32) #4 + +attributes #0 = { nounwind } +attributes #1 = { nounwind memory(inaccessiblemem: write) } +attributes #2 = { noreturn nounwind } +attributes #3 = { nofree nounwind memory(inaccessiblemem: read) } +attributes #4 = { mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) } + +!llvm.module.flags = !{!0} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !{!1, !2} +!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_5.peanohack.ll b/examples/elementwise_arith/air_project/square_kernel_0_core_0_5.peanohack.ll new file mode 100644 index 0000000..d8f77fa --- /dev/null +++ b/examples/elementwise_arith/air_project/square_kernel_0_core_0_5.peanohack.ll @@ -0,0 +1,84 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target triple = "aie2p" + +@buf0 = external global [256 x float] +@buf1 = external global [256 x float] +@buf2 = external global [256 x float] +@buf3 = external global [256 x float] +@buf4 = external global [256 x float] +@buf5 = external global [256 x float] +@buf6 = external global [256 x float] +@buf7 = external global [256 x float] +@buf8 = external global [1024 x float] +@buf9 = external global [1024 x float] + +declare void @debug_i32(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.event(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.put.ms(i32, i32) + +; Unknown intrinsic +declare { i32, i32 } @llvm.aie2p.get.ss() + +; Unknown intrinsic +declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32) + +; Unknown intrinsic +declare <16 x i32> @llvm.aie2p.scd.read.vec(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.acquire(i32, i32) + +; Unknown intrinsic +declare void @llvm.aie2p.release(i32, i32) + +; Unknown intrinsic +declare void @llvm.aie2p.set.ctrl.reg(i32, i32) + +define void @core_0_5() { + call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1) + call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0) + br label %1 + +1: ; preds = %13, %0 + call void @llvm.aie2p.acquire(i32 49, i32 -1) + call void @llvm.aie2p.acquire(i32 50, i32 -1) + br label %2 + +2: ; preds = %5, %1 + %3 = phi i32 [ %12, %5 ], [ 0, %1 ] + %4 = icmp slt i32 %3, 256 + br i1 %4, label %5, label %13 + +5: ; preds = %2 + %6 = getelementptr float, ptr @buf7, i32 %3 + %7 = load <16 x float>, ptr %6 + %8 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %7) + %9 = shufflevector <16 x bfloat> %8, <16 x bfloat> %8, <32 x i32> + %10 = call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %9, <32 x bfloat> %9, i32 60) + %11 = getelementptr float, ptr @buf6, i32 %3 + store <16 x float> %10, ptr %11 + %12 = add i32 %3, 16 + br label %2, !llvm.loop !1 + +13: ; preds = %2 + call void @llvm.aie2p.release(i32 51, i32 1) + call void @llvm.aie2p.release(i32 48, i32 1) + br label %1 +} + +; Unknown intrinsic +declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>) + +; Unknown intrinsic +declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32) + +!llvm.module.flags = !{!0} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !{!1, !2} +!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/square_kernel_0_design.bif b/examples/elementwise_arith/air_project/square_kernel_0_design.bif new file mode 100644 index 0000000..6e94022 --- /dev/null +++ b/examples/elementwise_arith/air_project/square_kernel_0_design.bif @@ -0,0 +1,10 @@ +all: +{ + id_code = 0x14ca8093 + extended_id_code = 0x01 + image + { + name=aie_image, id=0x1c000000 + { type=cdo file=air_project/square_kernel_0_aie_cdo_elfs.bin file=air_project/square_kernel_0_aie_cdo_init.bin file=air_project/square_kernel_0_aie_cdo_enable.bin } + } +} diff --git a/examples/elementwise_arith/air_project/square_kernel_0_square_kernel_0_sequence.bin b/examples/elementwise_arith/air_project/square_kernel_0_square_kernel_0_sequence.bin new file mode 100644 index 0000000000000000000000000000000000000000..97e175b81722f66b4d8fb9099cc8f7fbe11452c6 GIT binary patch literal 2288 zcmcJP!41MN3`Lz%A#QNszyXOB7@?zu%{)TIB#gjM>*D?kas#o1VyE%<&jTu{-yf35 zAR@1W2+}#mB=?e?toB8bc2;%|B-1(DSe73B-{SoA=NBCORAcIXoI82@=$VP#V&<7< z??-IhM6Sy|)_!>=l8dK|&wPETrL^|wYz7?iP^8=03@%Wz5F74Mv_~Hx|ia@+u5*2@O@J?m)Y3dW^8hm$JK8JDQm>oa8L9*VK%1G UjE(+*4Qs^Ma8Df@)A$FQHy}r3Z2$lO literal 0 HcmV?d00001 diff --git a/examples/elementwise_arith/air_project/sub_kernel_0.pdi b/examples/elementwise_arith/air_project/sub_kernel_0.pdi new file mode 100644 index 0000000000000000000000000000000000000000..cad10284470b3236f231f480e9d2396b90ba8f55 GIT binary patch literal 7792 zcmeHM&ubiY6n`^2Nw&IeH#L^ANpac)2_=V3H$n=5&UT}$5^-xP1UxhqYAq;)UfN@n zfJHz{nynrmyngkaH-Gy6+Z#jydEmzI;`b@Mczw3~k)-B1waJ z-+x)xPW*6_&y)KvT)uez^7|JqU33md9L%CG#!nq>3=qAH7r)E3ZPq!E{Y1VzL)!fx z-1&3+uK*6}v#;X>q8ehjdaJcPh{JrZKopIpxv^Z{JjQ1F1v$*gl#`1>9`tku?7Ppq zV!}R7fk)qoeL}5xv(-tRoeo!zq~oAXDs!Xl_{q$3OPGpC!fsMV=-U4|8`zN({C%=d9bpb z2P;!l&&^jYH(yzqHnVb3$l3Wi^JKmri}}+1w(}C2ep}hjgO%+(Sec^v+0(S)75;!qf=U2^KGki~w&2LiV=WvubToE`HxGC_W zz)J!z3%n}uy1*@g+X8n4-V!)5Sm#&I<)>xWznaL;;V5yqB5*8lQ{Y8`mjqrGcvawa zfm;H%1?~vEC2(S}&Tl@KpO#&IXGMMvM~TA~fn$N20xt@@B=EAps{*eJ+!DAga7W-R zffIqtG~)AjC3UR<+q}T@-L~ajjx-Voo}sd7V&*-AV{t811Rn{$EO-)p6SL*~WUm3s zM`9Ms!kz@*L=UZfEP7iJd?fg?;7RaJ)XMqE_SEu`tf#Oi!w>T3m|6DwVxL#kOb0$a zGJ>_jH^S3Fzuaf-BV`{uLxX-h9~{`tKGyzKpX8=Lz6uU=&xi1t&5LKfY_QEMfXrB3 z2RU5{GH1m){#@t9wmb8y?J+;LkJ(*j_0CWEe@K4NW9j@f_8@ca0q%zw>-@61lD#)p z`8nOr{3iF9AKS+`HM@G}r~E%8Kj^V^ej2;{^tGh(%j!z@yQRv{>2~IKY>)Y|eXL@V ze|qPq{68c==&^Ku8hem+|MX{$&M&Jg*=IB=Kc{o~;bDVXM*{UhM^-kpd5a&^Rt9lk zA3vPHiU{_QiH`7}9jmq-e<1CtVC=n*)vo-8ynSr_`zP|IO3vFy)_>t#0NM72gl?Dm`+Kavwtw{cYx_s9zqWt$`s0I~EB3GTSO0{~{3~p0 zjQabc{uv$Wubxr!KGZ*>L;ZbG|BMdx_n9!)R7N+G6sV7Andf4ti|5e>7s0Nrm?kGYt@q-Zwf_LXSeh9qt@AyhG>WjXoGdvC6 z`58_58StX-DL(~X^gZR9;GZ`2ndmw{3*hksjjCHi@DoW@#8XZEjDdG`c0A@m#N+rT zcz3SjBk<|`cO5^G)PLlLgPU!aEGtY;YTd>QC( z#Koe8`bAN-58uo2A`t(u53av<7vf=Kr535GwXmOu>Q5EFH0M=+eIME% zHXpNnry@TdPbXV0|C632+|1+^}`q6#!Vh)fSzv*3v zz^XbWH(*{#`4K>~H2LiYg&ox+{&oJuX$=15THkKsPq$vlaig4hADuDzQvKg%$yXQ2 zP#}MvdlIWAdLe+DVd9N-L>(P>gfr_1XVwwU5U$&?Jk$%i<&lHR#iU1?AtTRcEb(t` z>fglpr#;{fNi%cKtQ(wJH#kFB&G4_1;a{4en`S;^iGQ0@|0c#iJ#+4mG&ASSy1|)s qgENHhGW>g+;a{4en`S;^iGRCO|0c#iJ#+4mG&ASSy1|)sL$P~=DwMGR literal 0 HcmV?d00001 diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_aie_cdo_enable.bin b/examples/elementwise_arith/air_project/sub_kernel_0_aie_cdo_enable.bin new file mode 100644 index 0000000000000000000000000000000000000000..7cc1818bce4d6ce1226fc5fda519967a4842b99e GIT binary patch literal 104 zcmZQ!U|?`|@n>LQVqg#jvKQ~%@L!OTfq|Wo1;_&e1!V>x7mN{N##qFhv4{o3#Q>)) B1+xGE literal 0 HcmV?d00001 diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_aie_cdo_init.bin b/examples/elementwise_arith/air_project/sub_kernel_0_aie_cdo_init.bin new file mode 100644 index 0000000000000000000000000000000000000000..d4549ba181167c6e6d7475f23335e34fea2c3376 GIT binary patch literal 6032 zcmb7{&rak<5XQ@%hINw3u1Uy>EJ*QcI7AT#M?)kINHH2BaX`vMPg)mN>ymUudP@3&^d;#l($}PKNY6;mN#BybCv6+u_35mqPjyW9=@;sg z)`iu&Bwdl7lD;5)N&1TPHR&7DGtzU?x1{e$Thh7gr0z}C-Y@95Kj1=K_e*vRasT2A z`KGw%NtC~$_drR!Af6K!;#1rYonOBXO5z3Gubj9LpW;_N?bzRq9a$@;UpUZ4Mwdf~w*)RN!T zTGw0O+2D~L&k!+PZVspqp^>qxyLCZUYYv{=)g)M&yk)HtOi1`aSl`pz$Z~{VHC> z{wm~8zhkS{>@Pz8!5=I0ho0@PV=wrGo|TPu3xmfX=5+8#k7tP3R^-WgJe!Ecz0Z3* zJBWQ9<=Kinq2IpGzOohbc{t*JyYq{${(e{F$2@C)+CR@F|Df@&=r{W@PpQw>ig|vl z-i6*{KG=BId(6k{cs_@S#l1i1@f;%-&#>z8oFW$Y{-no)v!Lz|&O*DNXg+CuAD%p) z+W++V)c&W>r}jU6KIt>&-QrcKx@>wrfeNOYm3=@d_NZ zw|+x>4;)3bJR^PpjtW{nB))G4Db?$G()v+vO7%KEZ|nOoNU7cwf84d}i)35nL%m9) zUZqj5n{~S$^(u~f-3(ifdKE{#Db=fZZU-ih)>n&xd&hVAz^Pux=TW^Tu6I|q)q2#U zHTbJ2{~>V${xaeR#53^EBfd|32(IcOZtO_bqxD;h5z6W6t-#g!Bz^*}&xc-r46f!) z>of4z(euuUpMtA-6E}8D`NwMi``~*2njeCH5^6!G{ z`D%{yOZ(S+3a%8O|3dRUJE8m&wZAQJonLdDk5s*%7Mkzakr_t)DHkL4{@Vhl z{wcl*uJh~l1-L$+%*XWnG36iY_v1oy)KAZ!`1S4I2o|Ro1qz~3EajN7i78>jEaZQL C^4EL- literal 0 HcmV?d00001 diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.elf b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.elf new file mode 100755 index 0000000000000000000000000000000000000000..fbf768003c7ae9be77deae967dc2a77b06391f68 GIT binary patch literal 1656 zcma)6PiPcp6o0eX?xv;HHDX!N!iR!HiJO`1+GM?S5;hX*2GWBP>R~$nk__3MU1ny} zSPw2mYC-9(ussyK2;wOe6v-)&crBs#LK0fMct~#^O4Hw)Kf9xN@CV<#_j~Vqe`enI z&7Il$RY{U4L57C=tODaV9}o>eR!C5WMk!BO;@w@W1Y0oh!d5C^ef4)>CBC%GrAe%3 zC3;L$qrDIFeed}*L^CJrw0HEZKpVRfL6xn zqJ!bh^+%FgP!JiRI6U)dq>UA_pXC1Q&Hr?wxA6Okzozs0k4oW7gkLqtp<+vXD%}E~-6{jBJz3vi50u-kxeoO3E2{Q`67cy_Q;m&Z4-mlxoGws9Q zhBofK?k)eDAIoIbq4@m}6X!2RBjlu1tnLhkLd9RU^FLS z{b7oqgI?$dNj!%>IG)e`Ldrh_F0f?SMsQ%RL8ol=c?y(^YN-Q$o&DVb|C%`5Y!9)#jO+tE4q`$JHFy35Ye)%%J7@Nfg^P-ShNYi6wH| zu0>%`> Lf?PAnlgR%8Kh^|F literal 0 HcmV?d00001 diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.ld.script b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.ld.script new file mode 100644 index 0000000..fc4f0cf --- /dev/null +++ b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.ld.script @@ -0,0 +1,72 @@ + +MEMORY +{ + program (RX) : ORIGIN = 0, LENGTH = 0x0020000 + data (!RX) : ORIGIN = 0x78400, LENGTH = 0x7C00 +} +ENTRY(__start) +SECTIONS +{ + . = 0x0; + .text : { + /* the __start symbol has to come at address zero. */ + *crt0.o(.text*) + _ctors_start = .; + _init_array_start = .; + KEEP(SORT(*.init_array)) + _ctors_end = .; + _init_array_end = .; + _dtors_start = .; + _dtors_end = .; + *(.text*) + } > program + .data : { + *(.data*) + *(.rodata*) + } > data + .comment : { + *(.comment*) + } + .symtab : { + *(.symtab) + } + .shstrtab : { + *(.shstrtab) + } + .strtab : { + *(.strtab) + } + .stack_sizes : { + *(.stack_sizes) + } + +. = 0x70000; +_sp_start_value_DM_stack = .; +. += 0x400; /* stack */ +/* No tile with memory exists to the south. */ +. = 0x40000; +. += 0x10000; +/* No tile with memory exists to the west. */ +. = 0x50000; +. += 0x10000; +. = 0x60400; +buf5 = .; +. += 0x400; +. = 0x64000; +buf4 = .; +. += 0x400; +. = 0x68000; +buf3 = .; +. += 0x400; +. = 0x70400; +buf2 = .; +. += 0x400; +. = 0x74000; +buf1 = .; +. += 0x400; +. = 0x78000; +buf0 = .; +. += 0x400; + .bss : { *(.bss*) } > data +} +PROVIDE(main = core_0_2); diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.ll b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.ll new file mode 100644 index 0000000..906e39c --- /dev/null +++ b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.ll @@ -0,0 +1,95 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target triple = "aie2p" + +@buf0 = external global [256 x float] +@buf1 = external global [256 x float] +@buf2 = external global [256 x float] +@buf3 = external global [256 x float] +@buf4 = external global [256 x float] +@buf5 = external global [256 x float] +@buf6 = external global [256 x float] +@buf7 = external global [256 x float] +@buf8 = external global [256 x float] +@buf9 = external global [256 x float] +@buf10 = external global [256 x float] +@buf11 = external global [256 x float] +@buf12 = external global [1024 x float] +@buf13 = external global [1024 x float] +@buf14 = external global [1024 x float] + +declare void @debug_i32(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.event(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.put.ms(i32, i32) + +; Unknown intrinsic +declare { i32, i32 } @llvm.aie2p.get.ss() + +; Unknown intrinsic +declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32) + +; Unknown intrinsic +declare <16 x i32> @llvm.aie2p.scd.read.vec(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.acquire(i32, i32) + +; Unknown intrinsic +declare void @llvm.aie2p.release(i32, i32) + +; Unknown intrinsic +declare void @llvm.aie2p.set.ctrl.reg(i32, i32) + +define void @core_0_2() { + br label %1 + +1: ; preds = %22, %0 + call void @llvm.aie2p.acquire(i32 49, i32 -1) + call void @llvm.aie2p.acquire(i32 50, i32 -1) + call void @llvm.aie2p.acquire(i32 52, i32 -1) + br label %2 + +2: ; preds = %5, %1 + %3 = phi i32 [ %21, %5 ], [ 0, %1 ] + %4 = icmp slt i32 %3, 256 + br i1 %4, label %5, label %22 + +5: ; preds = %2 + %6 = getelementptr float, ptr @buf2, i32 %3 + %7 = load <16 x float>, ptr %6, align 4 + %8 = getelementptr float, ptr @buf1, i32 %3 + %9 = load <16 x float>, ptr %8, align 4 + %10 = bitcast <16 x float> %7 to <8 x i64> + %11 = bitcast <16 x float> %9 to <8 x i64> + %12 = shufflevector <8 x i64> %10, <8 x i64> %10, <32 x i32> + %13 = shufflevector <8 x i64> %11, <8 x i64> %11, <32 x i32> + %14 = bitcast <32 x i64> %12 to <64 x float> + %15 = bitcast <32 x i64> %13 to <64 x float> + %16 = call <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float> %14, <64 x float> %15, i32 60) + %17 = bitcast <64 x float> %16 to <32 x i64> + %18 = shufflevector <32 x i64> %17, <32 x i64> %17, <8 x i32> + %19 = bitcast <8 x i64> %18 to <16 x float> + %20 = getelementptr float, ptr @buf0, i32 %3 + store <16 x float> %19, ptr %20, align 4 + %21 = add i32 %3, 16 + br label %2, !llvm.loop !1 + +22: ; preds = %2 + call void @llvm.aie2p.release(i32 51, i32 1) + call void @llvm.aie2p.release(i32 53, i32 1) + call void @llvm.aie2p.release(i32 48, i32 1) + br label %1 +} + +; Unknown intrinsic +declare <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float>, <64 x float>, i32) + +!llvm.module.flags = !{!0} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !{!1, !2} +!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.o b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.o new file mode 100644 index 0000000000000000000000000000000000000000..c01f0bbb18a2b8aef10a542b3c15d504b366c5fe GIT binary patch literal 984 zcmaJnfSQGt;;NrR#&%IRn#NWmOhg2w)J2#i(<+@dWjcZCMv*RD zh`14E>8AUxTx8W>ApV33{($xm7|(t4l1#Jk!kv50J9qBA@6NnmcOEIF&^Lu#f&r%~ zV2t9jPMAR!!(2J~4dhWGc%`O+v)+$9PRNK=s!K((KW4id>UFgYjOV(C$EuvS&lh+u zR?Jmy#AqR(zMABo69>I(0* z@}2iOGR2E4Ft;=ZgK%?zfHW?8b#hr}Y-4y%@WC+o57G0ozZ=lJC0X9eY z_6Qd`%RP!o{$f@`++vu|#W-Y`lb?Y0jy{*x5L4`__+#bUiwB|G z+=1h~tgXFqeXkkT+O3Z7!8!BAoXP)(I)7!_Wlfn*iVv#W)ri-z$EW0nWT@?rJl_A4}x%RasU7T literal 0 HcmV?d00001 diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.opt.ll b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.opt.ll new file mode 100644 index 0000000..1f9925e --- /dev/null +++ b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.opt.ll @@ -0,0 +1,64 @@ +; ModuleID = 'air_project/sub_kernel_0_core_0_2.peanohack.ll' +source_filename = "LLVMDialectModule" +target datalayout = "e-m:e-p:20:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-f32:32:32-i64:32-f64:32-a:0:32-n32" +target triple = "aie2p" + +@buf0 = external local_unnamed_addr global [256 x float] +@buf1 = external local_unnamed_addr global [256 x float] +@buf2 = external local_unnamed_addr global [256 x float] + +; Function Attrs: nounwind +declare void @llvm.aie2p.acquire(i32, i32) #0 + +; Function Attrs: nounwind +declare void @llvm.aie2p.release(i32, i32) #0 + +; Function Attrs: noreturn nounwind +define void @core_0_2() local_unnamed_addr #1 { + br label %1 + +1: ; preds = %19, %0 + tail call void @llvm.aie2p.acquire(i32 49, i32 -1) + tail call void @llvm.aie2p.acquire(i32 50, i32 -1) + tail call void @llvm.aie2p.acquire(i32 52, i32 -1) + br label %2 + +2: ; preds = %1, %2 + %3 = phi i32 [ 0, %1 ], [ %17, %2 ] + %4 = trunc nuw i32 %3 to i20 + %5 = getelementptr float, ptr @buf2, i20 %4 + %6 = load <8 x i64>, ptr %5, align 64 + %7 = getelementptr float, ptr @buf1, i20 %4 + %8 = load <8 x i64>, ptr %7, align 64 + %9 = shufflevector <8 x i64> %6, <8 x i64> poison, <32 x i32> + %10 = shufflevector <8 x i64> %8, <8 x i64> poison, <32 x i32> + %11 = bitcast <32 x i64> %9 to <64 x float> + %12 = bitcast <32 x i64> %10 to <64 x float> + %13 = tail call <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float> %11, <64 x float> %12, i32 60) + %14 = bitcast <64 x float> %13 to <32 x i64> + %15 = shufflevector <32 x i64> %14, <32 x i64> poison, <8 x i32> + %16 = getelementptr float, ptr @buf0, i20 %4 + store <8 x i64> %15, ptr %16, align 64 + %17 = add nuw nsw i32 %3, 16 + %18 = icmp ult i32 %3, 240 + br i1 %18, label %2, label %19, !llvm.loop !1 + +19: ; preds = %2 + tail call void @llvm.aie2p.release(i32 51, i32 1) + tail call void @llvm.aie2p.release(i32 53, i32 1) + tail call void @llvm.aie2p.release(i32 48, i32 1) + br label %1 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) +declare <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float>, <64 x float>, i32) #2 + +attributes #0 = { nounwind } +attributes #1 = { noreturn nounwind } +attributes #2 = { mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) } + +!llvm.module.flags = !{!0} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !{!1, !2} +!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.peanohack.ll b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.peanohack.ll new file mode 100644 index 0000000..d91a003 --- /dev/null +++ b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.peanohack.ll @@ -0,0 +1,95 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target triple = "aie2p" + +@buf0 = external global [256 x float] +@buf1 = external global [256 x float] +@buf2 = external global [256 x float] +@buf3 = external global [256 x float] +@buf4 = external global [256 x float] +@buf5 = external global [256 x float] +@buf6 = external global [256 x float] +@buf7 = external global [256 x float] +@buf8 = external global [256 x float] +@buf9 = external global [256 x float] +@buf10 = external global [256 x float] +@buf11 = external global [256 x float] +@buf12 = external global [1024 x float] +@buf13 = external global [1024 x float] +@buf14 = external global [1024 x float] + +declare void @debug_i32(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.event(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.put.ms(i32, i32) + +; Unknown intrinsic +declare { i32, i32 } @llvm.aie2p.get.ss() + +; Unknown intrinsic +declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32) + +; Unknown intrinsic +declare <16 x i32> @llvm.aie2p.scd.read.vec(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.acquire(i32, i32) + +; Unknown intrinsic +declare void @llvm.aie2p.release(i32, i32) + +; Unknown intrinsic +declare void @llvm.aie2p.set.ctrl.reg(i32, i32) + +define void @core_0_2() { + br label %1 + +1: ; preds = %22, %0 + call void @llvm.aie2p.acquire(i32 49, i32 -1) + call void @llvm.aie2p.acquire(i32 50, i32 -1) + call void @llvm.aie2p.acquire(i32 52, i32 -1) + br label %2 + +2: ; preds = %5, %1 + %3 = phi i32 [ %21, %5 ], [ 0, %1 ] + %4 = icmp slt i32 %3, 256 + br i1 %4, label %5, label %22 + +5: ; preds = %2 + %6 = getelementptr float, ptr @buf2, i32 %3 + %7 = load <16 x float>, ptr %6 + %8 = getelementptr float, ptr @buf1, i32 %3 + %9 = load <16 x float>, ptr %8 + %10 = bitcast <16 x float> %7 to <8 x i64> + %11 = bitcast <16 x float> %9 to <8 x i64> + %12 = shufflevector <8 x i64> %10, <8 x i64> %10, <32 x i32> + %13 = shufflevector <8 x i64> %11, <8 x i64> %11, <32 x i32> + %14 = bitcast <32 x i64> %12 to <64 x float> + %15 = bitcast <32 x i64> %13 to <64 x float> + %16 = call <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float> %14, <64 x float> %15, i32 60) + %17 = bitcast <64 x float> %16 to <32 x i64> + %18 = shufflevector <32 x i64> %17, <32 x i64> %17, <8 x i32> + %19 = bitcast <8 x i64> %18 to <16 x float> + %20 = getelementptr float, ptr @buf0, i32 %3 + store <16 x float> %19, ptr %20 + %21 = add i32 %3, 16 + br label %2, !llvm.loop !1 + +22: ; preds = %2 + call void @llvm.aie2p.release(i32 51, i32 1) + call void @llvm.aie2p.release(i32 53, i32 1) + call void @llvm.aie2p.release(i32 48, i32 1) + br label %1 +} + +; Unknown intrinsic +declare <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float>, <64 x float>, i32) + +!llvm.module.flags = !{!0} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !{!1, !2} +!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.elf b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.elf new file mode 100755 index 0000000000000000000000000000000000000000..e4e226b260243a140d2d00b735618366285dcca0 GIT binary patch literal 1720 zcma)7O^6$17=9-klU4jlS1bV)J}Ss6Yj$Rnb~n8^T~aBUh4rA6_ApFlW;eqolQ5aA zTRj-0sGxWgq9E=?5KoJs$etEjuTs1V?V@<;p}j~!aebcoNunno_`dgj-k^KcevyVR==t#k=rNVm*n7viTf_5hLC*|lr$T5G0FQ|WUtcO11 zURj%g|Me&JkF;u95B(OjM?}hN`*^PFf+GRSYmz=C;!VLB#ruVo5BqTESY!OfjVE&b zd*99-eDGkr{a0}*mpA9q=R;1qemOZpE-K~v&FP^?`Li3vvMlm|ft#AhgAPQ3sS3EY)4UlaDI7t+mG7AR>kRcmoNJrKiFIDbauNh z*}kV*RmauqZnf&!ZdI?JtF5`7v+CB|s?n;pYK~F&+X*`Dsn9sYvIpbdkR~R~M3%D?^L6>~8JtZk5wah7hgZ|N!Um*t+ z2J>4ac$*03zXQp7JTCg*hg28@`A3k71So-zAr%Qg{tcue0m#3DR3rfTkC2K4ApaRs zkpSesP|4%o0PN?OiaF)KlaHqSE_ptAqZxmn+??`1$VXEymB`zRgvZ|X0w{xP-No6X!34Q=MfRo>N-df%S#^8Gq{~sV? nwBcCI2xQj68R-0vWcU(se`W+Si+eHEN2%{~6lBfJpUHm!Ajl1> literal 0 HcmV?d00001 diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.ld.script b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.ld.script new file mode 100644 index 0000000..6120a88 --- /dev/null +++ b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.ld.script @@ -0,0 +1,78 @@ + +MEMORY +{ + program (RX) : ORIGIN = 0, LENGTH = 0x0020000 + data (!RX) : ORIGIN = 0x78400, LENGTH = 0x7C00 +} +ENTRY(__start) +SECTIONS +{ + . = 0x0; + .text : { + /* the __start symbol has to come at address zero. */ + *crt0.o(.text*) + _ctors_start = .; + _init_array_start = .; + KEEP(SORT(*.init_array)) + _ctors_end = .; + _init_array_end = .; + _dtors_start = .; + _dtors_end = .; + *(.text*) + } > program + .data : { + *(.data*) + *(.rodata*) + } > data + .comment : { + *(.comment*) + } + .symtab : { + *(.symtab) + } + .shstrtab : { + *(.shstrtab) + } + .strtab : { + *(.strtab) + } + .stack_sizes : { + *(.stack_sizes) + } + +. = 0x70000; +_sp_start_value_DM_stack = .; +. += 0x400; /* stack */ +. = 0x40400; +buf2 = .; +. += 0x400; +. = 0x44000; +buf1 = .; +. += 0x400; +. = 0x48000; +buf0 = .; +. += 0x400; +/* No tile with memory exists to the west. */ +. = 0x50000; +. += 0x10000; +. = 0x60400; +buf8 = .; +. += 0x400; +. = 0x64000; +buf7 = .; +. += 0x400; +. = 0x68000; +buf6 = .; +. += 0x400; +. = 0x70400; +buf5 = .; +. += 0x400; +. = 0x74000; +buf4 = .; +. += 0x400; +. = 0x78000; +buf3 = .; +. += 0x400; + .bss : { *(.bss*) } > data +} +PROVIDE(main = core_0_3); diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.ll b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.ll new file mode 100644 index 0000000..ba863ab --- /dev/null +++ b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.ll @@ -0,0 +1,95 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target triple = "aie2p" + +@buf0 = external global [256 x float] +@buf1 = external global [256 x float] +@buf2 = external global [256 x float] +@buf3 = external global [256 x float] +@buf4 = external global [256 x float] +@buf5 = external global [256 x float] +@buf6 = external global [256 x float] +@buf7 = external global [256 x float] +@buf8 = external global [256 x float] +@buf9 = external global [256 x float] +@buf10 = external global [256 x float] +@buf11 = external global [256 x float] +@buf12 = external global [1024 x float] +@buf13 = external global [1024 x float] +@buf14 = external global [1024 x float] + +declare void @debug_i32(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.event(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.put.ms(i32, i32) + +; Unknown intrinsic +declare { i32, i32 } @llvm.aie2p.get.ss() + +; Unknown intrinsic +declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32) + +; Unknown intrinsic +declare <16 x i32> @llvm.aie2p.scd.read.vec(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.acquire(i32, i32) + +; Unknown intrinsic +declare void @llvm.aie2p.release(i32, i32) + +; Unknown intrinsic +declare void @llvm.aie2p.set.ctrl.reg(i32, i32) + +define void @core_0_3() { + br label %1 + +1: ; preds = %22, %0 + call void @llvm.aie2p.acquire(i32 49, i32 -1) + call void @llvm.aie2p.acquire(i32 50, i32 -1) + call void @llvm.aie2p.acquire(i32 52, i32 -1) + br label %2 + +2: ; preds = %5, %1 + %3 = phi i32 [ %21, %5 ], [ 0, %1 ] + %4 = icmp slt i32 %3, 256 + br i1 %4, label %5, label %22 + +5: ; preds = %2 + %6 = getelementptr float, ptr @buf5, i32 %3 + %7 = load <16 x float>, ptr %6, align 4 + %8 = getelementptr float, ptr @buf4, i32 %3 + %9 = load <16 x float>, ptr %8, align 4 + %10 = bitcast <16 x float> %7 to <8 x i64> + %11 = bitcast <16 x float> %9 to <8 x i64> + %12 = shufflevector <8 x i64> %10, <8 x i64> %10, <32 x i32> + %13 = shufflevector <8 x i64> %11, <8 x i64> %11, <32 x i32> + %14 = bitcast <32 x i64> %12 to <64 x float> + %15 = bitcast <32 x i64> %13 to <64 x float> + %16 = call <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float> %14, <64 x float> %15, i32 60) + %17 = bitcast <64 x float> %16 to <32 x i64> + %18 = shufflevector <32 x i64> %17, <32 x i64> %17, <8 x i32> + %19 = bitcast <8 x i64> %18 to <16 x float> + %20 = getelementptr float, ptr @buf3, i32 %3 + store <16 x float> %19, ptr %20, align 4 + %21 = add i32 %3, 16 + br label %2, !llvm.loop !1 + +22: ; preds = %2 + call void @llvm.aie2p.release(i32 51, i32 1) + call void @llvm.aie2p.release(i32 53, i32 1) + call void @llvm.aie2p.release(i32 48, i32 1) + br label %1 +} + +; Unknown intrinsic +declare <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float>, <64 x float>, i32) + +!llvm.module.flags = !{!0} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !{!1, !2} +!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.o b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.o new file mode 100644 index 0000000000000000000000000000000000000000..d5d447390e239adbd0677385ea8f16d9e12c9d10 GIT binary patch literal 984 zcmaJ<&ubG=5T2K&NsuaOK%0X~@zkG7Vk@*4O=GJ$Y($WjfyJoQ3IK1_q{v10XI9R@fK{S{2(v$@JDX~|QW6=>bPi5cDO^%mIl7kBX!&lfP=n7d zmkYJW#ui~~_S$Z9uqHFYO=cQFa|NAN!QAcG2e^tBTB=!zl(ctna7UCXbU05xb{&m& uNmzk$T_va%^resWIS*rFP_Df<1EB21Ni0?bMv1;K?>iXNN1?388Lt3R{cdRh literal 0 HcmV?d00001 diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.opt.ll b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.opt.ll new file mode 100644 index 0000000..ddb3226 --- /dev/null +++ b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.opt.ll @@ -0,0 +1,64 @@ +; ModuleID = 'air_project/sub_kernel_0_core_0_3.peanohack.ll' +source_filename = "LLVMDialectModule" +target datalayout = "e-m:e-p:20:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-f32:32:32-i64:32-f64:32-a:0:32-n32" +target triple = "aie2p" + +@buf3 = external local_unnamed_addr global [256 x float] +@buf4 = external local_unnamed_addr global [256 x float] +@buf5 = external local_unnamed_addr global [256 x float] + +; Function Attrs: nounwind +declare void @llvm.aie2p.acquire(i32, i32) #0 + +; Function Attrs: nounwind +declare void @llvm.aie2p.release(i32, i32) #0 + +; Function Attrs: noreturn nounwind +define void @core_0_3() local_unnamed_addr #1 { + br label %1 + +1: ; preds = %19, %0 + tail call void @llvm.aie2p.acquire(i32 49, i32 -1) + tail call void @llvm.aie2p.acquire(i32 50, i32 -1) + tail call void @llvm.aie2p.acquire(i32 52, i32 -1) + br label %2 + +2: ; preds = %1, %2 + %3 = phi i32 [ 0, %1 ], [ %17, %2 ] + %4 = trunc nuw i32 %3 to i20 + %5 = getelementptr float, ptr @buf5, i20 %4 + %6 = load <8 x i64>, ptr %5, align 64 + %7 = getelementptr float, ptr @buf4, i20 %4 + %8 = load <8 x i64>, ptr %7, align 64 + %9 = shufflevector <8 x i64> %6, <8 x i64> poison, <32 x i32> + %10 = shufflevector <8 x i64> %8, <8 x i64> poison, <32 x i32> + %11 = bitcast <32 x i64> %9 to <64 x float> + %12 = bitcast <32 x i64> %10 to <64 x float> + %13 = tail call <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float> %11, <64 x float> %12, i32 60) + %14 = bitcast <64 x float> %13 to <32 x i64> + %15 = shufflevector <32 x i64> %14, <32 x i64> poison, <8 x i32> + %16 = getelementptr float, ptr @buf3, i20 %4 + store <8 x i64> %15, ptr %16, align 64 + %17 = add nuw nsw i32 %3, 16 + %18 = icmp ult i32 %3, 240 + br i1 %18, label %2, label %19, !llvm.loop !1 + +19: ; preds = %2 + tail call void @llvm.aie2p.release(i32 51, i32 1) + tail call void @llvm.aie2p.release(i32 53, i32 1) + tail call void @llvm.aie2p.release(i32 48, i32 1) + br label %1 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) +declare <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float>, <64 x float>, i32) #2 + +attributes #0 = { nounwind } +attributes #1 = { noreturn nounwind } +attributes #2 = { mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) } + +!llvm.module.flags = !{!0} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !{!1, !2} +!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.peanohack.ll b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.peanohack.ll new file mode 100644 index 0000000..8b8d6a6 --- /dev/null +++ b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.peanohack.ll @@ -0,0 +1,95 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target triple = "aie2p" + +@buf0 = external global [256 x float] +@buf1 = external global [256 x float] +@buf2 = external global [256 x float] +@buf3 = external global [256 x float] +@buf4 = external global [256 x float] +@buf5 = external global [256 x float] +@buf6 = external global [256 x float] +@buf7 = external global [256 x float] +@buf8 = external global [256 x float] +@buf9 = external global [256 x float] +@buf10 = external global [256 x float] +@buf11 = external global [256 x float] +@buf12 = external global [1024 x float] +@buf13 = external global [1024 x float] +@buf14 = external global [1024 x float] + +declare void @debug_i32(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.event(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.put.ms(i32, i32) + +; Unknown intrinsic +declare { i32, i32 } @llvm.aie2p.get.ss() + +; Unknown intrinsic +declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32) + +; Unknown intrinsic +declare <16 x i32> @llvm.aie2p.scd.read.vec(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.acquire(i32, i32) + +; Unknown intrinsic +declare void @llvm.aie2p.release(i32, i32) + +; Unknown intrinsic +declare void @llvm.aie2p.set.ctrl.reg(i32, i32) + +define void @core_0_3() { + br label %1 + +1: ; preds = %22, %0 + call void @llvm.aie2p.acquire(i32 49, i32 -1) + call void @llvm.aie2p.acquire(i32 50, i32 -1) + call void @llvm.aie2p.acquire(i32 52, i32 -1) + br label %2 + +2: ; preds = %5, %1 + %3 = phi i32 [ %21, %5 ], [ 0, %1 ] + %4 = icmp slt i32 %3, 256 + br i1 %4, label %5, label %22 + +5: ; preds = %2 + %6 = getelementptr float, ptr @buf5, i32 %3 + %7 = load <16 x float>, ptr %6 + %8 = getelementptr float, ptr @buf4, i32 %3 + %9 = load <16 x float>, ptr %8 + %10 = bitcast <16 x float> %7 to <8 x i64> + %11 = bitcast <16 x float> %9 to <8 x i64> + %12 = shufflevector <8 x i64> %10, <8 x i64> %10, <32 x i32> + %13 = shufflevector <8 x i64> %11, <8 x i64> %11, <32 x i32> + %14 = bitcast <32 x i64> %12 to <64 x float> + %15 = bitcast <32 x i64> %13 to <64 x float> + %16 = call <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float> %14, <64 x float> %15, i32 60) + %17 = bitcast <64 x float> %16 to <32 x i64> + %18 = shufflevector <32 x i64> %17, <32 x i64> %17, <8 x i32> + %19 = bitcast <8 x i64> %18 to <16 x float> + %20 = getelementptr float, ptr @buf3, i32 %3 + store <16 x float> %19, ptr %20 + %21 = add i32 %3, 16 + br label %2, !llvm.loop !1 + +22: ; preds = %2 + call void @llvm.aie2p.release(i32 51, i32 1) + call void @llvm.aie2p.release(i32 53, i32 1) + call void @llvm.aie2p.release(i32 48, i32 1) + br label %1 +} + +; Unknown intrinsic +declare <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float>, <64 x float>, i32) + +!llvm.module.flags = !{!0} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !{!1, !2} +!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.elf b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.elf new file mode 100755 index 0000000000000000000000000000000000000000..cfc7551935cf7c8534f551f4b481c91b57b4768e GIT binary patch literal 1724 zcma)7O=uKn7=C9n-Hp_GM9|I@J?14(VD$@jU;;4z9a1m>nEweWm;lWGffP&t<_{qS z6M*@DtjM@G0Qa-aiZ8!Y(N>*JJ$3yqtS8n>H<@1mGq|Jdb;bm zA+bif6}J2!&0@Q5_`Y!|wZv}6vnfhSoDsK&Gh@Pb^l_4>sJ9Ypsv6A-&z0ytXuOZ_ z2?=pL;$6q~L*HUUg5=eWZXKKPsz%TCjCMY;~h!_~*%ARBPOV literal 0 HcmV?d00001 diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.ld.script b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.ld.script new file mode 100644 index 0000000..ddda3c2 --- /dev/null +++ b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.ld.script @@ -0,0 +1,78 @@ + +MEMORY +{ + program (RX) : ORIGIN = 0, LENGTH = 0x0020000 + data (!RX) : ORIGIN = 0x78400, LENGTH = 0x7C00 +} +ENTRY(__start) +SECTIONS +{ + . = 0x0; + .text : { + /* the __start symbol has to come at address zero. */ + *crt0.o(.text*) + _ctors_start = .; + _init_array_start = .; + KEEP(SORT(*.init_array)) + _ctors_end = .; + _init_array_end = .; + _dtors_start = .; + _dtors_end = .; + *(.text*) + } > program + .data : { + *(.data*) + *(.rodata*) + } > data + .comment : { + *(.comment*) + } + .symtab : { + *(.symtab) + } + .shstrtab : { + *(.shstrtab) + } + .strtab : { + *(.strtab) + } + .stack_sizes : { + *(.stack_sizes) + } + +. = 0x70000; +_sp_start_value_DM_stack = .; +. += 0x400; /* stack */ +. = 0x40400; +buf5 = .; +. += 0x400; +. = 0x44000; +buf4 = .; +. += 0x400; +. = 0x48000; +buf3 = .; +. += 0x400; +/* No tile with memory exists to the west. */ +. = 0x50000; +. += 0x10000; +. = 0x60400; +buf11 = .; +. += 0x400; +. = 0x64000; +buf10 = .; +. += 0x400; +. = 0x68000; +buf9 = .; +. += 0x400; +. = 0x70400; +buf8 = .; +. += 0x400; +. = 0x74000; +buf7 = .; +. += 0x400; +. = 0x78000; +buf6 = .; +. += 0x400; + .bss : { *(.bss*) } > data +} +PROVIDE(main = core_0_4); diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.ll b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.ll new file mode 100644 index 0000000..54f47e7 --- /dev/null +++ b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.ll @@ -0,0 +1,95 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target triple = "aie2p" + +@buf0 = external global [256 x float] +@buf1 = external global [256 x float] +@buf2 = external global [256 x float] +@buf3 = external global [256 x float] +@buf4 = external global [256 x float] +@buf5 = external global [256 x float] +@buf6 = external global [256 x float] +@buf7 = external global [256 x float] +@buf8 = external global [256 x float] +@buf9 = external global [256 x float] +@buf10 = external global [256 x float] +@buf11 = external global [256 x float] +@buf12 = external global [1024 x float] +@buf13 = external global [1024 x float] +@buf14 = external global [1024 x float] + +declare void @debug_i32(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.event(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.put.ms(i32, i32) + +; Unknown intrinsic +declare { i32, i32 } @llvm.aie2p.get.ss() + +; Unknown intrinsic +declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32) + +; Unknown intrinsic +declare <16 x i32> @llvm.aie2p.scd.read.vec(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.acquire(i32, i32) + +; Unknown intrinsic +declare void @llvm.aie2p.release(i32, i32) + +; Unknown intrinsic +declare void @llvm.aie2p.set.ctrl.reg(i32, i32) + +define void @core_0_4() { + br label %1 + +1: ; preds = %22, %0 + call void @llvm.aie2p.acquire(i32 49, i32 -1) + call void @llvm.aie2p.acquire(i32 50, i32 -1) + call void @llvm.aie2p.acquire(i32 52, i32 -1) + br label %2 + +2: ; preds = %5, %1 + %3 = phi i32 [ %21, %5 ], [ 0, %1 ] + %4 = icmp slt i32 %3, 256 + br i1 %4, label %5, label %22 + +5: ; preds = %2 + %6 = getelementptr float, ptr @buf8, i32 %3 + %7 = load <16 x float>, ptr %6, align 4 + %8 = getelementptr float, ptr @buf7, i32 %3 + %9 = load <16 x float>, ptr %8, align 4 + %10 = bitcast <16 x float> %7 to <8 x i64> + %11 = bitcast <16 x float> %9 to <8 x i64> + %12 = shufflevector <8 x i64> %10, <8 x i64> %10, <32 x i32> + %13 = shufflevector <8 x i64> %11, <8 x i64> %11, <32 x i32> + %14 = bitcast <32 x i64> %12 to <64 x float> + %15 = bitcast <32 x i64> %13 to <64 x float> + %16 = call <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float> %14, <64 x float> %15, i32 60) + %17 = bitcast <64 x float> %16 to <32 x i64> + %18 = shufflevector <32 x i64> %17, <32 x i64> %17, <8 x i32> + %19 = bitcast <8 x i64> %18 to <16 x float> + %20 = getelementptr float, ptr @buf6, i32 %3 + store <16 x float> %19, ptr %20, align 4 + %21 = add i32 %3, 16 + br label %2, !llvm.loop !1 + +22: ; preds = %2 + call void @llvm.aie2p.release(i32 51, i32 1) + call void @llvm.aie2p.release(i32 53, i32 1) + call void @llvm.aie2p.release(i32 48, i32 1) + br label %1 +} + +; Unknown intrinsic +declare <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float>, <64 x float>, i32) + +!llvm.module.flags = !{!0} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !{!1, !2} +!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.o b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.o new file mode 100644 index 0000000000000000000000000000000000000000..01ddce691877b0d149c944b2a2bff263cbb20063 GIT binary patch literal 984 zcmaJnfSQF$an;Wuu|?yirm?MAOr%hhfOjOV_2Nv2u!!hQFgckkS5>#QOLv?aFPJV z=v>kP(@0|&D_4JjED8j#)D&>m`<2Bp8L>)q>5=G9>F$<#Q!NAIneM?)Rn8XA=Xo!Z z&s1(jXg-^~p86NozD|Z4r;~qI($42%coPV}_e)lvBy(YJ@9rwjn3iWs*Jno0pOE^@ z%6`a?{2MX;dnMoal2Z4LKPo5wn;ov1}qkNu|9HO^J~ z!MTo1@_Y$qN^>zVw+9GF;-c3gmvu%mhSvli4U<3N5ueBSropo49Q}_6KKU`g@(6!A z!iCOok0KJkkk$})Xy!5z4rpfO5l}u*XOkM@9&0MT4=q;U9t5y<+ktC6d;Vf!KX95m zuR4uG4camTF5#Iz=*52=d5PNYF^A(AiVjvRtjZDJFsgh56{10FHZgT(t literal 0 HcmV?d00001 diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.opt.ll b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.opt.ll new file mode 100644 index 0000000..de0f954 --- /dev/null +++ b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.opt.ll @@ -0,0 +1,64 @@ +; ModuleID = 'air_project/sub_kernel_0_core_0_4.peanohack.ll' +source_filename = "LLVMDialectModule" +target datalayout = "e-m:e-p:20:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-f32:32:32-i64:32-f64:32-a:0:32-n32" +target triple = "aie2p" + +@buf6 = external local_unnamed_addr global [256 x float] +@buf7 = external local_unnamed_addr global [256 x float] +@buf8 = external local_unnamed_addr global [256 x float] + +; Function Attrs: nounwind +declare void @llvm.aie2p.acquire(i32, i32) #0 + +; Function Attrs: nounwind +declare void @llvm.aie2p.release(i32, i32) #0 + +; Function Attrs: noreturn nounwind +define void @core_0_4() local_unnamed_addr #1 { + br label %1 + +1: ; preds = %19, %0 + tail call void @llvm.aie2p.acquire(i32 49, i32 -1) + tail call void @llvm.aie2p.acquire(i32 50, i32 -1) + tail call void @llvm.aie2p.acquire(i32 52, i32 -1) + br label %2 + +2: ; preds = %1, %2 + %3 = phi i32 [ 0, %1 ], [ %17, %2 ] + %4 = trunc nuw i32 %3 to i20 + %5 = getelementptr float, ptr @buf8, i20 %4 + %6 = load <8 x i64>, ptr %5, align 64 + %7 = getelementptr float, ptr @buf7, i20 %4 + %8 = load <8 x i64>, ptr %7, align 64 + %9 = shufflevector <8 x i64> %6, <8 x i64> poison, <32 x i32> + %10 = shufflevector <8 x i64> %8, <8 x i64> poison, <32 x i32> + %11 = bitcast <32 x i64> %9 to <64 x float> + %12 = bitcast <32 x i64> %10 to <64 x float> + %13 = tail call <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float> %11, <64 x float> %12, i32 60) + %14 = bitcast <64 x float> %13 to <32 x i64> + %15 = shufflevector <32 x i64> %14, <32 x i64> poison, <8 x i32> + %16 = getelementptr float, ptr @buf6, i20 %4 + store <8 x i64> %15, ptr %16, align 64 + %17 = add nuw nsw i32 %3, 16 + %18 = icmp ult i32 %3, 240 + br i1 %18, label %2, label %19, !llvm.loop !1 + +19: ; preds = %2 + tail call void @llvm.aie2p.release(i32 51, i32 1) + tail call void @llvm.aie2p.release(i32 53, i32 1) + tail call void @llvm.aie2p.release(i32 48, i32 1) + br label %1 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) +declare <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float>, <64 x float>, i32) #2 + +attributes #0 = { nounwind } +attributes #1 = { noreturn nounwind } +attributes #2 = { mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) } + +!llvm.module.flags = !{!0} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !{!1, !2} +!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.peanohack.ll b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.peanohack.ll new file mode 100644 index 0000000..56c3882 --- /dev/null +++ b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.peanohack.ll @@ -0,0 +1,95 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target triple = "aie2p" + +@buf0 = external global [256 x float] +@buf1 = external global [256 x float] +@buf2 = external global [256 x float] +@buf3 = external global [256 x float] +@buf4 = external global [256 x float] +@buf5 = external global [256 x float] +@buf6 = external global [256 x float] +@buf7 = external global [256 x float] +@buf8 = external global [256 x float] +@buf9 = external global [256 x float] +@buf10 = external global [256 x float] +@buf11 = external global [256 x float] +@buf12 = external global [1024 x float] +@buf13 = external global [1024 x float] +@buf14 = external global [1024 x float] + +declare void @debug_i32(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.event(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.put.ms(i32, i32) + +; Unknown intrinsic +declare { i32, i32 } @llvm.aie2p.get.ss() + +; Unknown intrinsic +declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32) + +; Unknown intrinsic +declare <16 x i32> @llvm.aie2p.scd.read.vec(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.acquire(i32, i32) + +; Unknown intrinsic +declare void @llvm.aie2p.release(i32, i32) + +; Unknown intrinsic +declare void @llvm.aie2p.set.ctrl.reg(i32, i32) + +define void @core_0_4() { + br label %1 + +1: ; preds = %22, %0 + call void @llvm.aie2p.acquire(i32 49, i32 -1) + call void @llvm.aie2p.acquire(i32 50, i32 -1) + call void @llvm.aie2p.acquire(i32 52, i32 -1) + br label %2 + +2: ; preds = %5, %1 + %3 = phi i32 [ %21, %5 ], [ 0, %1 ] + %4 = icmp slt i32 %3, 256 + br i1 %4, label %5, label %22 + +5: ; preds = %2 + %6 = getelementptr float, ptr @buf8, i32 %3 + %7 = load <16 x float>, ptr %6 + %8 = getelementptr float, ptr @buf7, i32 %3 + %9 = load <16 x float>, ptr %8 + %10 = bitcast <16 x float> %7 to <8 x i64> + %11 = bitcast <16 x float> %9 to <8 x i64> + %12 = shufflevector <8 x i64> %10, <8 x i64> %10, <32 x i32> + %13 = shufflevector <8 x i64> %11, <8 x i64> %11, <32 x i32> + %14 = bitcast <32 x i64> %12 to <64 x float> + %15 = bitcast <32 x i64> %13 to <64 x float> + %16 = call <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float> %14, <64 x float> %15, i32 60) + %17 = bitcast <64 x float> %16 to <32 x i64> + %18 = shufflevector <32 x i64> %17, <32 x i64> %17, <8 x i32> + %19 = bitcast <8 x i64> %18 to <16 x float> + %20 = getelementptr float, ptr @buf6, i32 %3 + store <16 x float> %19, ptr %20 + %21 = add i32 %3, 16 + br label %2, !llvm.loop !1 + +22: ; preds = %2 + call void @llvm.aie2p.release(i32 51, i32 1) + call void @llvm.aie2p.release(i32 53, i32 1) + call void @llvm.aie2p.release(i32 48, i32 1) + br label %1 +} + +; Unknown intrinsic +declare <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float>, <64 x float>, i32) + +!llvm.module.flags = !{!0} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !{!1, !2} +!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.elf b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.elf new file mode 100755 index 0000000000000000000000000000000000000000..4588246a4ddfdd6db12c4f239462db7892609dba GIT binary patch literal 1660 zcma)7O=uHQ5S~qAQpKNGQ35JF6r@T`cTH>)FRmquXbSbB1(jv9yKR>yo3Oj7tsn*w z^q}5^C5WPDqkO zF%mS`Wd#_`bwmS@1rn5?VMN>`}$?7pG106qQ^vK`mjFT z_3qn6^y$=qpzkB}ky9Q*_Vbb;`mpvaL(6X^f-WUiG9-UQ-D~NUX=(1HiVg->79UA! zM#0Gt*};h&p*E69Zc6>`Oy1ernR>JJ^RcvkQ|W9c>f8-3grJj0lS7@g=I*k}SaZ1ExPb)7I0AX}v9mWWpgGYWn$WL{~)o!}as2QRlIn%|!eEZ_Lqnf;X> zNhH;Q=zcg8<z3L5xviDx5voy>D23|=-Ki7yVwlipO@5t!rtnJUWsyU-kADy;qwzD`|t1Z+I>b51SdBc=TW1jGF~BuT=BZf}xZwvs9QV?GJnC-_cGNN&2z24Q#4wk!UCMhkbl6^i21V z#B=C^&-1fC)#sl87g#cELwI0LL#Jf;^JFMRXPB!!evWy&$1gAk1QwqUUyz>v3hZ2u z$Ho4ekOGUr{5GUu0%XANLkcDU^Cysk3Bde0q+kLte+?;^0Ly`t>TdMnd6>`q=-B9aF-I&)r`?BTXd8wS8EZcg`GW^Mg z*{WHTt4vJDni9!FkXq%qW6ET>zLGNx7-m|Ip;fI}+fm49xRxeshpF0{QIuepc`uQs zdA{!YqbmV{wqyIm8fk{#aJ~2x+cn*F^`+Pnx-G|~;85fY zxY>U)By2|?Mc&4F8w&=i@(=fakPyZz{!X@`ud*RR^y=Xo9LC{Q3!o;5E=4+vI1k6e z)x`^tHk|yO<{Ugtj=?)0eB&Vk6yX?;31r;D9_ajE5Tk*(Z!v+4;|@f6JMvvbL9QA5 GWBCW<3 program + .data : { + *(.data*) + *(.rodata*) + } > data + .comment : { + *(.comment*) + } + .symtab : { + *(.symtab) + } + .shstrtab : { + *(.shstrtab) + } + .strtab : { + *(.strtab) + } + .stack_sizes : { + *(.stack_sizes) + } + +. = 0x70000; +_sp_start_value_DM_stack = .; +. += 0x400; /* stack */ +. = 0x40400; +buf8 = .; +. += 0x400; +. = 0x44000; +buf7 = .; +. += 0x400; +. = 0x48000; +buf6 = .; +. += 0x400; +/* No tile with memory exists to the west. */ +. = 0x50000; +. += 0x10000; +/* No tile with memory exists to the north. */ +. = 0x60000; +. += 0x10000; +. = 0x70400; +buf11 = .; +. += 0x400; +. = 0x74000; +buf10 = .; +. += 0x400; +. = 0x78000; +buf9 = .; +. += 0x400; + .bss : { *(.bss*) } > data +} +PROVIDE(main = core_0_5); diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.ll b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.ll new file mode 100644 index 0000000..8972a4d --- /dev/null +++ b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.ll @@ -0,0 +1,95 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target triple = "aie2p" + +@buf0 = external global [256 x float] +@buf1 = external global [256 x float] +@buf2 = external global [256 x float] +@buf3 = external global [256 x float] +@buf4 = external global [256 x float] +@buf5 = external global [256 x float] +@buf6 = external global [256 x float] +@buf7 = external global [256 x float] +@buf8 = external global [256 x float] +@buf9 = external global [256 x float] +@buf10 = external global [256 x float] +@buf11 = external global [256 x float] +@buf12 = external global [1024 x float] +@buf13 = external global [1024 x float] +@buf14 = external global [1024 x float] + +declare void @debug_i32(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.event(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.put.ms(i32, i32) + +; Unknown intrinsic +declare { i32, i32 } @llvm.aie2p.get.ss() + +; Unknown intrinsic +declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32) + +; Unknown intrinsic +declare <16 x i32> @llvm.aie2p.scd.read.vec(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.acquire(i32, i32) + +; Unknown intrinsic +declare void @llvm.aie2p.release(i32, i32) + +; Unknown intrinsic +declare void @llvm.aie2p.set.ctrl.reg(i32, i32) + +define void @core_0_5() { + br label %1 + +1: ; preds = %22, %0 + call void @llvm.aie2p.acquire(i32 49, i32 -1) + call void @llvm.aie2p.acquire(i32 50, i32 -1) + call void @llvm.aie2p.acquire(i32 52, i32 -1) + br label %2 + +2: ; preds = %5, %1 + %3 = phi i32 [ %21, %5 ], [ 0, %1 ] + %4 = icmp slt i32 %3, 256 + br i1 %4, label %5, label %22 + +5: ; preds = %2 + %6 = getelementptr float, ptr @buf11, i32 %3 + %7 = load <16 x float>, ptr %6, align 4 + %8 = getelementptr float, ptr @buf10, i32 %3 + %9 = load <16 x float>, ptr %8, align 4 + %10 = bitcast <16 x float> %7 to <8 x i64> + %11 = bitcast <16 x float> %9 to <8 x i64> + %12 = shufflevector <8 x i64> %10, <8 x i64> %10, <32 x i32> + %13 = shufflevector <8 x i64> %11, <8 x i64> %11, <32 x i32> + %14 = bitcast <32 x i64> %12 to <64 x float> + %15 = bitcast <32 x i64> %13 to <64 x float> + %16 = call <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float> %14, <64 x float> %15, i32 60) + %17 = bitcast <64 x float> %16 to <32 x i64> + %18 = shufflevector <32 x i64> %17, <32 x i64> %17, <8 x i32> + %19 = bitcast <8 x i64> %18 to <16 x float> + %20 = getelementptr float, ptr @buf9, i32 %3 + store <16 x float> %19, ptr %20, align 4 + %21 = add i32 %3, 16 + br label %2, !llvm.loop !1 + +22: ; preds = %2 + call void @llvm.aie2p.release(i32 51, i32 1) + call void @llvm.aie2p.release(i32 53, i32 1) + call void @llvm.aie2p.release(i32 48, i32 1) + br label %1 +} + +; Unknown intrinsic +declare <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float>, <64 x float>, i32) + +!llvm.module.flags = !{!0} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !{!1, !2} +!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.o b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.o new file mode 100644 index 0000000000000000000000000000000000000000..ca78f75bdee0299a5c24ffa00da8f22d235605ba GIT binary patch literal 984 zcmZ`%O-~b16g{t$wlPRb6SWI9#1$Y+I({Uu(NaN5GBw5oVsOz+%NUH6;xI&XqcJR8 z5aPyUmTcIwa?usPfbbI_;RnDUu$=qmwH=yxlRNjEcTeuU@6P)^@4Qq>p<@b}7z2(H zzyOm^v|tQr^t5tx24qnnc&A2z)Arvi4#|jBqQ#7;f2Lb+)mpg-3}#x}2dbF0FD7{| zmd})KM`$veyqWqJR`y21*T*C0vuXF29o_=MKb@B4=gC~y-nci1Q`7Zz}pUBtv3VDT6ncIwHdhe zk8qsj%G?LH;njmmb9JlX!P*S`z^yTU{Uour^?nAH?=@Uq!KycXuUe=+h2@k=h3X>{ zZ4ufwS=E_eUoB|bTa+2%|6!ada|JCbbM7xOIj^EkqPSTJiu$zoksXP>p!e(aW7pYe wmja*BWnD$OHufzONPU?{azk`kduIkhY{W?{XA`qT*F^W5Oyq-fsV6gD0S*Oj=>Px# literal 0 HcmV?d00001 diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.opt.ll b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.opt.ll new file mode 100644 index 0000000..d08aa8f --- /dev/null +++ b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.opt.ll @@ -0,0 +1,64 @@ +; ModuleID = 'air_project/sub_kernel_0_core_0_5.peanohack.ll' +source_filename = "LLVMDialectModule" +target datalayout = "e-m:e-p:20:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-f32:32:32-i64:32-f64:32-a:0:32-n32" +target triple = "aie2p" + +@buf9 = external local_unnamed_addr global [256 x float] +@buf10 = external local_unnamed_addr global [256 x float] +@buf11 = external local_unnamed_addr global [256 x float] + +; Function Attrs: nounwind +declare void @llvm.aie2p.acquire(i32, i32) #0 + +; Function Attrs: nounwind +declare void @llvm.aie2p.release(i32, i32) #0 + +; Function Attrs: noreturn nounwind +define void @core_0_5() local_unnamed_addr #1 { + br label %1 + +1: ; preds = %19, %0 + tail call void @llvm.aie2p.acquire(i32 49, i32 -1) + tail call void @llvm.aie2p.acquire(i32 50, i32 -1) + tail call void @llvm.aie2p.acquire(i32 52, i32 -1) + br label %2 + +2: ; preds = %1, %2 + %3 = phi i32 [ 0, %1 ], [ %17, %2 ] + %4 = trunc nuw i32 %3 to i20 + %5 = getelementptr float, ptr @buf11, i20 %4 + %6 = load <8 x i64>, ptr %5, align 64 + %7 = getelementptr float, ptr @buf10, i20 %4 + %8 = load <8 x i64>, ptr %7, align 64 + %9 = shufflevector <8 x i64> %6, <8 x i64> poison, <32 x i32> + %10 = shufflevector <8 x i64> %8, <8 x i64> poison, <32 x i32> + %11 = bitcast <32 x i64> %9 to <64 x float> + %12 = bitcast <32 x i64> %10 to <64 x float> + %13 = tail call <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float> %11, <64 x float> %12, i32 60) + %14 = bitcast <64 x float> %13 to <32 x i64> + %15 = shufflevector <32 x i64> %14, <32 x i64> poison, <8 x i32> + %16 = getelementptr float, ptr @buf9, i20 %4 + store <8 x i64> %15, ptr %16, align 64 + %17 = add nuw nsw i32 %3, 16 + %18 = icmp ult i32 %3, 240 + br i1 %18, label %2, label %19, !llvm.loop !1 + +19: ; preds = %2 + tail call void @llvm.aie2p.release(i32 51, i32 1) + tail call void @llvm.aie2p.release(i32 53, i32 1) + tail call void @llvm.aie2p.release(i32 48, i32 1) + br label %1 +} + +; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) +declare <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float>, <64 x float>, i32) #2 + +attributes #0 = { nounwind } +attributes #1 = { noreturn nounwind } +attributes #2 = { mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) } + +!llvm.module.flags = !{!0} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !{!1, !2} +!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.peanohack.ll b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.peanohack.ll new file mode 100644 index 0000000..69f695d --- /dev/null +++ b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.peanohack.ll @@ -0,0 +1,95 @@ +; ModuleID = 'LLVMDialectModule' +source_filename = "LLVMDialectModule" +target triple = "aie2p" + +@buf0 = external global [256 x float] +@buf1 = external global [256 x float] +@buf2 = external global [256 x float] +@buf3 = external global [256 x float] +@buf4 = external global [256 x float] +@buf5 = external global [256 x float] +@buf6 = external global [256 x float] +@buf7 = external global [256 x float] +@buf8 = external global [256 x float] +@buf9 = external global [256 x float] +@buf10 = external global [256 x float] +@buf11 = external global [256 x float] +@buf12 = external global [1024 x float] +@buf13 = external global [1024 x float] +@buf14 = external global [1024 x float] + +declare void @debug_i32(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.event(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.put.ms(i32, i32) + +; Unknown intrinsic +declare { i32, i32 } @llvm.aie2p.get.ss() + +; Unknown intrinsic +declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32) + +; Unknown intrinsic +declare <16 x i32> @llvm.aie2p.scd.read.vec(i32) + +; Unknown intrinsic +declare void @llvm.aie2p.acquire(i32, i32) + +; Unknown intrinsic +declare void @llvm.aie2p.release(i32, i32) + +; Unknown intrinsic +declare void @llvm.aie2p.set.ctrl.reg(i32, i32) + +define void @core_0_5() { + br label %1 + +1: ; preds = %22, %0 + call void @llvm.aie2p.acquire(i32 49, i32 -1) + call void @llvm.aie2p.acquire(i32 50, i32 -1) + call void @llvm.aie2p.acquire(i32 52, i32 -1) + br label %2 + +2: ; preds = %5, %1 + %3 = phi i32 [ %21, %5 ], [ 0, %1 ] + %4 = icmp slt i32 %3, 256 + br i1 %4, label %5, label %22 + +5: ; preds = %2 + %6 = getelementptr float, ptr @buf11, i32 %3 + %7 = load <16 x float>, ptr %6 + %8 = getelementptr float, ptr @buf10, i32 %3 + %9 = load <16 x float>, ptr %8 + %10 = bitcast <16 x float> %7 to <8 x i64> + %11 = bitcast <16 x float> %9 to <8 x i64> + %12 = shufflevector <8 x i64> %10, <8 x i64> %10, <32 x i32> + %13 = shufflevector <8 x i64> %11, <8 x i64> %11, <32 x i32> + %14 = bitcast <32 x i64> %12 to <64 x float> + %15 = bitcast <32 x i64> %13 to <64 x float> + %16 = call <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float> %14, <64 x float> %15, i32 60) + %17 = bitcast <64 x float> %16 to <32 x i64> + %18 = shufflevector <32 x i64> %17, <32 x i64> %17, <8 x i32> + %19 = bitcast <8 x i64> %18 to <16 x float> + %20 = getelementptr float, ptr @buf9, i32 %3 + store <16 x float> %19, ptr %20 + %21 = add i32 %3, 16 + br label %2, !llvm.loop !1 + +22: ; preds = %2 + call void @llvm.aie2p.release(i32 51, i32 1) + call void @llvm.aie2p.release(i32 53, i32 1) + call void @llvm.aie2p.release(i32 48, i32 1) + br label %1 +} + +; Unknown intrinsic +declare <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float>, <64 x float>, i32) + +!llvm.module.flags = !{!0} + +!0 = !{i32 2, !"Debug Info Version", i32 3} +!1 = distinct !{!1, !2} +!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_design.bif b/examples/elementwise_arith/air_project/sub_kernel_0_design.bif new file mode 100644 index 0000000..bbeec41 --- /dev/null +++ b/examples/elementwise_arith/air_project/sub_kernel_0_design.bif @@ -0,0 +1,10 @@ +all: +{ + id_code = 0x14ca8093 + extended_id_code = 0x01 + image + { + name=aie_image, id=0x1c000000 + { type=cdo file=air_project/sub_kernel_0_aie_cdo_elfs.bin file=air_project/sub_kernel_0_aie_cdo_init.bin file=air_project/sub_kernel_0_aie_cdo_enable.bin } + } +} diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_sub_kernel_0_sequence.bin b/examples/elementwise_arith/air_project/sub_kernel_0_sub_kernel_0_sequence.bin new file mode 100644 index 0000000000000000000000000000000000000000..f2eb383b7eb903e8ad8809b7d44fb0a4d660bfb6 GIT binary patch literal 3248 zcmcJQ-Aw~A5QNuneg4w5;eLtEZyo^(@j}krmd|WcZ&8OQJf_lyu+9e|Yukmso`>#0IXQ!}T>04{- zdWRz5+iJIiZ@khs7OO7qzIV<49O}M3Vd35Et*zr4+n!k}H6?s5ZHHP2b8iNd9f!uq z2%ELeWFG=0l9+H#Mw;SV&vYiNQGA=m?@TyH^<^fmv5ZNwd1Or5@6ws1SwdvOIT;Fy zFEe2cjS1(dzRbimmN6++jgb*5y%`xL$~=|+C!AAeiZ3%^4UGxssJ_g^HI^|MPc*|& zYb`xk4J-t?`*yLPCTa8!jZ|yI(P^-T#)NY+)Rg;`nXrb&gmYA1X5t#lnB*HUVGWH5 W=cv75Caj?`;T+YMnYhL+nEU|x-i@&U literal 0 HcmV?d00001 diff --git a/examples/elementwise_arith/air_project/tt.mlir b/examples/elementwise_arith/air_project/tt.mlir new file mode 100644 index 0000000..cfdc62d --- /dev/null +++ b/examples/elementwise_arith/air_project/tt.mlir @@ -0,0 +1,35 @@ +#loc = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":86:1) +#loc10 = loc("X"(#loc)) +#loc11 = loc("OUT"(#loc)) +module { + tt.func public @square_kernel(%X: !tt.ptr {tt.divisibility = 16 : i32} loc("X"(#loc)), %OUT: !tt.ptr {tt.divisibility = 16 : i32} loc("OUT"(#loc))) attributes {noinline = false} { + %c1024_i32 = arith.constant 1024 : i32 loc(#loc1) + %pid = tt.get_program_id x : i32 loc(#loc12) + %offsets = arith.muli %pid, %c1024_i32 : i32 loc(#loc13) + %offsets_0 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc14) + %offsets_1 = tt.splat %offsets : i32 -> tensor<1024xi32> loc(#loc13) + %offsets_2 = arith.addi %offsets_1, %offsets_0 : tensor<1024xi32> loc(#loc13) + %x = tt.splat %X : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc15) + %x_3 = tt.addptr %x, %offsets_2 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc15) + %x_4 = tt.load %x_3 : tensor<1024x!tt.ptr> loc(#loc16) + %0 = tt.splat %OUT : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc7) + %1 = tt.addptr %0, %offsets_2 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc7) + %2 = arith.muli %x_4, %x_4 : tensor<1024xi16> loc(#loc8) + tt.store %1, %2 : tensor<1024x!tt.ptr> loc(#loc9) + tt.return loc(#loc) + } loc(#loc) +} loc(#loc) +#loc1 = loc(unknown) +#loc2 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":87:11) +#loc3 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":88:15) +#loc4 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":88:34) +#loc5 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":89:17) +#loc6 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":89:9) +#loc7 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":90:14) +#loc8 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":90:32) +#loc9 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":90:5) +#loc12 = loc("pid"(#loc2)) +#loc13 = loc("offsets"(#loc3)) +#loc14 = loc("offsets"(#loc4)) +#loc15 = loc("x"(#loc5)) +#loc16 = loc("x"(#loc6)) diff --git a/examples/elementwise_arith/elementwise_arith.py b/examples/elementwise_arith/elementwise_arith.py new file mode 100644 index 0000000..04d4844 --- /dev/null +++ b/examples/elementwise_arith/elementwise_arith.py @@ -0,0 +1,189 @@ +# Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved. +# SPDX-License-Identifier: MIT + +# Elementwise arithmetic benchmark: sub, mul, div, square. +# Supports bf16 (default) and f32 (via bf16-emulation). +# Not all ops support all dtypes: +# sub: bf16, f32 +# mul: bf16, f32 +# div: f32 only (hardware constraint: arith.divf is f32-only on AIE2P) +# square: bf16, f32 (implemented as x * x) + +import argparse +import torch +import triton +import triton.language as tl +import sys +import os + +sys.path.append(os.path.abspath("..")) +import benchmark + +DTYPE_CONFIG = { + "bf16": { + "torch_dtype": torch.bfloat16, + "atol": 1e-2, + "rtol": 1e-2, + "bf16_emulation": False, + }, + "f32": { + "torch_dtype": torch.float32, + "atol": 1e-1, + "rtol": 5e-2, + "bf16_emulation": True, + }, +} + +# Which dtypes each op supports. +# Integer types (i16) fail at aircc for subi/muli on AIE2P (only addi works). +OP_DTYPES = { + "sub": ["bf16", "f32"], + "mul": ["bf16", "f32"], + "div": ["f32"], # arith.divf is f32-only on AIE2P; bf16 divf not supported + "square": ["bf16", "f32"], +} + + +# --- Triton kernels --- + + +@triton.jit +def sub_kernel(X, Y, OUT, n_elements: tl.constexpr, BLOCK_SIZE: tl.constexpr): + pid = tl.program_id(0) + offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + x = tl.load(X + offsets[:]) + y = tl.load(Y + offsets[:]) + tl.store(OUT + offsets[:], x - y) + + +@triton.jit +def mul_kernel(X, Y, OUT, n_elements: tl.constexpr, BLOCK_SIZE: tl.constexpr): + pid = tl.program_id(0) + offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + x = tl.load(X + offsets[:]) + y = tl.load(Y + offsets[:]) + tl.store(OUT + offsets[:], x * y) + + +@triton.jit +def div_kernel(X, Y, OUT, n_elements: tl.constexpr, BLOCK_SIZE: tl.constexpr): + pid = tl.program_id(0) + offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + x = tl.load(X + offsets[:]) + y = tl.load(Y + offsets[:]) + tl.store(OUT + offsets[:], x / y) + + +@triton.jit +def square_kernel(X, OUT, n_elements: tl.constexpr, BLOCK_SIZE: tl.constexpr): + pid = tl.program_id(0) + offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE) + x = tl.load(X + offsets[:]) + tl.store(OUT + offsets[:], x * x) + + +# --- Kernel dispatch table --- + +KERNELS = { + "sub": sub_kernel, + "mul": mul_kernel, + "div": div_kernel, + "square": square_kernel, +} + +# --- Torch reference functions --- + +TORCH_REF = { + "sub": lambda x, y: x - y, + "mul": lambda x, y: x * y, + "div": lambda x, y: x / y, + "square": lambda x, y: x * x, +} + + +def bench_op(op, N, provider, cfg): + device = "cpu" + torch_dtype = cfg["torch_dtype"] + is_unary = op == "square" + + x = torch.randn(N, device=device, dtype=torch_dtype) + if not is_unary: + if op == "div": + # Avoid division by zero; use values in [0.5, 1.5] + y = 0.5 + torch.rand(N, device=device, dtype=torch_dtype) + else: + y = torch.randn(N, device=device, dtype=torch_dtype) + + out = torch.empty(N, device=device, dtype=torch_dtype) + + if provider == "torch" or provider == "test": + out_ref = TORCH_REF[op](x, y if not is_unary else None) + + if provider == "triton" or provider == "test": + grid = lambda META: (triton.cdiv(N, META["BLOCK_SIZE"]),) + kernel = KERNELS[op] + if is_unary: + compiled_kernel = kernel[grid](x, out, N, BLOCK_SIZE=1024) + else: + compiled_kernel = kernel[grid](x, y, out, N, BLOCK_SIZE=1024) + with open("tt.shared.mlir", "w") as f: + f.write(str(compiled_kernel.asm["ttsharedir"])) + if provider == "test": + torch.testing.assert_close(out, out_ref, atol=cfg["atol"], rtol=cfg["rtol"]) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Elementwise arithmetic benchmark for AMD NPU" + ) + parser.add_argument( + "--op", + type=str, + choices=list(KERNELS.keys()), + required=True, + help="Operation to benchmark", + ) + parser.add_argument( + "--dtype", + type=str, + choices=list(DTYPE_CONFIG.keys()), + default="bf16", + help="Element data type (default: bf16)", + ) + parser.add_argument( + "--bf16-emulation", + dest="bf16_emulation", + default=False, + action="store_true", + help="Use f32 data type with bf16 emulation on AIE cores", + ) + args = parser.parse_args() + + if args.bf16_emulation: + args.dtype = "f32" + + # Validate op + dtype combination + if args.dtype not in OP_DTYPES[args.op]: + supported = ", ".join(OP_DTYPES[args.op]) + print(f"Error: --op {args.op} does not support --dtype {args.dtype}.") + print(f"Supported dtypes for {args.op}: {supported}") + sys.exit(1) + + cfg = DTYPE_CONFIG[args.dtype] + + if cfg["bf16_emulation"]: + os.environ["AMD_TRITON_NPU_BF16_EMULATION"] = "1" + + # Select the right transform script based on op arity. + # If AIR_TRANSFORM_TILING_SCRIPT is already set, respect it. + if not os.environ.get("AIR_TRANSFORM_TILING_SCRIPT"): + is_unary = args.op == "square" + script_dir = os.path.dirname(os.path.abspath(__file__)) + arity = "unary" if is_unary else "binary" + os.environ["AIR_TRANSFORM_TILING_SCRIPT"] = os.path.join( + script_dir, f"transform_{arity}_aie2p.mlir" + ) + + benchmark.select_npu_backend() + for N in [2**i for i in range(10, 16, 1)]: + bench_op(args.op, N, "test", cfg) diff --git a/examples/elementwise_arith/transform_binary_aie2p.mlir b/examples/elementwise_arith/transform_binary_aie2p.mlir new file mode 100644 index 0000000..2d76c54 --- /dev/null +++ b/examples/elementwise_arith/transform_binary_aie2p.mlir @@ -0,0 +1,40 @@ +// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved. +// SPDX-License-Identifier: MIT + +//////////////////////////////////////////////////////////////////////////////// +// Transform Script for Binary Elementwise Ops (AIE2P): sub, mul, div +// Dtype-generic: uses @DTYPE@ and @VECTOR_SIZE@ placeholders. +// Uses shared library sequences from transform_library.mlir (auto-injected). +//////////////////////////////////////////////////////////////////////////////// + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main( + %arg1: !transform.any_op {transform.readonly}) { + + transform.include @canonicalize_with_fold_dims failures(propagate) + (%arg1) : (!transform.any_op) -> () + transform.include @fuse_elementwise_and_canonicalize failures(propagate) + (%arg1) : (!transform.any_op) -> () + transform.include @flatten_tile_forall failures(propagate) + (%arg1) : (!transform.any_op) -> () + transform.include @canonicalize_with_cse failures(propagate) + (%arg1) : (!transform.any_op) -> () + transform.include @pad_and_promote_binary_@DTYPE@ failures(propagate) + (%arg1) : (!transform.any_op) -> () + transform.include @canonicalize_with_cse failures(propagate) + (%arg1) : (!transform.any_op) -> () + transform.include @one_shot_bufferize failures(propagate) + (%arg1) : (!transform.any_op) -> () + transform.include @post_bufferize_cleanup failures(propagate) + (%arg1) : (!transform.any_op) -> () + + transform.include @vectorize_generics_at_@VECTOR_SIZE@ failures(propagate) + (%arg1) : (!transform.any_op) -> () + %vh = transform.include @air_herd_mapping_and_vectorize + failures(propagate) (%arg1) : (!transform.any_op) -> !transform.any_op + transform.include @cast_bf16_only_ops failures(propagate) + (%vh) : (!transform.any_op) -> () + + transform.yield + } +} diff --git a/examples/elementwise_arith/transform_unary_aie2p.mlir b/examples/elementwise_arith/transform_unary_aie2p.mlir new file mode 100644 index 0000000..14bfd4c --- /dev/null +++ b/examples/elementwise_arith/transform_unary_aie2p.mlir @@ -0,0 +1,40 @@ +// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved. +// SPDX-License-Identifier: MIT + +//////////////////////////////////////////////////////////////////////////////// +// Transform Script for Unary Elementwise Ops (AIE2P): square +// Dtype-generic: uses @DTYPE@ and @VECTOR_SIZE@ placeholders. +// Uses shared library sequences from transform_library.mlir (auto-injected). +//////////////////////////////////////////////////////////////////////////////// + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main( + %arg1: !transform.any_op {transform.readonly}) { + + transform.include @canonicalize_with_fold_dims failures(propagate) + (%arg1) : (!transform.any_op) -> () + transform.include @fuse_elementwise_and_canonicalize failures(propagate) + (%arg1) : (!transform.any_op) -> () + transform.include @flatten_tile_forall failures(propagate) + (%arg1) : (!transform.any_op) -> () + transform.include @canonicalize_with_cse failures(propagate) + (%arg1) : (!transform.any_op) -> () + transform.include @pad_and_promote_unary_@DTYPE@ failures(propagate) + (%arg1) : (!transform.any_op) -> () + transform.include @canonicalize_with_cse failures(propagate) + (%arg1) : (!transform.any_op) -> () + transform.include @one_shot_bufferize failures(propagate) + (%arg1) : (!transform.any_op) -> () + transform.include @post_bufferize_cleanup failures(propagate) + (%arg1) : (!transform.any_op) -> () + + transform.include @vectorize_generics_at_@VECTOR_SIZE@ failures(propagate) + (%arg1) : (!transform.any_op) -> () + %vh = transform.include @air_herd_mapping_and_vectorize + failures(propagate) (%arg1) : (!transform.any_op) -> !transform.any_op + transform.include @cast_bf16_only_ops failures(propagate) + (%vh) : (!transform.any_op) -> () + + transform.yield + } +} diff --git a/examples/elementwise_arith/tt.shared.mlir b/examples/elementwise_arith/tt.shared.mlir new file mode 100644 index 0000000..dc6929b --- /dev/null +++ b/examples/elementwise_arith/tt.shared.mlir @@ -0,0 +1 @@ +b'#loc = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":86:1)\n#loc5 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":89:9)\n#map = affine_map<(d0) -> (d0)>\n#loc8 = loc("X"(#loc))\n#loc9 = loc("OUT"(#loc))\n#loc12 = loc("x"(#loc5))\nmodule {\n func.func @square_kernel(%arg0: memref<*xf32> {tt.divisibility = 16 : i32} loc("X"(#loc)), %arg1: memref<*xf32> {tt.divisibility = 16 : i32} loc("OUT"(#loc)), %arg2: i32 loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":86:1), %arg3: i32 loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":86:1), %arg4: i32 loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":86:1), %arg5: i32 loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":86:1), %arg6: i32 loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":86:1), %arg7: i32 loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":86:1)) {\n %c1024_i32 = arith.constant 1024 : i32 loc(#loc1)\n %0 = arith.muli %arg5, %c1024_i32 : i32 loc(#loc10)\n %1 = arith.index_cast %0 : i32 to index loc(#loc3)\n %reinterpret_cast = memref.reinterpret_cast %arg0 to offset: [%1], sizes: [1024], strides: [1] : memref<*xf32> to memref<1024xf32, strided<[1], offset: ?>> loc(#loc11)\n %alloc = memref.alloc() : memref<1024xf32> loc(#loc12)\n memref.copy %reinterpret_cast, %alloc : memref<1024xf32, strided<[1], offset: ?>> to memref<1024xf32> loc(#loc12)\n %2 = bufferization.to_tensor %alloc restrict writable : memref<1024xf32> to tensor<1024xf32> loc(#loc12)\n %reinterpret_cast_0 = memref.reinterpret_cast %arg1 to offset: [%1], sizes: [1024], strides: [1] : memref<*xf32> to memref<1024xf32, strided<[1], offset: ?>> loc(#loc3)\n %3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%2, %2 : tensor<1024xf32>, tensor<1024xf32>) outs(%2 : tensor<1024xf32>) {\n ^bb0(%in: f32 loc("x"(#loc5)), %in_1: f32 loc("x"(#loc5)), %out: f32 loc("x"(#loc5))):\n %4 = arith.mulf %in, %in_1 : f32 loc(#loc6)\n linalg.yield %4 : f32 loc(#loc6)\n } -> tensor<1024xf32> loc(#loc6)\n bufferization.materialize_in_destination %3 in writable %reinterpret_cast_0 : (tensor<1024xf32>, memref<1024xf32, strided<[1], offset: ?>>) -> () loc(#loc7)\n return loc(#loc)\n } loc(#loc)\n} loc(#loc)\n#loc1 = loc(unknown)\n#loc2 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":88:15)\n#loc3 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":90:14)\n#loc4 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":89:17)\n#loc6 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":90:32)\n#loc7 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":90:5)\n#loc10 = loc("offsets"(#loc2))\n#loc11 = loc("x"(#loc4))\n\n' \ No newline at end of file diff --git a/examples/generate_readme.py b/examples/generate_readme.py index 75dd998..bc75808 100644 --- a/examples/generate_readme.py +++ b/examples/generate_readme.py @@ -146,6 +146,12 @@ "path": "multi_drivers", "datatypes": "bf16", }, + { + "category": "Element-wise", + "name": "Elementwise Arith (sub, mul, div, square)", + "path": "elementwise_arith", + "datatypes": "bf16, f32", + }, ] # Directories to ignore when verifying registry completeness From d38be1d2e60f2a7ac6c7d4606634967274ac3d6a Mon Sep 17 00:00:00 2001 From: erweiw Date: Thu, 9 Apr 2026 22:01:07 -0700 Subject: [PATCH 8/9] Remove build artifacts from elementwise_arith Co-Authored-By: Claude Opus 4.6 (1M context) --- .../air_project/aie.asm_air_output.mlir | 386 ----------- .../elementwise_arith/air_project/aie.elf | Bin 29488 -> 0 bytes .../aiecc_failure_1775797115_856352.mlir | 411 ------------ .../aiecc_failure_1775797139_858651.mlir | 601 ------------------ .../aiecc_failure_1775797174_862028.mlir | 431 ------------- .../aiecc_repeater_1775797115_856352.sh | 12 - .../aiecc_repeater_1775797139_858651.sh | 14 - .../aiecc_repeater_1775797174_862028.sh | 14 - .../air_project/airinput.mlir | 41 -- .../air_project/asm_air_output.mlir | 41 -- .../air_project/asm_src.mlir | 34 - .../air_project/div_kernel_0.pdi | Bin 15904 -> 0 bytes .../air_project/div_kernel_0_aie_cdo_elfs.bin | Bin 10704 -> 0 bytes .../div_kernel_0_aie_cdo_enable.bin | Bin 104 -> 0 bytes .../air_project/div_kernel_0_aie_cdo_init.bin | Bin 6032 -> 0 bytes .../air_project/div_kernel_0_core_0_2.elf | Bin 4132 -> 0 bytes .../div_kernel_0_core_0_2.ld.script | 72 --- .../air_project/div_kernel_0_core_0_2.ll | 158 ----- .../air_project/div_kernel_0_core_0_2.o | Bin 2048 -> 0 bytes .../air_project/div_kernel_0_core_0_2.opt.ll | 129 ---- .../div_kernel_0_core_0_2.peanohack.ll | 158 ----- .../air_project/div_kernel_0_core_0_3.elf | Bin 4192 -> 0 bytes .../div_kernel_0_core_0_3.ld.script | 78 --- .../air_project/div_kernel_0_core_0_3.ll | 158 ----- .../air_project/div_kernel_0_core_0_3.o | Bin 2048 -> 0 bytes .../air_project/div_kernel_0_core_0_3.opt.ll | 129 ---- .../div_kernel_0_core_0_3.peanohack.ll | 158 ----- .../air_project/div_kernel_0_core_0_4.elf | Bin 4196 -> 0 bytes .../div_kernel_0_core_0_4.ld.script | 78 --- .../air_project/div_kernel_0_core_0_4.ll | 158 ----- .../air_project/div_kernel_0_core_0_4.o | Bin 2048 -> 0 bytes .../air_project/div_kernel_0_core_0_4.opt.ll | 129 ---- .../div_kernel_0_core_0_4.peanohack.ll | 158 ----- .../air_project/div_kernel_0_core_0_5.elf | Bin 4132 -> 0 bytes .../div_kernel_0_core_0_5.ld.script | 72 --- .../air_project/div_kernel_0_core_0_5.ll | 158 ----- .../air_project/div_kernel_0_core_0_5.o | Bin 2052 -> 0 bytes .../air_project/div_kernel_0_core_0_5.opt.ll | 129 ---- .../div_kernel_0_core_0_5.peanohack.ll | 158 ----- .../air_project/div_kernel_0_design.bif | 10 - .../div_kernel_0_div_kernel_0_sequence.bin | Bin 3248 -> 0 bytes .../elementwise_arith/air_project/empty_0.pdi | Bin 368 -> 0 bytes .../air_project/empty_0_aie_cdo_elfs.bin | Bin 24 -> 0 bytes .../air_project/empty_0_aie_cdo_enable.bin | Bin 24 -> 0 bytes .../air_project/empty_0_aie_cdo_init.bin | Bin 24 -> 0 bytes .../air_project/empty_0_design.bif | 10 - .../air_project/full_elf_config.json | 134 ---- .../air_project/input_with_addresses.mlir | 328 ---------- .../elementwise_arith/air_project/main.pdi | Bin 368 -> 0 bytes .../air_project/main_aie_cdo_elfs.bin | Bin 24 -> 0 bytes .../air_project/main_aie_cdo_enable.bin | Bin 24 -> 0 bytes .../air_project/main_aie_cdo_init.bin | Bin 24 -> 0 bytes .../air_project/main_design.bif | 10 - .../air_project/main_div_kernel.bin | Bin 22460 -> 0 bytes .../air_project/main_mul_kernel.bin | Bin 14460 -> 0 bytes .../air_project/main_square_kernel.bin | Bin 11048 -> 0 bytes .../air_project/main_sub_kernel.bin | Bin 14396 -> 0 bytes .../air_project/mul_kernel_0.pdi | Bin 7856 -> 0 bytes .../air_project/mul_kernel_0_aie_cdo_elfs.bin | Bin 2656 -> 0 bytes .../mul_kernel_0_aie_cdo_enable.bin | Bin 104 -> 0 bytes .../air_project/mul_kernel_0_aie_cdo_init.bin | Bin 6032 -> 0 bytes .../air_project/mul_kernel_0_core_0_2.elf | Bin 1672 -> 0 bytes .../mul_kernel_0_core_0_2.ld.script | 72 --- .../air_project/mul_kernel_0_core_0_2.ll | 95 --- .../air_project/mul_kernel_0_core_0_2.o | Bin 1000 -> 0 bytes .../air_project/mul_kernel_0_core_0_2.opt.ll | 72 --- .../mul_kernel_0_core_0_2.peanohack.ll | 95 --- .../air_project/mul_kernel_0_core_0_3.elf | Bin 1736 -> 0 bytes .../mul_kernel_0_core_0_3.ld.script | 78 --- .../air_project/mul_kernel_0_core_0_3.ll | 95 --- .../air_project/mul_kernel_0_core_0_3.o | Bin 1000 -> 0 bytes .../air_project/mul_kernel_0_core_0_3.opt.ll | 72 --- .../mul_kernel_0_core_0_3.peanohack.ll | 95 --- .../air_project/mul_kernel_0_core_0_4.elf | Bin 1740 -> 0 bytes .../mul_kernel_0_core_0_4.ld.script | 78 --- .../air_project/mul_kernel_0_core_0_4.ll | 95 --- .../air_project/mul_kernel_0_core_0_4.o | Bin 1000 -> 0 bytes .../air_project/mul_kernel_0_core_0_4.opt.ll | 72 --- .../mul_kernel_0_core_0_4.peanohack.ll | 95 --- .../air_project/mul_kernel_0_core_0_5.elf | Bin 1676 -> 0 bytes .../mul_kernel_0_core_0_5.ld.script | 72 --- .../air_project/mul_kernel_0_core_0_5.ll | 95 --- .../air_project/mul_kernel_0_core_0_5.o | Bin 1000 -> 0 bytes .../air_project/mul_kernel_0_core_0_5.opt.ll | 72 --- .../mul_kernel_0_core_0_5.peanohack.ll | 95 --- .../air_project/mul_kernel_0_design.bif | 10 - .../mul_kernel_0_mul_kernel_0_sequence.bin | Bin 3248 -> 0 bytes .../air_project/npu.asm_air_output.mlir | 300 --------- .../air_project/placed.asm_air_output.mlir | 86 --- .../air_project/square_kernel_0.pdi | Bin 6272 -> 0 bytes .../square_kernel_0_aie_cdo_elfs.bin | Bin 2528 -> 0 bytes .../square_kernel_0_aie_cdo_enable.bin | Bin 104 -> 0 bytes .../square_kernel_0_aie_cdo_init.bin | Bin 4300 -> 0 bytes .../air_project/square_kernel_0_core_0_2.elf | Bin 1600 -> 0 bytes .../square_kernel_0_core_0_2.ld.script | 66 -- .../air_project/square_kernel_0_core_0_2.ll | 84 --- .../air_project/square_kernel_0_core_0_2.o | Bin 932 -> 0 bytes .../square_kernel_0_core_0_2.opt.ll | 65 -- .../square_kernel_0_core_0_2.peanohack.ll | 84 --- .../air_project/square_kernel_0_core_0_3.elf | Bin 1640 -> 0 bytes .../square_kernel_0_core_0_3.ld.script | 69 -- .../air_project/square_kernel_0_core_0_3.ll | 84 --- .../air_project/square_kernel_0_core_0_3.o | Bin 932 -> 0 bytes .../square_kernel_0_core_0_3.opt.ll | 65 -- .../square_kernel_0_core_0_3.peanohack.ll | 84 --- .../air_project/square_kernel_0_core_0_4.elf | Bin 1640 -> 0 bytes .../square_kernel_0_core_0_4.ld.script | 69 -- .../air_project/square_kernel_0_core_0_4.ll | 84 --- .../air_project/square_kernel_0_core_0_4.o | Bin 932 -> 0 bytes .../square_kernel_0_core_0_4.opt.ll | 65 -- .../square_kernel_0_core_0_4.peanohack.ll | 84 --- .../air_project/square_kernel_0_core_0_5.elf | Bin 1600 -> 0 bytes .../square_kernel_0_core_0_5.ld.script | 66 -- .../air_project/square_kernel_0_core_0_5.ll | 84 --- .../air_project/square_kernel_0_core_0_5.o | Bin 932 -> 0 bytes .../square_kernel_0_core_0_5.opt.ll | 65 -- .../square_kernel_0_core_0_5.peanohack.ll | 84 --- .../air_project/square_kernel_0_design.bif | 10 - ...uare_kernel_0_square_kernel_0_sequence.bin | Bin 2288 -> 0 bytes .../air_project/sub_kernel_0.pdi | Bin 7792 -> 0 bytes .../air_project/sub_kernel_0_aie_cdo_elfs.bin | Bin 2592 -> 0 bytes .../sub_kernel_0_aie_cdo_enable.bin | Bin 104 -> 0 bytes .../air_project/sub_kernel_0_aie_cdo_init.bin | Bin 6032 -> 0 bytes .../air_project/sub_kernel_0_core_0_2.elf | Bin 1656 -> 0 bytes .../sub_kernel_0_core_0_2.ld.script | 72 --- .../air_project/sub_kernel_0_core_0_2.ll | 95 --- .../air_project/sub_kernel_0_core_0_2.o | Bin 984 -> 0 bytes .../air_project/sub_kernel_0_core_0_2.opt.ll | 64 -- .../sub_kernel_0_core_0_2.peanohack.ll | 95 --- .../air_project/sub_kernel_0_core_0_3.elf | Bin 1720 -> 0 bytes .../sub_kernel_0_core_0_3.ld.script | 78 --- .../air_project/sub_kernel_0_core_0_3.ll | 95 --- .../air_project/sub_kernel_0_core_0_3.o | Bin 984 -> 0 bytes .../air_project/sub_kernel_0_core_0_3.opt.ll | 64 -- .../sub_kernel_0_core_0_3.peanohack.ll | 95 --- .../air_project/sub_kernel_0_core_0_4.elf | Bin 1724 -> 0 bytes .../sub_kernel_0_core_0_4.ld.script | 78 --- .../air_project/sub_kernel_0_core_0_4.ll | 95 --- .../air_project/sub_kernel_0_core_0_4.o | Bin 984 -> 0 bytes .../air_project/sub_kernel_0_core_0_4.opt.ll | 64 -- .../sub_kernel_0_core_0_4.peanohack.ll | 95 --- .../air_project/sub_kernel_0_core_0_5.elf | Bin 1660 -> 0 bytes .../sub_kernel_0_core_0_5.ld.script | 72 --- .../air_project/sub_kernel_0_core_0_5.ll | 95 --- .../air_project/sub_kernel_0_core_0_5.o | Bin 984 -> 0 bytes .../air_project/sub_kernel_0_core_0_5.opt.ll | 64 -- .../sub_kernel_0_core_0_5.peanohack.ll | 95 --- .../air_project/sub_kernel_0_design.bif | 10 - .../sub_kernel_0_sub_kernel_0_sequence.bin | Bin 3248 -> 0 bytes .../elementwise_arith/air_project/tt.mlir | 35 - examples/elementwise_arith/tt.shared.mlir | 1 - 151 files changed, 8875 deletions(-) delete mode 100644 examples/elementwise_arith/air_project/aie.asm_air_output.mlir delete mode 100644 examples/elementwise_arith/air_project/aie.elf delete mode 100644 examples/elementwise_arith/air_project/aiecc_failure_1775797115_856352.mlir delete mode 100644 examples/elementwise_arith/air_project/aiecc_failure_1775797139_858651.mlir delete mode 100644 examples/elementwise_arith/air_project/aiecc_failure_1775797174_862028.mlir delete mode 100755 examples/elementwise_arith/air_project/aiecc_repeater_1775797115_856352.sh delete mode 100755 examples/elementwise_arith/air_project/aiecc_repeater_1775797139_858651.sh delete mode 100755 examples/elementwise_arith/air_project/aiecc_repeater_1775797174_862028.sh delete mode 100644 examples/elementwise_arith/air_project/airinput.mlir delete mode 100644 examples/elementwise_arith/air_project/asm_air_output.mlir delete mode 100644 examples/elementwise_arith/air_project/asm_src.mlir delete mode 100644 examples/elementwise_arith/air_project/div_kernel_0.pdi delete mode 100644 examples/elementwise_arith/air_project/div_kernel_0_aie_cdo_elfs.bin delete mode 100644 examples/elementwise_arith/air_project/div_kernel_0_aie_cdo_enable.bin delete mode 100644 examples/elementwise_arith/air_project/div_kernel_0_aie_cdo_init.bin delete mode 100755 examples/elementwise_arith/air_project/div_kernel_0_core_0_2.elf delete mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_2.ld.script delete mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_2.ll delete mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_2.o delete mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_2.opt.ll delete mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_2.peanohack.ll delete mode 100755 examples/elementwise_arith/air_project/div_kernel_0_core_0_3.elf delete mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_3.ld.script delete mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_3.ll delete mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_3.o delete mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_3.opt.ll delete mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_3.peanohack.ll delete mode 100755 examples/elementwise_arith/air_project/div_kernel_0_core_0_4.elf delete mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_4.ld.script delete mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_4.ll delete mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_4.o delete mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_4.opt.ll delete mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_4.peanohack.ll delete mode 100755 examples/elementwise_arith/air_project/div_kernel_0_core_0_5.elf delete mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_5.ld.script delete mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_5.ll delete mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_5.o delete mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_5.opt.ll delete mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_5.peanohack.ll delete mode 100644 examples/elementwise_arith/air_project/div_kernel_0_design.bif delete mode 100644 examples/elementwise_arith/air_project/div_kernel_0_div_kernel_0_sequence.bin delete mode 100644 examples/elementwise_arith/air_project/empty_0.pdi delete mode 100644 examples/elementwise_arith/air_project/empty_0_aie_cdo_elfs.bin delete mode 100644 examples/elementwise_arith/air_project/empty_0_aie_cdo_enable.bin delete mode 100644 examples/elementwise_arith/air_project/empty_0_aie_cdo_init.bin delete mode 100644 examples/elementwise_arith/air_project/empty_0_design.bif delete mode 100644 examples/elementwise_arith/air_project/full_elf_config.json delete mode 100644 examples/elementwise_arith/air_project/input_with_addresses.mlir delete mode 100644 examples/elementwise_arith/air_project/main.pdi delete mode 100644 examples/elementwise_arith/air_project/main_aie_cdo_elfs.bin delete mode 100644 examples/elementwise_arith/air_project/main_aie_cdo_enable.bin delete mode 100644 examples/elementwise_arith/air_project/main_aie_cdo_init.bin delete mode 100644 examples/elementwise_arith/air_project/main_design.bif delete mode 100644 examples/elementwise_arith/air_project/main_div_kernel.bin delete mode 100644 examples/elementwise_arith/air_project/main_mul_kernel.bin delete mode 100644 examples/elementwise_arith/air_project/main_square_kernel.bin delete mode 100644 examples/elementwise_arith/air_project/main_sub_kernel.bin delete mode 100644 examples/elementwise_arith/air_project/mul_kernel_0.pdi delete mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_aie_cdo_elfs.bin delete mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_aie_cdo_enable.bin delete mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_aie_cdo_init.bin delete mode 100755 examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.elf delete mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.ld.script delete mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.ll delete mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.o delete mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.opt.ll delete mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.peanohack.ll delete mode 100755 examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.elf delete mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.ld.script delete mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.ll delete mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.o delete mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.opt.ll delete mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.peanohack.ll delete mode 100755 examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.elf delete mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.ld.script delete mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.ll delete mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.o delete mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.opt.ll delete mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.peanohack.ll delete mode 100755 examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.elf delete mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.ld.script delete mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.ll delete mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.o delete mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.opt.ll delete mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.peanohack.ll delete mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_design.bif delete mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_mul_kernel_0_sequence.bin delete mode 100644 examples/elementwise_arith/air_project/npu.asm_air_output.mlir delete mode 100644 examples/elementwise_arith/air_project/placed.asm_air_output.mlir delete mode 100644 examples/elementwise_arith/air_project/square_kernel_0.pdi delete mode 100644 examples/elementwise_arith/air_project/square_kernel_0_aie_cdo_elfs.bin delete mode 100644 examples/elementwise_arith/air_project/square_kernel_0_aie_cdo_enable.bin delete mode 100644 examples/elementwise_arith/air_project/square_kernel_0_aie_cdo_init.bin delete mode 100755 examples/elementwise_arith/air_project/square_kernel_0_core_0_2.elf delete mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_2.ld.script delete mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_2.ll delete mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_2.o delete mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_2.opt.ll delete mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_2.peanohack.ll delete mode 100755 examples/elementwise_arith/air_project/square_kernel_0_core_0_3.elf delete mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_3.ld.script delete mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_3.ll delete mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_3.o delete mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_3.opt.ll delete mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_3.peanohack.ll delete mode 100755 examples/elementwise_arith/air_project/square_kernel_0_core_0_4.elf delete mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_4.ld.script delete mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_4.ll delete mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_4.o delete mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_4.opt.ll delete mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_4.peanohack.ll delete mode 100755 examples/elementwise_arith/air_project/square_kernel_0_core_0_5.elf delete mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_5.ld.script delete mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_5.ll delete mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_5.o delete mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_5.opt.ll delete mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_5.peanohack.ll delete mode 100644 examples/elementwise_arith/air_project/square_kernel_0_design.bif delete mode 100644 examples/elementwise_arith/air_project/square_kernel_0_square_kernel_0_sequence.bin delete mode 100644 examples/elementwise_arith/air_project/sub_kernel_0.pdi delete mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_aie_cdo_elfs.bin delete mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_aie_cdo_enable.bin delete mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_aie_cdo_init.bin delete mode 100755 examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.elf delete mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.ld.script delete mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.ll delete mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.o delete mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.opt.ll delete mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.peanohack.ll delete mode 100755 examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.elf delete mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.ld.script delete mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.ll delete mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.o delete mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.opt.ll delete mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.peanohack.ll delete mode 100755 examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.elf delete mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.ld.script delete mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.ll delete mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.o delete mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.opt.ll delete mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.peanohack.ll delete mode 100755 examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.elf delete mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.ld.script delete mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.ll delete mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.o delete mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.opt.ll delete mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.peanohack.ll delete mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_design.bif delete mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_sub_kernel_0_sequence.bin delete mode 100644 examples/elementwise_arith/air_project/tt.mlir delete mode 100644 examples/elementwise_arith/tt.shared.mlir diff --git a/examples/elementwise_arith/air_project/aie.asm_air_output.mlir b/examples/elementwise_arith/air_project/aie.asm_air_output.mlir deleted file mode 100644 index e55b5a1..0000000 --- a/examples/elementwise_arith/air_project/aie.asm_air_output.mlir +++ /dev/null @@ -1,386 +0,0 @@ -#loop_annotation = #llvm.loop_annotation -module { - aie.device(npu2) @square_kernel_0 { - %shim_noc_tile_0_0 = aie.tile(0, 0) - %shim_noc_tile_1_0 = aie.tile(1, 0) - %mem_tile_0_1 = aie.tile(0, 1) - %mem_tile_1_1 = aie.tile(1, 1) - %tile_0_2 = aie.tile(0, 2) - %tile_0_3 = aie.tile(0, 3) - %tile_0_4 = aie.tile(0, 4) - %tile_0_5 = aie.tile(0, 5) - %lock_0_1 = aie.lock(%mem_tile_0_1, 1) {init = 4 : i32} - %lock_0_1_0 = aie.lock(%mem_tile_0_1, 0) {init = 0 : i32} - %lock_1_1 = aie.lock(%mem_tile_1_1, 1) {init = 4 : i32} - %lock_1_1_1 = aie.lock(%mem_tile_1_1, 0) {init = 0 : i32} - %lock_0_2 = aie.lock(%tile_0_2, 3) {init = 1 : i32} - %lock_0_2_2 = aie.lock(%tile_0_2, 2) {init = 0 : i32} - %lock_0_2_3 = aie.lock(%tile_0_2, 1) {init = 1 : i32} - %lock_0_2_4 = aie.lock(%tile_0_2, 0) {init = 0 : i32} - %lock_0_3 = aie.lock(%tile_0_3, 3) {init = 1 : i32} - %lock_0_3_5 = aie.lock(%tile_0_3, 2) {init = 0 : i32} - %lock_0_3_6 = aie.lock(%tile_0_3, 1) {init = 1 : i32} - %lock_0_3_7 = aie.lock(%tile_0_3, 0) {init = 0 : i32} - %lock_0_4 = aie.lock(%tile_0_4, 3) {init = 1 : i32} - %lock_0_4_8 = aie.lock(%tile_0_4, 2) {init = 0 : i32} - %lock_0_4_9 = aie.lock(%tile_0_4, 1) {init = 1 : i32} - %lock_0_4_10 = aie.lock(%tile_0_4, 0) {init = 0 : i32} - %lock_0_5 = aie.lock(%tile_0_5, 3) {init = 1 : i32} - %lock_0_5_11 = aie.lock(%tile_0_5, 2) {init = 0 : i32} - %lock_0_5_12 = aie.lock(%tile_0_5, 1) {init = 1 : i32} - %lock_0_5_13 = aie.lock(%tile_0_5, 0) {init = 0 : i32} - %buf9 = aie.buffer(%mem_tile_0_1) {sym_name = "buf9"} : memref<1024xi16, 1 : i32> - %buf8 = aie.buffer(%mem_tile_1_1) {sym_name = "buf8"} : memref<1024xi16, 1> - %buf7 = aie.buffer(%tile_0_5) {sym_name = "buf7"} : memref<256xi16, 2> - %buf6 = aie.buffer(%tile_0_5) {sym_name = "buf6"} : memref<256xi16, 2> - %buf5 = aie.buffer(%tile_0_4) {sym_name = "buf5"} : memref<256xi16, 2> - %buf4 = aie.buffer(%tile_0_4) {sym_name = "buf4"} : memref<256xi16, 2> - %buf3 = aie.buffer(%tile_0_3) {sym_name = "buf3"} : memref<256xi16, 2> - %buf2 = aie.buffer(%tile_0_3) {sym_name = "buf2"} : memref<256xi16, 2> - %buf1 = aie.buffer(%tile_0_2) {sym_name = "buf1"} : memref<256xi16, 2> - %buf0 = aie.buffer(%tile_0_2) {sym_name = "buf0"} : memref<256xi16, 2> - %mem_0_5 = aie.mem(%tile_0_5) { - %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) - ^bb1: // 2 preds: ^bb0, ^bb1 - aie.use_lock(%lock_0_5_13, AcquireGreaterEqual, 1) - aie.dma_bd(%buf6 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32} - aie.use_lock(%lock_0_5_12, Release, 1) - aie.next_bd ^bb1 - ^bb2: // pred: ^bb3 - aie.end - ^bb3: // pred: ^bb0 - %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb2) - ^bb4: // 2 preds: ^bb3, ^bb4 - aie.use_lock(%lock_0_5, AcquireGreaterEqual, 1) - aie.dma_bd(%buf7 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32} - aie.use_lock(%lock_0_5_11, Release, 1) - aie.next_bd ^bb4 - } - %core_0_5 = aie.core(%tile_0_5) { - %0 = ub.poison : i16 - %c256 = arith.constant 256 : index - %c32 = arith.constant 32 : index - %c0 = arith.constant 0 : index - cf.br ^bb1 - ^bb1: // 2 preds: ^bb0, ^bb2 - aie.use_lock(%lock_0_5_12, AcquireGreaterEqual, 1) - cf.br ^bb2 - ^bb2: // pred: ^bb1 - aie.use_lock(%lock_0_5_11, AcquireGreaterEqual, 1) - scf.for %arg0 = %c0 to %c256 step %c32 { - %subview = memref.subview %buf7[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2> - %subview_14 = memref.subview %buf6[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2> - %1 = vector.transfer_read %subview[%c0], %0 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16> - %2 = arith.muli %1, %1 : vector<32xi16> - vector.transfer_write %2, %subview_14[%c0] {in_bounds = [true]} : vector<32xi16>, memref<32xi16, strided<[1], offset: ?>, 2> - } {loop_annotation = #loop_annotation} - aie.use_lock(%lock_0_5, Release, 1) - aie.use_lock(%lock_0_5_13, Release, 1) - cf.br ^bb1 - } - %mem_0_4 = aie.mem(%tile_0_4) { - %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) - ^bb1: // 2 preds: ^bb0, ^bb1 - aie.use_lock(%lock_0_4_10, AcquireGreaterEqual, 1) - aie.dma_bd(%buf4 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32} - aie.use_lock(%lock_0_4_9, Release, 1) - aie.next_bd ^bb1 - ^bb2: // pred: ^bb3 - aie.end - ^bb3: // pred: ^bb0 - %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb2) - ^bb4: // 2 preds: ^bb3, ^bb4 - aie.use_lock(%lock_0_4, AcquireGreaterEqual, 1) - aie.dma_bd(%buf5 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32} - aie.use_lock(%lock_0_4_8, Release, 1) - aie.next_bd ^bb4 - } - %core_0_4 = aie.core(%tile_0_4) { - %0 = ub.poison : i16 - %c256 = arith.constant 256 : index - %c32 = arith.constant 32 : index - %c0 = arith.constant 0 : index - cf.br ^bb1 - ^bb1: // 2 preds: ^bb0, ^bb2 - aie.use_lock(%lock_0_4_9, AcquireGreaterEqual, 1) - cf.br ^bb2 - ^bb2: // pred: ^bb1 - aie.use_lock(%lock_0_4_8, AcquireGreaterEqual, 1) - scf.for %arg0 = %c0 to %c256 step %c32 { - %subview = memref.subview %buf5[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2> - %subview_14 = memref.subview %buf4[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2> - %1 = vector.transfer_read %subview[%c0], %0 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16> - %2 = arith.muli %1, %1 : vector<32xi16> - vector.transfer_write %2, %subview_14[%c0] {in_bounds = [true]} : vector<32xi16>, memref<32xi16, strided<[1], offset: ?>, 2> - } {loop_annotation = #loop_annotation} - aie.use_lock(%lock_0_4, Release, 1) - aie.use_lock(%lock_0_4_10, Release, 1) - cf.br ^bb1 - } - %mem_0_3 = aie.mem(%tile_0_3) { - %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) - ^bb1: // 2 preds: ^bb0, ^bb1 - aie.use_lock(%lock_0_3_7, AcquireGreaterEqual, 1) - aie.dma_bd(%buf2 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32} - aie.use_lock(%lock_0_3_6, Release, 1) - aie.next_bd ^bb1 - ^bb2: // pred: ^bb3 - aie.end - ^bb3: // pred: ^bb0 - %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb2) - ^bb4: // 2 preds: ^bb3, ^bb4 - aie.use_lock(%lock_0_3, AcquireGreaterEqual, 1) - aie.dma_bd(%buf3 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32} - aie.use_lock(%lock_0_3_5, Release, 1) - aie.next_bd ^bb4 - } - %core_0_3 = aie.core(%tile_0_3) { - %0 = ub.poison : i16 - %c256 = arith.constant 256 : index - %c32 = arith.constant 32 : index - %c0 = arith.constant 0 : index - cf.br ^bb1 - ^bb1: // 2 preds: ^bb0, ^bb2 - aie.use_lock(%lock_0_3_6, AcquireGreaterEqual, 1) - cf.br ^bb2 - ^bb2: // pred: ^bb1 - aie.use_lock(%lock_0_3_5, AcquireGreaterEqual, 1) - scf.for %arg0 = %c0 to %c256 step %c32 { - %subview = memref.subview %buf3[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2> - %subview_14 = memref.subview %buf2[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2> - %1 = vector.transfer_read %subview[%c0], %0 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16> - %2 = arith.muli %1, %1 : vector<32xi16> - vector.transfer_write %2, %subview_14[%c0] {in_bounds = [true]} : vector<32xi16>, memref<32xi16, strided<[1], offset: ?>, 2> - } {loop_annotation = #loop_annotation} - aie.use_lock(%lock_0_3, Release, 1) - aie.use_lock(%lock_0_3_7, Release, 1) - cf.br ^bb1 - } - %mem_0_2 = aie.mem(%tile_0_2) { - %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) - ^bb1: // 2 preds: ^bb0, ^bb1 - aie.use_lock(%lock_0_2_4, AcquireGreaterEqual, 1) - aie.dma_bd(%buf0 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32} - aie.use_lock(%lock_0_2_3, Release, 1) - aie.next_bd ^bb1 - ^bb2: // pred: ^bb3 - aie.end - ^bb3: // pred: ^bb0 - %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb2) - ^bb4: // 2 preds: ^bb3, ^bb4 - aie.use_lock(%lock_0_2, AcquireGreaterEqual, 1) - aie.dma_bd(%buf1 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32} - aie.use_lock(%lock_0_2_2, Release, 1) - aie.next_bd ^bb4 - } - %core_0_2 = aie.core(%tile_0_2) { - %0 = ub.poison : i16 - %c256 = arith.constant 256 : index - %c32 = arith.constant 32 : index - %c0 = arith.constant 0 : index - cf.br ^bb1 - ^bb1: // 2 preds: ^bb0, ^bb2 - aie.use_lock(%lock_0_2_3, AcquireGreaterEqual, 1) - cf.br ^bb2 - ^bb2: // pred: ^bb1 - aie.use_lock(%lock_0_2_2, AcquireGreaterEqual, 1) - scf.for %arg0 = %c0 to %c256 step %c32 { - %subview = memref.subview %buf1[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2> - %subview_14 = memref.subview %buf0[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2> - %1 = vector.transfer_read %subview[%c0], %0 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16> - %2 = arith.muli %1, %1 : vector<32xi16> - vector.transfer_write %2, %subview_14[%c0] {in_bounds = [true]} : vector<32xi16>, memref<32xi16, strided<[1], offset: ?>, 2> - } {loop_annotation = #loop_annotation} - aie.use_lock(%lock_0_2, Release, 1) - aie.use_lock(%lock_0_2_4, Release, 1) - cf.br ^bb1 - } - air.channel @channel_0 [] - air.channel @channel_2 [1, 1] - air.channel @channel_8 [1, 1] - air.channel @channel_9 [1, 1] - air.channel @channel_10 [1, 1] - air.channel @channel_4 [1, 1] - air.channel @channel_5 [1, 1] - air.channel @channel_6 [1, 1] - air.channel @channel_7 [1, 1] - air.channel @channel_3 [] - aie.flow(%shim_noc_tile_0_0, DMA : 0, %mem_tile_0_1, DMA : 0) - aie.flow(%mem_tile_1_1, DMA : 0, %shim_noc_tile_1_0, DMA : 0) - aie.flow(%mem_tile_0_1, DMA : 0, %tile_0_2, DMA : 0) - aie.flow(%mem_tile_0_1, DMA : 1, %tile_0_3, DMA : 0) - aie.flow(%mem_tile_0_1, DMA : 2, %tile_0_4, DMA : 0) - aie.flow(%mem_tile_0_1, DMA : 3, %tile_0_5, DMA : 0) - aie.flow(%tile_0_2, DMA : 0, %mem_tile_1_1, DMA : 0) - aie.flow(%tile_0_3, DMA : 0, %mem_tile_1_1, DMA : 1) - aie.flow(%tile_0_4, DMA : 0, %mem_tile_1_1, DMA : 2) - aie.flow(%tile_0_5, DMA : 0, %mem_tile_1_1, DMA : 3) - %memtile_dma_1_1 = aie.memtile_dma(%mem_tile_1_1) { - %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) - ^bb1: // 2 preds: ^bb0, ^bb1 - aie.use_lock(%lock_1_1_1, AcquireGreaterEqual, 4) - aie.dma_bd(%buf8 : memref<1024xi16, 1>, 0, 1024) {task_id = 0 : i32} - aie.use_lock(%lock_1_1, Release, 4) - aie.next_bd ^bb1 - ^bb2: // pred: ^bb9 - aie.end - ^bb3: // pred: ^bb0 - %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb5) - ^bb4: // 2 preds: ^bb3, ^bb4 - aie.use_lock(%lock_1_1, AcquireGreaterEqual, 1) - aie.dma_bd(%buf8 : memref<1024xi16, 1>, 0, 256) {task_id = 0 : i32} - aie.use_lock(%lock_1_1_1, Release, 1) - aie.next_bd ^bb4 - ^bb5: // pred: ^bb3 - %2 = aie.dma_start(S2MM, 1, ^bb6, ^bb7) - ^bb6: // 2 preds: ^bb5, ^bb6 - aie.use_lock(%lock_1_1, AcquireGreaterEqual, 1) - aie.dma_bd(%buf8 : memref<1024xi16, 1>, 256, 256) {task_id = 0 : i32} - aie.use_lock(%lock_1_1_1, Release, 1) - aie.next_bd ^bb6 - ^bb7: // pred: ^bb5 - %3 = aie.dma_start(S2MM, 2, ^bb8, ^bb9) - ^bb8: // 2 preds: ^bb7, ^bb8 - aie.use_lock(%lock_1_1, AcquireGreaterEqual, 1) - aie.dma_bd(%buf8 : memref<1024xi16, 1>, 512, 256) {task_id = 0 : i32} - aie.use_lock(%lock_1_1_1, Release, 1) - aie.next_bd ^bb8 - ^bb9: // pred: ^bb7 - %4 = aie.dma_start(S2MM, 3, ^bb10, ^bb2) - ^bb10: // 2 preds: ^bb9, ^bb10 - aie.use_lock(%lock_1_1, AcquireGreaterEqual, 1) - aie.dma_bd(%buf8 : memref<1024xi16, 1>, 768, 256) {task_id = 0 : i32} - aie.use_lock(%lock_1_1_1, Release, 1) - aie.next_bd ^bb10 - } - %memtile_dma_0_1 = aie.memtile_dma(%mem_tile_0_1) { - %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) - ^bb1: // 2 preds: ^bb0, ^bb1 - aie.use_lock(%lock_0_1_0, AcquireGreaterEqual, 1) - aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 0, 256) {task_id = 0 : i32} - aie.use_lock(%lock_0_1, Release, 1) - aie.next_bd ^bb1 - ^bb2: // pred: ^bb9 - aie.end - ^bb3: // pred: ^bb0 - %1 = aie.dma_start(MM2S, 1, ^bb4, ^bb5) - ^bb4: // 2 preds: ^bb3, ^bb4 - aie.use_lock(%lock_0_1_0, AcquireGreaterEqual, 1) - aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 256, 256) {task_id = 0 : i32} - aie.use_lock(%lock_0_1, Release, 1) - aie.next_bd ^bb4 - ^bb5: // pred: ^bb3 - %2 = aie.dma_start(MM2S, 2, ^bb6, ^bb7) - ^bb6: // 2 preds: ^bb5, ^bb6 - aie.use_lock(%lock_0_1_0, AcquireGreaterEqual, 1) - aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 512, 256) {task_id = 0 : i32} - aie.use_lock(%lock_0_1, Release, 1) - aie.next_bd ^bb6 - ^bb7: // pred: ^bb5 - %3 = aie.dma_start(MM2S, 3, ^bb8, ^bb9) - ^bb8: // 2 preds: ^bb7, ^bb8 - aie.use_lock(%lock_0_1_0, AcquireGreaterEqual, 1) - aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 768, 256) {task_id = 0 : i32} - aie.use_lock(%lock_0_1, Release, 1) - aie.next_bd ^bb8 - ^bb9: // pred: ^bb7 - %4 = aie.dma_start(S2MM, 0, ^bb10, ^bb2) - ^bb10: // 2 preds: ^bb9, ^bb10 - aie.use_lock(%lock_0_1, AcquireGreaterEqual, 4) - aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 0, 1024) {task_id = 0 : i32} - aie.use_lock(%lock_0_1_0, Release, 4) - aie.next_bd ^bb10 - } - aie.shim_dma_allocation @air_channel_3(%shim_noc_tile_1_0, S2MM, 0) - aie.shim_dma_allocation @air_channel_0(%shim_noc_tile_0_0, MM2S, 0) - } {dlti.dl_spec = #dlti.dl_spec} - airrt.module_metadata{ - airrt.segment_metadata attributes {dma_allocations = [{channel = 2 : i64, col = 0 : i64, id = 3 : i64, location = 0 : i64, row = -1 : i64}], sym_name = "square_kernel_0"}{ - airrt.herd_metadata {dma_allocations = [], loc_x = 0 : i64, loc_y = 2 : i64, size_x = 1 : i64, size_y = 4 : i64, sym_name = "herd_0"} - } - } - air.channel @channel_0 [] - air.channel @channel_1 [4, 1] - air.channel @channel_2 [4, 1] - air.channel @channel_3 [] - func.func @square_kernel(%arg0: memref<*xi16> {tt.divisibility = 16 : i32}, %arg1: memref<*xi16> {tt.divisibility = 16 : i32}, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32) { - %c1 = arith.constant 1 : index - %0 = air.launch async (%arg8, %arg9, %arg10) in (%arg11=%c1, %arg12=%c1, %arg13=%c1) args(%arg14=%arg0, %arg15=%arg1) : memref<*xi16>, memref<*xi16> attributes {id = 1 : i32} { - %c1024 = arith.constant 1024 : index - %c1_0 = arith.constant 1 : index - %1 = arith.muli %arg8, %c1024 : index - %2 = air.channel.put async @channel_0[] (%arg14[%1] [%c1024] [%c1_0]) {id = 1 : i32, metadataArray = [{base = "air_channel_0", index = 0 : i32}]} : (memref<*xi16>) - %3 = air.channel.get async @channel_3[] (%arg15[%1] [%c1024] [%c1_0]) {id = 2 : i32, metadataArray = [{base = "air_channel_3", index = 0 : i32}]} : (memref<*xi16>) - %4 = air.segment @square_kernel_0 async attributes {id = 2 : i32, x_loc = 0 : i64, x_size = 8 : i64, y_loc = 2 : i64, y_size = 6 : i64} { - %c4 = arith.constant 4 : index - %c768 = arith.constant 768 : index - %c3 = arith.constant 3 : index - %c512 = arith.constant 512 : index - %c2 = arith.constant 2 : index - %c256 = arith.constant 256 : index - %c0 = arith.constant 0 : index - %c1_1 = arith.constant 1 : index - %async_token, %results = air.execute -> (memref<1024xi16, 1 : i32>) { - %alloc = memref.alloc() : memref<1024xi16, 1 : i32> - air.execute_terminator %alloc : memref<1024xi16, 1 : i32> - } - %5 = air.channel.get async [%async_token] @channel_0[] (%results[] [] []) {id = 3 : i32} : (memref<1024xi16, 1 : i32>) - %async_token_2, %results_3 = air.execute -> (memref<1024xi16, 1>) { - %alloc = memref.alloc() : memref<1024xi16, 1> - air.execute_terminator %alloc : memref<1024xi16, 1> - } - %6 = air.channel.put async [%5] @channel_1[%c0, %c0] (%results[%c0] [%c256] [%c1_1]) {id = 4 : i32} : (memref<1024xi16, 1 : i32>) - %7 = air.channel.put async [%5] @channel_1[%c1_1, %c0] (%results[%c256] [%c256] [%c1_1]) {id = 5 : i32} : (memref<1024xi16, 1 : i32>) - %8 = air.channel.put async [%5] @channel_1[%c2, %c0] (%results[%c512] [%c256] [%c1_1]) {id = 6 : i32} : (memref<1024xi16, 1 : i32>) - %9 = air.channel.put async [%5] @channel_1[%c3, %c0] (%results[%c768] [%c256] [%c1_1]) {id = 7 : i32} : (memref<1024xi16, 1 : i32>) - %10 = air.channel.get async [%async_token_2] @channel_2[%c0, %c0] (%results_3[%c0] [%c256] [%c1_1]) {id = 8 : i32} : (memref<1024xi16, 1>) - %11 = air.channel.get async [%async_token_2] @channel_2[%c1_1, %c0] (%results_3[%c256] [%c256] [%c1_1]) {id = 9 : i32} : (memref<1024xi16, 1>) - %12 = air.channel.get async [%async_token_2] @channel_2[%c2, %c0] (%results_3[%c512] [%c256] [%c1_1]) {id = 10 : i32} : (memref<1024xi16, 1>) - %13 = air.channel.get async [%async_token_2] @channel_2[%c3, %c0] (%results_3[%c768] [%c256] [%c1_1]) {id = 11 : i32} : (memref<1024xi16, 1>) - %14 = air.herd @herd_0 async [%5, %async_token_2] tile (%arg16, %arg17) in (%arg18=%c1_1, %arg19=%c4) attributes {id = 3 : i32, x_loc = 0 : i64, y_loc = 2 : i64} { - %c32 = arith.constant 32 : index - %c256_5 = arith.constant 256 : index - %c0_6 = arith.constant 0 : index - %16 = ub.poison : i16 - %async_token_7, %results_8 = air.execute -> (memref<256xi16, 2>) { - %alloc = memref.alloc() : memref<256xi16, 2> - air.execute_terminator %alloc : memref<256xi16, 2> - } - %17 = air.channel.get async [%async_token_7] @channel_1[%arg17, %c0_6] (%results_8[] [] []) {id = 12 : i32} : (memref<256xi16, 2>) - %async_token_9, %results_10 = air.execute -> (memref<256xi16, 2>) { - %alloc = memref.alloc() : memref<256xi16, 2> - air.execute_terminator %alloc : memref<256xi16, 2> - } - %18 = air.wait_all async [%17, %async_token_9] - %19 = scf.for %arg20 = %c0_6 to %c256_5 step %c32 iter_args(%arg21 = %18) -> (!air.async.token) { - %subview = memref.subview %results_8[%arg20] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2> - %subview_13 = memref.subview %results_10[%arg20] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2> - %async_token_14, %results_15 = air.execute [%arg21] -> (vector<32xi16>) { - %23 = vector.transfer_read %subview[%c0_6], %16 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16> - air.execute_terminator %23 : vector<32xi16> - } - %21 = arith.muli %results_15, %results_15 : vector<32xi16> - %async_token_16 = air.execute [%arg21] { - vector.transfer_write %21, %subview_13[%c0_6] {in_bounds = [true]} : vector<32xi16>, memref<32xi16, strided<[1], offset: ?>, 2> - } - %22 = air.wait_all async [%async_token_14, %async_token_16] - scf.yield %22 : !air.async.token - } - %20 = air.channel.put async [%async_token_9] @channel_2[%arg17, %c0_6] (%results_10[] [] []) {id = 13 : i32} : (memref<256xi16, 2>) - %async_token_11 = air.execute [%17] { - memref.dealloc %results_8 : memref<256xi16, 2> - } - %async_token_12 = air.execute [%20] { - memref.dealloc %results_10 : memref<256xi16, 2> - } - } - %15 = air.channel.put async [%14] @channel_3[] (%results_3[] [] []) {id = 14 : i32} : (memref<1024xi16, 1>) - %async_token_4 = air.execute [%15] { - memref.dealloc %results_3 : memref<1024xi16, 1> - } - air.wait_all [%6, %7, %8, %9, %10, %11, %12, %13, %async_token_4] {air.segment_end} - } - } - return - } -} diff --git a/examples/elementwise_arith/air_project/aie.elf b/examples/elementwise_arith/air_project/aie.elf deleted file mode 100644 index d54eb1201d9709da9aaf31632fd7188b680af270..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 29488 zcmeHQU2GiH6~43HBuksZx^>Z#1Tpym0*$P798^G28DpmkZ7P-oP*jkzCYTU4aTdpE zL#2wPjVNtVWwm`URF(YF7sSJ=KZvF;TfqaZP_^PIFO3|^BSIU*1C?mq^WA&T+%rEn z4JC4HG{>6VbH8)u-ZST(o!y=J_WGGU`@S9o!JeuzQc?l^8QDco)fVfax<+kRLu6e` z5Ns?V?SQRp0qo!8PDpQ&j%^L;7zd_a$zY_acFknm@cHjO@z7g;e)&aY z3CEAn@XUGqW&3YK)Qsi6iAE!=hN>PTPQiZvkDR{!FE3#{x&P$RsY6GPPac_a9a3iV z<=5Nqx{*??c#~dBRvsjf>Z2sbkba*=|M}9=zfoi36Zg{yN)3?y$3JzJ25B(fK5O}Y zi;C*6wBcFGa^!o@`^kL2K{0&zdz0@{K-mws#VgT6WaWtY7<|`EUJvaK7>P!FR{jO+G&| z|7Bz2$J+G~75&~|@rLMW< zHz*??E#Zal>Hs^T&2_+St^;m!9dKKPqp7+YN!3-dtT@meMJ>6T>)2gf|GrwFPH9NoD=K1F~&p)?S zcpz0z52WfT*;eP1?TTn8>uG;cPph(?%=~hF)a?9no9Ca~JpX88{`s?%{Op9!R%k;8 z0@C`MW4$W$Q0R4`M?!B1y(#pT(Az?v6Z(SCJ3{XYy(jcVq4%wBj&C&a8o_&f^7ux? z@i{&0JH0OSNazirH-+93dRypoLSGPiN9bLl_k_ME^uE>2@r|dB&$QkAjfvxPdf0b* zUFea}8$xdiy(RRv(C37{AoPyVyF%{?eNpIrtDECHkUBoocE|UCI6kL`eW%xj9tpi6 z^rp~TLT?LwPUs6l?+Cps^q$Zch29r>Rc+GOFZ64TbUrS^OZd)dab2>|>r~r;#Y0i^ zRe>vk*Qr*VzpLr0z?G=2I_2K^yBetqT#31=)3G>zm-DK?l|9};oKLK*3Gi*6q(}P% zIt8X}9~bW_ug&uhwSScGMdP1`%GV-4Lv#*=pmwTO=lIIhhOX;M*aiu7@r-@Kc;Fvl z$b`6f$WCJ7F?v94#9F5Jf9NJ2XG_LIbM}ey1OJG6O^S<$>?9^0qq}%at(bV6Eg8?| zPZ$sUBR!^kaPg3x#KdFtfZ8TMrq)e7&Sv8IoZe5V4U|Zz+_tcNias7YBmG18V*s@w z>1bPHOVP8^UkzCQ;HP@8^iFC2i2E3kZ0 zPd?NBG+)NQls{h?{{ido$!FT1=F46{_RQDE(HWb6eN;C8TIc-lUb_L;D|D{zBpt%G z32O7)L@C=oYIBbFrfg+u^W3PEZFheF*L>TEYv>|f53_oO@I6X(fh&U>{euB@M{v`1 z8dn?fO40jn4>rd8)(Pi+l)xjxDSqu=7WfXrX%vlD1b!#s6p_Z)3%uGNDA6SwjYk&; zaA;;l0`e7C^M z{jIpBtmCX*+#2U6B0SEa!1ogF=EeC>5KixQx}Q+^?;||Ut-ucw9>*i_a(`6xGaARU zi*Of@!*>%-m*@1j+b%kMAK`9XhYu0%#&vkPKQ4}IT#P%e=)#=tf73;W?<0TLzr%+J zcl|rO+&>`tKM?mnM7Zn6;kEwO(&z2CJ1%aG>mejOo_B!{5&lK%zgOU!2#@PW;I;m! z=zmo7KPvinc&$G!{KvJwJ{R{X`t@`;^!|i)xgV73)T9^_-@nKU$Ekhz8l-ERYe=qb zt^uGe<+FDG0iOLz8ssc1x%*+#E1 z#MGt@HLjzh`zvvuS2{P59wdHPw%pipS9EULaay#pY`L-HuISu!uTpO8xGOq0?$xT3 zv}|=_$6e96iQ+Ta^5({lyP|W`zDl{VBNI}8nSY7wt)Ahphn^!M+ zHm;XE8`n#o4d1<3P(yOOD&6~sg=r6+BI{_^DAqK z>m|>|^^#}fddahaYr>MP-7#&-uU)?ld<00|e&8mgZGlGcaBoq&Iv1<8cFo-45HR=2 z+V$JGUh?+iddaha*TRAtlH>6`yu2C&-vN?m1NXsa0}mo?bG5s_s9l|%)mpn|Zh3xX zEpff%*|=WvY+NsSHt=N_Prp{!#&*OKOMi2$;+h!??&GsaobUC+CUNiqSR@{K@P-F( zdhnJ9Z+q}L55C~RJ0862!FwKj(S!F>IDfX}&w9D%4)HmV=ah5ITnV&gzQLzpk$C9A z!OLKg{*ec7c<`nNZ+Y;x2cPrc3m&}V!Mh&3=fM{}ct3@^c{87L^5@Oikk{gEu^Q(}TA>c-w=|dGG}f-tpjF58m_Oiypk6!ri=?_xSvI zbMODzd2{D;nQ!omSR@{LaPX8^q<`eW8y>vr!CM}@?ZM|f_<{%Tc<`S04YmJ&$Dj&p-cGJ^m_n zJ~GkJJv>|Q+5Yp-6IGADO1*Ae|E6xU{pZ(n)y`wuU!~?3b}hXJ%nx@w95eO?e*L=&e!x9$Fq?Qnp2@qP?z8UvOn?3jK2o*Ic4DJu|N2K`aV4X4`)sK zgZ~%%UxEY3{e%A(`+GQR=%0_4d4OokdX=0&i4%`ZpLa2y#EDaui-Rx7eM^7I9h5jY zgz(3M#H*hE!7GG6cyH2QatwiUeiZTZ;{5pLmpF#8Kk@%!f6sh@1IYeV56=G*9K+aO z_V1Z5*+12T>0fhH{TxH&1VZy`6nd2Tq50J~;*dDFhY(;Ampnv?gO3PF_anK9(jOc} zNP4~`Us3vlw+KnsgXAzue{dNg;LS1YE*|zu`M- z-kiiOClb3P#EH4=vp8`v0fLt&`?vhgj6b-Z z05ivBc$^u3a6kcq2P*rwe9(+PxS;^STa^BmCz|mGXA}b7sPwn|(TqR1q!92orT=!r z`^@-*V+zT>UJOpAL_gr3LWrW4z%36IrUVX7DkM|4o_N4jg=F&M@dt+$l8wjUWJ<&X zZYw04pL~CCULoMU%6wY>D@+L-Tv$lXCc}7GB}wM@$9nx;ML0b zEx#6BC2(+UA=&uz`v(UXl8ry#AKY9>HhzPXDG?7iyO3-=`TpSY!XJEIdAyd_3sV9I z#}{Dc^=|mP8Gmqp0fPT4`?oycj6XQR05j(wPygTw!#^9p!O4^uH?;oQ`1Ae2Eza-X z;N1TJoFL#FL-2WnxX1GPH~7ae(b)PkiL~*xYV3_IX&WWP;8~lgoBaLaT8R6EpO!1l z1?7J2^YZc5%z$zJnF~uVu!1MM3UbpnG4kWG)r}o@Mb}N2vtpi?EjM=D6`dP>MsxXD zw%pipS9ETo_$;pzv~0Pt~LKTKmPPI?PH#wKCXsmrjAVl)X3!VqleY- zk(m>xTJXJeXEcoQBxU-$H}Ko zP0mam`p(qM^whCK^~T}kpGw&eot%2=)YSChDcX70@yVmpN%zL#=evqCu-ZpDbYMk_03Pj^wrQU-Mf70jZbGzw3 zP!$T!^dG2Kpkv%9P}}I@g6Tg{Z$by&4pc;68cqL!dKWrys_jSV%eLu1Q1}j1u}L_% zg>Zdcl465ylQFy1;t|CL9XRQ8+brIuK5n)+>90UXKV4d1@)nDe{w8$YKgD*d#c5vO zg$|sK5#HWA0oZm-&*79Afhp85d^)Ss^oOlIY;Fr-OfnI;JSPzrlROnta-p2Gszjx>L z5YZl_6S7-hwO9|6?$*U?yP}jpx$~^`DFNFmbNM`bo@(ht3>$y(W!a|B~OMi}f%~zCz2tM825Q2|W^cL+DMRw}jpn z`kc@ggx(Q)SLi*VFABYHbu)j(dYE)~66LGw9QfA!W!fA;+~KfLYkd#nF?pUjsn`m>qvKG#s{HPMD0LVQej zub1>W`kC3|Fr?&5)Ye}nEdC++QI{G+UmQEm=xF1(AWdqtcra??_#j(oV?WsO{{|%i zl>O-TX6%RmH;Df`nAcxj_%+F=OTA) r0rEI{6bK&&;^*V|1?lqs^EvVx;?C$9#|U|uHtF{s`Gqi8VZZ+Zn&r4p diff --git a/examples/elementwise_arith/air_project/aiecc_failure_1775797115_856352.mlir b/examples/elementwise_arith/air_project/aiecc_failure_1775797115_856352.mlir deleted file mode 100644 index 15c21c6..0000000 --- a/examples/elementwise_arith/air_project/aiecc_failure_1775797115_856352.mlir +++ /dev/null @@ -1,411 +0,0 @@ -#loop_annotation = #llvm.loop_annotation -module { - aie.device(npu2) @sub_kernel_0 { - %shim_noc_tile_0_0 = aie.tile(0, 0) - %shim_noc_tile_1_0 = aie.tile(1, 0) - %shim_noc_tile_2_0 = aie.tile(2, 0) - %mem_tile_0_1 = aie.tile(0, 1) - %mem_tile_1_1 = aie.tile(1, 1) - %mem_tile_2_1 = aie.tile(2, 1) - %tile_0_2 = aie.tile(0, 2) - %tile_0_3 = aie.tile(0, 3) - %tile_0_4 = aie.tile(0, 4) - %tile_0_5 = aie.tile(0, 5) - %lock_1_1 = aie.lock(%mem_tile_1_1, 1) {init = 4 : i32} - %lock_1_1_0 = aie.lock(%mem_tile_1_1, 0) {init = 0 : i32} - %lock_0_1 = aie.lock(%mem_tile_0_1, 1) {init = 4 : i32} - %lock_0_1_1 = aie.lock(%mem_tile_0_1, 0) {init = 0 : i32} - %lock_2_1 = aie.lock(%mem_tile_2_1, 1) {init = 4 : i32} - %lock_2_1_2 = aie.lock(%mem_tile_2_1, 0) {init = 0 : i32} - %lock_0_2 = aie.lock(%tile_0_2, 5) {init = 1 : i32} - %lock_0_2_3 = aie.lock(%tile_0_2, 4) {init = 0 : i32} - %lock_0_2_4 = aie.lock(%tile_0_2, 3) {init = 1 : i32} - %lock_0_2_5 = aie.lock(%tile_0_2, 2) {init = 0 : i32} - %lock_0_2_6 = aie.lock(%tile_0_2, 1) {init = 1 : i32} - %lock_0_2_7 = aie.lock(%tile_0_2, 0) {init = 0 : i32} - %lock_0_3 = aie.lock(%tile_0_3, 5) {init = 1 : i32} - %lock_0_3_8 = aie.lock(%tile_0_3, 4) {init = 0 : i32} - %lock_0_3_9 = aie.lock(%tile_0_3, 3) {init = 1 : i32} - %lock_0_3_10 = aie.lock(%tile_0_3, 2) {init = 0 : i32} - %lock_0_3_11 = aie.lock(%tile_0_3, 1) {init = 1 : i32} - %lock_0_3_12 = aie.lock(%tile_0_3, 0) {init = 0 : i32} - %lock_0_4 = aie.lock(%tile_0_4, 5) {init = 1 : i32} - %lock_0_4_13 = aie.lock(%tile_0_4, 4) {init = 0 : i32} - %lock_0_4_14 = aie.lock(%tile_0_4, 3) {init = 1 : i32} - %lock_0_4_15 = aie.lock(%tile_0_4, 2) {init = 0 : i32} - %lock_0_4_16 = aie.lock(%tile_0_4, 1) {init = 1 : i32} - %lock_0_4_17 = aie.lock(%tile_0_4, 0) {init = 0 : i32} - %lock_0_5 = aie.lock(%tile_0_5, 5) {init = 1 : i32} - %lock_0_5_18 = aie.lock(%tile_0_5, 4) {init = 0 : i32} - %lock_0_5_19 = aie.lock(%tile_0_5, 3) {init = 1 : i32} - %lock_0_5_20 = aie.lock(%tile_0_5, 2) {init = 0 : i32} - %lock_0_5_21 = aie.lock(%tile_0_5, 1) {init = 1 : i32} - %lock_0_5_22 = aie.lock(%tile_0_5, 0) {init = 0 : i32} - %buf14 = aie.buffer(%mem_tile_0_1) {sym_name = "buf14"} : memref<1024xi16, 1 : i32> - %buf13 = aie.buffer(%mem_tile_1_1) {sym_name = "buf13"} : memref<1024xi16, 1 : i32> - %buf12 = aie.buffer(%mem_tile_2_1) {sym_name = "buf12"} : memref<1024xi16, 1> - %buf11 = aie.buffer(%tile_0_5) {sym_name = "buf11"} : memref<256xi16, 2> - %buf10 = aie.buffer(%tile_0_5) {sym_name = "buf10"} : memref<256xi16, 2> - %buf9 = aie.buffer(%tile_0_5) {sym_name = "buf9"} : memref<256xi16, 2> - %buf8 = aie.buffer(%tile_0_4) {sym_name = "buf8"} : memref<256xi16, 2> - %buf7 = aie.buffer(%tile_0_4) {sym_name = "buf7"} : memref<256xi16, 2> - %buf6 = aie.buffer(%tile_0_4) {sym_name = "buf6"} : memref<256xi16, 2> - %buf5 = aie.buffer(%tile_0_3) {sym_name = "buf5"} : memref<256xi16, 2> - %buf4 = aie.buffer(%tile_0_3) {sym_name = "buf4"} : memref<256xi16, 2> - %buf3 = aie.buffer(%tile_0_3) {sym_name = "buf3"} : memref<256xi16, 2> - %buf2 = aie.buffer(%tile_0_2) {sym_name = "buf2"} : memref<256xi16, 2> - %buf1 = aie.buffer(%tile_0_2) {sym_name = "buf1"} : memref<256xi16, 2> - %buf0 = aie.buffer(%tile_0_2) {sym_name = "buf0"} : memref<256xi16, 2> - %mem_0_5 = aie.mem(%tile_0_5) { - %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) - ^bb1: // 2 preds: ^bb0, ^bb1 - aie.use_lock(%lock_0_5_22, AcquireGreaterEqual, 1) - aie.dma_bd(%buf9 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32} - aie.use_lock(%lock_0_5_21, Release, 1) - aie.next_bd ^bb1 - ^bb2: // pred: ^bb5 - aie.end - ^bb3: // pred: ^bb0 - %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb5) - ^bb4: // 2 preds: ^bb3, ^bb4 - aie.use_lock(%lock_0_5_19, AcquireGreaterEqual, 1) - aie.dma_bd(%buf11 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32} - aie.use_lock(%lock_0_5_20, Release, 1) - aie.next_bd ^bb4 - ^bb5: // pred: ^bb3 - %2 = aie.dma_start(S2MM, 1, ^bb6, ^bb2) - ^bb6: // 2 preds: ^bb5, ^bb6 - aie.use_lock(%lock_0_5, AcquireGreaterEqual, 1) - aie.dma_bd(%buf10 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32} - aie.use_lock(%lock_0_5_18, Release, 1) - aie.next_bd ^bb6 - } - %core_0_5 = aie.core(%tile_0_5) { - %0 = ub.poison : i16 - %c256 = arith.constant 256 : index - %c32 = arith.constant 32 : index - %c0 = arith.constant 0 : index - cf.br ^bb1 - ^bb1: // 2 preds: ^bb0, ^bb1 - aie.use_lock(%lock_0_5_21, AcquireGreaterEqual, 1) - aie.use_lock(%lock_0_5_20, AcquireGreaterEqual, 1) - aie.use_lock(%lock_0_5_18, AcquireGreaterEqual, 1) - scf.for %arg0 = %c0 to %c256 step %c32 { - %subview = memref.subview %buf11[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2> - %subview_23 = memref.subview %buf10[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2> - %subview_24 = memref.subview %buf9[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2> - %1 = vector.transfer_read %subview[%c0], %0 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16> - %2 = vector.transfer_read %subview_23[%c0], %0 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16> - %3 = arith.subi %1, %2 : vector<32xi16> - vector.transfer_write %3, %subview_24[%c0] {in_bounds = [true]} : vector<32xi16>, memref<32xi16, strided<[1], offset: ?>, 2> - } {loop_annotation = #loop_annotation} - aie.use_lock(%lock_0_5_19, Release, 1) - aie.use_lock(%lock_0_5, Release, 1) - aie.use_lock(%lock_0_5_22, Release, 1) - cf.br ^bb1 - } - %mem_0_4 = aie.mem(%tile_0_4) { - %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) - ^bb1: // 2 preds: ^bb0, ^bb1 - aie.use_lock(%lock_0_4_17, AcquireGreaterEqual, 1) - aie.dma_bd(%buf6 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32} - aie.use_lock(%lock_0_4_16, Release, 1) - aie.next_bd ^bb1 - ^bb2: // pred: ^bb5 - aie.end - ^bb3: // pred: ^bb0 - %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb5) - ^bb4: // 2 preds: ^bb3, ^bb4 - aie.use_lock(%lock_0_4_14, AcquireGreaterEqual, 1) - aie.dma_bd(%buf8 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32} - aie.use_lock(%lock_0_4_15, Release, 1) - aie.next_bd ^bb4 - ^bb5: // pred: ^bb3 - %2 = aie.dma_start(S2MM, 1, ^bb6, ^bb2) - ^bb6: // 2 preds: ^bb5, ^bb6 - aie.use_lock(%lock_0_4, AcquireGreaterEqual, 1) - aie.dma_bd(%buf7 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32} - aie.use_lock(%lock_0_4_13, Release, 1) - aie.next_bd ^bb6 - } - %core_0_4 = aie.core(%tile_0_4) { - %0 = ub.poison : i16 - %c256 = arith.constant 256 : index - %c32 = arith.constant 32 : index - %c0 = arith.constant 0 : index - cf.br ^bb1 - ^bb1: // 2 preds: ^bb0, ^bb1 - aie.use_lock(%lock_0_4_16, AcquireGreaterEqual, 1) - aie.use_lock(%lock_0_4_15, AcquireGreaterEqual, 1) - aie.use_lock(%lock_0_4_13, AcquireGreaterEqual, 1) - scf.for %arg0 = %c0 to %c256 step %c32 { - %subview = memref.subview %buf8[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2> - %subview_23 = memref.subview %buf7[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2> - %subview_24 = memref.subview %buf6[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2> - %1 = vector.transfer_read %subview[%c0], %0 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16> - %2 = vector.transfer_read %subview_23[%c0], %0 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16> - %3 = arith.subi %1, %2 : vector<32xi16> - vector.transfer_write %3, %subview_24[%c0] {in_bounds = [true]} : vector<32xi16>, memref<32xi16, strided<[1], offset: ?>, 2> - } {loop_annotation = #loop_annotation} - aie.use_lock(%lock_0_4_14, Release, 1) - aie.use_lock(%lock_0_4, Release, 1) - aie.use_lock(%lock_0_4_17, Release, 1) - cf.br ^bb1 - } - %mem_0_3 = aie.mem(%tile_0_3) { - %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) - ^bb1: // 2 preds: ^bb0, ^bb1 - aie.use_lock(%lock_0_3_12, AcquireGreaterEqual, 1) - aie.dma_bd(%buf3 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32} - aie.use_lock(%lock_0_3_11, Release, 1) - aie.next_bd ^bb1 - ^bb2: // pred: ^bb5 - aie.end - ^bb3: // pred: ^bb0 - %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb5) - ^bb4: // 2 preds: ^bb3, ^bb4 - aie.use_lock(%lock_0_3_9, AcquireGreaterEqual, 1) - aie.dma_bd(%buf5 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32} - aie.use_lock(%lock_0_3_10, Release, 1) - aie.next_bd ^bb4 - ^bb5: // pred: ^bb3 - %2 = aie.dma_start(S2MM, 1, ^bb6, ^bb2) - ^bb6: // 2 preds: ^bb5, ^bb6 - aie.use_lock(%lock_0_3, AcquireGreaterEqual, 1) - aie.dma_bd(%buf4 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32} - aie.use_lock(%lock_0_3_8, Release, 1) - aie.next_bd ^bb6 - } - %core_0_3 = aie.core(%tile_0_3) { - %0 = ub.poison : i16 - %c256 = arith.constant 256 : index - %c32 = arith.constant 32 : index - %c0 = arith.constant 0 : index - cf.br ^bb1 - ^bb1: // 2 preds: ^bb0, ^bb1 - aie.use_lock(%lock_0_3_11, AcquireGreaterEqual, 1) - aie.use_lock(%lock_0_3_10, AcquireGreaterEqual, 1) - aie.use_lock(%lock_0_3_8, AcquireGreaterEqual, 1) - scf.for %arg0 = %c0 to %c256 step %c32 { - %subview = memref.subview %buf5[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2> - %subview_23 = memref.subview %buf4[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2> - %subview_24 = memref.subview %buf3[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2> - %1 = vector.transfer_read %subview[%c0], %0 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16> - %2 = vector.transfer_read %subview_23[%c0], %0 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16> - %3 = arith.subi %1, %2 : vector<32xi16> - vector.transfer_write %3, %subview_24[%c0] {in_bounds = [true]} : vector<32xi16>, memref<32xi16, strided<[1], offset: ?>, 2> - } {loop_annotation = #loop_annotation} - aie.use_lock(%lock_0_3_9, Release, 1) - aie.use_lock(%lock_0_3, Release, 1) - aie.use_lock(%lock_0_3_12, Release, 1) - cf.br ^bb1 - } - %mem_0_2 = aie.mem(%tile_0_2) { - %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) - ^bb1: // 2 preds: ^bb0, ^bb1 - aie.use_lock(%lock_0_2_7, AcquireGreaterEqual, 1) - aie.dma_bd(%buf0 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32} - aie.use_lock(%lock_0_2_6, Release, 1) - aie.next_bd ^bb1 - ^bb2: // pred: ^bb5 - aie.end - ^bb3: // pred: ^bb0 - %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb5) - ^bb4: // 2 preds: ^bb3, ^bb4 - aie.use_lock(%lock_0_2_4, AcquireGreaterEqual, 1) - aie.dma_bd(%buf2 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32} - aie.use_lock(%lock_0_2_5, Release, 1) - aie.next_bd ^bb4 - ^bb5: // pred: ^bb3 - %2 = aie.dma_start(S2MM, 1, ^bb6, ^bb2) - ^bb6: // 2 preds: ^bb5, ^bb6 - aie.use_lock(%lock_0_2, AcquireGreaterEqual, 1) - aie.dma_bd(%buf1 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32} - aie.use_lock(%lock_0_2_3, Release, 1) - aie.next_bd ^bb6 - } - %core_0_2 = aie.core(%tile_0_2) { - %0 = ub.poison : i16 - %c256 = arith.constant 256 : index - %c32 = arith.constant 32 : index - %c0 = arith.constant 0 : index - cf.br ^bb1 - ^bb1: // 2 preds: ^bb0, ^bb1 - aie.use_lock(%lock_0_2_6, AcquireGreaterEqual, 1) - aie.use_lock(%lock_0_2_5, AcquireGreaterEqual, 1) - aie.use_lock(%lock_0_2_3, AcquireGreaterEqual, 1) - scf.for %arg0 = %c0 to %c256 step %c32 { - %subview = memref.subview %buf2[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2> - %subview_23 = memref.subview %buf1[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2> - %subview_24 = memref.subview %buf0[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2> - %1 = vector.transfer_read %subview[%c0], %0 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16> - %2 = vector.transfer_read %subview_23[%c0], %0 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16> - %3 = arith.subi %1, %2 : vector<32xi16> - vector.transfer_write %3, %subview_24[%c0] {in_bounds = [true]} : vector<32xi16>, memref<32xi16, strided<[1], offset: ?>, 2> - } {loop_annotation = #loop_annotation} - aie.use_lock(%lock_0_2_4, Release, 1) - aie.use_lock(%lock_0_2, Release, 1) - aie.use_lock(%lock_0_2_7, Release, 1) - cf.br ^bb1 - } - aie.flow(%shim_noc_tile_0_0, DMA : 0, %mem_tile_0_1, DMA : 0) - aie.flow(%shim_noc_tile_1_0, DMA : 0, %mem_tile_1_1, DMA : 0) - aie.flow(%mem_tile_2_1, DMA : 0, %shim_noc_tile_2_0, DMA : 0) - aie.flow(%mem_tile_0_1, DMA : 0, %tile_0_2, DMA : 0) - aie.flow(%mem_tile_0_1, DMA : 1, %tile_0_3, DMA : 0) - aie.flow(%mem_tile_0_1, DMA : 2, %tile_0_4, DMA : 0) - aie.flow(%mem_tile_0_1, DMA : 3, %tile_0_5, DMA : 0) - aie.flow(%mem_tile_1_1, DMA : 0, %tile_0_2, DMA : 1) - aie.flow(%mem_tile_1_1, DMA : 1, %tile_0_3, DMA : 1) - aie.flow(%mem_tile_1_1, DMA : 2, %tile_0_4, DMA : 1) - aie.flow(%mem_tile_1_1, DMA : 3, %tile_0_5, DMA : 1) - aie.flow(%tile_0_2, DMA : 0, %mem_tile_2_1, DMA : 0) - aie.flow(%tile_0_3, DMA : 0, %mem_tile_2_1, DMA : 1) - aie.flow(%tile_0_4, DMA : 0, %mem_tile_2_1, DMA : 2) - aie.flow(%tile_0_5, DMA : 0, %mem_tile_2_1, DMA : 3) - %memtile_dma_2_1 = aie.memtile_dma(%mem_tile_2_1) { - %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) - ^bb1: // 2 preds: ^bb0, ^bb1 - aie.use_lock(%lock_2_1_2, AcquireGreaterEqual, 4) - aie.dma_bd(%buf12 : memref<1024xi16, 1>, 0, 1024) {task_id = 0 : i32} - aie.use_lock(%lock_2_1, Release, 4) - aie.next_bd ^bb1 - ^bb2: // pred: ^bb9 - aie.end - ^bb3: // pred: ^bb0 - %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb5) - ^bb4: // 2 preds: ^bb3, ^bb4 - aie.use_lock(%lock_2_1, AcquireGreaterEqual, 1) - aie.dma_bd(%buf12 : memref<1024xi16, 1>, 0, 256) {task_id = 0 : i32} - aie.use_lock(%lock_2_1_2, Release, 1) - aie.next_bd ^bb4 - ^bb5: // pred: ^bb3 - %2 = aie.dma_start(S2MM, 1, ^bb6, ^bb7) - ^bb6: // 2 preds: ^bb5, ^bb6 - aie.use_lock(%lock_2_1, AcquireGreaterEqual, 1) - aie.dma_bd(%buf12 : memref<1024xi16, 1>, 256, 256) {task_id = 0 : i32} - aie.use_lock(%lock_2_1_2, Release, 1) - aie.next_bd ^bb6 - ^bb7: // pred: ^bb5 - %3 = aie.dma_start(S2MM, 2, ^bb8, ^bb9) - ^bb8: // 2 preds: ^bb7, ^bb8 - aie.use_lock(%lock_2_1, AcquireGreaterEqual, 1) - aie.dma_bd(%buf12 : memref<1024xi16, 1>, 512, 256) {task_id = 0 : i32} - aie.use_lock(%lock_2_1_2, Release, 1) - aie.next_bd ^bb8 - ^bb9: // pred: ^bb7 - %4 = aie.dma_start(S2MM, 3, ^bb10, ^bb2) - ^bb10: // 2 preds: ^bb9, ^bb10 - aie.use_lock(%lock_2_1, AcquireGreaterEqual, 1) - aie.dma_bd(%buf12 : memref<1024xi16, 1>, 768, 256) {task_id = 0 : i32} - aie.use_lock(%lock_2_1_2, Release, 1) - aie.next_bd ^bb10 - } - %memtile_dma_0_1 = aie.memtile_dma(%mem_tile_0_1) { - %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) - ^bb1: // 2 preds: ^bb0, ^bb1 - aie.use_lock(%lock_0_1_1, AcquireGreaterEqual, 1) - aie.dma_bd(%buf14 : memref<1024xi16, 1 : i32>, 0, 256) {task_id = 0 : i32} - aie.use_lock(%lock_0_1, Release, 1) - aie.next_bd ^bb1 - ^bb2: // pred: ^bb9 - aie.end - ^bb3: // pred: ^bb0 - %1 = aie.dma_start(MM2S, 1, ^bb4, ^bb5) - ^bb4: // 2 preds: ^bb3, ^bb4 - aie.use_lock(%lock_0_1_1, AcquireGreaterEqual, 1) - aie.dma_bd(%buf14 : memref<1024xi16, 1 : i32>, 256, 256) {task_id = 0 : i32} - aie.use_lock(%lock_0_1, Release, 1) - aie.next_bd ^bb4 - ^bb5: // pred: ^bb3 - %2 = aie.dma_start(MM2S, 2, ^bb6, ^bb7) - ^bb6: // 2 preds: ^bb5, ^bb6 - aie.use_lock(%lock_0_1_1, AcquireGreaterEqual, 1) - aie.dma_bd(%buf14 : memref<1024xi16, 1 : i32>, 512, 256) {task_id = 0 : i32} - aie.use_lock(%lock_0_1, Release, 1) - aie.next_bd ^bb6 - ^bb7: // pred: ^bb5 - %3 = aie.dma_start(MM2S, 3, ^bb8, ^bb9) - ^bb8: // 2 preds: ^bb7, ^bb8 - aie.use_lock(%lock_0_1_1, AcquireGreaterEqual, 1) - aie.dma_bd(%buf14 : memref<1024xi16, 1 : i32>, 768, 256) {task_id = 0 : i32} - aie.use_lock(%lock_0_1, Release, 1) - aie.next_bd ^bb8 - ^bb9: // pred: ^bb7 - %4 = aie.dma_start(S2MM, 0, ^bb10, ^bb2) - ^bb10: // 2 preds: ^bb9, ^bb10 - aie.use_lock(%lock_0_1, AcquireGreaterEqual, 4) - aie.dma_bd(%buf14 : memref<1024xi16, 1 : i32>, 0, 1024) {task_id = 0 : i32} - aie.use_lock(%lock_0_1_1, Release, 4) - aie.next_bd ^bb10 - } - %memtile_dma_1_1 = aie.memtile_dma(%mem_tile_1_1) { - %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) - ^bb1: // 2 preds: ^bb0, ^bb1 - aie.use_lock(%lock_1_1_0, AcquireGreaterEqual, 1) - aie.dma_bd(%buf13 : memref<1024xi16, 1 : i32>, 0, 256) {task_id = 0 : i32} - aie.use_lock(%lock_1_1, Release, 1) - aie.next_bd ^bb1 - ^bb2: // pred: ^bb9 - aie.end - ^bb3: // pred: ^bb0 - %1 = aie.dma_start(MM2S, 1, ^bb4, ^bb5) - ^bb4: // 2 preds: ^bb3, ^bb4 - aie.use_lock(%lock_1_1_0, AcquireGreaterEqual, 1) - aie.dma_bd(%buf13 : memref<1024xi16, 1 : i32>, 256, 256) {task_id = 0 : i32} - aie.use_lock(%lock_1_1, Release, 1) - aie.next_bd ^bb4 - ^bb5: // pred: ^bb3 - %2 = aie.dma_start(MM2S, 2, ^bb6, ^bb7) - ^bb6: // 2 preds: ^bb5, ^bb6 - aie.use_lock(%lock_1_1_0, AcquireGreaterEqual, 1) - aie.dma_bd(%buf13 : memref<1024xi16, 1 : i32>, 512, 256) {task_id = 0 : i32} - aie.use_lock(%lock_1_1, Release, 1) - aie.next_bd ^bb6 - ^bb7: // pred: ^bb5 - %3 = aie.dma_start(MM2S, 3, ^bb8, ^bb9) - ^bb8: // 2 preds: ^bb7, ^bb8 - aie.use_lock(%lock_1_1_0, AcquireGreaterEqual, 1) - aie.dma_bd(%buf13 : memref<1024xi16, 1 : i32>, 768, 256) {task_id = 0 : i32} - aie.use_lock(%lock_1_1, Release, 1) - aie.next_bd ^bb8 - ^bb9: // pred: ^bb7 - %4 = aie.dma_start(S2MM, 0, ^bb10, ^bb2) - ^bb10: // 2 preds: ^bb9, ^bb10 - aie.use_lock(%lock_1_1, AcquireGreaterEqual, 4) - aie.dma_bd(%buf13 : memref<1024xi16, 1 : i32>, 0, 1024) {task_id = 0 : i32} - aie.use_lock(%lock_1_1_0, Release, 4) - aie.next_bd ^bb10 - } - aie.shim_dma_allocation @air_channel_5(%shim_noc_tile_2_0, S2MM, 0) - aie.shim_dma_allocation @air_channel_0(%shim_noc_tile_0_0, MM2S, 0) - aie.shim_dma_allocation @air_channel_1(%shim_noc_tile_1_0, MM2S, 0) - aie.runtime_sequence @sub_kernel_0_sequence(%arg0: memref<*xi16>, %arg1: memref<*xi16>, %arg2: memref<*xi16>, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32) { - %0 = aiex.dma_configure_task_for @air_channel_0 { - aie.dma_bd(%arg0 : memref<*xi16>, 0, 1024, [, ]) - aie.end - } - aiex.dma_start_task(%0) - %1 = aiex.dma_configure_task_for @air_channel_1 { - aie.dma_bd(%arg1 : memref<*xi16>, 0, 1024, [, ]) - aie.end - } - aiex.dma_start_task(%1) - %2 = aiex.dma_configure_task_for @air_channel_5 { - aie.dma_bd(%arg2 : memref<*xi16>, 0, 1024, [, ]) - aie.end - } {issue_token = true} - aiex.dma_start_task(%2) - aiex.dma_free_task(%0) - aiex.dma_await_task(%2) - aiex.dma_free_task(%1) - } - } {dlti.dl_spec = #dlti.dl_spec} - aie.device(npu2) { - aie.runtime_sequence @sub_kernel(%arg0: memref<*xi16>, %arg1: memref<*xi16>, %arg2: memref<*xi16>, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32) { - aiex.configure @sub_kernel_0 { - aiex.run @sub_kernel_0_sequence(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8) : (memref<*xi16>, memref<*xi16>, memref<*xi16>, i32, i32, i32, i32, i32, i32) - } - } - } -} diff --git a/examples/elementwise_arith/air_project/aiecc_failure_1775797139_858651.mlir b/examples/elementwise_arith/air_project/aiecc_failure_1775797139_858651.mlir deleted file mode 100644 index fc9d492..0000000 --- a/examples/elementwise_arith/air_project/aiecc_failure_1775797139_858651.mlir +++ /dev/null @@ -1,601 +0,0 @@ -#loop_annotation = #llvm.loop_annotation -module { - aie.device(npu2) @mul_kernel_0 { - %shim_noc_tile_0_0 = aie.tile(0, 0) {controller_id = #aie.packet_info} - %shim_noc_tile_1_0 = aie.tile(1, 0) {controller_id = #aie.packet_info} - %shim_noc_tile_2_0 = aie.tile(2, 0) {controller_id = #aie.packet_info} - %mem_tile_0_1 = aie.tile(0, 1) {controller_id = #aie.packet_info} - %mem_tile_1_1 = aie.tile(1, 1) {controller_id = #aie.packet_info} - %mem_tile_2_1 = aie.tile(2, 1) {controller_id = #aie.packet_info} - %tile_0_2 = aie.tile(0, 2) {controller_id = #aie.packet_info} - %tile_0_3 = aie.tile(0, 3) {controller_id = #aie.packet_info} - %tile_0_4 = aie.tile(0, 4) {controller_id = #aie.packet_info} - %tile_0_5 = aie.tile(0, 5) {controller_id = #aie.packet_info} - %lock_1_1 = aie.lock(%mem_tile_1_1, 1) {init = 4 : i32} - %lock_1_1_0 = aie.lock(%mem_tile_1_1, 0) {init = 0 : i32} - %lock_0_1 = aie.lock(%mem_tile_0_1, 1) {init = 4 : i32} - %lock_0_1_1 = aie.lock(%mem_tile_0_1, 0) {init = 0 : i32} - %lock_2_1 = aie.lock(%mem_tile_2_1, 1) {init = 4 : i32} - %lock_2_1_2 = aie.lock(%mem_tile_2_1, 0) {init = 0 : i32} - %lock_0_2 = aie.lock(%tile_0_2, 5) {init = 1 : i32} - %lock_0_2_3 = aie.lock(%tile_0_2, 4) {init = 0 : i32} - %lock_0_2_4 = aie.lock(%tile_0_2, 3) {init = 1 : i32} - %lock_0_2_5 = aie.lock(%tile_0_2, 2) {init = 0 : i32} - %lock_0_2_6 = aie.lock(%tile_0_2, 1) {init = 1 : i32} - %lock_0_2_7 = aie.lock(%tile_0_2, 0) {init = 0 : i32} - %lock_0_3 = aie.lock(%tile_0_3, 5) {init = 1 : i32} - %lock_0_3_8 = aie.lock(%tile_0_3, 4) {init = 0 : i32} - %lock_0_3_9 = aie.lock(%tile_0_3, 3) {init = 1 : i32} - %lock_0_3_10 = aie.lock(%tile_0_3, 2) {init = 0 : i32} - %lock_0_3_11 = aie.lock(%tile_0_3, 1) {init = 1 : i32} - %lock_0_3_12 = aie.lock(%tile_0_3, 0) {init = 0 : i32} - %lock_0_4 = aie.lock(%tile_0_4, 5) {init = 1 : i32} - %lock_0_4_13 = aie.lock(%tile_0_4, 4) {init = 0 : i32} - %lock_0_4_14 = aie.lock(%tile_0_4, 3) {init = 1 : i32} - %lock_0_4_15 = aie.lock(%tile_0_4, 2) {init = 0 : i32} - %lock_0_4_16 = aie.lock(%tile_0_4, 1) {init = 1 : i32} - %lock_0_4_17 = aie.lock(%tile_0_4, 0) {init = 0 : i32} - %lock_0_5 = aie.lock(%tile_0_5, 5) {init = 1 : i32} - %lock_0_5_18 = aie.lock(%tile_0_5, 4) {init = 0 : i32} - %lock_0_5_19 = aie.lock(%tile_0_5, 3) {init = 1 : i32} - %lock_0_5_20 = aie.lock(%tile_0_5, 2) {init = 0 : i32} - %lock_0_5_21 = aie.lock(%tile_0_5, 1) {init = 1 : i32} - %lock_0_5_22 = aie.lock(%tile_0_5, 0) {init = 0 : i32} - %buf14 = aie.buffer(%mem_tile_0_1) {address = 0 : i32, mem_bank = 0 : i32, sym_name = "buf14"} : memref<1024xi16, 1 : i32> - %buf13 = aie.buffer(%mem_tile_1_1) {address = 0 : i32, mem_bank = 0 : i32, sym_name = "buf13"} : memref<1024xi16, 1 : i32> - %buf12 = aie.buffer(%mem_tile_2_1) {address = 0 : i32, mem_bank = 0 : i32, sym_name = "buf12"} : memref<1024xi16, 1> - %buf11 = aie.buffer(%tile_0_5) {address = 1024 : i32, mem_bank = 0 : i32, sym_name = "buf11"} : memref<256xi16, 2> - %buf10 = aie.buffer(%tile_0_5) {address = 16384 : i32, mem_bank = 1 : i32, sym_name = "buf10"} : memref<256xi16, 2> - %buf9 = aie.buffer(%tile_0_5) {address = 32768 : i32, mem_bank = 2 : i32, sym_name = "buf9"} : memref<256xi16, 2> - %buf8 = aie.buffer(%tile_0_4) {address = 1024 : i32, mem_bank = 0 : i32, sym_name = "buf8"} : memref<256xi16, 2> - %buf7 = aie.buffer(%tile_0_4) {address = 16384 : i32, mem_bank = 1 : i32, sym_name = "buf7"} : memref<256xi16, 2> - %buf6 = aie.buffer(%tile_0_4) {address = 32768 : i32, mem_bank = 2 : i32, sym_name = "buf6"} : memref<256xi16, 2> - %buf5 = aie.buffer(%tile_0_3) {address = 1024 : i32, mem_bank = 0 : i32, sym_name = "buf5"} : memref<256xi16, 2> - %buf4 = aie.buffer(%tile_0_3) {address = 16384 : i32, mem_bank = 1 : i32, sym_name = "buf4"} : memref<256xi16, 2> - %buf3 = aie.buffer(%tile_0_3) {address = 32768 : i32, mem_bank = 2 : i32, sym_name = "buf3"} : memref<256xi16, 2> - %buf2 = aie.buffer(%tile_0_2) {address = 1024 : i32, mem_bank = 0 : i32, sym_name = "buf2"} : memref<256xi16, 2> - %buf1 = aie.buffer(%tile_0_2) {address = 16384 : i32, mem_bank = 1 : i32, sym_name = "buf1"} : memref<256xi16, 2> - %buf0 = aie.buffer(%tile_0_2) {address = 32768 : i32, mem_bank = 2 : i32, sym_name = "buf0"} : memref<256xi16, 2> - %mem_0_5 = aie.mem(%tile_0_5) { - %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) - ^bb1: // 2 preds: ^bb0, ^bb1 - aie.use_lock(%lock_0_5_22, AcquireGreaterEqual, 1) - aie.dma_bd(%buf9 : memref<256xi16, 2>, 0, 256) {bd_id = 0 : i32, next_bd_id = 0 : i32, task_id = 0 : i32} - aie.use_lock(%lock_0_5_21, Release, 1) - aie.next_bd ^bb1 - ^bb2: // pred: ^bb5 - aie.end - ^bb3: // pred: ^bb0 - %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb5) - ^bb4: // 2 preds: ^bb3, ^bb4 - aie.use_lock(%lock_0_5_19, AcquireGreaterEqual, 1) - aie.dma_bd(%buf11 : memref<256xi16, 2>, 0, 256) {bd_id = 1 : i32, next_bd_id = 1 : i32, task_id = 0 : i32} - aie.use_lock(%lock_0_5_20, Release, 1) - aie.next_bd ^bb4 - ^bb5: // pred: ^bb3 - %2 = aie.dma_start(S2MM, 1, ^bb6, ^bb2) - ^bb6: // 2 preds: ^bb5, ^bb6 - aie.use_lock(%lock_0_5, AcquireGreaterEqual, 1) - aie.dma_bd(%buf10 : memref<256xi16, 2>, 0, 256) {bd_id = 2 : i32, next_bd_id = 2 : i32, task_id = 0 : i32} - aie.use_lock(%lock_0_5_18, Release, 1) - aie.next_bd ^bb6 - } - %core_0_5 = aie.core(%tile_0_5) { - %c0_i32 = arith.constant 0 : i32 - %c256 = arith.constant 256 : index - %c32 = arith.constant 32 : index - %c0 = arith.constant 0 : index - cf.br ^bb1 - ^bb1: // 2 preds: ^bb0, ^bb4 - aie.use_lock(%lock_0_5_21, AcquireGreaterEqual, 1) - aie.use_lock(%lock_0_5_20, AcquireGreaterEqual, 1) - aie.use_lock(%lock_0_5_18, AcquireGreaterEqual, 1) - cf.br ^bb2(%c0 : index) - ^bb2(%0: index): // 2 preds: ^bb1, ^bb3 - %1 = arith.cmpi slt, %0, %c256 : index - cf.cond_br %1, ^bb3, ^bb4 - ^bb3: // pred: ^bb2 - %2 = vector.load %buf11[%0] : memref<256xi16, 2>, vector<32xi16> - %3 = vector.load %buf10[%0] : memref<256xi16, 2>, vector<32xi16> - %4 = aievec.mul_elem %2, %3 : vector<32xi16>, vector<32xi16>, vector<32xi32> - %5 = aievec.srs %4, %c0_i32 : vector<32xi32>, i32, vector<32xi16> - vector.store %5, %buf9[%0] : memref<256xi16, 2>, vector<32xi16> - %6 = arith.addi %0, %c32 : index - cf.br ^bb2(%6 : index) {loop_annotation = #loop_annotation} - ^bb4: // pred: ^bb2 - aie.use_lock(%lock_0_5_19, Release, 1) - aie.use_lock(%lock_0_5, Release, 1) - aie.use_lock(%lock_0_5_22, Release, 1) - cf.br ^bb1 - } - %mem_0_4 = aie.mem(%tile_0_4) { - %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) - ^bb1: // 2 preds: ^bb0, ^bb1 - aie.use_lock(%lock_0_4_17, AcquireGreaterEqual, 1) - aie.dma_bd(%buf6 : memref<256xi16, 2>, 0, 256) {bd_id = 0 : i32, next_bd_id = 0 : i32, task_id = 0 : i32} - aie.use_lock(%lock_0_4_16, Release, 1) - aie.next_bd ^bb1 - ^bb2: // pred: ^bb5 - aie.end - ^bb3: // pred: ^bb0 - %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb5) - ^bb4: // 2 preds: ^bb3, ^bb4 - aie.use_lock(%lock_0_4_14, AcquireGreaterEqual, 1) - aie.dma_bd(%buf8 : memref<256xi16, 2>, 0, 256) {bd_id = 1 : i32, next_bd_id = 1 : i32, task_id = 0 : i32} - aie.use_lock(%lock_0_4_15, Release, 1) - aie.next_bd ^bb4 - ^bb5: // pred: ^bb3 - %2 = aie.dma_start(S2MM, 1, ^bb6, ^bb2) - ^bb6: // 2 preds: ^bb5, ^bb6 - aie.use_lock(%lock_0_4, AcquireGreaterEqual, 1) - aie.dma_bd(%buf7 : memref<256xi16, 2>, 0, 256) {bd_id = 2 : i32, next_bd_id = 2 : i32, task_id = 0 : i32} - aie.use_lock(%lock_0_4_13, Release, 1) - aie.next_bd ^bb6 - } - %core_0_4 = aie.core(%tile_0_4) { - %c0_i32 = arith.constant 0 : i32 - %c256 = arith.constant 256 : index - %c32 = arith.constant 32 : index - %c0 = arith.constant 0 : index - cf.br ^bb1 - ^bb1: // 2 preds: ^bb0, ^bb4 - aie.use_lock(%lock_0_4_16, AcquireGreaterEqual, 1) - aie.use_lock(%lock_0_4_15, AcquireGreaterEqual, 1) - aie.use_lock(%lock_0_4_13, AcquireGreaterEqual, 1) - cf.br ^bb2(%c0 : index) - ^bb2(%0: index): // 2 preds: ^bb1, ^bb3 - %1 = arith.cmpi slt, %0, %c256 : index - cf.cond_br %1, ^bb3, ^bb4 - ^bb3: // pred: ^bb2 - %2 = vector.load %buf8[%0] : memref<256xi16, 2>, vector<32xi16> - %3 = vector.load %buf7[%0] : memref<256xi16, 2>, vector<32xi16> - %4 = aievec.mul_elem %2, %3 : vector<32xi16>, vector<32xi16>, vector<32xi32> - %5 = aievec.srs %4, %c0_i32 : vector<32xi32>, i32, vector<32xi16> - vector.store %5, %buf6[%0] : memref<256xi16, 2>, vector<32xi16> - %6 = arith.addi %0, %c32 : index - cf.br ^bb2(%6 : index) {loop_annotation = #loop_annotation} - ^bb4: // pred: ^bb2 - aie.use_lock(%lock_0_4_14, Release, 1) - aie.use_lock(%lock_0_4, Release, 1) - aie.use_lock(%lock_0_4_17, Release, 1) - cf.br ^bb1 - } - %mem_0_3 = aie.mem(%tile_0_3) { - %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) - ^bb1: // 2 preds: ^bb0, ^bb1 - aie.use_lock(%lock_0_3_12, AcquireGreaterEqual, 1) - aie.dma_bd(%buf3 : memref<256xi16, 2>, 0, 256) {bd_id = 0 : i32, next_bd_id = 0 : i32, task_id = 0 : i32} - aie.use_lock(%lock_0_3_11, Release, 1) - aie.next_bd ^bb1 - ^bb2: // pred: ^bb5 - aie.end - ^bb3: // pred: ^bb0 - %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb5) - ^bb4: // 2 preds: ^bb3, ^bb4 - aie.use_lock(%lock_0_3_9, AcquireGreaterEqual, 1) - aie.dma_bd(%buf5 : memref<256xi16, 2>, 0, 256) {bd_id = 1 : i32, next_bd_id = 1 : i32, task_id = 0 : i32} - aie.use_lock(%lock_0_3_10, Release, 1) - aie.next_bd ^bb4 - ^bb5: // pred: ^bb3 - %2 = aie.dma_start(S2MM, 1, ^bb6, ^bb2) - ^bb6: // 2 preds: ^bb5, ^bb6 - aie.use_lock(%lock_0_3, AcquireGreaterEqual, 1) - aie.dma_bd(%buf4 : memref<256xi16, 2>, 0, 256) {bd_id = 2 : i32, next_bd_id = 2 : i32, task_id = 0 : i32} - aie.use_lock(%lock_0_3_8, Release, 1) - aie.next_bd ^bb6 - } - %core_0_3 = aie.core(%tile_0_3) { - %c0_i32 = arith.constant 0 : i32 - %c256 = arith.constant 256 : index - %c32 = arith.constant 32 : index - %c0 = arith.constant 0 : index - cf.br ^bb1 - ^bb1: // 2 preds: ^bb0, ^bb4 - aie.use_lock(%lock_0_3_11, AcquireGreaterEqual, 1) - aie.use_lock(%lock_0_3_10, AcquireGreaterEqual, 1) - aie.use_lock(%lock_0_3_8, AcquireGreaterEqual, 1) - cf.br ^bb2(%c0 : index) - ^bb2(%0: index): // 2 preds: ^bb1, ^bb3 - %1 = arith.cmpi slt, %0, %c256 : index - cf.cond_br %1, ^bb3, ^bb4 - ^bb3: // pred: ^bb2 - %2 = vector.load %buf5[%0] : memref<256xi16, 2>, vector<32xi16> - %3 = vector.load %buf4[%0] : memref<256xi16, 2>, vector<32xi16> - %4 = aievec.mul_elem %2, %3 : vector<32xi16>, vector<32xi16>, vector<32xi32> - %5 = aievec.srs %4, %c0_i32 : vector<32xi32>, i32, vector<32xi16> - vector.store %5, %buf3[%0] : memref<256xi16, 2>, vector<32xi16> - %6 = arith.addi %0, %c32 : index - cf.br ^bb2(%6 : index) {loop_annotation = #loop_annotation} - ^bb4: // pred: ^bb2 - aie.use_lock(%lock_0_3_9, Release, 1) - aie.use_lock(%lock_0_3, Release, 1) - aie.use_lock(%lock_0_3_12, Release, 1) - cf.br ^bb1 - } - %mem_0_2 = aie.mem(%tile_0_2) { - %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) - ^bb1: // 2 preds: ^bb0, ^bb1 - aie.use_lock(%lock_0_2_7, AcquireGreaterEqual, 1) - aie.dma_bd(%buf0 : memref<256xi16, 2>, 0, 256) {bd_id = 0 : i32, next_bd_id = 0 : i32, task_id = 0 : i32} - aie.use_lock(%lock_0_2_6, Release, 1) - aie.next_bd ^bb1 - ^bb2: // pred: ^bb5 - aie.end - ^bb3: // pred: ^bb0 - %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb5) - ^bb4: // 2 preds: ^bb3, ^bb4 - aie.use_lock(%lock_0_2_4, AcquireGreaterEqual, 1) - aie.dma_bd(%buf2 : memref<256xi16, 2>, 0, 256) {bd_id = 1 : i32, next_bd_id = 1 : i32, task_id = 0 : i32} - aie.use_lock(%lock_0_2_5, Release, 1) - aie.next_bd ^bb4 - ^bb5: // pred: ^bb3 - %2 = aie.dma_start(S2MM, 1, ^bb6, ^bb2) - ^bb6: // 2 preds: ^bb5, ^bb6 - aie.use_lock(%lock_0_2, AcquireGreaterEqual, 1) - aie.dma_bd(%buf1 : memref<256xi16, 2>, 0, 256) {bd_id = 2 : i32, next_bd_id = 2 : i32, task_id = 0 : i32} - aie.use_lock(%lock_0_2_3, Release, 1) - aie.next_bd ^bb6 - } - %core_0_2 = aie.core(%tile_0_2) { - %c0_i32 = arith.constant 0 : i32 - %c256 = arith.constant 256 : index - %c32 = arith.constant 32 : index - %c0 = arith.constant 0 : index - cf.br ^bb1 - ^bb1: // 2 preds: ^bb0, ^bb4 - aie.use_lock(%lock_0_2_6, AcquireGreaterEqual, 1) - aie.use_lock(%lock_0_2_5, AcquireGreaterEqual, 1) - aie.use_lock(%lock_0_2_3, AcquireGreaterEqual, 1) - cf.br ^bb2(%c0 : index) - ^bb2(%0: index): // 2 preds: ^bb1, ^bb3 - %1 = arith.cmpi slt, %0, %c256 : index - cf.cond_br %1, ^bb3, ^bb4 - ^bb3: // pred: ^bb2 - %2 = vector.load %buf2[%0] : memref<256xi16, 2>, vector<32xi16> - %3 = vector.load %buf1[%0] : memref<256xi16, 2>, vector<32xi16> - %4 = aievec.mul_elem %2, %3 : vector<32xi16>, vector<32xi16>, vector<32xi32> - %5 = aievec.srs %4, %c0_i32 : vector<32xi32>, i32, vector<32xi16> - vector.store %5, %buf0[%0] : memref<256xi16, 2>, vector<32xi16> - %6 = arith.addi %0, %c32 : index - cf.br ^bb2(%6 : index) {loop_annotation = #loop_annotation} - ^bb4: // pred: ^bb2 - aie.use_lock(%lock_0_2_4, Release, 1) - aie.use_lock(%lock_0_2, Release, 1) - aie.use_lock(%lock_0_2_7, Release, 1) - cf.br ^bb1 - } - %memtile_dma_2_1 = aie.memtile_dma(%mem_tile_2_1) { - %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) - ^bb1: // 2 preds: ^bb0, ^bb1 - aie.use_lock(%lock_2_1_2, AcquireGreaterEqual, 4) - aie.dma_bd(%buf12 : memref<1024xi16, 1>, 0, 1024) {bd_id = 0 : i32, next_bd_id = 0 : i32, task_id = 0 : i32} - aie.use_lock(%lock_2_1, Release, 4) - aie.next_bd ^bb1 - ^bb2: // pred: ^bb9 - aie.end - ^bb3: // pred: ^bb0 - %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb5) - ^bb4: // 2 preds: ^bb3, ^bb4 - aie.use_lock(%lock_2_1, AcquireGreaterEqual, 1) - aie.dma_bd(%buf12 : memref<1024xi16, 1>, 0, 256) {bd_id = 1 : i32, next_bd_id = 1 : i32, task_id = 0 : i32} - aie.use_lock(%lock_2_1_2, Release, 1) - aie.next_bd ^bb4 - ^bb5: // pred: ^bb3 - %2 = aie.dma_start(S2MM, 1, ^bb6, ^bb7) - ^bb6: // 2 preds: ^bb5, ^bb6 - aie.use_lock(%lock_2_1, AcquireGreaterEqual, 1) - aie.dma_bd(%buf12 : memref<1024xi16, 1>, 256, 256) {bd_id = 24 : i32, next_bd_id = 24 : i32, task_id = 0 : i32} - aie.use_lock(%lock_2_1_2, Release, 1) - aie.next_bd ^bb6 - ^bb7: // pred: ^bb5 - %3 = aie.dma_start(S2MM, 2, ^bb8, ^bb9) - ^bb8: // 2 preds: ^bb7, ^bb8 - aie.use_lock(%lock_2_1, AcquireGreaterEqual, 1) - aie.dma_bd(%buf12 : memref<1024xi16, 1>, 512, 256) {bd_id = 2 : i32, next_bd_id = 2 : i32, task_id = 0 : i32} - aie.use_lock(%lock_2_1_2, Release, 1) - aie.next_bd ^bb8 - ^bb9: // pred: ^bb7 - %4 = aie.dma_start(S2MM, 3, ^bb10, ^bb2) - ^bb10: // 2 preds: ^bb9, ^bb10 - aie.use_lock(%lock_2_1, AcquireGreaterEqual, 1) - aie.dma_bd(%buf12 : memref<1024xi16, 1>, 768, 256) {bd_id = 25 : i32, next_bd_id = 25 : i32, task_id = 0 : i32} - aie.use_lock(%lock_2_1_2, Release, 1) - aie.next_bd ^bb10 - } - %memtile_dma_0_1 = aie.memtile_dma(%mem_tile_0_1) { - %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) - ^bb1: // 2 preds: ^bb0, ^bb1 - aie.use_lock(%lock_0_1_1, AcquireGreaterEqual, 1) - aie.dma_bd(%buf14 : memref<1024xi16, 1 : i32>, 0, 256) {bd_id = 0 : i32, next_bd_id = 0 : i32, task_id = 0 : i32} - aie.use_lock(%lock_0_1, Release, 1) - aie.next_bd ^bb1 - ^bb2: // pred: ^bb9 - aie.end - ^bb3: // pred: ^bb0 - %1 = aie.dma_start(MM2S, 1, ^bb4, ^bb5) - ^bb4: // 2 preds: ^bb3, ^bb4 - aie.use_lock(%lock_0_1_1, AcquireGreaterEqual, 1) - aie.dma_bd(%buf14 : memref<1024xi16, 1 : i32>, 256, 256) {bd_id = 24 : i32, next_bd_id = 24 : i32, task_id = 0 : i32} - aie.use_lock(%lock_0_1, Release, 1) - aie.next_bd ^bb4 - ^bb5: // pred: ^bb3 - %2 = aie.dma_start(MM2S, 2, ^bb6, ^bb7) - ^bb6: // 2 preds: ^bb5, ^bb6 - aie.use_lock(%lock_0_1_1, AcquireGreaterEqual, 1) - aie.dma_bd(%buf14 : memref<1024xi16, 1 : i32>, 512, 256) {bd_id = 1 : i32, next_bd_id = 1 : i32, task_id = 0 : i32} - aie.use_lock(%lock_0_1, Release, 1) - aie.next_bd ^bb6 - ^bb7: // pred: ^bb5 - %3 = aie.dma_start(MM2S, 3, ^bb8, ^bb9) - ^bb8: // 2 preds: ^bb7, ^bb8 - aie.use_lock(%lock_0_1_1, AcquireGreaterEqual, 1) - aie.dma_bd(%buf14 : memref<1024xi16, 1 : i32>, 768, 256) {bd_id = 25 : i32, next_bd_id = 25 : i32, task_id = 0 : i32} - aie.use_lock(%lock_0_1, Release, 1) - aie.next_bd ^bb8 - ^bb9: // pred: ^bb7 - %4 = aie.dma_start(S2MM, 0, ^bb10, ^bb2) - ^bb10: // 2 preds: ^bb9, ^bb10 - aie.use_lock(%lock_0_1, AcquireGreaterEqual, 4) - aie.dma_bd(%buf14 : memref<1024xi16, 1 : i32>, 0, 1024) {bd_id = 2 : i32, next_bd_id = 2 : i32, task_id = 0 : i32} - aie.use_lock(%lock_0_1_1, Release, 4) - aie.next_bd ^bb10 - } - %memtile_dma_1_1 = aie.memtile_dma(%mem_tile_1_1) { - %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) - ^bb1: // 2 preds: ^bb0, ^bb1 - aie.use_lock(%lock_1_1_0, AcquireGreaterEqual, 1) - aie.dma_bd(%buf13 : memref<1024xi16, 1 : i32>, 0, 256) {bd_id = 0 : i32, next_bd_id = 0 : i32, task_id = 0 : i32} - aie.use_lock(%lock_1_1, Release, 1) - aie.next_bd ^bb1 - ^bb2: // pred: ^bb9 - aie.end - ^bb3: // pred: ^bb0 - %1 = aie.dma_start(MM2S, 1, ^bb4, ^bb5) - ^bb4: // 2 preds: ^bb3, ^bb4 - aie.use_lock(%lock_1_1_0, AcquireGreaterEqual, 1) - aie.dma_bd(%buf13 : memref<1024xi16, 1 : i32>, 256, 256) {bd_id = 24 : i32, next_bd_id = 24 : i32, task_id = 0 : i32} - aie.use_lock(%lock_1_1, Release, 1) - aie.next_bd ^bb4 - ^bb5: // pred: ^bb3 - %2 = aie.dma_start(MM2S, 2, ^bb6, ^bb7) - ^bb6: // 2 preds: ^bb5, ^bb6 - aie.use_lock(%lock_1_1_0, AcquireGreaterEqual, 1) - aie.dma_bd(%buf13 : memref<1024xi16, 1 : i32>, 512, 256) {bd_id = 1 : i32, next_bd_id = 1 : i32, task_id = 0 : i32} - aie.use_lock(%lock_1_1, Release, 1) - aie.next_bd ^bb6 - ^bb7: // pred: ^bb5 - %3 = aie.dma_start(MM2S, 3, ^bb8, ^bb9) - ^bb8: // 2 preds: ^bb7, ^bb8 - aie.use_lock(%lock_1_1_0, AcquireGreaterEqual, 1) - aie.dma_bd(%buf13 : memref<1024xi16, 1 : i32>, 768, 256) {bd_id = 25 : i32, next_bd_id = 25 : i32, task_id = 0 : i32} - aie.use_lock(%lock_1_1, Release, 1) - aie.next_bd ^bb8 - ^bb9: // pred: ^bb7 - %4 = aie.dma_start(S2MM, 0, ^bb10, ^bb2) - ^bb10: // 2 preds: ^bb9, ^bb10 - aie.use_lock(%lock_1_1, AcquireGreaterEqual, 4) - aie.dma_bd(%buf13 : memref<1024xi16, 1 : i32>, 0, 1024) {bd_id = 2 : i32, next_bd_id = 2 : i32, task_id = 0 : i32} - aie.use_lock(%lock_1_1_0, Release, 4) - aie.next_bd ^bb10 - } - aie.shim_dma_allocation @air_channel_5(%shim_noc_tile_2_0, S2MM, 0) - aie.shim_dma_allocation @air_channel_0(%shim_noc_tile_0_0, MM2S, 0) - aie.shim_dma_allocation @air_channel_1(%shim_noc_tile_1_0, MM2S, 0) - aie.runtime_sequence @mul_kernel_0_sequence(%arg0: memref<*xi16>, %arg1: memref<*xi16>, %arg2: memref<*xi16>, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32) { - %0 = aiex.dma_configure_task_for @air_channel_0 { - aie.dma_bd(%arg0 : memref<*xi16>, 0, 1024, [, ]) - aie.end - } - aiex.dma_start_task(%0) - %1 = aiex.dma_configure_task_for @air_channel_1 { - aie.dma_bd(%arg1 : memref<*xi16>, 0, 1024, [, ]) - aie.end - } - aiex.dma_start_task(%1) - %2 = aiex.dma_configure_task_for @air_channel_5 { - aie.dma_bd(%arg2 : memref<*xi16>, 0, 1024, [, ]) - aie.end - } {issue_token = true} - aiex.dma_start_task(%2) - aiex.dma_free_task(%0) - aiex.dma_await_task(%2) - aiex.dma_free_task(%1) - } - aie.packet_flow(15) { - aie.packet_source<%shim_noc_tile_0_0, TileControl : 0> - aie.packet_dest<%shim_noc_tile_0_0, South : 0> - } {keep_pkt_header = true, priority_route = true} - aie.packet_flow(15) { - aie.packet_source<%shim_noc_tile_1_0, TileControl : 0> - aie.packet_dest<%shim_noc_tile_1_0, South : 0> - } {keep_pkt_header = true, priority_route = true} - aie.packet_flow(15) { - aie.packet_source<%shim_noc_tile_2_0, TileControl : 0> - aie.packet_dest<%shim_noc_tile_2_0, South : 0> - } {keep_pkt_header = true, priority_route = true} - %switchbox_0_0 = aie.switchbox(%shim_noc_tile_0_0) { - aie.connect - %0 = aie.amsel<5> (3) - %1 = aie.masterset(South : 0, %0) {keep_pkt_header = true} - aie.packet_rules(TileControl : 0) { - aie.rule(31, 15, %0) - } - } - %shim_mux_0_0 = aie.shim_mux(%shim_noc_tile_0_0) { - aie.connect - } - %switchbox_0_1 = aie.switchbox(%mem_tile_0_1) { - aie.connect - aie.connect - aie.connect - aie.connect - aie.connect - } - %switchbox_1_0 = aie.switchbox(%shim_noc_tile_1_0) { - aie.connect - %0 = aie.amsel<5> (3) - %1 = aie.masterset(South : 0, %0) {keep_pkt_header = true} - aie.packet_rules(TileControl : 0) { - aie.rule(31, 15, %0) - } - } - %shim_mux_1_0 = aie.shim_mux(%shim_noc_tile_1_0) { - aie.connect - } - %switchbox_1_1 = aie.switchbox(%mem_tile_1_1) { - aie.connect - aie.connect - aie.connect - aie.connect - aie.connect - } - %switchbox_2_0 = aie.switchbox(%shim_noc_tile_2_0) { - aie.connect - %0 = aie.amsel<5> (3) - %1 = aie.masterset(South : 0, %0) {keep_pkt_header = true} - aie.packet_rules(TileControl : 0) { - aie.rule(31, 15, %0) - } - } - %shim_mux_2_0 = aie.shim_mux(%shim_noc_tile_2_0) { - aie.connect - } - %switchbox_2_1 = aie.switchbox(%mem_tile_2_1) { - aie.connect - aie.connect - aie.connect - aie.connect - aie.connect - } - %switchbox_0_2 = aie.switchbox(%tile_0_2) { - aie.connect - aie.connect - aie.connect - aie.connect - aie.connect - aie.connect - } - %switchbox_0_3 = aie.switchbox(%tile_0_3) { - aie.connect - aie.connect - aie.connect - aie.connect - aie.connect - aie.connect - } - %switchbox_0_4 = aie.switchbox(%tile_0_4) { - aie.connect - aie.connect - aie.connect - aie.connect - aie.connect - aie.connect - } - %switchbox_0_5 = aie.switchbox(%tile_0_5) { - aie.connect - aie.connect - aie.connect - } - %tile_1_2 = aie.tile(1, 2) - %switchbox_1_2 = aie.switchbox(%tile_1_2) { - aie.connect - aie.connect - aie.connect - aie.connect - aie.connect - aie.connect - } - %tile_1_3 = aie.tile(1, 3) - %switchbox_1_3 = aie.switchbox(%tile_1_3) { - aie.connect - aie.connect - aie.connect - aie.connect - aie.connect - } - %tile_1_4 = aie.tile(1, 4) - %switchbox_1_4 = aie.switchbox(%tile_1_4) { - aie.connect - aie.connect - aie.connect - } - %tile_2_2 = aie.tile(2, 2) - %switchbox_2_2 = aie.switchbox(%tile_2_2) { - aie.connect - aie.connect - aie.connect - aie.connect - } - %tile_2_3 = aie.tile(2, 3) - %switchbox_2_3 = aie.switchbox(%tile_2_3) { - aie.connect - aie.connect - } - %tile_2_4 = aie.tile(2, 4) - %switchbox_2_4 = aie.switchbox(%tile_2_4) { - aie.connect - } - aie.wire(%shim_mux_0_0 : North, %switchbox_0_0 : South) - aie.wire(%shim_noc_tile_0_0 : DMA, %shim_mux_0_0 : DMA) - aie.wire(%mem_tile_0_1 : Core, %switchbox_0_1 : Core) - aie.wire(%mem_tile_0_1 : DMA, %switchbox_0_1 : DMA) - aie.wire(%switchbox_0_0 : North, %switchbox_0_1 : South) - aie.wire(%tile_0_2 : Core, %switchbox_0_2 : Core) - aie.wire(%tile_0_2 : DMA, %switchbox_0_2 : DMA) - aie.wire(%switchbox_0_1 : North, %switchbox_0_2 : South) - aie.wire(%tile_0_3 : Core, %switchbox_0_3 : Core) - aie.wire(%tile_0_3 : DMA, %switchbox_0_3 : DMA) - aie.wire(%switchbox_0_2 : North, %switchbox_0_3 : South) - aie.wire(%tile_0_4 : Core, %switchbox_0_4 : Core) - aie.wire(%tile_0_4 : DMA, %switchbox_0_4 : DMA) - aie.wire(%switchbox_0_3 : North, %switchbox_0_4 : South) - aie.wire(%tile_0_5 : Core, %switchbox_0_5 : Core) - aie.wire(%tile_0_5 : DMA, %switchbox_0_5 : DMA) - aie.wire(%switchbox_0_4 : North, %switchbox_0_5 : South) - aie.wire(%switchbox_0_0 : East, %switchbox_1_0 : West) - aie.wire(%shim_mux_1_0 : North, %switchbox_1_0 : South) - aie.wire(%shim_noc_tile_1_0 : DMA, %shim_mux_1_0 : DMA) - aie.wire(%switchbox_0_1 : East, %switchbox_1_1 : West) - aie.wire(%mem_tile_1_1 : Core, %switchbox_1_1 : Core) - aie.wire(%mem_tile_1_1 : DMA, %switchbox_1_1 : DMA) - aie.wire(%switchbox_1_0 : North, %switchbox_1_1 : South) - aie.wire(%switchbox_0_2 : East, %switchbox_1_2 : West) - aie.wire(%tile_1_2 : Core, %switchbox_1_2 : Core) - aie.wire(%tile_1_2 : DMA, %switchbox_1_2 : DMA) - aie.wire(%switchbox_1_1 : North, %switchbox_1_2 : South) - aie.wire(%switchbox_0_3 : East, %switchbox_1_3 : West) - aie.wire(%tile_1_3 : Core, %switchbox_1_3 : Core) - aie.wire(%tile_1_3 : DMA, %switchbox_1_3 : DMA) - aie.wire(%switchbox_1_2 : North, %switchbox_1_3 : South) - aie.wire(%switchbox_0_4 : East, %switchbox_1_4 : West) - aie.wire(%tile_1_4 : Core, %switchbox_1_4 : Core) - aie.wire(%tile_1_4 : DMA, %switchbox_1_4 : DMA) - aie.wire(%switchbox_1_3 : North, %switchbox_1_4 : South) - aie.wire(%switchbox_1_0 : East, %switchbox_2_0 : West) - aie.wire(%shim_mux_2_0 : North, %switchbox_2_0 : South) - aie.wire(%shim_noc_tile_2_0 : DMA, %shim_mux_2_0 : DMA) - aie.wire(%switchbox_1_1 : East, %switchbox_2_1 : West) - aie.wire(%mem_tile_2_1 : Core, %switchbox_2_1 : Core) - aie.wire(%mem_tile_2_1 : DMA, %switchbox_2_1 : DMA) - aie.wire(%switchbox_2_0 : North, %switchbox_2_1 : South) - aie.wire(%switchbox_1_2 : East, %switchbox_2_2 : West) - aie.wire(%tile_2_2 : Core, %switchbox_2_2 : Core) - aie.wire(%tile_2_2 : DMA, %switchbox_2_2 : DMA) - aie.wire(%switchbox_2_1 : North, %switchbox_2_2 : South) - aie.wire(%switchbox_1_3 : East, %switchbox_2_3 : West) - aie.wire(%tile_2_3 : Core, %switchbox_2_3 : Core) - aie.wire(%tile_2_3 : DMA, %switchbox_2_3 : DMA) - aie.wire(%switchbox_2_2 : North, %switchbox_2_3 : South) - aie.wire(%switchbox_1_4 : East, %switchbox_2_4 : West) - aie.wire(%tile_2_4 : Core, %switchbox_2_4 : Core) - aie.wire(%tile_2_4 : DMA, %switchbox_2_4 : DMA) - aie.wire(%switchbox_2_3 : North, %switchbox_2_4 : South) - } {dlti.dl_spec = #dlti.dl_spec} - aie.device(npu2) { - aie.runtime_sequence @mul_kernel(%arg0: memref<*xi16>, %arg1: memref<*xi16>, %arg2: memref<*xi16>, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32) { - aiex.configure @mul_kernel_0 { - aiex.run @mul_kernel_0_sequence(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8) : (memref<*xi16>, memref<*xi16>, memref<*xi16>, i32, i32, i32, i32, i32, i32) - } - } - } -} diff --git a/examples/elementwise_arith/air_project/aiecc_failure_1775797174_862028.mlir b/examples/elementwise_arith/air_project/aiecc_failure_1775797174_862028.mlir deleted file mode 100644 index 918aa51..0000000 --- a/examples/elementwise_arith/air_project/aiecc_failure_1775797174_862028.mlir +++ /dev/null @@ -1,431 +0,0 @@ -#loop_annotation = #llvm.loop_annotation -module { - aie.device(npu2) @square_kernel_0 { - %shim_noc_tile_0_0 = aie.tile(0, 0) {controller_id = #aie.packet_info} - %shim_noc_tile_1_0 = aie.tile(1, 0) {controller_id = #aie.packet_info} - %mem_tile_0_1 = aie.tile(0, 1) {controller_id = #aie.packet_info} - %mem_tile_1_1 = aie.tile(1, 1) {controller_id = #aie.packet_info} - %tile_0_2 = aie.tile(0, 2) {controller_id = #aie.packet_info} - %tile_0_3 = aie.tile(0, 3) {controller_id = #aie.packet_info} - %tile_0_4 = aie.tile(0, 4) {controller_id = #aie.packet_info} - %tile_0_5 = aie.tile(0, 5) {controller_id = #aie.packet_info} - %lock_0_1 = aie.lock(%mem_tile_0_1, 1) {init = 4 : i32} - %lock_0_1_0 = aie.lock(%mem_tile_0_1, 0) {init = 0 : i32} - %lock_1_1 = aie.lock(%mem_tile_1_1, 1) {init = 4 : i32} - %lock_1_1_1 = aie.lock(%mem_tile_1_1, 0) {init = 0 : i32} - %lock_0_2 = aie.lock(%tile_0_2, 3) {init = 1 : i32} - %lock_0_2_2 = aie.lock(%tile_0_2, 2) {init = 0 : i32} - %lock_0_2_3 = aie.lock(%tile_0_2, 1) {init = 1 : i32} - %lock_0_2_4 = aie.lock(%tile_0_2, 0) {init = 0 : i32} - %lock_0_3 = aie.lock(%tile_0_3, 3) {init = 1 : i32} - %lock_0_3_5 = aie.lock(%tile_0_3, 2) {init = 0 : i32} - %lock_0_3_6 = aie.lock(%tile_0_3, 1) {init = 1 : i32} - %lock_0_3_7 = aie.lock(%tile_0_3, 0) {init = 0 : i32} - %lock_0_4 = aie.lock(%tile_0_4, 3) {init = 1 : i32} - %lock_0_4_8 = aie.lock(%tile_0_4, 2) {init = 0 : i32} - %lock_0_4_9 = aie.lock(%tile_0_4, 1) {init = 1 : i32} - %lock_0_4_10 = aie.lock(%tile_0_4, 0) {init = 0 : i32} - %lock_0_5 = aie.lock(%tile_0_5, 3) {init = 1 : i32} - %lock_0_5_11 = aie.lock(%tile_0_5, 2) {init = 0 : i32} - %lock_0_5_12 = aie.lock(%tile_0_5, 1) {init = 1 : i32} - %lock_0_5_13 = aie.lock(%tile_0_5, 0) {init = 0 : i32} - %buf9 = aie.buffer(%mem_tile_0_1) {address = 0 : i32, mem_bank = 0 : i32, sym_name = "buf9"} : memref<1024xi16, 1 : i32> - %buf8 = aie.buffer(%mem_tile_1_1) {address = 0 : i32, mem_bank = 0 : i32, sym_name = "buf8"} : memref<1024xi16, 1> - %buf7 = aie.buffer(%tile_0_5) {address = 1024 : i32, mem_bank = 0 : i32, sym_name = "buf7"} : memref<256xi16, 2> - %buf6 = aie.buffer(%tile_0_5) {address = 16384 : i32, mem_bank = 1 : i32, sym_name = "buf6"} : memref<256xi16, 2> - %buf5 = aie.buffer(%tile_0_4) {address = 1024 : i32, mem_bank = 0 : i32, sym_name = "buf5"} : memref<256xi16, 2> - %buf4 = aie.buffer(%tile_0_4) {address = 16384 : i32, mem_bank = 1 : i32, sym_name = "buf4"} : memref<256xi16, 2> - %buf3 = aie.buffer(%tile_0_3) {address = 1024 : i32, mem_bank = 0 : i32, sym_name = "buf3"} : memref<256xi16, 2> - %buf2 = aie.buffer(%tile_0_3) {address = 16384 : i32, mem_bank = 1 : i32, sym_name = "buf2"} : memref<256xi16, 2> - %buf1 = aie.buffer(%tile_0_2) {address = 1024 : i32, mem_bank = 0 : i32, sym_name = "buf1"} : memref<256xi16, 2> - %buf0 = aie.buffer(%tile_0_2) {address = 16384 : i32, mem_bank = 1 : i32, sym_name = "buf0"} : memref<256xi16, 2> - %mem_0_5 = aie.mem(%tile_0_5) { - %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) - ^bb1: // 2 preds: ^bb0, ^bb1 - aie.use_lock(%lock_0_5_13, AcquireGreaterEqual, 1) - aie.dma_bd(%buf6 : memref<256xi16, 2>, 0, 256) {bd_id = 0 : i32, next_bd_id = 0 : i32, task_id = 0 : i32} - aie.use_lock(%lock_0_5_12, Release, 1) - aie.next_bd ^bb1 - ^bb2: // pred: ^bb3 - aie.end - ^bb3: // pred: ^bb0 - %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb2) - ^bb4: // 2 preds: ^bb3, ^bb4 - aie.use_lock(%lock_0_5, AcquireGreaterEqual, 1) - aie.dma_bd(%buf7 : memref<256xi16, 2>, 0, 256) {bd_id = 1 : i32, next_bd_id = 1 : i32, task_id = 0 : i32} - aie.use_lock(%lock_0_5_11, Release, 1) - aie.next_bd ^bb4 - } - %core_0_5 = aie.core(%tile_0_5) { - %c0_i32 = arith.constant 0 : i32 - %c256 = arith.constant 256 : index - %c32 = arith.constant 32 : index - %c0 = arith.constant 0 : index - cf.br ^bb1 - ^bb1: // 2 preds: ^bb0, ^bb4 - aie.use_lock(%lock_0_5_12, AcquireGreaterEqual, 1) - aie.use_lock(%lock_0_5_11, AcquireGreaterEqual, 1) - cf.br ^bb2(%c0 : index) - ^bb2(%0: index): // 2 preds: ^bb1, ^bb3 - %1 = arith.cmpi slt, %0, %c256 : index - cf.cond_br %1, ^bb3, ^bb4 - ^bb3: // pred: ^bb2 - %2 = vector.load %buf7[%0] : memref<256xi16, 2>, vector<32xi16> - %3 = aievec.mul_elem %2, %2 : vector<32xi16>, vector<32xi16>, vector<32xi32> - %4 = aievec.srs %3, %c0_i32 : vector<32xi32>, i32, vector<32xi16> - vector.store %4, %buf6[%0] : memref<256xi16, 2>, vector<32xi16> - %5 = arith.addi %0, %c32 : index - cf.br ^bb2(%5 : index) {loop_annotation = #loop_annotation} - ^bb4: // pred: ^bb2 - aie.use_lock(%lock_0_5, Release, 1) - aie.use_lock(%lock_0_5_13, Release, 1) - cf.br ^bb1 - } - %mem_0_4 = aie.mem(%tile_0_4) { - %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) - ^bb1: // 2 preds: ^bb0, ^bb1 - aie.use_lock(%lock_0_4_10, AcquireGreaterEqual, 1) - aie.dma_bd(%buf4 : memref<256xi16, 2>, 0, 256) {bd_id = 0 : i32, next_bd_id = 0 : i32, task_id = 0 : i32} - aie.use_lock(%lock_0_4_9, Release, 1) - aie.next_bd ^bb1 - ^bb2: // pred: ^bb3 - aie.end - ^bb3: // pred: ^bb0 - %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb2) - ^bb4: // 2 preds: ^bb3, ^bb4 - aie.use_lock(%lock_0_4, AcquireGreaterEqual, 1) - aie.dma_bd(%buf5 : memref<256xi16, 2>, 0, 256) {bd_id = 1 : i32, next_bd_id = 1 : i32, task_id = 0 : i32} - aie.use_lock(%lock_0_4_8, Release, 1) - aie.next_bd ^bb4 - } - %core_0_4 = aie.core(%tile_0_4) { - %c0_i32 = arith.constant 0 : i32 - %c256 = arith.constant 256 : index - %c32 = arith.constant 32 : index - %c0 = arith.constant 0 : index - cf.br ^bb1 - ^bb1: // 2 preds: ^bb0, ^bb4 - aie.use_lock(%lock_0_4_9, AcquireGreaterEqual, 1) - aie.use_lock(%lock_0_4_8, AcquireGreaterEqual, 1) - cf.br ^bb2(%c0 : index) - ^bb2(%0: index): // 2 preds: ^bb1, ^bb3 - %1 = arith.cmpi slt, %0, %c256 : index - cf.cond_br %1, ^bb3, ^bb4 - ^bb3: // pred: ^bb2 - %2 = vector.load %buf5[%0] : memref<256xi16, 2>, vector<32xi16> - %3 = aievec.mul_elem %2, %2 : vector<32xi16>, vector<32xi16>, vector<32xi32> - %4 = aievec.srs %3, %c0_i32 : vector<32xi32>, i32, vector<32xi16> - vector.store %4, %buf4[%0] : memref<256xi16, 2>, vector<32xi16> - %5 = arith.addi %0, %c32 : index - cf.br ^bb2(%5 : index) {loop_annotation = #loop_annotation} - ^bb4: // pred: ^bb2 - aie.use_lock(%lock_0_4, Release, 1) - aie.use_lock(%lock_0_4_10, Release, 1) - cf.br ^bb1 - } - %mem_0_3 = aie.mem(%tile_0_3) { - %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) - ^bb1: // 2 preds: ^bb0, ^bb1 - aie.use_lock(%lock_0_3_7, AcquireGreaterEqual, 1) - aie.dma_bd(%buf2 : memref<256xi16, 2>, 0, 256) {bd_id = 0 : i32, next_bd_id = 0 : i32, task_id = 0 : i32} - aie.use_lock(%lock_0_3_6, Release, 1) - aie.next_bd ^bb1 - ^bb2: // pred: ^bb3 - aie.end - ^bb3: // pred: ^bb0 - %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb2) - ^bb4: // 2 preds: ^bb3, ^bb4 - aie.use_lock(%lock_0_3, AcquireGreaterEqual, 1) - aie.dma_bd(%buf3 : memref<256xi16, 2>, 0, 256) {bd_id = 1 : i32, next_bd_id = 1 : i32, task_id = 0 : i32} - aie.use_lock(%lock_0_3_5, Release, 1) - aie.next_bd ^bb4 - } - %core_0_3 = aie.core(%tile_0_3) { - %c0_i32 = arith.constant 0 : i32 - %c256 = arith.constant 256 : index - %c32 = arith.constant 32 : index - %c0 = arith.constant 0 : index - cf.br ^bb1 - ^bb1: // 2 preds: ^bb0, ^bb4 - aie.use_lock(%lock_0_3_6, AcquireGreaterEqual, 1) - aie.use_lock(%lock_0_3_5, AcquireGreaterEqual, 1) - cf.br ^bb2(%c0 : index) - ^bb2(%0: index): // 2 preds: ^bb1, ^bb3 - %1 = arith.cmpi slt, %0, %c256 : index - cf.cond_br %1, ^bb3, ^bb4 - ^bb3: // pred: ^bb2 - %2 = vector.load %buf3[%0] : memref<256xi16, 2>, vector<32xi16> - %3 = aievec.mul_elem %2, %2 : vector<32xi16>, vector<32xi16>, vector<32xi32> - %4 = aievec.srs %3, %c0_i32 : vector<32xi32>, i32, vector<32xi16> - vector.store %4, %buf2[%0] : memref<256xi16, 2>, vector<32xi16> - %5 = arith.addi %0, %c32 : index - cf.br ^bb2(%5 : index) {loop_annotation = #loop_annotation} - ^bb4: // pred: ^bb2 - aie.use_lock(%lock_0_3, Release, 1) - aie.use_lock(%lock_0_3_7, Release, 1) - cf.br ^bb1 - } - %mem_0_2 = aie.mem(%tile_0_2) { - %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) - ^bb1: // 2 preds: ^bb0, ^bb1 - aie.use_lock(%lock_0_2_4, AcquireGreaterEqual, 1) - aie.dma_bd(%buf0 : memref<256xi16, 2>, 0, 256) {bd_id = 0 : i32, next_bd_id = 0 : i32, task_id = 0 : i32} - aie.use_lock(%lock_0_2_3, Release, 1) - aie.next_bd ^bb1 - ^bb2: // pred: ^bb3 - aie.end - ^bb3: // pred: ^bb0 - %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb2) - ^bb4: // 2 preds: ^bb3, ^bb4 - aie.use_lock(%lock_0_2, AcquireGreaterEqual, 1) - aie.dma_bd(%buf1 : memref<256xi16, 2>, 0, 256) {bd_id = 1 : i32, next_bd_id = 1 : i32, task_id = 0 : i32} - aie.use_lock(%lock_0_2_2, Release, 1) - aie.next_bd ^bb4 - } - %core_0_2 = aie.core(%tile_0_2) { - %c0_i32 = arith.constant 0 : i32 - %c256 = arith.constant 256 : index - %c32 = arith.constant 32 : index - %c0 = arith.constant 0 : index - cf.br ^bb1 - ^bb1: // 2 preds: ^bb0, ^bb4 - aie.use_lock(%lock_0_2_3, AcquireGreaterEqual, 1) - aie.use_lock(%lock_0_2_2, AcquireGreaterEqual, 1) - cf.br ^bb2(%c0 : index) - ^bb2(%0: index): // 2 preds: ^bb1, ^bb3 - %1 = arith.cmpi slt, %0, %c256 : index - cf.cond_br %1, ^bb3, ^bb4 - ^bb3: // pred: ^bb2 - %2 = vector.load %buf1[%0] : memref<256xi16, 2>, vector<32xi16> - %3 = aievec.mul_elem %2, %2 : vector<32xi16>, vector<32xi16>, vector<32xi32> - %4 = aievec.srs %3, %c0_i32 : vector<32xi32>, i32, vector<32xi16> - vector.store %4, %buf0[%0] : memref<256xi16, 2>, vector<32xi16> - %5 = arith.addi %0, %c32 : index - cf.br ^bb2(%5 : index) {loop_annotation = #loop_annotation} - ^bb4: // pred: ^bb2 - aie.use_lock(%lock_0_2, Release, 1) - aie.use_lock(%lock_0_2_4, Release, 1) - cf.br ^bb1 - } - %memtile_dma_1_1 = aie.memtile_dma(%mem_tile_1_1) { - %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) - ^bb1: // 2 preds: ^bb0, ^bb1 - aie.use_lock(%lock_1_1_1, AcquireGreaterEqual, 4) - aie.dma_bd(%buf8 : memref<1024xi16, 1>, 0, 1024) {bd_id = 0 : i32, next_bd_id = 0 : i32, task_id = 0 : i32} - aie.use_lock(%lock_1_1, Release, 4) - aie.next_bd ^bb1 - ^bb2: // pred: ^bb9 - aie.end - ^bb3: // pred: ^bb0 - %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb5) - ^bb4: // 2 preds: ^bb3, ^bb4 - aie.use_lock(%lock_1_1, AcquireGreaterEqual, 1) - aie.dma_bd(%buf8 : memref<1024xi16, 1>, 0, 256) {bd_id = 1 : i32, next_bd_id = 1 : i32, task_id = 0 : i32} - aie.use_lock(%lock_1_1_1, Release, 1) - aie.next_bd ^bb4 - ^bb5: // pred: ^bb3 - %2 = aie.dma_start(S2MM, 1, ^bb6, ^bb7) - ^bb6: // 2 preds: ^bb5, ^bb6 - aie.use_lock(%lock_1_1, AcquireGreaterEqual, 1) - aie.dma_bd(%buf8 : memref<1024xi16, 1>, 256, 256) {bd_id = 24 : i32, next_bd_id = 24 : i32, task_id = 0 : i32} - aie.use_lock(%lock_1_1_1, Release, 1) - aie.next_bd ^bb6 - ^bb7: // pred: ^bb5 - %3 = aie.dma_start(S2MM, 2, ^bb8, ^bb9) - ^bb8: // 2 preds: ^bb7, ^bb8 - aie.use_lock(%lock_1_1, AcquireGreaterEqual, 1) - aie.dma_bd(%buf8 : memref<1024xi16, 1>, 512, 256) {bd_id = 2 : i32, next_bd_id = 2 : i32, task_id = 0 : i32} - aie.use_lock(%lock_1_1_1, Release, 1) - aie.next_bd ^bb8 - ^bb9: // pred: ^bb7 - %4 = aie.dma_start(S2MM, 3, ^bb10, ^bb2) - ^bb10: // 2 preds: ^bb9, ^bb10 - aie.use_lock(%lock_1_1, AcquireGreaterEqual, 1) - aie.dma_bd(%buf8 : memref<1024xi16, 1>, 768, 256) {bd_id = 25 : i32, next_bd_id = 25 : i32, task_id = 0 : i32} - aie.use_lock(%lock_1_1_1, Release, 1) - aie.next_bd ^bb10 - } - %memtile_dma_0_1 = aie.memtile_dma(%mem_tile_0_1) { - %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) - ^bb1: // 2 preds: ^bb0, ^bb1 - aie.use_lock(%lock_0_1_0, AcquireGreaterEqual, 1) - aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 0, 256) {bd_id = 0 : i32, next_bd_id = 0 : i32, task_id = 0 : i32} - aie.use_lock(%lock_0_1, Release, 1) - aie.next_bd ^bb1 - ^bb2: // pred: ^bb9 - aie.end - ^bb3: // pred: ^bb0 - %1 = aie.dma_start(MM2S, 1, ^bb4, ^bb5) - ^bb4: // 2 preds: ^bb3, ^bb4 - aie.use_lock(%lock_0_1_0, AcquireGreaterEqual, 1) - aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 256, 256) {bd_id = 24 : i32, next_bd_id = 24 : i32, task_id = 0 : i32} - aie.use_lock(%lock_0_1, Release, 1) - aie.next_bd ^bb4 - ^bb5: // pred: ^bb3 - %2 = aie.dma_start(MM2S, 2, ^bb6, ^bb7) - ^bb6: // 2 preds: ^bb5, ^bb6 - aie.use_lock(%lock_0_1_0, AcquireGreaterEqual, 1) - aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 512, 256) {bd_id = 1 : i32, next_bd_id = 1 : i32, task_id = 0 : i32} - aie.use_lock(%lock_0_1, Release, 1) - aie.next_bd ^bb6 - ^bb7: // pred: ^bb5 - %3 = aie.dma_start(MM2S, 3, ^bb8, ^bb9) - ^bb8: // 2 preds: ^bb7, ^bb8 - aie.use_lock(%lock_0_1_0, AcquireGreaterEqual, 1) - aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 768, 256) {bd_id = 25 : i32, next_bd_id = 25 : i32, task_id = 0 : i32} - aie.use_lock(%lock_0_1, Release, 1) - aie.next_bd ^bb8 - ^bb9: // pred: ^bb7 - %4 = aie.dma_start(S2MM, 0, ^bb10, ^bb2) - ^bb10: // 2 preds: ^bb9, ^bb10 - aie.use_lock(%lock_0_1, AcquireGreaterEqual, 4) - aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 0, 1024) {bd_id = 2 : i32, next_bd_id = 2 : i32, task_id = 0 : i32} - aie.use_lock(%lock_0_1_0, Release, 4) - aie.next_bd ^bb10 - } - aie.shim_dma_allocation @air_channel_3(%shim_noc_tile_1_0, S2MM, 0) - aie.shim_dma_allocation @air_channel_0(%shim_noc_tile_0_0, MM2S, 0) - aie.runtime_sequence @square_kernel_0_sequence(%arg0: memref<*xi16>, %arg1: memref<*xi16>, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32) { - %0 = aiex.dma_configure_task_for @air_channel_0 { - aie.dma_bd(%arg0 : memref<*xi16>, 0, 1024, [, ]) - aie.end - } - aiex.dma_start_task(%0) - %1 = aiex.dma_configure_task_for @air_channel_3 { - aie.dma_bd(%arg1 : memref<*xi16>, 0, 1024, [, ]) - aie.end - } {issue_token = true} - aiex.dma_start_task(%1) - aiex.dma_free_task(%0) - aiex.dma_await_task(%1) - } - aie.packet_flow(15) { - aie.packet_source<%shim_noc_tile_0_0, TileControl : 0> - aie.packet_dest<%shim_noc_tile_0_0, South : 0> - } {keep_pkt_header = true, priority_route = true} - aie.packet_flow(15) { - aie.packet_source<%shim_noc_tile_1_0, TileControl : 0> - aie.packet_dest<%shim_noc_tile_1_0, South : 0> - } {keep_pkt_header = true, priority_route = true} - %switchbox_0_0 = aie.switchbox(%shim_noc_tile_0_0) { - aie.connect - %0 = aie.amsel<5> (3) - %1 = aie.masterset(South : 0, %0) {keep_pkt_header = true} - aie.packet_rules(TileControl : 0) { - aie.rule(31, 15, %0) - } - } - %shim_mux_0_0 = aie.shim_mux(%shim_noc_tile_0_0) { - aie.connect - } - %switchbox_0_1 = aie.switchbox(%mem_tile_0_1) { - aie.connect - aie.connect - aie.connect - aie.connect - aie.connect - } - %switchbox_1_0 = aie.switchbox(%shim_noc_tile_1_0) { - aie.connect - %0 = aie.amsel<5> (3) - %1 = aie.masterset(South : 0, %0) {keep_pkt_header = true} - aie.packet_rules(TileControl : 0) { - aie.rule(31, 15, %0) - } - } - %shim_mux_1_0 = aie.shim_mux(%shim_noc_tile_1_0) { - aie.connect - } - %switchbox_1_1 = aie.switchbox(%mem_tile_1_1) { - aie.connect - aie.connect - aie.connect - aie.connect - aie.connect - } - %switchbox_0_2 = aie.switchbox(%tile_0_2) { - aie.connect - aie.connect - aie.connect - aie.connect - aie.connect - aie.connect - } - %switchbox_0_3 = aie.switchbox(%tile_0_3) { - aie.connect - aie.connect - aie.connect - aie.connect - aie.connect - } - %switchbox_0_4 = aie.switchbox(%tile_0_4) { - aie.connect - aie.connect - aie.connect - aie.connect - } - %switchbox_0_5 = aie.switchbox(%tile_0_5) { - aie.connect - aie.connect - } - %tile_1_2 = aie.tile(1, 2) - %switchbox_1_2 = aie.switchbox(%tile_1_2) { - aie.connect - aie.connect - aie.connect - aie.connect - } - %tile_1_3 = aie.tile(1, 3) - %switchbox_1_3 = aie.switchbox(%tile_1_3) { - aie.connect - aie.connect - } - %tile_1_4 = aie.tile(1, 4) - %switchbox_1_4 = aie.switchbox(%tile_1_4) { - aie.connect - } - aie.wire(%shim_mux_0_0 : North, %switchbox_0_0 : South) - aie.wire(%shim_noc_tile_0_0 : DMA, %shim_mux_0_0 : DMA) - aie.wire(%mem_tile_0_1 : Core, %switchbox_0_1 : Core) - aie.wire(%mem_tile_0_1 : DMA, %switchbox_0_1 : DMA) - aie.wire(%switchbox_0_0 : North, %switchbox_0_1 : South) - aie.wire(%tile_0_2 : Core, %switchbox_0_2 : Core) - aie.wire(%tile_0_2 : DMA, %switchbox_0_2 : DMA) - aie.wire(%switchbox_0_1 : North, %switchbox_0_2 : South) - aie.wire(%tile_0_3 : Core, %switchbox_0_3 : Core) - aie.wire(%tile_0_3 : DMA, %switchbox_0_3 : DMA) - aie.wire(%switchbox_0_2 : North, %switchbox_0_3 : South) - aie.wire(%tile_0_4 : Core, %switchbox_0_4 : Core) - aie.wire(%tile_0_4 : DMA, %switchbox_0_4 : DMA) - aie.wire(%switchbox_0_3 : North, %switchbox_0_4 : South) - aie.wire(%tile_0_5 : Core, %switchbox_0_5 : Core) - aie.wire(%tile_0_5 : DMA, %switchbox_0_5 : DMA) - aie.wire(%switchbox_0_4 : North, %switchbox_0_5 : South) - aie.wire(%switchbox_0_0 : East, %switchbox_1_0 : West) - aie.wire(%shim_mux_1_0 : North, %switchbox_1_0 : South) - aie.wire(%shim_noc_tile_1_0 : DMA, %shim_mux_1_0 : DMA) - aie.wire(%switchbox_0_1 : East, %switchbox_1_1 : West) - aie.wire(%mem_tile_1_1 : Core, %switchbox_1_1 : Core) - aie.wire(%mem_tile_1_1 : DMA, %switchbox_1_1 : DMA) - aie.wire(%switchbox_1_0 : North, %switchbox_1_1 : South) - aie.wire(%switchbox_0_2 : East, %switchbox_1_2 : West) - aie.wire(%tile_1_2 : Core, %switchbox_1_2 : Core) - aie.wire(%tile_1_2 : DMA, %switchbox_1_2 : DMA) - aie.wire(%switchbox_1_1 : North, %switchbox_1_2 : South) - aie.wire(%switchbox_0_3 : East, %switchbox_1_3 : West) - aie.wire(%tile_1_3 : Core, %switchbox_1_3 : Core) - aie.wire(%tile_1_3 : DMA, %switchbox_1_3 : DMA) - aie.wire(%switchbox_1_2 : North, %switchbox_1_3 : South) - aie.wire(%switchbox_0_4 : East, %switchbox_1_4 : West) - aie.wire(%tile_1_4 : Core, %switchbox_1_4 : Core) - aie.wire(%tile_1_4 : DMA, %switchbox_1_4 : DMA) - aie.wire(%switchbox_1_3 : North, %switchbox_1_4 : South) - } {dlti.dl_spec = #dlti.dl_spec} - aie.device(npu2) { - aie.runtime_sequence @square_kernel(%arg0: memref<*xi16>, %arg1: memref<*xi16>, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32) { - aiex.configure @square_kernel_0 { - aiex.run @square_kernel_0_sequence(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7) : (memref<*xi16>, memref<*xi16>, i32, i32, i32, i32, i32, i32) - } - } - } -} diff --git a/examples/elementwise_arith/air_project/aiecc_repeater_1775797115_856352.sh b/examples/elementwise_arith/air_project/aiecc_repeater_1775797115_856352.sh deleted file mode 100755 index b7aa36c..0000000 --- a/examples/elementwise_arith/air_project/aiecc_repeater_1775797115_856352.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash -set -e -# Repeater script for: resource allocation -echo "Original MLIR Diagnostics:" -cat << 'DIAGNOSTICS_EOF' -failed to legalize operation 'arith.subi' that was explicitly marked illegal: %120 = "arith.subi"(%118, %119) <{overflowFlags = #arith.overflow}> : (vector<32xi16>, vector<32xi16>) -> vector<32xi16> -DIAGNOSTICS_EOF -echo "" - -MLIR_FILE='air_project/aiecc_failure_1775797115_856352.mlir' -PASS_PIPELINE='any(canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},unknown,unknown,unknown<{anonymous}::CopyRemovalPass>,unknown,test-canonicalize-vector-for-aievec{aie-target=aie2p target-backend=llvmir},test-lower-vector-to-aievec{aie-target=aie2p target-backend=llvmir},canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},unknown,cse,unknown,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},test-aievec-optimize{aie-target=aie2p target-backend=llvmir},cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},aievec-convolution-analysis{print=false},test-aievec-convolution-optimize{aie-target=aie2p shift=0 target-backend=llvmir},cse,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},loop-invariant-code-motion,canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},lower-affine,aie-canonicalize-device,aie.device(aie-assign-lock-ids,aie-register-objectFifos,aie-objectFifo-stateful-transform{dynamic-objFifos=false packet-sw-objFifos=false},aie-assign-bd-ids,aie-lower-cascade-flows,aie-lower-broadcast-packet,aie-lower-multicast,aie-assign-tile-controller-ids{column-wise-unique-ids=true},aie-generate-column-control-overlay{route-shim-to-tct=shim-only route-shim-to-tile-ctrl=false},aie-assign-buffer-addresses{alloc-scheme=},aie-assign-core-link-files,aie-vector-transfer-lowering{max-transfer-rank=4294967295}),convert-scf-to-cf{allow-pattern-rollback=true})' -aie-opt --mlir-print-ir-after-all --mlir-disable-threading --pass-pipeline="$PASS_PIPELINE" "$MLIR_FILE" diff --git a/examples/elementwise_arith/air_project/aiecc_repeater_1775797139_858651.sh b/examples/elementwise_arith/air_project/aiecc_repeater_1775797139_858651.sh deleted file mode 100755 index 2f765b5..0000000 --- a/examples/elementwise_arith/air_project/aiecc_repeater_1775797139_858651.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash -set -e -# Repeater script for: LLVM lowering -echo "Original MLIR Diagnostics:" -cat << 'DIAGNOSTICS_EOF' -aievec.mul_elem conversion is not supported for AIE2p. - -failed to legalize operation 'aievec.mul_elem' that was explicitly marked illegal: %28 = "aievec.mul_elem"(%24, %27) : (vector<32xi16>, vector<32xi16>) -> vector<32xi32> -DIAGNOSTICS_EOF -echo "" - -MLIR_FILE='air_project/aiecc_failure_1775797139_858651.mlir' -PASS_PIPELINE='any(aie.device(aie-localize-locks,aie-normalize-address-spaces,aie-transform-bfp-types),aie-standard-lowering{device=mul_kernel_0 tilecol=0 tilerow=5},aiex-standard-lowering,convert-aievec-to-llvm{aie-target=aie2p aie2-fp32-emulation-strategy=accuracy-safe},canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},cse,expand-strided-metadata,lower-affine,arith-expand{include-bf16=false include-f4e2m1=false include-f8e8m0=false},finalize-memref-to-llvm{index-bitwidth=0 use-aligned-alloc=false use-generic-functions=false},convert-func-to-llvm{index-bitwidth=0 use-bare-ptr-memref-call-conv=true},convert-to-llvm{allow-pattern-rollback=true dynamic=true },convert-vector-to-llvm{enable-arm-bf16=false enable-arm-i8mm=false enable-arm-neon=false enable-arm-sve=false enable-x86=false force-32bit-vector-indices=true reassociate-fp-reductions=false use-vector-alignment=false vector-contract-lowering=dot vector-transpose-lowering=eltwise},convert-ub-to-llvm{index-bitwidth=0},canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},cse)' -aie-opt --mlir-print-ir-after-all --mlir-disable-threading --pass-pipeline="$PASS_PIPELINE" "$MLIR_FILE" diff --git a/examples/elementwise_arith/air_project/aiecc_repeater_1775797174_862028.sh b/examples/elementwise_arith/air_project/aiecc_repeater_1775797174_862028.sh deleted file mode 100755 index e9fc1e4..0000000 --- a/examples/elementwise_arith/air_project/aiecc_repeater_1775797174_862028.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash -set -e -# Repeater script for: LLVM lowering -echo "Original MLIR Diagnostics:" -cat << 'DIAGNOSTICS_EOF' -aievec.mul_elem conversion is not supported for AIE2p. - -failed to legalize operation 'aievec.mul_elem' that was explicitly marked illegal: %21 = "aievec.mul_elem"(%20, %20) : (vector<32xi16>, vector<32xi16>) -> vector<32xi32> -DIAGNOSTICS_EOF -echo "" - -MLIR_FILE='air_project/aiecc_failure_1775797174_862028.mlir' -PASS_PIPELINE='any(aie.device(aie-localize-locks,aie-normalize-address-spaces,aie-transform-bfp-types),aie-standard-lowering{device=square_kernel_0 tilecol=0 tilerow=5},aiex-standard-lowering,convert-aievec-to-llvm{aie-target=aie2p aie2-fp32-emulation-strategy=accuracy-safe},canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},cse,expand-strided-metadata,lower-affine,arith-expand{include-bf16=false include-f4e2m1=false include-f8e8m0=false},finalize-memref-to-llvm{index-bitwidth=0 use-aligned-alloc=false use-generic-functions=false},convert-func-to-llvm{index-bitwidth=0 use-bare-ptr-memref-call-conv=true},convert-to-llvm{allow-pattern-rollback=true dynamic=true },convert-vector-to-llvm{enable-arm-bf16=false enable-arm-i8mm=false enable-arm-neon=false enable-arm-sve=false enable-x86=false force-32bit-vector-indices=true reassociate-fp-reductions=false use-vector-alignment=false vector-contract-lowering=dot vector-transpose-lowering=eltwise},convert-ub-to-llvm{index-bitwidth=0},canonicalize{ max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},cse)' -aie-opt --mlir-print-ir-after-all --mlir-disable-threading --pass-pipeline="$PASS_PIPELINE" "$MLIR_FILE" diff --git a/examples/elementwise_arith/air_project/airinput.mlir b/examples/elementwise_arith/air_project/airinput.mlir deleted file mode 100644 index d0b7377..0000000 --- a/examples/elementwise_arith/air_project/airinput.mlir +++ /dev/null @@ -1,41 +0,0 @@ -#map = affine_map<()[s0] -> (s0 * 256)> -module { - func.func @square_kernel(%arg0: memref<*xi16> {tt.divisibility = 16 : i32}, %arg1: memref<*xi16> {tt.divisibility = 16 : i32}, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32) { - %c1 = arith.constant 1 : index - air.launch (%arg8, %arg9, %arg10) in (%arg11=%c1, %arg12=%c1, %arg13=%c1) args(%arg14=%arg0, %arg15=%arg1) : memref<*xi16>, memref<*xi16> { - air.segment @square_kernel_0 args(%arg16=%arg8, %arg17=%arg14, %arg18=%arg15) : index, memref<*xi16>, memref<*xi16> { - %c1024 = arith.constant 1024 : index - %c4 = arith.constant 4 : index - %c1_0 = arith.constant 1 : index - %0 = arith.muli %arg16, %c1024 : index - %alloc = memref.alloc() : memref<1024xi16, 1 : i32> - air.dma_memcpy_nd (%alloc[] [] [], %arg17[%0] [%c1024] [%c1_0]) {id = 1 : i32} : (memref<1024xi16, 1 : i32>, memref<*xi16>) - %alloc_1 = memref.alloc() : memref<1024xi16, 1> - air.herd @herd_0 tile (%arg19, %arg20) in (%arg21=%c4, %arg22=%c1_0) args(%arg23=%alloc, %arg24=%alloc_1) : memref<1024xi16, 1 : i32>, memref<1024xi16, 1> { - %1 = ub.poison : i16 - %c1_2 = arith.constant 1 : index - %c0 = arith.constant 0 : index - %c256 = arith.constant 256 : index - %c32 = arith.constant 32 : index - %2 = affine.apply #map()[%arg19] - %alloc_3 = memref.alloc() : memref<256xi16, 2> - air.dma_memcpy_nd (%alloc_3[] [] [], %arg23[%2] [%c256] [%c1_2]) {id = 1 : i32} : (memref<256xi16, 2>, memref<1024xi16, 1 : i32>) - %alloc_4 = memref.alloc() : memref<256xi16, 2> - scf.for %arg25 = %c0 to %c256 step %c32 { - %subview = memref.subview %alloc_3[%arg25] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2> - %subview_5 = memref.subview %alloc_4[%arg25] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2> - %3 = vector.transfer_read %subview[%c0], %1 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16> - %4 = arith.muli %3, %3 : vector<32xi16> - vector.transfer_write %4, %subview_5[%c0] {in_bounds = [true]} : vector<32xi16>, memref<32xi16, strided<[1], offset: ?>, 2> - } - air.dma_memcpy_nd (%arg24[%2] [%c256] [%c1_2], %alloc_4[] [] []) {id = 2 : i32} : (memref<1024xi16, 1>, memref<256xi16, 2>) - memref.dealloc %alloc_3 : memref<256xi16, 2> - memref.dealloc %alloc_4 : memref<256xi16, 2> - } - air.dma_memcpy_nd (%arg18[%0] [%c1024] [%c1_0], %alloc_1[] [] []) {id = 2 : i32} : (memref<*xi16>, memref<1024xi16, 1>) - memref.dealloc %alloc_1 : memref<1024xi16, 1> - } - } - return - } -} diff --git a/examples/elementwise_arith/air_project/asm_air_output.mlir b/examples/elementwise_arith/air_project/asm_air_output.mlir deleted file mode 100644 index d0b7377..0000000 --- a/examples/elementwise_arith/air_project/asm_air_output.mlir +++ /dev/null @@ -1,41 +0,0 @@ -#map = affine_map<()[s0] -> (s0 * 256)> -module { - func.func @square_kernel(%arg0: memref<*xi16> {tt.divisibility = 16 : i32}, %arg1: memref<*xi16> {tt.divisibility = 16 : i32}, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32) { - %c1 = arith.constant 1 : index - air.launch (%arg8, %arg9, %arg10) in (%arg11=%c1, %arg12=%c1, %arg13=%c1) args(%arg14=%arg0, %arg15=%arg1) : memref<*xi16>, memref<*xi16> { - air.segment @square_kernel_0 args(%arg16=%arg8, %arg17=%arg14, %arg18=%arg15) : index, memref<*xi16>, memref<*xi16> { - %c1024 = arith.constant 1024 : index - %c4 = arith.constant 4 : index - %c1_0 = arith.constant 1 : index - %0 = arith.muli %arg16, %c1024 : index - %alloc = memref.alloc() : memref<1024xi16, 1 : i32> - air.dma_memcpy_nd (%alloc[] [] [], %arg17[%0] [%c1024] [%c1_0]) {id = 1 : i32} : (memref<1024xi16, 1 : i32>, memref<*xi16>) - %alloc_1 = memref.alloc() : memref<1024xi16, 1> - air.herd @herd_0 tile (%arg19, %arg20) in (%arg21=%c4, %arg22=%c1_0) args(%arg23=%alloc, %arg24=%alloc_1) : memref<1024xi16, 1 : i32>, memref<1024xi16, 1> { - %1 = ub.poison : i16 - %c1_2 = arith.constant 1 : index - %c0 = arith.constant 0 : index - %c256 = arith.constant 256 : index - %c32 = arith.constant 32 : index - %2 = affine.apply #map()[%arg19] - %alloc_3 = memref.alloc() : memref<256xi16, 2> - air.dma_memcpy_nd (%alloc_3[] [] [], %arg23[%2] [%c256] [%c1_2]) {id = 1 : i32} : (memref<256xi16, 2>, memref<1024xi16, 1 : i32>) - %alloc_4 = memref.alloc() : memref<256xi16, 2> - scf.for %arg25 = %c0 to %c256 step %c32 { - %subview = memref.subview %alloc_3[%arg25] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2> - %subview_5 = memref.subview %alloc_4[%arg25] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2> - %3 = vector.transfer_read %subview[%c0], %1 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16> - %4 = arith.muli %3, %3 : vector<32xi16> - vector.transfer_write %4, %subview_5[%c0] {in_bounds = [true]} : vector<32xi16>, memref<32xi16, strided<[1], offset: ?>, 2> - } - air.dma_memcpy_nd (%arg24[%2] [%c256] [%c1_2], %alloc_4[] [] []) {id = 2 : i32} : (memref<1024xi16, 1>, memref<256xi16, 2>) - memref.dealloc %alloc_3 : memref<256xi16, 2> - memref.dealloc %alloc_4 : memref<256xi16, 2> - } - air.dma_memcpy_nd (%arg18[%0] [%c1024] [%c1_0], %alloc_1[] [] []) {id = 2 : i32} : (memref<*xi16>, memref<1024xi16, 1>) - memref.dealloc %alloc_1 : memref<1024xi16, 1> - } - } - return - } -} diff --git a/examples/elementwise_arith/air_project/asm_src.mlir b/examples/elementwise_arith/air_project/asm_src.mlir deleted file mode 100644 index aa0162c..0000000 --- a/examples/elementwise_arith/air_project/asm_src.mlir +++ /dev/null @@ -1,34 +0,0 @@ -#loc = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":86:1) -#loc5 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":89:9) -#map = affine_map<(d0) -> (d0)> -#loc8 = loc("X"(#loc)) -#loc9 = loc("OUT"(#loc)) -#loc12 = loc("x"(#loc5)) -module { - func.func @square_kernel(%arg0: memref<*xi16> {tt.divisibility = 16 : i32} loc("X"(#loc)), %arg1: memref<*xi16> {tt.divisibility = 16 : i32} loc("OUT"(#loc)), %arg2: i32 loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":86:1), %arg3: i32 loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":86:1), %arg4: i32 loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":86:1), %arg5: i32 loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":86:1), %arg6: i32 loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":86:1), %arg7: i32 loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":86:1)) { - %c1024_i32 = arith.constant 1024 : i32 loc(#loc1) - %0 = arith.muli %arg5, %c1024_i32 : i32 loc(#loc10) - %1 = arith.index_cast %0 : i32 to index loc(#loc3) - %reinterpret_cast = memref.reinterpret_cast %arg0 to offset: [%1], sizes: [1024], strides: [1] : memref<*xi16> to memref<1024xi16, strided<[1], offset: ?>> loc(#loc11) - %alloc = memref.alloc() : memref<1024xi16> loc(#loc12) - memref.copy %reinterpret_cast, %alloc : memref<1024xi16, strided<[1], offset: ?>> to memref<1024xi16> loc(#loc12) - %2 = bufferization.to_tensor %alloc restrict writable : memref<1024xi16> to tensor<1024xi16> loc(#loc12) - %reinterpret_cast_0 = memref.reinterpret_cast %arg1 to offset: [%1], sizes: [1024], strides: [1] : memref<*xi16> to memref<1024xi16, strided<[1], offset: ?>> loc(#loc3) - %3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%2, %2 : tensor<1024xi16>, tensor<1024xi16>) outs(%2 : tensor<1024xi16>) { - ^bb0(%in: i16 loc("x"(#loc5)), %in_1: i16 loc("x"(#loc5)), %out: i16 loc("x"(#loc5))): - %4 = arith.muli %in, %in_1 : i16 loc(#loc6) - linalg.yield %4 : i16 loc(#loc6) - } -> tensor<1024xi16> loc(#loc6) - bufferization.materialize_in_destination %3 in writable %reinterpret_cast_0 : (tensor<1024xi16>, memref<1024xi16, strided<[1], offset: ?>>) -> () loc(#loc7) - return loc(#loc) - } loc(#loc) -} loc(#loc) -#loc1 = loc(unknown) -#loc2 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":88:15) -#loc3 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":90:14) -#loc4 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":89:17) -#loc6 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":90:32) -#loc7 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":90:5) -#loc10 = loc("offsets"(#loc2)) -#loc11 = loc("x"(#loc4)) - diff --git a/examples/elementwise_arith/air_project/div_kernel_0.pdi b/examples/elementwise_arith/air_project/div_kernel_0.pdi deleted file mode 100644 index 3681781c43b9bf80414d1863d6cae3111aa21810..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 15904 zcmeHOZ)_aJ6@UB3z9V(Gy~Iuz+aVj<5JN6-i49F4fpzRwF16ANA(h)k#Yjzowy3UD zT7OA7h$;UPVbUTEP=t$ws-k{4S8WuzUyc$GAZSeqQmR69g;Z^&qMi|zOmJ|2v%9nN z?))!lrHT(X(%QfGn>TOXdow$J-rIYdi2AzP()a)18+%{Wn%1rZf_?HnI&8HuX{If`lUJQe^C%8!V5?8JhHoenb+v z_TA2LSV)rCQG}$pDebw;DOU$@Lj-rTM>Wm~ga%PU`heI3bLMT8RHpP=x zN~}$!#E#CCXu2dNu3DcGd)KDKofoCVqfM!6(zK;$P~qx#zVA@s4I}^N%0v$VXWupQ z|Gpq$CNr>AnsbwFYXH#PGv{`FkUHIS`9#h~e7@Y{B;$j}VG~U+p?~AVjr~VM!lU8DoVfOb zD2btEqGJcG=3@sBHjAcLm$5f>Zuahly?d<|(R2d(nI-%?iT+K?`*$Dg-FFc7-h%!W zOV}%+|H|dn&WR#ED&|cVren+g6=Dv8%1)$k~w%>sv-PY;GOd(6e@A z!|U#eU8~wJoEh2hSo6q+U$l(;e6n@qZ?B-|&udTr{^-U*ckHpeh@yS%l7YKtD7t&xHLyy$*7 zDlJ2VnX_W$=Nz#z@9gt_Gja8Y`@C(1$r-UG@0_ulD8Kj3gjdQKKdJ94iGhLiHMfxS zOKK=wH*?J+bN`xMM}@>J?)FZzw65^@8K@bJa@Rh?X1{ze2pl-=m-y2{uM`j`K49J&B@4Fgd%eA&DQ++VxOaB+QSLZvfXHe#eF}ieC+|l5O{mFEu z!!g&BxM(1aTnt&+(4JT`>WEFYGdVW*9b^|i_0hORdC^^;78_rJu0Mto9w+ptNv!nK zq7E@WQ*@S^-%XGs^WU-B4r6E;kTGmkbIO^Q8ZYYd!T95y$zVLj7A%~=VUzI}??MbE zE2+3;>^0rJ>d0>2!(V7yd}{^vf`8J&DuykL*XXR z@(pQm$#Lvh$a|5kAS;xVSqWQ|4@7*t-H!GX<|t{O=liGr{)xulIl>{CPdUER)Z@vH zW5yjO3Yz6#_?8(2ULG#GFZpMB6g@n7UlyE0r##!=O!u{^+33vCK z33v0K5^nnK#QeXC97yjKs&{z|)R6(2*9Y0Y_Zfmlo@K9g)g6iggb(iUZ?mw{R#evRHS= zY!p6z9~g4n!cIrfPDkb`W2`V%n_@=~dcf`{Xx=0J^+{GRQ9dTa^AFL9C!bH5!{uLD zh5Kg2t+R=YY5SJEAK)LxMEYuf$V9o@j_aa}+{d0@V10PqC&x%x)qP4`t)9%F{;SoKwR-aZT~CJfovh-*da_bh^`Gij^`DZ->#eFMd#dWmN|{bo%1I_K zT~F?;)swY)vQ|&l>PZ|uwda$y=aaSPleOoQwda$cyyuf)eW&U}H>&THtm;1{tNKsL z#Vu~tvk>d9I?S*s^;^wjFfT0L2-Cu{X&t)Bek)sta; zr|LsDs_&Gn>OUo``cKOI-AVo#w|tjUeR6Hd&#=qur8-IMFzhj$W!Psp$M6ut!wlyc z9%Fc%;R3@&hNl=VF@)31y#lvPYr-pu0D*kgQ_VV~g~!$S-YGn{95jNx&H z3k(+-o?^Jfa5=(Ze7#lig|d#XhsCF{SJpVou+MOg;UR{H8O}32#_%}91%`_ZPcd9# zxE$dyzCBg(g|d!sH;YeWudH#FVV~g~!$S-YGn{95jNx&H3k(+-o?^JfaG7C;T21+G zb8xQ#tGK|+>QnJLMrySgFJYjQ!=5)H?6Z3z%Xp9R4&#aOIqWTMN8b-w#(QipI*cdA z=dgy_PUT)xyvNquVfw`Q9CD@feU`T@<2}YZjHigN{9jjElkay&FKRecSR zgL4hz+7Uh60Ieau?f7LPBmzeIfKM+@T%v5qgim%{ie`x?}{VnKYmZ+?91&oe%$ z?*~5dAB)Eq*k2+(^rMCGh1h@`u3vcOgz;7OHK^}Y1o7!U9iMzTQ?_P1IOyaK?$XfjvX2+ILM9wAE{{D=DIo_`$V zGGoi;Kd5b@{U2bJW8~jt`LF0h{)6w7$#KYkMIZ8Sviw){A^)Z%qI;^MZ+qFoHy9`{ z_eJA0{zrDvY0ijt?05ApjJU1h9G1w6ILJ0%#~X|%*+v*48-V!RI?aR&M}<1^qf zOTZ@?zZE<@3it-bcb6@b`7GlJ$AV`*OTHNR`MzZ`pG_1vfxc(7#AD%ejNx;P;j_$n zpbwv89zM&QhVwEWKAX(vSl=mI@)iC--=DVRE(yniXFf~582M}~pF5}})`xEzgU6TZ z0z2CoZ-d9%K>@#&@k#J2BfgvQ4d7$n7;l%`3EyK3?6}kIa4p0>_;}qJ-w$4|i`MT0 zALpFuC&A+@LV^83#t(pxbIy3X+{Ns4#r|}I*ZydJJNOGC`yJDoPl9iX_?sBt4}NvT z4={cJy!J=ivCF+I-d^TUFX1KqAghmpK2z>xc6#G^UGUnD=CLkj zeBVuLzO~$LH$-t|r`zNE*9Fhwjrm6K+K$%u!0UDAd>6Cd#q4zH`+8dQ@SoZ54fKP* u4Q1>GI7(n5+2pcaHta0Ak*kVlW*8;+Q8*_0-iVgN_x`jTer_ve)&2uCkqQU^ diff --git a/examples/elementwise_arith/air_project/div_kernel_0_aie_cdo_elfs.bin b/examples/elementwise_arith/air_project/div_kernel_0_aie_cdo_elfs.bin deleted file mode 100644 index 3542f70d63d78dc642ddc7dc29acd12d643307f5..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 10704 zcmeHLZEO@p7=Cy6?)GRXH{15=aM#*xDfF;gC?6spDejfVAT||HOx4EJL?ZDc)I`kv zsT{P(R|$nkG6ad0gcuVOw?>=hepM+12q=V-)QHUqP1HnUO)xo1%lXdTZo7EbmVfeN zCz;;vGc(V;?>jT^J{=bTtgCw-Km^#H1MuuuMp6*~WWj|kSk!rduc=~TWT=#PPgM(} zSun_gd2JY<$1Nn`El!pyW#AHkxu6c>vt$c|1w{ZiH%KasXb{r@J_7zkUAss>GI)X2 z^!WgW!(GO1@2}RVR|OSRIY9x41p!b9nQwe#A_?$eb0y5d4%izx*-6RAie(?G3Hg|R zzK=aw>SJ4qeQd{EA3NsvtyW-Zp8&>+n;#G`t_jiWxpElrx%!q6{b!aedNt&VC#kor z5FfBmZ>{;H5m&BsUt1IX8|P7hB-ALS-@r<`9G*`1Mc>gnwxiD{Ug>iS#_k0EhK1(B zBwcpLHynPVd!vs2=>WJk+&{E9U6}towpVsiM9)Ri@9-u5}ltPeyXmCo-pJAnERT7fTyR~ zJU!Igg-`7O&3q*Ff{CMl1iT`s=6UGh%=n5l<1F_X!g8aeN&U8a#oZ=#l@Uv@!YCPa z`60UHx~#@E;bTu-oYmJWt6vB5CCoO~CRQIw{xwzthCGI+JtW3Uj8pgUX}zd|vEW|Q zA05V`(b4x0K=jyU*#|w5)0ioX(%IuVAnlkPCej(2lCp1~5HGRp)6sgt@{!HIKw<m)b^``9&1;|CEl@?VU66xIjsrlSGp{fAd ziB4EB#@?Mx*jBHil@qZP*xY&rW6>;Cr}b0EJ_PHz4=b%X`OMReXYOcfe1FKB8joWQ z2{UlIaR2*uq7QM&Yhg3+`FG_V-K47k9v6E%2ls+LDB&umOWI!=>XNn3Is^|eA*!&N z1BBIJ1ajL69oe9;`Te+K@qG(MD-{*+@~EVX84vWhf1b(&kM2tnVKy_?|EV zJf3`fNSquGt_h#$sW|1>ZNi9*jTQ9~C^N{Y-mG(ar^-Ou%>&I@+$# z(Z?}%VIKV!(d4NE)f3&xs?L!fMr|2&- zWItbKJGakmtq(OsMfJo7cF2V`wLiWmqLvWg;nFa z6S08l_}kh{s8LH8)Dj*iO~?`QMOG8W7DTrK_T0_w)qyJ*Gaj*Y{zLEB$umB&W#ntg z*qdN)j>($n(j|T#&^{bv_P6#24-{S1KPkF^9zT<1eJs6aX+S>C!YmDF$1DwK$1DwK z$3R^M0tc{Id}L?|@D=3M!e|x@vS3~trU;xNT^`h%OC7!G=uJm&X8b>qP#wML=uJm& zqVk4G{p{#X4fUa;HyyqCf9TD$7UU}Kzgp0aSz6GJSz6GJfx0e>-VA5ao0%94Wnx|% zHnrZYb@ZmAHyyp{=uJ#T&i$ryzvI`^B-{pO>2znRu}cFjmljc3O!EojFqEojF; zeIbk9+>}LcW@0duiFs|<)OvHXqcF7;IZ(=HP^roXX9lh!3O-FA&ntC&>@$8zB Rni|iJSz6GJSz3@|{{p#ZAFu!b diff --git a/examples/elementwise_arith/air_project/div_kernel_0_aie_cdo_enable.bin b/examples/elementwise_arith/air_project/div_kernel_0_aie_cdo_enable.bin deleted file mode 100644 index 7cc1818bce4d6ce1226fc5fda519967a4842b99e..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 104 zcmZQ!U|?`|@n>LQVqg#jvKQ~%@L!OTfq|Wo1;_&e1!V>x7mN{N##qFhv4{o3#Q>)) B1+xGE diff --git a/examples/elementwise_arith/air_project/div_kernel_0_aie_cdo_init.bin b/examples/elementwise_arith/air_project/div_kernel_0_aie_cdo_init.bin deleted file mode 100644 index d4549ba181167c6e6d7475f23335e34fea2c3376..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6032 zcmb7{&rak<5XQ@%hINw3u1Uy>EJ*QcI7AT#M?)kINHH2BaX`vMPg)mN>ymUudP@3&^d;#l($}PKNY6;mN#BybCv6+u_35mqPjyW9=@;sg z)`iu&Bwdl7lD;5)N&1TPHR&7DGtzU?x1{e$Thh7gr0z}C-Y@95Kj1=K_e*vRasT2A z`KGw%NtC~$_drR!Af6K!;#1rYonOBXO5z3Gubj9LpW;_N?bzRq9a$@;UpUZ4Mwdf~w*)RN!T zTGw0O+2D~L&k!+PZVspqp^>qxyLCZUYYv{=)g)M&yk)HtOi1`aSl`pz$Z~{VHC> z{wm~8zhkS{>@Pz8!5=I0ho0@PV=wrGo|TPu3xmfX=5+8#k7tP3R^-WgJe!Ecz0Z3* zJBWQ9<=Kinq2IpGzOohbc{t*JyYq{${(e{F$2@C)+CR@F|Df@&=r{W@PpQw>ig|vl z-i6*{KG=BId(6k{cs_@S#l1i1@f;%-&#>z8oFW$Y{-no)v!Lz|&O*DNXg+CuAD%p) z+W++V)c&W>r}jU6KIt>&-QrcKx@>wrfeNOYm3=@d_NZ zw|+x>4;)3bJR^PpjtW{nB))G4Db?$G()v+vO7%KEZ|nOoNU7cwf84d}i)35nL%m9) zUZqj5n{~S$^(u~f-3(ifdKE{#Db=fZZU-ih)>n&xd&hVAz^Pux=TW^Tu6I|q)q2#U zHTbJ2{~>V${xaeR#53^EBfd|32(IcOZtO_bqxD;h5z6W6t-#g!Bz^*}&xc-r46f!) z>of4z(euuUpMtA-6E}8D`NwMi``~*2njeCH5^6!G{ z`D%{yOZ(S+3a%8O|3dRUJE8m&wZAQJonLdDk5s*%7Mkzakr_t)DHkL4{@Vhl z{wcl*uJh~l1-L$+%*XWnG36iY_v1oy)KAZ!`1S4I2o|Ro1qz~3EajN7i78>jEaZQL C^4EL- diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_2.elf b/examples/elementwise_arith/air_project/div_kernel_0_core_0_2.elf deleted file mode 100755 index 33f5143237b2608b0fcdae4c633076f2bc60cb03..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4132 zcma)9eN0=|6+h3<_&Hgquld?7gz`d2a2f+PI0>esHH=kJ=AdL-BDO^}#x}tU7={gV zHf4DYv}u||B~c^Ys*TDjbB~+YJ8r|;nZee ztx1e(p=~m8%KUeI)DHz2WGPM65Hu+T8xIJHk4$F(e%8Gc9z!f-y|c@rg;h!x)@rk` z(rp&DYm0>)sM~@IsJUI^?8I0o-m^cY+_3|bfuv=H(%(Tdb-^>LE#{8Vv5}<3a6MTdBu=HVZxg!9 zGPLP4%R<|eQwKblp9iq!!1Kc@sKiRIp?>GoclJ)J0vU#+414M`t%#~;fSEC|d}eH{ zoRz-6j=$@V=I;gc_kvi#O6SquzD9hvP+zxxe5cXh=`r;8G1}L!;V*^yhV}iOL4Rj> zz6)r-R{j2k`bVhW*|GAOvm(a#U$nPReeb}-*CQ=Yl=FNKcdfG?=f#Sd^O$cz8R7YM zb$^vVQo-~6LEk$5UKy*Hc||N|1?A|^Hg#m^EB#@7JEoqC&uaUn=nHz`eTiPUj%^;> zEo>iSOG)#c2KnML{#F;=o$su!xp+5TyQLyt>)af#ZL5se-k|fHMOzPRuK*V^I-@@v?Xcskv3A@jfGDo9Anc-n1-R8``w zFOl18kRh@C%On3jm#AM_ioFcVr4J+vOvm3AjIdd~J-!WO9B&5$4^g9Rd@l*9Dm^|A zMla|&vTg5@SpqgeMss1d3;OzDBiI&ayYBVX+4v1=-|;D)a{C|8o2u`!y16p8c$F|2 zoAMrE-@jIbwY-Iq~frn680Q>6`uzi^gY$7aVw%TimVGFRwy;iJ6x9Cv)v9fW(>WpM|A~T59#h5x^ zQUry$Ojfq@8rpg$@P-EpUk6s`v9bcp@$Dp8Ykn#L;q|X6HmW&PczF)Z`kZ{`wZ^jo zB|Co3X3mbsu?Z1xV3m0O$&;8vN;K=Tb^Mh+Uv%-HM+Pt%*|Ckd7d*E_Tt$`Ce5q|x zvj6;?U;-v6R@VA5VXat!!f`?u_gUGtIoz>`4`H>kRRJ@vib`5NFypzcMr}Xg9I2I< zAB<1`{;`rj5k`R5laIFp%dmouI)o^Ni(2mRf*pD))gYzsESg2Ujykhz}U=uRq4G2qtYlX5VAA zU#);@0qaTV(zul_J%P1r3Mtw`ufm>!D@f=oFLs}C*C6LM2(eMyo5D$SXtFK39~3A ztm;e&J27dciE~!wz;!2v66V4FL^q*{cEX^Y@H*LrjY6@3wYH%Kr6EAeQ#@V|@E43$ zk6HEp!|d4QcPxgX#ovmF(`j~eS+X099+961j1R|X_WJ(dg`)1cl}8uQ*WP)^`gp^U zW1-;|+3W3)o$h)^J)ZACzdv$R9t{mg!-Ek`D%<*Ev7u;7L&MQ$RBKSxOcmngQ38lpexwq>}l$81)T0s(CzAV@8Hqh zPeL9v!ktVp;$%)HKn3!tJiIl>^Vidz41YOw^9O2{1bok;JwB^c65t!iB=Ynfh?BYM z2(TZqBlmz6;4s(cVIN{S&%O^a$D=T;jVG`rxKDa+oTR>UEU(4bZ`x?YNKHa_nG3Pn)@h-%27V!sw0Bx)A zcUR%(R^iAh{L@wVMZ{oK7l`L~mSfKfp5Rzn!B8Iy#bdy?`9LUwO!W9b%-@53Uohbja%0=>WnUF)Fq8B-CwiLxOoV{Fb^Rx9COwOcqqLik$|r! zbTk}sg3pJCxpBxB4fy;0!@k~Ncob!7+z=QX4*49uM(7#o&Hgw!=THsMD>BiTe>eud zD0<+WZ`9vE67qHI=P+;#d;|V)#21c)W57A^1!9B4QGFDb`~1Vh{^PntRfi%$$PUt! zSsctCsRAy?fOhd87yoI>?mG`I@1qjm3cRVi0k;M6{Qs{`L}U2w=5`JZ%lO^=vBvXG z_rvWCd2)t!lPKWV5r6jLk<9+D#=9HvIOeFi5MWOZss1>u#+Gk4h<2CwWE$!0lZZL5 PXmTU|tLAg+dcFTY7d$5= diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_2.ld.script b/examples/elementwise_arith/air_project/div_kernel_0_core_0_2.ld.script deleted file mode 100644 index fc4f0cf..0000000 --- a/examples/elementwise_arith/air_project/div_kernel_0_core_0_2.ld.script +++ /dev/null @@ -1,72 +0,0 @@ - -MEMORY -{ - program (RX) : ORIGIN = 0, LENGTH = 0x0020000 - data (!RX) : ORIGIN = 0x78400, LENGTH = 0x7C00 -} -ENTRY(__start) -SECTIONS -{ - . = 0x0; - .text : { - /* the __start symbol has to come at address zero. */ - *crt0.o(.text*) - _ctors_start = .; - _init_array_start = .; - KEEP(SORT(*.init_array)) - _ctors_end = .; - _init_array_end = .; - _dtors_start = .; - _dtors_end = .; - *(.text*) - } > program - .data : { - *(.data*) - *(.rodata*) - } > data - .comment : { - *(.comment*) - } - .symtab : { - *(.symtab) - } - .shstrtab : { - *(.shstrtab) - } - .strtab : { - *(.strtab) - } - .stack_sizes : { - *(.stack_sizes) - } - -. = 0x70000; -_sp_start_value_DM_stack = .; -. += 0x400; /* stack */ -/* No tile with memory exists to the south. */ -. = 0x40000; -. += 0x10000; -/* No tile with memory exists to the west. */ -. = 0x50000; -. += 0x10000; -. = 0x60400; -buf5 = .; -. += 0x400; -. = 0x64000; -buf4 = .; -. += 0x400; -. = 0x68000; -buf3 = .; -. += 0x400; -. = 0x70400; -buf2 = .; -. += 0x400; -. = 0x74000; -buf1 = .; -. += 0x400; -. = 0x78000; -buf0 = .; -. += 0x400; - .bss : { *(.bss*) } > data -} -PROVIDE(main = core_0_2); diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_2.ll b/examples/elementwise_arith/air_project/div_kernel_0_core_0_2.ll deleted file mode 100644 index bf98238..0000000 --- a/examples/elementwise_arith/air_project/div_kernel_0_core_0_2.ll +++ /dev/null @@ -1,158 +0,0 @@ -; ModuleID = 'LLVMDialectModule' -source_filename = "LLVMDialectModule" -target triple = "aie2p" - -@buf0 = external global [256 x float] -@buf1 = external global [256 x float] -@buf2 = external global [256 x float] -@buf3 = external global [256 x float] -@buf4 = external global [256 x float] -@buf5 = external global [256 x float] -@buf6 = external global [256 x float] -@buf7 = external global [256 x float] -@buf8 = external global [256 x float] -@buf9 = external global [256 x float] -@buf10 = external global [256 x float] -@buf11 = external global [256 x float] -@buf12 = external global [1024 x float] -@buf13 = external global [1024 x float] -@buf14 = external global [1024 x float] - -; Function Attrs: noinline -define float @__aie2p_scalar_fdiv(float %0, float %1) #0 { - %3 = call float @llvm.aie2p.inv(float %1) - %4 = fmul float %0, %3 - ret float %4 -} - -declare void @debug_i32(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.event(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.put.ms(i32, i32) - -; Unknown intrinsic -declare { i32, i32 } @llvm.aie2p.get.ss() - -; Unknown intrinsic -declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32) - -; Unknown intrinsic -declare <16 x i32> @llvm.aie2p.scd.read.vec(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.acquire(i32, i32) - -; Unknown intrinsic -declare void @llvm.aie2p.release(i32, i32) - -; Unknown intrinsic -declare void @llvm.aie2p.set.ctrl.reg(i32, i32) - -define void @core_0_2() { - br label %1 - -1: ; preds = %76, %0 - call void @llvm.aie2p.acquire(i32 49, i32 -1) - call void @llvm.aie2p.acquire(i32 50, i32 -1) - call void @llvm.aie2p.acquire(i32 52, i32 -1) - br label %2 - -2: ; preds = %5, %1 - %3 = phi i32 [ %75, %5 ], [ 0, %1 ] - %4 = icmp slt i32 %3, 256 - br i1 %4, label %5, label %76 - -5: ; preds = %2 - %6 = getelementptr float, ptr @buf2, i32 %3 - %7 = load <16 x float>, ptr %6, align 4 - %8 = getelementptr float, ptr @buf1, i32 %3 - %9 = load <16 x float>, ptr %8, align 4 - %10 = extractelement <16 x float> %7, i64 0 - %11 = extractelement <16 x float> %9, i64 0 - %12 = call float @__aie2p_scalar_fdiv(float %10, float %11) - %13 = insertelement <16 x float> poison, float %12, i64 0 - %14 = extractelement <16 x float> %7, i64 1 - %15 = extractelement <16 x float> %9, i64 1 - %16 = call float @__aie2p_scalar_fdiv(float %14, float %15) - %17 = insertelement <16 x float> %13, float %16, i64 1 - %18 = extractelement <16 x float> %7, i64 2 - %19 = extractelement <16 x float> %9, i64 2 - %20 = call float @__aie2p_scalar_fdiv(float %18, float %19) - %21 = insertelement <16 x float> %17, float %20, i64 2 - %22 = extractelement <16 x float> %7, i64 3 - %23 = extractelement <16 x float> %9, i64 3 - %24 = call float @__aie2p_scalar_fdiv(float %22, float %23) - %25 = insertelement <16 x float> %21, float %24, i64 3 - %26 = extractelement <16 x float> %7, i64 4 - %27 = extractelement <16 x float> %9, i64 4 - %28 = call float @__aie2p_scalar_fdiv(float %26, float %27) - %29 = insertelement <16 x float> %25, float %28, i64 4 - %30 = extractelement <16 x float> %7, i64 5 - %31 = extractelement <16 x float> %9, i64 5 - %32 = call float @__aie2p_scalar_fdiv(float %30, float %31) - %33 = insertelement <16 x float> %29, float %32, i64 5 - %34 = extractelement <16 x float> %7, i64 6 - %35 = extractelement <16 x float> %9, i64 6 - %36 = call float @__aie2p_scalar_fdiv(float %34, float %35) - %37 = insertelement <16 x float> %33, float %36, i64 6 - %38 = extractelement <16 x float> %7, i64 7 - %39 = extractelement <16 x float> %9, i64 7 - %40 = call float @__aie2p_scalar_fdiv(float %38, float %39) - %41 = insertelement <16 x float> %37, float %40, i64 7 - %42 = extractelement <16 x float> %7, i64 8 - %43 = extractelement <16 x float> %9, i64 8 - %44 = call float @__aie2p_scalar_fdiv(float %42, float %43) - %45 = insertelement <16 x float> %41, float %44, i64 8 - %46 = extractelement <16 x float> %7, i64 9 - %47 = extractelement <16 x float> %9, i64 9 - %48 = call float @__aie2p_scalar_fdiv(float %46, float %47) - %49 = insertelement <16 x float> %45, float %48, i64 9 - %50 = extractelement <16 x float> %7, i64 10 - %51 = extractelement <16 x float> %9, i64 10 - %52 = call float @__aie2p_scalar_fdiv(float %50, float %51) - %53 = insertelement <16 x float> %49, float %52, i64 10 - %54 = extractelement <16 x float> %7, i64 11 - %55 = extractelement <16 x float> %9, i64 11 - %56 = call float @__aie2p_scalar_fdiv(float %54, float %55) - %57 = insertelement <16 x float> %53, float %56, i64 11 - %58 = extractelement <16 x float> %7, i64 12 - %59 = extractelement <16 x float> %9, i64 12 - %60 = call float @__aie2p_scalar_fdiv(float %58, float %59) - %61 = insertelement <16 x float> %57, float %60, i64 12 - %62 = extractelement <16 x float> %7, i64 13 - %63 = extractelement <16 x float> %9, i64 13 - %64 = call float @__aie2p_scalar_fdiv(float %62, float %63) - %65 = insertelement <16 x float> %61, float %64, i64 13 - %66 = extractelement <16 x float> %7, i64 14 - %67 = extractelement <16 x float> %9, i64 14 - %68 = call float @__aie2p_scalar_fdiv(float %66, float %67) - %69 = insertelement <16 x float> %65, float %68, i64 14 - %70 = extractelement <16 x float> %7, i64 15 - %71 = extractelement <16 x float> %9, i64 15 - %72 = call float @__aie2p_scalar_fdiv(float %70, float %71) - %73 = insertelement <16 x float> %69, float %72, i64 15 - %74 = getelementptr float, ptr @buf0, i32 %3 - store <16 x float> %73, ptr %74, align 4 - %75 = add i32 %3, 16 - br label %2, !llvm.loop !1 - -76: ; preds = %2 - call void @llvm.aie2p.release(i32 51, i32 1) - call void @llvm.aie2p.release(i32 53, i32 1) - call void @llvm.aie2p.release(i32 48, i32 1) - br label %1 -} - -; Unknown intrinsic -declare float @llvm.aie2p.inv(float) - -attributes #0 = { noinline } - -!llvm.module.flags = !{!0} - -!0 = !{i32 2, !"Debug Info Version", i32 3} -!1 = distinct !{!1, !2} -!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_2.o b/examples/elementwise_arith/air_project/div_kernel_0_core_0_2.o deleted file mode 100644 index 728d41f24858dcddcc40661dd9b21afbd5fbd4eb..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2048 zcma)7VQ3R)7=AC8G*_738AWV00Qk?Tg4MC=qDWzIhp- z5}hqfNRLb!561-+RBx35C^iJZpp|!YX0-ZI+GPhqCdLU*59`I#68bemtf?L4@(YH+Z9fbqCh38@(TFwBaycj6u^8v$ur;gvk`uxtL z-+3`;_{*pduQ9*-$hX&@-xT^y<a{ie7Lot7zTYMEyF}})pnk3U_YC=U+`r5D zK~Jg$hr(iL2Y>gpdMMhghwia*%=gj8svf!?&_llj^`Gaq=uiJd=0S7i?(McdHdosG zBGA>KUaVGPg&QK96e8AD@yL{j`Tk(}m#ZNPJ^;S28NT^U>gR6{lV-7+n9G}{3LR`TUgP@v9|RW9QdpQ|HpwZIq+?+ z6`u&!w$GRYpK;*7JMcvZp2hnl&<7l25_3CW@2`ZDjzSwvrIT7VrDwH7A~}@m8cAdZ zlIi4VV(|3P7?=!bXNR*X?Tgsa_DnW8@D23#9y@$sD49+TWDgIY&ZSeJWwN8$, ptr %5, align 64 - %7 = getelementptr float, ptr @buf1, i20 %4 - %8 = load <16 x float>, ptr %7, align 64 - %9 = extractelement <16 x float> %6, i64 0 - %10 = extractelement <16 x float> %8, i64 0 - %11 = tail call float @__aie2p_scalar_fdiv(float %9, float %10) - %12 = insertelement <16 x float> poison, float %11, i64 0 - %13 = extractelement <16 x float> %6, i64 1 - %14 = extractelement <16 x float> %8, i64 1 - %15 = tail call float @__aie2p_scalar_fdiv(float %13, float %14) - %16 = insertelement <16 x float> %12, float %15, i64 1 - %17 = extractelement <16 x float> %6, i64 2 - %18 = extractelement <16 x float> %8, i64 2 - %19 = tail call float @__aie2p_scalar_fdiv(float %17, float %18) - %20 = insertelement <16 x float> %16, float %19, i64 2 - %21 = extractelement <16 x float> %6, i64 3 - %22 = extractelement <16 x float> %8, i64 3 - %23 = tail call float @__aie2p_scalar_fdiv(float %21, float %22) - %24 = insertelement <16 x float> %20, float %23, i64 3 - %25 = extractelement <16 x float> %6, i64 4 - %26 = extractelement <16 x float> %8, i64 4 - %27 = tail call float @__aie2p_scalar_fdiv(float %25, float %26) - %28 = insertelement <16 x float> %24, float %27, i64 4 - %29 = extractelement <16 x float> %6, i64 5 - %30 = extractelement <16 x float> %8, i64 5 - %31 = tail call float @__aie2p_scalar_fdiv(float %29, float %30) - %32 = insertelement <16 x float> %28, float %31, i64 5 - %33 = extractelement <16 x float> %6, i64 6 - %34 = extractelement <16 x float> %8, i64 6 - %35 = tail call float @__aie2p_scalar_fdiv(float %33, float %34) - %36 = insertelement <16 x float> %32, float %35, i64 6 - %37 = extractelement <16 x float> %6, i64 7 - %38 = extractelement <16 x float> %8, i64 7 - %39 = tail call float @__aie2p_scalar_fdiv(float %37, float %38) - %40 = insertelement <16 x float> %36, float %39, i64 7 - %41 = extractelement <16 x float> %6, i64 8 - %42 = extractelement <16 x float> %8, i64 8 - %43 = tail call float @__aie2p_scalar_fdiv(float %41, float %42) - %44 = insertelement <16 x float> %40, float %43, i64 8 - %45 = extractelement <16 x float> %6, i64 9 - %46 = extractelement <16 x float> %8, i64 9 - %47 = tail call float @__aie2p_scalar_fdiv(float %45, float %46) - %48 = insertelement <16 x float> %44, float %47, i64 9 - %49 = extractelement <16 x float> %6, i64 10 - %50 = extractelement <16 x float> %8, i64 10 - %51 = tail call float @__aie2p_scalar_fdiv(float %49, float %50) - %52 = insertelement <16 x float> %48, float %51, i64 10 - %53 = extractelement <16 x float> %6, i64 11 - %54 = extractelement <16 x float> %8, i64 11 - %55 = tail call float @__aie2p_scalar_fdiv(float %53, float %54) - %56 = insertelement <16 x float> %52, float %55, i64 11 - %57 = extractelement <16 x float> %6, i64 12 - %58 = extractelement <16 x float> %8, i64 12 - %59 = tail call float @__aie2p_scalar_fdiv(float %57, float %58) - %60 = insertelement <16 x float> %56, float %59, i64 12 - %61 = extractelement <16 x float> %6, i64 13 - %62 = extractelement <16 x float> %8, i64 13 - %63 = tail call float @__aie2p_scalar_fdiv(float %61, float %62) - %64 = insertelement <16 x float> %60, float %63, i64 13 - %65 = extractelement <16 x float> %6, i64 14 - %66 = extractelement <16 x float> %8, i64 14 - %67 = tail call float @__aie2p_scalar_fdiv(float %65, float %66) - %68 = insertelement <16 x float> %64, float %67, i64 14 - %69 = extractelement <16 x float> %6, i64 15 - %70 = extractelement <16 x float> %8, i64 15 - %71 = tail call float @__aie2p_scalar_fdiv(float %69, float %70) - %72 = insertelement <16 x float> %68, float %71, i64 15 - %73 = getelementptr float, ptr @buf0, i20 %4 - store <16 x float> %72, ptr %73, align 64 - %74 = add nuw nsw i32 %3, 16 - %75 = icmp ult i32 %3, 240 - br i1 %75, label %2, label %76, !llvm.loop !1 - -76: ; preds = %2 - tail call void @llvm.aie2p.release(i32 51, i32 1) - tail call void @llvm.aie2p.release(i32 53, i32 1) - tail call void @llvm.aie2p.release(i32 48, i32 1) - br label %1 -} - -; Function Attrs: nofree nosync nounwind memory(none) -declare float @llvm.aie2p.inv(float) #3 - -attributes #0 = { nofree noinline nosync nounwind memory(none) } -attributes #1 = { nounwind } -attributes #2 = { noreturn nounwind } -attributes #3 = { nofree nosync nounwind memory(none) } - -!llvm.module.flags = !{!0} - -!0 = !{i32 2, !"Debug Info Version", i32 3} -!1 = distinct !{!1, !2} -!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_2.peanohack.ll b/examples/elementwise_arith/air_project/div_kernel_0_core_0_2.peanohack.ll deleted file mode 100644 index 61bace1..0000000 --- a/examples/elementwise_arith/air_project/div_kernel_0_core_0_2.peanohack.ll +++ /dev/null @@ -1,158 +0,0 @@ -; ModuleID = 'LLVMDialectModule' -source_filename = "LLVMDialectModule" -target triple = "aie2p" - -@buf0 = external global [256 x float] -@buf1 = external global [256 x float] -@buf2 = external global [256 x float] -@buf3 = external global [256 x float] -@buf4 = external global [256 x float] -@buf5 = external global [256 x float] -@buf6 = external global [256 x float] -@buf7 = external global [256 x float] -@buf8 = external global [256 x float] -@buf9 = external global [256 x float] -@buf10 = external global [256 x float] -@buf11 = external global [256 x float] -@buf12 = external global [1024 x float] -@buf13 = external global [1024 x float] -@buf14 = external global [1024 x float] - -; Function Attrs: noinline -define float @__aie2p_scalar_fdiv(float %0, float %1) #0 { - %3 = call float @llvm.aie2p.inv(float %1) - %4 = fmul float %0, %3 - ret float %4 -} - -declare void @debug_i32(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.event(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.put.ms(i32, i32) - -; Unknown intrinsic -declare { i32, i32 } @llvm.aie2p.get.ss() - -; Unknown intrinsic -declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32) - -; Unknown intrinsic -declare <16 x i32> @llvm.aie2p.scd.read.vec(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.acquire(i32, i32) - -; Unknown intrinsic -declare void @llvm.aie2p.release(i32, i32) - -; Unknown intrinsic -declare void @llvm.aie2p.set.ctrl.reg(i32, i32) - -define void @core_0_2() { - br label %1 - -1: ; preds = %76, %0 - call void @llvm.aie2p.acquire(i32 49, i32 -1) - call void @llvm.aie2p.acquire(i32 50, i32 -1) - call void @llvm.aie2p.acquire(i32 52, i32 -1) - br label %2 - -2: ; preds = %5, %1 - %3 = phi i32 [ %75, %5 ], [ 0, %1 ] - %4 = icmp slt i32 %3, 256 - br i1 %4, label %5, label %76 - -5: ; preds = %2 - %6 = getelementptr float, ptr @buf2, i32 %3 - %7 = load <16 x float>, ptr %6 - %8 = getelementptr float, ptr @buf1, i32 %3 - %9 = load <16 x float>, ptr %8 - %10 = extractelement <16 x float> %7, i64 0 - %11 = extractelement <16 x float> %9, i64 0 - %12 = call float @__aie2p_scalar_fdiv(float %10, float %11) - %13 = insertelement <16 x float> poison, float %12, i64 0 - %14 = extractelement <16 x float> %7, i64 1 - %15 = extractelement <16 x float> %9, i64 1 - %16 = call float @__aie2p_scalar_fdiv(float %14, float %15) - %17 = insertelement <16 x float> %13, float %16, i64 1 - %18 = extractelement <16 x float> %7, i64 2 - %19 = extractelement <16 x float> %9, i64 2 - %20 = call float @__aie2p_scalar_fdiv(float %18, float %19) - %21 = insertelement <16 x float> %17, float %20, i64 2 - %22 = extractelement <16 x float> %7, i64 3 - %23 = extractelement <16 x float> %9, i64 3 - %24 = call float @__aie2p_scalar_fdiv(float %22, float %23) - %25 = insertelement <16 x float> %21, float %24, i64 3 - %26 = extractelement <16 x float> %7, i64 4 - %27 = extractelement <16 x float> %9, i64 4 - %28 = call float @__aie2p_scalar_fdiv(float %26, float %27) - %29 = insertelement <16 x float> %25, float %28, i64 4 - %30 = extractelement <16 x float> %7, i64 5 - %31 = extractelement <16 x float> %9, i64 5 - %32 = call float @__aie2p_scalar_fdiv(float %30, float %31) - %33 = insertelement <16 x float> %29, float %32, i64 5 - %34 = extractelement <16 x float> %7, i64 6 - %35 = extractelement <16 x float> %9, i64 6 - %36 = call float @__aie2p_scalar_fdiv(float %34, float %35) - %37 = insertelement <16 x float> %33, float %36, i64 6 - %38 = extractelement <16 x float> %7, i64 7 - %39 = extractelement <16 x float> %9, i64 7 - %40 = call float @__aie2p_scalar_fdiv(float %38, float %39) - %41 = insertelement <16 x float> %37, float %40, i64 7 - %42 = extractelement <16 x float> %7, i64 8 - %43 = extractelement <16 x float> %9, i64 8 - %44 = call float @__aie2p_scalar_fdiv(float %42, float %43) - %45 = insertelement <16 x float> %41, float %44, i64 8 - %46 = extractelement <16 x float> %7, i64 9 - %47 = extractelement <16 x float> %9, i64 9 - %48 = call float @__aie2p_scalar_fdiv(float %46, float %47) - %49 = insertelement <16 x float> %45, float %48, i64 9 - %50 = extractelement <16 x float> %7, i64 10 - %51 = extractelement <16 x float> %9, i64 10 - %52 = call float @__aie2p_scalar_fdiv(float %50, float %51) - %53 = insertelement <16 x float> %49, float %52, i64 10 - %54 = extractelement <16 x float> %7, i64 11 - %55 = extractelement <16 x float> %9, i64 11 - %56 = call float @__aie2p_scalar_fdiv(float %54, float %55) - %57 = insertelement <16 x float> %53, float %56, i64 11 - %58 = extractelement <16 x float> %7, i64 12 - %59 = extractelement <16 x float> %9, i64 12 - %60 = call float @__aie2p_scalar_fdiv(float %58, float %59) - %61 = insertelement <16 x float> %57, float %60, i64 12 - %62 = extractelement <16 x float> %7, i64 13 - %63 = extractelement <16 x float> %9, i64 13 - %64 = call float @__aie2p_scalar_fdiv(float %62, float %63) - %65 = insertelement <16 x float> %61, float %64, i64 13 - %66 = extractelement <16 x float> %7, i64 14 - %67 = extractelement <16 x float> %9, i64 14 - %68 = call float @__aie2p_scalar_fdiv(float %66, float %67) - %69 = insertelement <16 x float> %65, float %68, i64 14 - %70 = extractelement <16 x float> %7, i64 15 - %71 = extractelement <16 x float> %9, i64 15 - %72 = call float @__aie2p_scalar_fdiv(float %70, float %71) - %73 = insertelement <16 x float> %69, float %72, i64 15 - %74 = getelementptr float, ptr @buf0, i32 %3 - store <16 x float> %73, ptr %74 - %75 = add i32 %3, 16 - br label %2, !llvm.loop !1 - -76: ; preds = %2 - call void @llvm.aie2p.release(i32 51, i32 1) - call void @llvm.aie2p.release(i32 53, i32 1) - call void @llvm.aie2p.release(i32 48, i32 1) - br label %1 -} - -; Unknown intrinsic -declare float @llvm.aie2p.inv(float) - -attributes #0 = { noinline } - -!llvm.module.flags = !{!0} - -!0 = !{i32 2, !"Debug Info Version", i32 3} -!1 = distinct !{!1, !2} -!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_3.elf b/examples/elementwise_arith/air_project/div_kernel_0_core_0_3.elf deleted file mode 100755 index a92e4ae85366d8708c15091ef67871349c6409af..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4192 zcma)9eQZp-&G1j-|P#1tp7V*@cAtwXa4F@sLfIp>~x&bc4He*WvhXC+AjgG^v6A|DF3*#XSh`#=E`I6(#rkhO}y>gFAn*QIQa@WHn`|!hm7C?Z z`Wj@AxNWP>xtOKWeAo4)dfNp;0hU)(&VL83+y_rnwwIKE@(@7NIf`v~ou=JA(9ee=Tpj-$Wh zl4vuD>4vNP+Rmy~kLJ@2uh*s< zepQ$L<;1e|Ki@^oUzgwd^M&;td}4gbC$%kB<>maXotZbxd{_$f>l+t13()zYQ}uz# z+6Gks`o88M;pyqrPY<_^A~y`sNh7%pbUEh7z-9tPKMxnkGeUH8M^rw`M5W61DZd}v z`1wA?pP9&u8kJ33szKdzT~=}~={0M6P6UIF&0%1_hDS0_^OW63&MxU%VgihA7<<7u71%kIepMX4Z)3Omv$b`_854iO*3O9DN0^A&9In;Ov=YQ)K?l~N)9lv#iJD<18ZB&MWVR2h zi!rsss2Nn@vpU53%V--I!5bbpd=*4xz#%Fy$G5U4#FfiE#R;Eg{m{V0}6oEA`ULby%4xz##Piv z%a=PxW!KM6N>&h(>JVFAVPXqbpmLb;sqGH2W)gQS;yqZcLRG*L)VxBeKvDp~dU zglm(-b*T>OC9Ego)58uv{W#XHjjG>o(*3l9VpX3PjhEO3^!-yhBe(;(JK*3So0*WZ zOT_Z+4&Lziy!$P$RCCN1Jv{h)>3WKv~;F*&S;l_1mrllf){gF<_Gh3N}t&HmAk`_x#OkztLYET*jv^+}j z27q2LdOaqz`wz1d=iauP`lo(t&K%2&12eMAWC@t*JYak{Mz=Ti2NjC@Crghm;IF^^ zfb|K+69;1ho0VX&UGeyv+)a4CBR%270p(C^AQ|sV=u*Ymol5m5H#avQh^M*-yP6_> zz0Do*o_HeN+|zTYw=o=#DP7)3)a{RYy|HlA>-KN-wMJu+O;KOe>*?}#`63>FEb8~| z_phVq?j@-VTHyBV62#e}OoBS(b7go{k*C+w?OA#`b*FpKe#5AtW$7f{jhR|>dfzZx;cXV^C*CZ34K`w%MyL=ON7 z+UDSI&B4#j!HGHeM|1E?h{2+52+HpS;XnzG5LQe0Y{6bzP_q9?!M=pg7wk(Ino_ZJ z3g}q$#uCUR5BH|RUD$UgQv(`T?8A|Rp=A7ru_Qn+xOYc;JlqqDq;~X02YX`B6x_Pi z6Y}VN3-k{5B=>upA_ajvv_bE^dcR5UTMU86txG(HsmHMK8=4k_F(fU9q{V0WYc&`@ zg%FKX2$6x5rzt|A>>o@-LS3-~@q`CLAw2vo{h?$e+!G!M?T^L}p-hV#B7Fm~kUQjs zuEG5q=;Nc0S2IAb$RtzYffR(2=z+M

WS&EY!Y(VB{c#dc*NVD4vL?fH(+6QhftS zV-(3l;emnhVMC&+V~Hpf2Id?T nz@{S7{1L3jR&IA5?S4*^>141^ASPbb< program - .data : { - *(.data*) - *(.rodata*) - } > data - .comment : { - *(.comment*) - } - .symtab : { - *(.symtab) - } - .shstrtab : { - *(.shstrtab) - } - .strtab : { - *(.strtab) - } - .stack_sizes : { - *(.stack_sizes) - } - -. = 0x70000; -_sp_start_value_DM_stack = .; -. += 0x400; /* stack */ -. = 0x40400; -buf2 = .; -. += 0x400; -. = 0x44000; -buf1 = .; -. += 0x400; -. = 0x48000; -buf0 = .; -. += 0x400; -/* No tile with memory exists to the west. */ -. = 0x50000; -. += 0x10000; -. = 0x60400; -buf8 = .; -. += 0x400; -. = 0x64000; -buf7 = .; -. += 0x400; -. = 0x68000; -buf6 = .; -. += 0x400; -. = 0x70400; -buf5 = .; -. += 0x400; -. = 0x74000; -buf4 = .; -. += 0x400; -. = 0x78000; -buf3 = .; -. += 0x400; - .bss : { *(.bss*) } > data -} -PROVIDE(main = core_0_3); diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_3.ll b/examples/elementwise_arith/air_project/div_kernel_0_core_0_3.ll deleted file mode 100644 index 666390f..0000000 --- a/examples/elementwise_arith/air_project/div_kernel_0_core_0_3.ll +++ /dev/null @@ -1,158 +0,0 @@ -; ModuleID = 'LLVMDialectModule' -source_filename = "LLVMDialectModule" -target triple = "aie2p" - -@buf0 = external global [256 x float] -@buf1 = external global [256 x float] -@buf2 = external global [256 x float] -@buf3 = external global [256 x float] -@buf4 = external global [256 x float] -@buf5 = external global [256 x float] -@buf6 = external global [256 x float] -@buf7 = external global [256 x float] -@buf8 = external global [256 x float] -@buf9 = external global [256 x float] -@buf10 = external global [256 x float] -@buf11 = external global [256 x float] -@buf12 = external global [1024 x float] -@buf13 = external global [1024 x float] -@buf14 = external global [1024 x float] - -; Function Attrs: noinline -define float @__aie2p_scalar_fdiv(float %0, float %1) #0 { - %3 = call float @llvm.aie2p.inv(float %1) - %4 = fmul float %0, %3 - ret float %4 -} - -declare void @debug_i32(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.event(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.put.ms(i32, i32) - -; Unknown intrinsic -declare { i32, i32 } @llvm.aie2p.get.ss() - -; Unknown intrinsic -declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32) - -; Unknown intrinsic -declare <16 x i32> @llvm.aie2p.scd.read.vec(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.acquire(i32, i32) - -; Unknown intrinsic -declare void @llvm.aie2p.release(i32, i32) - -; Unknown intrinsic -declare void @llvm.aie2p.set.ctrl.reg(i32, i32) - -define void @core_0_3() { - br label %1 - -1: ; preds = %76, %0 - call void @llvm.aie2p.acquire(i32 49, i32 -1) - call void @llvm.aie2p.acquire(i32 50, i32 -1) - call void @llvm.aie2p.acquire(i32 52, i32 -1) - br label %2 - -2: ; preds = %5, %1 - %3 = phi i32 [ %75, %5 ], [ 0, %1 ] - %4 = icmp slt i32 %3, 256 - br i1 %4, label %5, label %76 - -5: ; preds = %2 - %6 = getelementptr float, ptr @buf5, i32 %3 - %7 = load <16 x float>, ptr %6, align 4 - %8 = getelementptr float, ptr @buf4, i32 %3 - %9 = load <16 x float>, ptr %8, align 4 - %10 = extractelement <16 x float> %7, i64 0 - %11 = extractelement <16 x float> %9, i64 0 - %12 = call float @__aie2p_scalar_fdiv(float %10, float %11) - %13 = insertelement <16 x float> poison, float %12, i64 0 - %14 = extractelement <16 x float> %7, i64 1 - %15 = extractelement <16 x float> %9, i64 1 - %16 = call float @__aie2p_scalar_fdiv(float %14, float %15) - %17 = insertelement <16 x float> %13, float %16, i64 1 - %18 = extractelement <16 x float> %7, i64 2 - %19 = extractelement <16 x float> %9, i64 2 - %20 = call float @__aie2p_scalar_fdiv(float %18, float %19) - %21 = insertelement <16 x float> %17, float %20, i64 2 - %22 = extractelement <16 x float> %7, i64 3 - %23 = extractelement <16 x float> %9, i64 3 - %24 = call float @__aie2p_scalar_fdiv(float %22, float %23) - %25 = insertelement <16 x float> %21, float %24, i64 3 - %26 = extractelement <16 x float> %7, i64 4 - %27 = extractelement <16 x float> %9, i64 4 - %28 = call float @__aie2p_scalar_fdiv(float %26, float %27) - %29 = insertelement <16 x float> %25, float %28, i64 4 - %30 = extractelement <16 x float> %7, i64 5 - %31 = extractelement <16 x float> %9, i64 5 - %32 = call float @__aie2p_scalar_fdiv(float %30, float %31) - %33 = insertelement <16 x float> %29, float %32, i64 5 - %34 = extractelement <16 x float> %7, i64 6 - %35 = extractelement <16 x float> %9, i64 6 - %36 = call float @__aie2p_scalar_fdiv(float %34, float %35) - %37 = insertelement <16 x float> %33, float %36, i64 6 - %38 = extractelement <16 x float> %7, i64 7 - %39 = extractelement <16 x float> %9, i64 7 - %40 = call float @__aie2p_scalar_fdiv(float %38, float %39) - %41 = insertelement <16 x float> %37, float %40, i64 7 - %42 = extractelement <16 x float> %7, i64 8 - %43 = extractelement <16 x float> %9, i64 8 - %44 = call float @__aie2p_scalar_fdiv(float %42, float %43) - %45 = insertelement <16 x float> %41, float %44, i64 8 - %46 = extractelement <16 x float> %7, i64 9 - %47 = extractelement <16 x float> %9, i64 9 - %48 = call float @__aie2p_scalar_fdiv(float %46, float %47) - %49 = insertelement <16 x float> %45, float %48, i64 9 - %50 = extractelement <16 x float> %7, i64 10 - %51 = extractelement <16 x float> %9, i64 10 - %52 = call float @__aie2p_scalar_fdiv(float %50, float %51) - %53 = insertelement <16 x float> %49, float %52, i64 10 - %54 = extractelement <16 x float> %7, i64 11 - %55 = extractelement <16 x float> %9, i64 11 - %56 = call float @__aie2p_scalar_fdiv(float %54, float %55) - %57 = insertelement <16 x float> %53, float %56, i64 11 - %58 = extractelement <16 x float> %7, i64 12 - %59 = extractelement <16 x float> %9, i64 12 - %60 = call float @__aie2p_scalar_fdiv(float %58, float %59) - %61 = insertelement <16 x float> %57, float %60, i64 12 - %62 = extractelement <16 x float> %7, i64 13 - %63 = extractelement <16 x float> %9, i64 13 - %64 = call float @__aie2p_scalar_fdiv(float %62, float %63) - %65 = insertelement <16 x float> %61, float %64, i64 13 - %66 = extractelement <16 x float> %7, i64 14 - %67 = extractelement <16 x float> %9, i64 14 - %68 = call float @__aie2p_scalar_fdiv(float %66, float %67) - %69 = insertelement <16 x float> %65, float %68, i64 14 - %70 = extractelement <16 x float> %7, i64 15 - %71 = extractelement <16 x float> %9, i64 15 - %72 = call float @__aie2p_scalar_fdiv(float %70, float %71) - %73 = insertelement <16 x float> %69, float %72, i64 15 - %74 = getelementptr float, ptr @buf3, i32 %3 - store <16 x float> %73, ptr %74, align 4 - %75 = add i32 %3, 16 - br label %2, !llvm.loop !1 - -76: ; preds = %2 - call void @llvm.aie2p.release(i32 51, i32 1) - call void @llvm.aie2p.release(i32 53, i32 1) - call void @llvm.aie2p.release(i32 48, i32 1) - br label %1 -} - -; Unknown intrinsic -declare float @llvm.aie2p.inv(float) - -attributes #0 = { noinline } - -!llvm.module.flags = !{!0} - -!0 = !{i32 2, !"Debug Info Version", i32 3} -!1 = distinct !{!1, !2} -!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_3.o b/examples/elementwise_arith/air_project/div_kernel_0_core_0_3.o deleted file mode 100644 index 0cbf9dffbcb51eefa1ba59e76899a9a17c429aac..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2048 zcma)7VQ3R)7=AC8G*_73EhFVyV&Uwj>DDxBYb`R7CB<$2s38oMGB>Yjjayk z@AEwO-1psm_uc1j4!tEw61X9OB2)lY3xJGdzb)v5R#?*#v=Fz6l_)Up7uq!>6|nX# zssOdbeCd>O+orKQD(Rs6>kL41Api!QyrL`X4S>%^x?n2`P>r6}53*{t`&DaSr)mXv zsMhZ7s&%+swF)n)*0(|R_2;1n#ifBe(;AdynE7LiKLMEcVE%Wj-=jp4UT!d@qYV=b zrVR2ds(H7_|3>%DJi`0vpw;ym#;(8~)(>wuTCcrStFWkgmKJ?d=~NxZgf!CDV7nfx zwZyCCR2Hs?A{0(dGW4}{6&H$C79I6zHAuD*jiQm%Z z{7$3aX*p~KtEi7|Fu$9~cQ>Ek8T30-M88|8k8R+$jQnfO{m!D_Sz2!m^&36E`^azN z`JF3Cj$cDg-4CAu}-aGKHDKD)X(vpGyyz$K3L_PgX2?PFXEvT03Qsm(sBKKh4tvL98%3GbB<|}!fyhH=f?+UnmByxN}TGOX6clBQi_`HDsDc~yt zzSFbj6UW^3nHKOl0smRRmjyhJ&q<;K9Ag&8Zob~12qzsyY%-Hg$MP97A2W>fc&0CB zOpT?p=}F_n@$qS}8Hjy6kUM!)oGfIgFcEGyHZhqodJM#d1_yhMK0fw3^V&pz05yDhw%UvT zt8aws=#f(B*eFNZjyNCPLE{PW>vgmj-EeiJp;sQ~#0nO)cWDdJe2AmEfKBw9!J;, ptr %5, align 64 - %7 = getelementptr float, ptr @buf4, i20 %4 - %8 = load <16 x float>, ptr %7, align 64 - %9 = extractelement <16 x float> %6, i64 0 - %10 = extractelement <16 x float> %8, i64 0 - %11 = tail call float @__aie2p_scalar_fdiv(float %9, float %10) - %12 = insertelement <16 x float> poison, float %11, i64 0 - %13 = extractelement <16 x float> %6, i64 1 - %14 = extractelement <16 x float> %8, i64 1 - %15 = tail call float @__aie2p_scalar_fdiv(float %13, float %14) - %16 = insertelement <16 x float> %12, float %15, i64 1 - %17 = extractelement <16 x float> %6, i64 2 - %18 = extractelement <16 x float> %8, i64 2 - %19 = tail call float @__aie2p_scalar_fdiv(float %17, float %18) - %20 = insertelement <16 x float> %16, float %19, i64 2 - %21 = extractelement <16 x float> %6, i64 3 - %22 = extractelement <16 x float> %8, i64 3 - %23 = tail call float @__aie2p_scalar_fdiv(float %21, float %22) - %24 = insertelement <16 x float> %20, float %23, i64 3 - %25 = extractelement <16 x float> %6, i64 4 - %26 = extractelement <16 x float> %8, i64 4 - %27 = tail call float @__aie2p_scalar_fdiv(float %25, float %26) - %28 = insertelement <16 x float> %24, float %27, i64 4 - %29 = extractelement <16 x float> %6, i64 5 - %30 = extractelement <16 x float> %8, i64 5 - %31 = tail call float @__aie2p_scalar_fdiv(float %29, float %30) - %32 = insertelement <16 x float> %28, float %31, i64 5 - %33 = extractelement <16 x float> %6, i64 6 - %34 = extractelement <16 x float> %8, i64 6 - %35 = tail call float @__aie2p_scalar_fdiv(float %33, float %34) - %36 = insertelement <16 x float> %32, float %35, i64 6 - %37 = extractelement <16 x float> %6, i64 7 - %38 = extractelement <16 x float> %8, i64 7 - %39 = tail call float @__aie2p_scalar_fdiv(float %37, float %38) - %40 = insertelement <16 x float> %36, float %39, i64 7 - %41 = extractelement <16 x float> %6, i64 8 - %42 = extractelement <16 x float> %8, i64 8 - %43 = tail call float @__aie2p_scalar_fdiv(float %41, float %42) - %44 = insertelement <16 x float> %40, float %43, i64 8 - %45 = extractelement <16 x float> %6, i64 9 - %46 = extractelement <16 x float> %8, i64 9 - %47 = tail call float @__aie2p_scalar_fdiv(float %45, float %46) - %48 = insertelement <16 x float> %44, float %47, i64 9 - %49 = extractelement <16 x float> %6, i64 10 - %50 = extractelement <16 x float> %8, i64 10 - %51 = tail call float @__aie2p_scalar_fdiv(float %49, float %50) - %52 = insertelement <16 x float> %48, float %51, i64 10 - %53 = extractelement <16 x float> %6, i64 11 - %54 = extractelement <16 x float> %8, i64 11 - %55 = tail call float @__aie2p_scalar_fdiv(float %53, float %54) - %56 = insertelement <16 x float> %52, float %55, i64 11 - %57 = extractelement <16 x float> %6, i64 12 - %58 = extractelement <16 x float> %8, i64 12 - %59 = tail call float @__aie2p_scalar_fdiv(float %57, float %58) - %60 = insertelement <16 x float> %56, float %59, i64 12 - %61 = extractelement <16 x float> %6, i64 13 - %62 = extractelement <16 x float> %8, i64 13 - %63 = tail call float @__aie2p_scalar_fdiv(float %61, float %62) - %64 = insertelement <16 x float> %60, float %63, i64 13 - %65 = extractelement <16 x float> %6, i64 14 - %66 = extractelement <16 x float> %8, i64 14 - %67 = tail call float @__aie2p_scalar_fdiv(float %65, float %66) - %68 = insertelement <16 x float> %64, float %67, i64 14 - %69 = extractelement <16 x float> %6, i64 15 - %70 = extractelement <16 x float> %8, i64 15 - %71 = tail call float @__aie2p_scalar_fdiv(float %69, float %70) - %72 = insertelement <16 x float> %68, float %71, i64 15 - %73 = getelementptr float, ptr @buf3, i20 %4 - store <16 x float> %72, ptr %73, align 64 - %74 = add nuw nsw i32 %3, 16 - %75 = icmp ult i32 %3, 240 - br i1 %75, label %2, label %76, !llvm.loop !1 - -76: ; preds = %2 - tail call void @llvm.aie2p.release(i32 51, i32 1) - tail call void @llvm.aie2p.release(i32 53, i32 1) - tail call void @llvm.aie2p.release(i32 48, i32 1) - br label %1 -} - -; Function Attrs: nofree nosync nounwind memory(none) -declare float @llvm.aie2p.inv(float) #3 - -attributes #0 = { nofree noinline nosync nounwind memory(none) } -attributes #1 = { nounwind } -attributes #2 = { noreturn nounwind } -attributes #3 = { nofree nosync nounwind memory(none) } - -!llvm.module.flags = !{!0} - -!0 = !{i32 2, !"Debug Info Version", i32 3} -!1 = distinct !{!1, !2} -!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_3.peanohack.ll b/examples/elementwise_arith/air_project/div_kernel_0_core_0_3.peanohack.ll deleted file mode 100644 index 0c167b0..0000000 --- a/examples/elementwise_arith/air_project/div_kernel_0_core_0_3.peanohack.ll +++ /dev/null @@ -1,158 +0,0 @@ -; ModuleID = 'LLVMDialectModule' -source_filename = "LLVMDialectModule" -target triple = "aie2p" - -@buf0 = external global [256 x float] -@buf1 = external global [256 x float] -@buf2 = external global [256 x float] -@buf3 = external global [256 x float] -@buf4 = external global [256 x float] -@buf5 = external global [256 x float] -@buf6 = external global [256 x float] -@buf7 = external global [256 x float] -@buf8 = external global [256 x float] -@buf9 = external global [256 x float] -@buf10 = external global [256 x float] -@buf11 = external global [256 x float] -@buf12 = external global [1024 x float] -@buf13 = external global [1024 x float] -@buf14 = external global [1024 x float] - -; Function Attrs: noinline -define float @__aie2p_scalar_fdiv(float %0, float %1) #0 { - %3 = call float @llvm.aie2p.inv(float %1) - %4 = fmul float %0, %3 - ret float %4 -} - -declare void @debug_i32(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.event(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.put.ms(i32, i32) - -; Unknown intrinsic -declare { i32, i32 } @llvm.aie2p.get.ss() - -; Unknown intrinsic -declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32) - -; Unknown intrinsic -declare <16 x i32> @llvm.aie2p.scd.read.vec(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.acquire(i32, i32) - -; Unknown intrinsic -declare void @llvm.aie2p.release(i32, i32) - -; Unknown intrinsic -declare void @llvm.aie2p.set.ctrl.reg(i32, i32) - -define void @core_0_3() { - br label %1 - -1: ; preds = %76, %0 - call void @llvm.aie2p.acquire(i32 49, i32 -1) - call void @llvm.aie2p.acquire(i32 50, i32 -1) - call void @llvm.aie2p.acquire(i32 52, i32 -1) - br label %2 - -2: ; preds = %5, %1 - %3 = phi i32 [ %75, %5 ], [ 0, %1 ] - %4 = icmp slt i32 %3, 256 - br i1 %4, label %5, label %76 - -5: ; preds = %2 - %6 = getelementptr float, ptr @buf5, i32 %3 - %7 = load <16 x float>, ptr %6 - %8 = getelementptr float, ptr @buf4, i32 %3 - %9 = load <16 x float>, ptr %8 - %10 = extractelement <16 x float> %7, i64 0 - %11 = extractelement <16 x float> %9, i64 0 - %12 = call float @__aie2p_scalar_fdiv(float %10, float %11) - %13 = insertelement <16 x float> poison, float %12, i64 0 - %14 = extractelement <16 x float> %7, i64 1 - %15 = extractelement <16 x float> %9, i64 1 - %16 = call float @__aie2p_scalar_fdiv(float %14, float %15) - %17 = insertelement <16 x float> %13, float %16, i64 1 - %18 = extractelement <16 x float> %7, i64 2 - %19 = extractelement <16 x float> %9, i64 2 - %20 = call float @__aie2p_scalar_fdiv(float %18, float %19) - %21 = insertelement <16 x float> %17, float %20, i64 2 - %22 = extractelement <16 x float> %7, i64 3 - %23 = extractelement <16 x float> %9, i64 3 - %24 = call float @__aie2p_scalar_fdiv(float %22, float %23) - %25 = insertelement <16 x float> %21, float %24, i64 3 - %26 = extractelement <16 x float> %7, i64 4 - %27 = extractelement <16 x float> %9, i64 4 - %28 = call float @__aie2p_scalar_fdiv(float %26, float %27) - %29 = insertelement <16 x float> %25, float %28, i64 4 - %30 = extractelement <16 x float> %7, i64 5 - %31 = extractelement <16 x float> %9, i64 5 - %32 = call float @__aie2p_scalar_fdiv(float %30, float %31) - %33 = insertelement <16 x float> %29, float %32, i64 5 - %34 = extractelement <16 x float> %7, i64 6 - %35 = extractelement <16 x float> %9, i64 6 - %36 = call float @__aie2p_scalar_fdiv(float %34, float %35) - %37 = insertelement <16 x float> %33, float %36, i64 6 - %38 = extractelement <16 x float> %7, i64 7 - %39 = extractelement <16 x float> %9, i64 7 - %40 = call float @__aie2p_scalar_fdiv(float %38, float %39) - %41 = insertelement <16 x float> %37, float %40, i64 7 - %42 = extractelement <16 x float> %7, i64 8 - %43 = extractelement <16 x float> %9, i64 8 - %44 = call float @__aie2p_scalar_fdiv(float %42, float %43) - %45 = insertelement <16 x float> %41, float %44, i64 8 - %46 = extractelement <16 x float> %7, i64 9 - %47 = extractelement <16 x float> %9, i64 9 - %48 = call float @__aie2p_scalar_fdiv(float %46, float %47) - %49 = insertelement <16 x float> %45, float %48, i64 9 - %50 = extractelement <16 x float> %7, i64 10 - %51 = extractelement <16 x float> %9, i64 10 - %52 = call float @__aie2p_scalar_fdiv(float %50, float %51) - %53 = insertelement <16 x float> %49, float %52, i64 10 - %54 = extractelement <16 x float> %7, i64 11 - %55 = extractelement <16 x float> %9, i64 11 - %56 = call float @__aie2p_scalar_fdiv(float %54, float %55) - %57 = insertelement <16 x float> %53, float %56, i64 11 - %58 = extractelement <16 x float> %7, i64 12 - %59 = extractelement <16 x float> %9, i64 12 - %60 = call float @__aie2p_scalar_fdiv(float %58, float %59) - %61 = insertelement <16 x float> %57, float %60, i64 12 - %62 = extractelement <16 x float> %7, i64 13 - %63 = extractelement <16 x float> %9, i64 13 - %64 = call float @__aie2p_scalar_fdiv(float %62, float %63) - %65 = insertelement <16 x float> %61, float %64, i64 13 - %66 = extractelement <16 x float> %7, i64 14 - %67 = extractelement <16 x float> %9, i64 14 - %68 = call float @__aie2p_scalar_fdiv(float %66, float %67) - %69 = insertelement <16 x float> %65, float %68, i64 14 - %70 = extractelement <16 x float> %7, i64 15 - %71 = extractelement <16 x float> %9, i64 15 - %72 = call float @__aie2p_scalar_fdiv(float %70, float %71) - %73 = insertelement <16 x float> %69, float %72, i64 15 - %74 = getelementptr float, ptr @buf3, i32 %3 - store <16 x float> %73, ptr %74 - %75 = add i32 %3, 16 - br label %2, !llvm.loop !1 - -76: ; preds = %2 - call void @llvm.aie2p.release(i32 51, i32 1) - call void @llvm.aie2p.release(i32 53, i32 1) - call void @llvm.aie2p.release(i32 48, i32 1) - br label %1 -} - -; Unknown intrinsic -declare float @llvm.aie2p.inv(float) - -attributes #0 = { noinline } - -!llvm.module.flags = !{!0} - -!0 = !{i32 2, !"Debug Info Version", i32 3} -!1 = distinct !{!1, !2} -!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_4.elf b/examples/elementwise_arith/air_project/div_kernel_0_core_0_4.elf deleted file mode 100755 index b9ef327d81371a030c01aa887037ee689da34b56..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4196 zcma)9eQZp-&G1e!P`05uj%|oV;$*SI z%qGSwNlPi922`Y5G1RnDTTPlYXWCelzq~adTD2RfAW^N8Jj%39TeTpylA8vybNt@3 zO`r)!y6>LfIp>~x&bc4HKJrZ9SxJ(>AQRXM$cw^&9l(se7Zfmo6J)ReS*zF#8eIfH ztqcVW+pd2kBZjWDx0V8(w@4r?&<@!}azVRh8NkhLW(B5PFyRMy35esC7uY0DZ3D46 zgHhdT8%-az{oNS#V@Ux;&T%aSUCO~p0U^=I+$_K^I@iIYh^4%Dao%hf%VoRR?6ix@ z)poIdgu^|*;#I@ zt3n2e+qUSOi&@IecU@1aw_P97zO9+oaB_ zS>E!6{Z`u(V>|trpC7R1K>1-6G!m7UQNM2Nds{DP0tNczS@Gl-dJ#3x0OLdE>hYnW zYEk*&0{*T%oWB>*-;3rNQF#sRUGv0u1N99H$9D|<9UDS_pQ3%^JpQt%Z(7*jarAeb z^1X%j^VRQPsDFt1ofxVfKVioB{)_gmu^;Sw@OtEwk!s3!Psal5amrjXehTv~t74RI zN9Q;BV>OiTkGdD|_v%p1_^ak>QC5xqoMU_YztJDYw|4CL)P%lYD!$^UKa~0DE7-1K zyMgUfY+3o*4wG_b2ETO`cdqTITXp76s(wXHs@}6KRo}KeRezOV+flLd(Ojzj_3Bjp zuWC~7PAp6P^8?iUb@{D7Us%_{C&rh2Ufpa}Ue4Xxkv?bU!;-&W-?+e;pUw}Rs*g?9 zHmCs5_caFzPfw?Qdbn*AxnY1#8p&;-%P~I&HWMiNdALBH5u%$rqWoDV%2l>k`Tf|2 zul6dw^h8cnschO(3F_|avXXU4uUXr(A`ozF3IY2yJd%DYx9Rlk|7L0-EzjU-cbc-b z>9@W{ZmUUw^qQ~t{`+LQae6xW3aDp4lI?II^|oYzW!mlW9I#=$9ZVFWPWkv=ma?@* ze142xGIA8F-)9pHoRWg((nJSz_rPLsPEB;&?QU?=4eDC^Ii7OYA5U59?udrTDlv75 z2?d+_0TVyGT!FQ`fsqvm=R=e$+z)s|6mN-(Y6a>?X8$rXYs7OVvQ>|L$5Qmg&8>#3 zc68*2dHt378CN85og0BQGotqqCZaZnYZWuC0P$GBfwkx~yET8JYM6-z3!503?ZfI~ zOzkjg232^i4zcbs+WJTEh6fJc22t*Jh%(Ia?F^f5el7!}`mb7I(Q>E>P!6reoHX-% z<3*X8A3y1|<;UaLv>9(;jVS-j5zHZLwi&Vo{8heCab}la0kB%cfyKBN{5Qgp^q# zmTzL;d-(uqMDu=KKm}qKuh}!Qm zVON)m+T%;b;ZX-qpL7T}t~)a=6@KgwcQT&d!USw#R413TSXyEdo7+%>(kP(iQHs|O z^n%grF`?amn4LKPp54?x^;>iLSWfJpkzFQ>-%RHLWKBk;;E*do`by&p;%PuY6(Z&zDP?;G!$uZ`!;x6Bhm22h&R&W>1yfnhCRM$#OK}T zTT9X1OHv86!0p*3h%*J51U1NKOYq79Pp_xjv-EQ6qz7uABz(`JJwB^6lHfbYq)YVe zh%<%iNU$BTyYPUOU=Qg_a1gOlV&9FJmFQ`i`b+R3#GNJfM-it>@Cn5D)YVCX*N{_7 z@Oi|eCHNh~6no0|H*@eG5WDe>u8{;6kx{Vi#8v?ke1g3j>oJ0j@J+-NbH=RU{~!+* zEkFiVm?G>i;x7rSMLdl#gYl6M@@Li;P|@+X^0;Wf3NgiJ*f$|2o`}YK5i5B_4*&_; z=HTzo!OzdZ@j3YCbMQ-u!J=&l%I^eWe-V!mR*U#--dIn@5_eEj{QKrQW;l6=r&>i$b z*WkWZ`fQ}n2F(DyB9lml29gj=paaR9AEYbuI8r)N10=_Q*3%~+WD9W~PyXwsuP?uG{kT6Kl4xsC z({=}LE9B|_vq4N}=yZ^sTgM9i?*3HgsWbeLy{SYl(C!Kf=sKfkGwoWk!Mh*uI2Wk9 n9AIMsY5oY-VJo$J1MS|S$#gQhfpsUpdWb=#BpWGCVKz diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_4.ld.script b/examples/elementwise_arith/air_project/div_kernel_0_core_0_4.ld.script deleted file mode 100644 index ddda3c2..0000000 --- a/examples/elementwise_arith/air_project/div_kernel_0_core_0_4.ld.script +++ /dev/null @@ -1,78 +0,0 @@ - -MEMORY -{ - program (RX) : ORIGIN = 0, LENGTH = 0x0020000 - data (!RX) : ORIGIN = 0x78400, LENGTH = 0x7C00 -} -ENTRY(__start) -SECTIONS -{ - . = 0x0; - .text : { - /* the __start symbol has to come at address zero. */ - *crt0.o(.text*) - _ctors_start = .; - _init_array_start = .; - KEEP(SORT(*.init_array)) - _ctors_end = .; - _init_array_end = .; - _dtors_start = .; - _dtors_end = .; - *(.text*) - } > program - .data : { - *(.data*) - *(.rodata*) - } > data - .comment : { - *(.comment*) - } - .symtab : { - *(.symtab) - } - .shstrtab : { - *(.shstrtab) - } - .strtab : { - *(.strtab) - } - .stack_sizes : { - *(.stack_sizes) - } - -. = 0x70000; -_sp_start_value_DM_stack = .; -. += 0x400; /* stack */ -. = 0x40400; -buf5 = .; -. += 0x400; -. = 0x44000; -buf4 = .; -. += 0x400; -. = 0x48000; -buf3 = .; -. += 0x400; -/* No tile with memory exists to the west. */ -. = 0x50000; -. += 0x10000; -. = 0x60400; -buf11 = .; -. += 0x400; -. = 0x64000; -buf10 = .; -. += 0x400; -. = 0x68000; -buf9 = .; -. += 0x400; -. = 0x70400; -buf8 = .; -. += 0x400; -. = 0x74000; -buf7 = .; -. += 0x400; -. = 0x78000; -buf6 = .; -. += 0x400; - .bss : { *(.bss*) } > data -} -PROVIDE(main = core_0_4); diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_4.ll b/examples/elementwise_arith/air_project/div_kernel_0_core_0_4.ll deleted file mode 100644 index 678847a..0000000 --- a/examples/elementwise_arith/air_project/div_kernel_0_core_0_4.ll +++ /dev/null @@ -1,158 +0,0 @@ -; ModuleID = 'LLVMDialectModule' -source_filename = "LLVMDialectModule" -target triple = "aie2p" - -@buf0 = external global [256 x float] -@buf1 = external global [256 x float] -@buf2 = external global [256 x float] -@buf3 = external global [256 x float] -@buf4 = external global [256 x float] -@buf5 = external global [256 x float] -@buf6 = external global [256 x float] -@buf7 = external global [256 x float] -@buf8 = external global [256 x float] -@buf9 = external global [256 x float] -@buf10 = external global [256 x float] -@buf11 = external global [256 x float] -@buf12 = external global [1024 x float] -@buf13 = external global [1024 x float] -@buf14 = external global [1024 x float] - -; Function Attrs: noinline -define float @__aie2p_scalar_fdiv(float %0, float %1) #0 { - %3 = call float @llvm.aie2p.inv(float %1) - %4 = fmul float %0, %3 - ret float %4 -} - -declare void @debug_i32(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.event(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.put.ms(i32, i32) - -; Unknown intrinsic -declare { i32, i32 } @llvm.aie2p.get.ss() - -; Unknown intrinsic -declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32) - -; Unknown intrinsic -declare <16 x i32> @llvm.aie2p.scd.read.vec(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.acquire(i32, i32) - -; Unknown intrinsic -declare void @llvm.aie2p.release(i32, i32) - -; Unknown intrinsic -declare void @llvm.aie2p.set.ctrl.reg(i32, i32) - -define void @core_0_4() { - br label %1 - -1: ; preds = %76, %0 - call void @llvm.aie2p.acquire(i32 49, i32 -1) - call void @llvm.aie2p.acquire(i32 50, i32 -1) - call void @llvm.aie2p.acquire(i32 52, i32 -1) - br label %2 - -2: ; preds = %5, %1 - %3 = phi i32 [ %75, %5 ], [ 0, %1 ] - %4 = icmp slt i32 %3, 256 - br i1 %4, label %5, label %76 - -5: ; preds = %2 - %6 = getelementptr float, ptr @buf8, i32 %3 - %7 = load <16 x float>, ptr %6, align 4 - %8 = getelementptr float, ptr @buf7, i32 %3 - %9 = load <16 x float>, ptr %8, align 4 - %10 = extractelement <16 x float> %7, i64 0 - %11 = extractelement <16 x float> %9, i64 0 - %12 = call float @__aie2p_scalar_fdiv(float %10, float %11) - %13 = insertelement <16 x float> poison, float %12, i64 0 - %14 = extractelement <16 x float> %7, i64 1 - %15 = extractelement <16 x float> %9, i64 1 - %16 = call float @__aie2p_scalar_fdiv(float %14, float %15) - %17 = insertelement <16 x float> %13, float %16, i64 1 - %18 = extractelement <16 x float> %7, i64 2 - %19 = extractelement <16 x float> %9, i64 2 - %20 = call float @__aie2p_scalar_fdiv(float %18, float %19) - %21 = insertelement <16 x float> %17, float %20, i64 2 - %22 = extractelement <16 x float> %7, i64 3 - %23 = extractelement <16 x float> %9, i64 3 - %24 = call float @__aie2p_scalar_fdiv(float %22, float %23) - %25 = insertelement <16 x float> %21, float %24, i64 3 - %26 = extractelement <16 x float> %7, i64 4 - %27 = extractelement <16 x float> %9, i64 4 - %28 = call float @__aie2p_scalar_fdiv(float %26, float %27) - %29 = insertelement <16 x float> %25, float %28, i64 4 - %30 = extractelement <16 x float> %7, i64 5 - %31 = extractelement <16 x float> %9, i64 5 - %32 = call float @__aie2p_scalar_fdiv(float %30, float %31) - %33 = insertelement <16 x float> %29, float %32, i64 5 - %34 = extractelement <16 x float> %7, i64 6 - %35 = extractelement <16 x float> %9, i64 6 - %36 = call float @__aie2p_scalar_fdiv(float %34, float %35) - %37 = insertelement <16 x float> %33, float %36, i64 6 - %38 = extractelement <16 x float> %7, i64 7 - %39 = extractelement <16 x float> %9, i64 7 - %40 = call float @__aie2p_scalar_fdiv(float %38, float %39) - %41 = insertelement <16 x float> %37, float %40, i64 7 - %42 = extractelement <16 x float> %7, i64 8 - %43 = extractelement <16 x float> %9, i64 8 - %44 = call float @__aie2p_scalar_fdiv(float %42, float %43) - %45 = insertelement <16 x float> %41, float %44, i64 8 - %46 = extractelement <16 x float> %7, i64 9 - %47 = extractelement <16 x float> %9, i64 9 - %48 = call float @__aie2p_scalar_fdiv(float %46, float %47) - %49 = insertelement <16 x float> %45, float %48, i64 9 - %50 = extractelement <16 x float> %7, i64 10 - %51 = extractelement <16 x float> %9, i64 10 - %52 = call float @__aie2p_scalar_fdiv(float %50, float %51) - %53 = insertelement <16 x float> %49, float %52, i64 10 - %54 = extractelement <16 x float> %7, i64 11 - %55 = extractelement <16 x float> %9, i64 11 - %56 = call float @__aie2p_scalar_fdiv(float %54, float %55) - %57 = insertelement <16 x float> %53, float %56, i64 11 - %58 = extractelement <16 x float> %7, i64 12 - %59 = extractelement <16 x float> %9, i64 12 - %60 = call float @__aie2p_scalar_fdiv(float %58, float %59) - %61 = insertelement <16 x float> %57, float %60, i64 12 - %62 = extractelement <16 x float> %7, i64 13 - %63 = extractelement <16 x float> %9, i64 13 - %64 = call float @__aie2p_scalar_fdiv(float %62, float %63) - %65 = insertelement <16 x float> %61, float %64, i64 13 - %66 = extractelement <16 x float> %7, i64 14 - %67 = extractelement <16 x float> %9, i64 14 - %68 = call float @__aie2p_scalar_fdiv(float %66, float %67) - %69 = insertelement <16 x float> %65, float %68, i64 14 - %70 = extractelement <16 x float> %7, i64 15 - %71 = extractelement <16 x float> %9, i64 15 - %72 = call float @__aie2p_scalar_fdiv(float %70, float %71) - %73 = insertelement <16 x float> %69, float %72, i64 15 - %74 = getelementptr float, ptr @buf6, i32 %3 - store <16 x float> %73, ptr %74, align 4 - %75 = add i32 %3, 16 - br label %2, !llvm.loop !1 - -76: ; preds = %2 - call void @llvm.aie2p.release(i32 51, i32 1) - call void @llvm.aie2p.release(i32 53, i32 1) - call void @llvm.aie2p.release(i32 48, i32 1) - br label %1 -} - -; Unknown intrinsic -declare float @llvm.aie2p.inv(float) - -attributes #0 = { noinline } - -!llvm.module.flags = !{!0} - -!0 = !{i32 2, !"Debug Info Version", i32 3} -!1 = distinct !{!1, !2} -!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_4.o b/examples/elementwise_arith/air_project/div_kernel_0_core_0_4.o deleted file mode 100644 index 75208a26da516d5325930c6a9cff6520d51ab84d..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2048 zcma)7Z)h837=JF8G*@uDGm`R}tZ=KkRGXe{S8H($lQMN5x)5X=#edhdOSPEUk|yOu z(7JTaF~_>0a3Vs#4ZlbNGD1J~!w``n6{8H?hblq+pwPjD2%GVF?%r!(ifj77`#yhu zzx&<$+`aeSd(VwNFG&)3CV?Vk0CpRIjNqU{=!SOK;!j)}fA6``f z>dEEu8RZuT#_45A2R+ze0Gd|(0V1EgRP5t4DU9-dAAl_4X@#rLupjwh6)lw;9hi|mdTic!9 zS=2i#N9}M8`SBgrcN6j6*6TZmdgn^0cMJK(4tnc|KiOLEJnEgNe(T8J>G}PE_%5E` zg;Jz?LB{p{iTrrwc>4d}M?+tVP`@X~TD*^oaicr?T=cq{(V*~*=fbTXSG!5k-ZHo_G!e5eG#>;uI#gZ{}z#-JL}(F>ltG! z)rbCy47BMVHtMPJXEIxq67F4bW?I7eelYyO*Or730N>XPzkH^Q^RI^~%NQ;4>zD$( zzA40mA)(-h+s%(KPjA)&j>4>vKfq(+gz)5fXA^S|jy%s-@lXmu0LHfAuWrjHw&j%i z0&dYj*TcWaVT^kfw|<1;b~mQ=N`5ykljHGkLfttMd3{LQ;-@k8_#X)MWug9)P~Q;h z2Yg#P35-3Rd7-`})V~($>q5PN&q<;K9BmPOFYotDsuPdQ$Y*mIqmZ=kTqUNo#>q_WG3H)(b1E~j!b8A*~!AOnN!7F7L2()OS96gj*-hwaDz z)yKkh^hn8c926t$Sb}@cX*?i&JCF9FXC99@^vV+)*uaGLE^Q%N4`Gz&aEN+KnA6_% v%+q@v3!>g#OccD3^k}, ptr %5, align 64 - %7 = getelementptr float, ptr @buf7, i20 %4 - %8 = load <16 x float>, ptr %7, align 64 - %9 = extractelement <16 x float> %6, i64 0 - %10 = extractelement <16 x float> %8, i64 0 - %11 = tail call float @__aie2p_scalar_fdiv(float %9, float %10) - %12 = insertelement <16 x float> poison, float %11, i64 0 - %13 = extractelement <16 x float> %6, i64 1 - %14 = extractelement <16 x float> %8, i64 1 - %15 = tail call float @__aie2p_scalar_fdiv(float %13, float %14) - %16 = insertelement <16 x float> %12, float %15, i64 1 - %17 = extractelement <16 x float> %6, i64 2 - %18 = extractelement <16 x float> %8, i64 2 - %19 = tail call float @__aie2p_scalar_fdiv(float %17, float %18) - %20 = insertelement <16 x float> %16, float %19, i64 2 - %21 = extractelement <16 x float> %6, i64 3 - %22 = extractelement <16 x float> %8, i64 3 - %23 = tail call float @__aie2p_scalar_fdiv(float %21, float %22) - %24 = insertelement <16 x float> %20, float %23, i64 3 - %25 = extractelement <16 x float> %6, i64 4 - %26 = extractelement <16 x float> %8, i64 4 - %27 = tail call float @__aie2p_scalar_fdiv(float %25, float %26) - %28 = insertelement <16 x float> %24, float %27, i64 4 - %29 = extractelement <16 x float> %6, i64 5 - %30 = extractelement <16 x float> %8, i64 5 - %31 = tail call float @__aie2p_scalar_fdiv(float %29, float %30) - %32 = insertelement <16 x float> %28, float %31, i64 5 - %33 = extractelement <16 x float> %6, i64 6 - %34 = extractelement <16 x float> %8, i64 6 - %35 = tail call float @__aie2p_scalar_fdiv(float %33, float %34) - %36 = insertelement <16 x float> %32, float %35, i64 6 - %37 = extractelement <16 x float> %6, i64 7 - %38 = extractelement <16 x float> %8, i64 7 - %39 = tail call float @__aie2p_scalar_fdiv(float %37, float %38) - %40 = insertelement <16 x float> %36, float %39, i64 7 - %41 = extractelement <16 x float> %6, i64 8 - %42 = extractelement <16 x float> %8, i64 8 - %43 = tail call float @__aie2p_scalar_fdiv(float %41, float %42) - %44 = insertelement <16 x float> %40, float %43, i64 8 - %45 = extractelement <16 x float> %6, i64 9 - %46 = extractelement <16 x float> %8, i64 9 - %47 = tail call float @__aie2p_scalar_fdiv(float %45, float %46) - %48 = insertelement <16 x float> %44, float %47, i64 9 - %49 = extractelement <16 x float> %6, i64 10 - %50 = extractelement <16 x float> %8, i64 10 - %51 = tail call float @__aie2p_scalar_fdiv(float %49, float %50) - %52 = insertelement <16 x float> %48, float %51, i64 10 - %53 = extractelement <16 x float> %6, i64 11 - %54 = extractelement <16 x float> %8, i64 11 - %55 = tail call float @__aie2p_scalar_fdiv(float %53, float %54) - %56 = insertelement <16 x float> %52, float %55, i64 11 - %57 = extractelement <16 x float> %6, i64 12 - %58 = extractelement <16 x float> %8, i64 12 - %59 = tail call float @__aie2p_scalar_fdiv(float %57, float %58) - %60 = insertelement <16 x float> %56, float %59, i64 12 - %61 = extractelement <16 x float> %6, i64 13 - %62 = extractelement <16 x float> %8, i64 13 - %63 = tail call float @__aie2p_scalar_fdiv(float %61, float %62) - %64 = insertelement <16 x float> %60, float %63, i64 13 - %65 = extractelement <16 x float> %6, i64 14 - %66 = extractelement <16 x float> %8, i64 14 - %67 = tail call float @__aie2p_scalar_fdiv(float %65, float %66) - %68 = insertelement <16 x float> %64, float %67, i64 14 - %69 = extractelement <16 x float> %6, i64 15 - %70 = extractelement <16 x float> %8, i64 15 - %71 = tail call float @__aie2p_scalar_fdiv(float %69, float %70) - %72 = insertelement <16 x float> %68, float %71, i64 15 - %73 = getelementptr float, ptr @buf6, i20 %4 - store <16 x float> %72, ptr %73, align 64 - %74 = add nuw nsw i32 %3, 16 - %75 = icmp ult i32 %3, 240 - br i1 %75, label %2, label %76, !llvm.loop !1 - -76: ; preds = %2 - tail call void @llvm.aie2p.release(i32 51, i32 1) - tail call void @llvm.aie2p.release(i32 53, i32 1) - tail call void @llvm.aie2p.release(i32 48, i32 1) - br label %1 -} - -; Function Attrs: nofree nosync nounwind memory(none) -declare float @llvm.aie2p.inv(float) #3 - -attributes #0 = { nofree noinline nosync nounwind memory(none) } -attributes #1 = { nounwind } -attributes #2 = { noreturn nounwind } -attributes #3 = { nofree nosync nounwind memory(none) } - -!llvm.module.flags = !{!0} - -!0 = !{i32 2, !"Debug Info Version", i32 3} -!1 = distinct !{!1, !2} -!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_4.peanohack.ll b/examples/elementwise_arith/air_project/div_kernel_0_core_0_4.peanohack.ll deleted file mode 100644 index 9a0f789..0000000 --- a/examples/elementwise_arith/air_project/div_kernel_0_core_0_4.peanohack.ll +++ /dev/null @@ -1,158 +0,0 @@ -; ModuleID = 'LLVMDialectModule' -source_filename = "LLVMDialectModule" -target triple = "aie2p" - -@buf0 = external global [256 x float] -@buf1 = external global [256 x float] -@buf2 = external global [256 x float] -@buf3 = external global [256 x float] -@buf4 = external global [256 x float] -@buf5 = external global [256 x float] -@buf6 = external global [256 x float] -@buf7 = external global [256 x float] -@buf8 = external global [256 x float] -@buf9 = external global [256 x float] -@buf10 = external global [256 x float] -@buf11 = external global [256 x float] -@buf12 = external global [1024 x float] -@buf13 = external global [1024 x float] -@buf14 = external global [1024 x float] - -; Function Attrs: noinline -define float @__aie2p_scalar_fdiv(float %0, float %1) #0 { - %3 = call float @llvm.aie2p.inv(float %1) - %4 = fmul float %0, %3 - ret float %4 -} - -declare void @debug_i32(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.event(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.put.ms(i32, i32) - -; Unknown intrinsic -declare { i32, i32 } @llvm.aie2p.get.ss() - -; Unknown intrinsic -declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32) - -; Unknown intrinsic -declare <16 x i32> @llvm.aie2p.scd.read.vec(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.acquire(i32, i32) - -; Unknown intrinsic -declare void @llvm.aie2p.release(i32, i32) - -; Unknown intrinsic -declare void @llvm.aie2p.set.ctrl.reg(i32, i32) - -define void @core_0_4() { - br label %1 - -1: ; preds = %76, %0 - call void @llvm.aie2p.acquire(i32 49, i32 -1) - call void @llvm.aie2p.acquire(i32 50, i32 -1) - call void @llvm.aie2p.acquire(i32 52, i32 -1) - br label %2 - -2: ; preds = %5, %1 - %3 = phi i32 [ %75, %5 ], [ 0, %1 ] - %4 = icmp slt i32 %3, 256 - br i1 %4, label %5, label %76 - -5: ; preds = %2 - %6 = getelementptr float, ptr @buf8, i32 %3 - %7 = load <16 x float>, ptr %6 - %8 = getelementptr float, ptr @buf7, i32 %3 - %9 = load <16 x float>, ptr %8 - %10 = extractelement <16 x float> %7, i64 0 - %11 = extractelement <16 x float> %9, i64 0 - %12 = call float @__aie2p_scalar_fdiv(float %10, float %11) - %13 = insertelement <16 x float> poison, float %12, i64 0 - %14 = extractelement <16 x float> %7, i64 1 - %15 = extractelement <16 x float> %9, i64 1 - %16 = call float @__aie2p_scalar_fdiv(float %14, float %15) - %17 = insertelement <16 x float> %13, float %16, i64 1 - %18 = extractelement <16 x float> %7, i64 2 - %19 = extractelement <16 x float> %9, i64 2 - %20 = call float @__aie2p_scalar_fdiv(float %18, float %19) - %21 = insertelement <16 x float> %17, float %20, i64 2 - %22 = extractelement <16 x float> %7, i64 3 - %23 = extractelement <16 x float> %9, i64 3 - %24 = call float @__aie2p_scalar_fdiv(float %22, float %23) - %25 = insertelement <16 x float> %21, float %24, i64 3 - %26 = extractelement <16 x float> %7, i64 4 - %27 = extractelement <16 x float> %9, i64 4 - %28 = call float @__aie2p_scalar_fdiv(float %26, float %27) - %29 = insertelement <16 x float> %25, float %28, i64 4 - %30 = extractelement <16 x float> %7, i64 5 - %31 = extractelement <16 x float> %9, i64 5 - %32 = call float @__aie2p_scalar_fdiv(float %30, float %31) - %33 = insertelement <16 x float> %29, float %32, i64 5 - %34 = extractelement <16 x float> %7, i64 6 - %35 = extractelement <16 x float> %9, i64 6 - %36 = call float @__aie2p_scalar_fdiv(float %34, float %35) - %37 = insertelement <16 x float> %33, float %36, i64 6 - %38 = extractelement <16 x float> %7, i64 7 - %39 = extractelement <16 x float> %9, i64 7 - %40 = call float @__aie2p_scalar_fdiv(float %38, float %39) - %41 = insertelement <16 x float> %37, float %40, i64 7 - %42 = extractelement <16 x float> %7, i64 8 - %43 = extractelement <16 x float> %9, i64 8 - %44 = call float @__aie2p_scalar_fdiv(float %42, float %43) - %45 = insertelement <16 x float> %41, float %44, i64 8 - %46 = extractelement <16 x float> %7, i64 9 - %47 = extractelement <16 x float> %9, i64 9 - %48 = call float @__aie2p_scalar_fdiv(float %46, float %47) - %49 = insertelement <16 x float> %45, float %48, i64 9 - %50 = extractelement <16 x float> %7, i64 10 - %51 = extractelement <16 x float> %9, i64 10 - %52 = call float @__aie2p_scalar_fdiv(float %50, float %51) - %53 = insertelement <16 x float> %49, float %52, i64 10 - %54 = extractelement <16 x float> %7, i64 11 - %55 = extractelement <16 x float> %9, i64 11 - %56 = call float @__aie2p_scalar_fdiv(float %54, float %55) - %57 = insertelement <16 x float> %53, float %56, i64 11 - %58 = extractelement <16 x float> %7, i64 12 - %59 = extractelement <16 x float> %9, i64 12 - %60 = call float @__aie2p_scalar_fdiv(float %58, float %59) - %61 = insertelement <16 x float> %57, float %60, i64 12 - %62 = extractelement <16 x float> %7, i64 13 - %63 = extractelement <16 x float> %9, i64 13 - %64 = call float @__aie2p_scalar_fdiv(float %62, float %63) - %65 = insertelement <16 x float> %61, float %64, i64 13 - %66 = extractelement <16 x float> %7, i64 14 - %67 = extractelement <16 x float> %9, i64 14 - %68 = call float @__aie2p_scalar_fdiv(float %66, float %67) - %69 = insertelement <16 x float> %65, float %68, i64 14 - %70 = extractelement <16 x float> %7, i64 15 - %71 = extractelement <16 x float> %9, i64 15 - %72 = call float @__aie2p_scalar_fdiv(float %70, float %71) - %73 = insertelement <16 x float> %69, float %72, i64 15 - %74 = getelementptr float, ptr @buf6, i32 %3 - store <16 x float> %73, ptr %74 - %75 = add i32 %3, 16 - br label %2, !llvm.loop !1 - -76: ; preds = %2 - call void @llvm.aie2p.release(i32 51, i32 1) - call void @llvm.aie2p.release(i32 53, i32 1) - call void @llvm.aie2p.release(i32 48, i32 1) - br label %1 -} - -; Unknown intrinsic -declare float @llvm.aie2p.inv(float) - -attributes #0 = { noinline } - -!llvm.module.flags = !{!0} - -!0 = !{i32 2, !"Debug Info Version", i32 3} -!1 = distinct !{!1, !2} -!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_5.elf b/examples/elementwise_arith/air_project/div_kernel_0_core_0_5.elf deleted file mode 100755 index 8162c282e6181e1320623e660451ca081a79c550..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4132 zcma)9eQZJArh@=bU@)Ip==(`s7#rFG!LE2ARN?Lk%e0UzS2wF zR#}D&61VNuITy1On(w=wRqwh$D8Mp`%K2}gnK!^!G+WFaqZ30ZyXktWKuVspXx=_JOfORn#-q0 zN6SU&du#Z+{&@agLVqurD@5rW+PhYX?-uH-*N*Qr`a3;}{yswcnpOOzQD3{Zzq9D? zEaiI#?N_VcKT!V|^*c9OK7G!N@%DC6an?~(R3*5iV?V)_E+TTsR* z-}a6#^2aJD-*0uV;qR5vis@I(<)WY*{W&L(4t$|MjBn@UcM>!DekuBlUwlvI7q4TR z!*&bXN7&NxT$@R`w20rzqI+{~mD?`eOH^&CNK|M5`OL<|-`_>epEuq4oY?`sYco}Lc<^l;k*a$^7;G?Lpumt%emY$j0j^Dsr85u%%WqVNSK3RQMg`R!!m zXGaxpawa3nRJLF#1@-VvSxLL3SFNpS;rBas2Z8+(HYA_R?7q14pT*6Plo#=|J5A}$ z$u~boZl6hk=zGN`Y;FWX@%@s?zPjoR(;HDF_SJD4a$owD(rkWaKEezr$u2I3)$mrI~ibI&3!52V>c{G0 zOsz0s230gz9b(rtwDpbS4G$c?1ftOA5CxdyTPe2M{9Fb`_1{)((Q>HpQx46>oHX-l z<3)j*9Y5=|Wyj;#q#18ujVS-rNz5T_wi&WD{FS~~bm@>!0kB%c@%6YDe7DTFiW+J8 za_5BX`ssPe3PMsHV$aJ=?7<2Yjxj#J-yyco;*Ldp7^{`73fQPB8tL`GjAyr6^!S#{n77fmK0E) z%kUdlWbRGNeDWQPb<3l}(}VAJZAZ9mw522mjd2gp?{4 zoAx_+)l;kPw~SKCF<(FOeVw;!=S z{%GIv$lz|p?{8H+-Wqoep6^gkuv3xVu!0IF*x0P=S0p4{y!!^m@9xL@%cfdZ1=W!uKrNGEd)% zIF+l81P2hia}QVvj*vbN2M{ZH_T7kCo}QLzAP=8F+>vL03UM+IpF@mKU7aL&6*)Bz zUq(EUhks_6Vo&-0Y6bosVmH3gHIm>8G77eX*or`c53zS+J;t#SzKxh-PMJ0QFXX|Z z1<1e(cM1EJ@&5>`%XkrC3gaUmN19!cq9=A+UC8HK4fAidgH-P?7L&} zL5<7x!O-zQEc*RO48ZR{a-cOD?1_Zp2l~T9JrSt!@7?POc=WywdWU*qUG+7gtiT<3 zM(^wOzDe)v41ve3OFV|D$FT7lnmU6qBz1{Zu)vP6ZDHrEFK(; zLm-Aehzkq{dxj!`)&m4X$05)gjP?bhebG1&2Z2z$e=uf@B6%PqomE(PTOq?9+&eS9Q4+ M|5ekRhTiD^55T1-w*UYD diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_5.ld.script b/examples/elementwise_arith/air_project/div_kernel_0_core_0_5.ld.script deleted file mode 100644 index 51c13db..0000000 --- a/examples/elementwise_arith/air_project/div_kernel_0_core_0_5.ld.script +++ /dev/null @@ -1,72 +0,0 @@ - -MEMORY -{ - program (RX) : ORIGIN = 0, LENGTH = 0x0020000 - data (!RX) : ORIGIN = 0x78400, LENGTH = 0x7C00 -} -ENTRY(__start) -SECTIONS -{ - . = 0x0; - .text : { - /* the __start symbol has to come at address zero. */ - *crt0.o(.text*) - _ctors_start = .; - _init_array_start = .; - KEEP(SORT(*.init_array)) - _ctors_end = .; - _init_array_end = .; - _dtors_start = .; - _dtors_end = .; - *(.text*) - } > program - .data : { - *(.data*) - *(.rodata*) - } > data - .comment : { - *(.comment*) - } - .symtab : { - *(.symtab) - } - .shstrtab : { - *(.shstrtab) - } - .strtab : { - *(.strtab) - } - .stack_sizes : { - *(.stack_sizes) - } - -. = 0x70000; -_sp_start_value_DM_stack = .; -. += 0x400; /* stack */ -. = 0x40400; -buf8 = .; -. += 0x400; -. = 0x44000; -buf7 = .; -. += 0x400; -. = 0x48000; -buf6 = .; -. += 0x400; -/* No tile with memory exists to the west. */ -. = 0x50000; -. += 0x10000; -/* No tile with memory exists to the north. */ -. = 0x60000; -. += 0x10000; -. = 0x70400; -buf11 = .; -. += 0x400; -. = 0x74000; -buf10 = .; -. += 0x400; -. = 0x78000; -buf9 = .; -. += 0x400; - .bss : { *(.bss*) } > data -} -PROVIDE(main = core_0_5); diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_5.ll b/examples/elementwise_arith/air_project/div_kernel_0_core_0_5.ll deleted file mode 100644 index e652b65..0000000 --- a/examples/elementwise_arith/air_project/div_kernel_0_core_0_5.ll +++ /dev/null @@ -1,158 +0,0 @@ -; ModuleID = 'LLVMDialectModule' -source_filename = "LLVMDialectModule" -target triple = "aie2p" - -@buf0 = external global [256 x float] -@buf1 = external global [256 x float] -@buf2 = external global [256 x float] -@buf3 = external global [256 x float] -@buf4 = external global [256 x float] -@buf5 = external global [256 x float] -@buf6 = external global [256 x float] -@buf7 = external global [256 x float] -@buf8 = external global [256 x float] -@buf9 = external global [256 x float] -@buf10 = external global [256 x float] -@buf11 = external global [256 x float] -@buf12 = external global [1024 x float] -@buf13 = external global [1024 x float] -@buf14 = external global [1024 x float] - -; Function Attrs: noinline -define float @__aie2p_scalar_fdiv(float %0, float %1) #0 { - %3 = call float @llvm.aie2p.inv(float %1) - %4 = fmul float %0, %3 - ret float %4 -} - -declare void @debug_i32(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.event(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.put.ms(i32, i32) - -; Unknown intrinsic -declare { i32, i32 } @llvm.aie2p.get.ss() - -; Unknown intrinsic -declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32) - -; Unknown intrinsic -declare <16 x i32> @llvm.aie2p.scd.read.vec(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.acquire(i32, i32) - -; Unknown intrinsic -declare void @llvm.aie2p.release(i32, i32) - -; Unknown intrinsic -declare void @llvm.aie2p.set.ctrl.reg(i32, i32) - -define void @core_0_5() { - br label %1 - -1: ; preds = %76, %0 - call void @llvm.aie2p.acquire(i32 49, i32 -1) - call void @llvm.aie2p.acquire(i32 50, i32 -1) - call void @llvm.aie2p.acquire(i32 52, i32 -1) - br label %2 - -2: ; preds = %5, %1 - %3 = phi i32 [ %75, %5 ], [ 0, %1 ] - %4 = icmp slt i32 %3, 256 - br i1 %4, label %5, label %76 - -5: ; preds = %2 - %6 = getelementptr float, ptr @buf11, i32 %3 - %7 = load <16 x float>, ptr %6, align 4 - %8 = getelementptr float, ptr @buf10, i32 %3 - %9 = load <16 x float>, ptr %8, align 4 - %10 = extractelement <16 x float> %7, i64 0 - %11 = extractelement <16 x float> %9, i64 0 - %12 = call float @__aie2p_scalar_fdiv(float %10, float %11) - %13 = insertelement <16 x float> poison, float %12, i64 0 - %14 = extractelement <16 x float> %7, i64 1 - %15 = extractelement <16 x float> %9, i64 1 - %16 = call float @__aie2p_scalar_fdiv(float %14, float %15) - %17 = insertelement <16 x float> %13, float %16, i64 1 - %18 = extractelement <16 x float> %7, i64 2 - %19 = extractelement <16 x float> %9, i64 2 - %20 = call float @__aie2p_scalar_fdiv(float %18, float %19) - %21 = insertelement <16 x float> %17, float %20, i64 2 - %22 = extractelement <16 x float> %7, i64 3 - %23 = extractelement <16 x float> %9, i64 3 - %24 = call float @__aie2p_scalar_fdiv(float %22, float %23) - %25 = insertelement <16 x float> %21, float %24, i64 3 - %26 = extractelement <16 x float> %7, i64 4 - %27 = extractelement <16 x float> %9, i64 4 - %28 = call float @__aie2p_scalar_fdiv(float %26, float %27) - %29 = insertelement <16 x float> %25, float %28, i64 4 - %30 = extractelement <16 x float> %7, i64 5 - %31 = extractelement <16 x float> %9, i64 5 - %32 = call float @__aie2p_scalar_fdiv(float %30, float %31) - %33 = insertelement <16 x float> %29, float %32, i64 5 - %34 = extractelement <16 x float> %7, i64 6 - %35 = extractelement <16 x float> %9, i64 6 - %36 = call float @__aie2p_scalar_fdiv(float %34, float %35) - %37 = insertelement <16 x float> %33, float %36, i64 6 - %38 = extractelement <16 x float> %7, i64 7 - %39 = extractelement <16 x float> %9, i64 7 - %40 = call float @__aie2p_scalar_fdiv(float %38, float %39) - %41 = insertelement <16 x float> %37, float %40, i64 7 - %42 = extractelement <16 x float> %7, i64 8 - %43 = extractelement <16 x float> %9, i64 8 - %44 = call float @__aie2p_scalar_fdiv(float %42, float %43) - %45 = insertelement <16 x float> %41, float %44, i64 8 - %46 = extractelement <16 x float> %7, i64 9 - %47 = extractelement <16 x float> %9, i64 9 - %48 = call float @__aie2p_scalar_fdiv(float %46, float %47) - %49 = insertelement <16 x float> %45, float %48, i64 9 - %50 = extractelement <16 x float> %7, i64 10 - %51 = extractelement <16 x float> %9, i64 10 - %52 = call float @__aie2p_scalar_fdiv(float %50, float %51) - %53 = insertelement <16 x float> %49, float %52, i64 10 - %54 = extractelement <16 x float> %7, i64 11 - %55 = extractelement <16 x float> %9, i64 11 - %56 = call float @__aie2p_scalar_fdiv(float %54, float %55) - %57 = insertelement <16 x float> %53, float %56, i64 11 - %58 = extractelement <16 x float> %7, i64 12 - %59 = extractelement <16 x float> %9, i64 12 - %60 = call float @__aie2p_scalar_fdiv(float %58, float %59) - %61 = insertelement <16 x float> %57, float %60, i64 12 - %62 = extractelement <16 x float> %7, i64 13 - %63 = extractelement <16 x float> %9, i64 13 - %64 = call float @__aie2p_scalar_fdiv(float %62, float %63) - %65 = insertelement <16 x float> %61, float %64, i64 13 - %66 = extractelement <16 x float> %7, i64 14 - %67 = extractelement <16 x float> %9, i64 14 - %68 = call float @__aie2p_scalar_fdiv(float %66, float %67) - %69 = insertelement <16 x float> %65, float %68, i64 14 - %70 = extractelement <16 x float> %7, i64 15 - %71 = extractelement <16 x float> %9, i64 15 - %72 = call float @__aie2p_scalar_fdiv(float %70, float %71) - %73 = insertelement <16 x float> %69, float %72, i64 15 - %74 = getelementptr float, ptr @buf9, i32 %3 - store <16 x float> %73, ptr %74, align 4 - %75 = add i32 %3, 16 - br label %2, !llvm.loop !1 - -76: ; preds = %2 - call void @llvm.aie2p.release(i32 51, i32 1) - call void @llvm.aie2p.release(i32 53, i32 1) - call void @llvm.aie2p.release(i32 48, i32 1) - br label %1 -} - -; Unknown intrinsic -declare float @llvm.aie2p.inv(float) - -attributes #0 = { noinline } - -!llvm.module.flags = !{!0} - -!0 = !{i32 2, !"Debug Info Version", i32 3} -!1 = distinct !{!1, !2} -!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_5.o b/examples/elementwise_arith/air_project/div_kernel_0_core_0_5.o deleted file mode 100644 index 78cc76657161c0618a24de8080e832261028cff6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2052 zcma)7VQ5=b6h1F6Xy+ihh!hDn*aA6*FIMsezE+NG?R+1e)M z=?|?<=bUq_8z@hl(0{T&l7NiRU;QB>GNfWA1NWm!P=6?NFd@QbJmUd)n7nRKPyG ztOC@Mi={c`7pIKVNl6DiSZ4s52LUkX7B)h;}$+8>A2XC8(gB$xVsozb8q!|cr+!6abZgYn<3L7x)G@@j)A9c>t( zF=c>9am~L&{x^PT_747z4q9EGW$Y_B$o9jtF4yn0Y8RGO-^y}8D$Ui=C#B)G2HSU6 zttFoBKeIY{qo)~KKJ}%19%0=uq(56wpL&Fc!)1uVx%cq;@ zT`EQ^m*j{Yh@f6)`IX5B>f!#LD8FVc@$(Y;hh4rAWS3WPRB_zGaT~{Ku$uDeSMK55 z5xQSZbv%CMz7^XYwPL-ytXQ($iv7TsIX3h!wjQwZ2%|W&QpgR(@`;eSf`c zh%Hqf`76@bqJL1YrAnX4Y+g#Zzlt-{63+L7;g7zSB!mF?xn}s~Gi8{6Jxp1|Xr8ZQ z3UGW~hzI=w;D!0sODO~a_;6EwVN*V| zDW}vIa6toI7ylv;V%)8`_z{X*-S{JPlHZL>fIC+r$NQxXei~zs|Gt1P3iwX~ zz9!&%eH%Inj6Izh0bdaCZv=c*!1MT=B)Y)S=FxZae!nD~cx1*@HkUE-Su1atW@bFw zH)&3fW^$P+bL`ak3^)}q-k8W|jTchKyQlM+(br*c@Winr&;T z&y1jd>kRpk!r0TI_UOb^*6cA4fN7p7, ptr %5, align 64 - %7 = getelementptr float, ptr @buf10, i20 %4 - %8 = load <16 x float>, ptr %7, align 64 - %9 = extractelement <16 x float> %6, i64 0 - %10 = extractelement <16 x float> %8, i64 0 - %11 = tail call float @__aie2p_scalar_fdiv(float %9, float %10) - %12 = insertelement <16 x float> poison, float %11, i64 0 - %13 = extractelement <16 x float> %6, i64 1 - %14 = extractelement <16 x float> %8, i64 1 - %15 = tail call float @__aie2p_scalar_fdiv(float %13, float %14) - %16 = insertelement <16 x float> %12, float %15, i64 1 - %17 = extractelement <16 x float> %6, i64 2 - %18 = extractelement <16 x float> %8, i64 2 - %19 = tail call float @__aie2p_scalar_fdiv(float %17, float %18) - %20 = insertelement <16 x float> %16, float %19, i64 2 - %21 = extractelement <16 x float> %6, i64 3 - %22 = extractelement <16 x float> %8, i64 3 - %23 = tail call float @__aie2p_scalar_fdiv(float %21, float %22) - %24 = insertelement <16 x float> %20, float %23, i64 3 - %25 = extractelement <16 x float> %6, i64 4 - %26 = extractelement <16 x float> %8, i64 4 - %27 = tail call float @__aie2p_scalar_fdiv(float %25, float %26) - %28 = insertelement <16 x float> %24, float %27, i64 4 - %29 = extractelement <16 x float> %6, i64 5 - %30 = extractelement <16 x float> %8, i64 5 - %31 = tail call float @__aie2p_scalar_fdiv(float %29, float %30) - %32 = insertelement <16 x float> %28, float %31, i64 5 - %33 = extractelement <16 x float> %6, i64 6 - %34 = extractelement <16 x float> %8, i64 6 - %35 = tail call float @__aie2p_scalar_fdiv(float %33, float %34) - %36 = insertelement <16 x float> %32, float %35, i64 6 - %37 = extractelement <16 x float> %6, i64 7 - %38 = extractelement <16 x float> %8, i64 7 - %39 = tail call float @__aie2p_scalar_fdiv(float %37, float %38) - %40 = insertelement <16 x float> %36, float %39, i64 7 - %41 = extractelement <16 x float> %6, i64 8 - %42 = extractelement <16 x float> %8, i64 8 - %43 = tail call float @__aie2p_scalar_fdiv(float %41, float %42) - %44 = insertelement <16 x float> %40, float %43, i64 8 - %45 = extractelement <16 x float> %6, i64 9 - %46 = extractelement <16 x float> %8, i64 9 - %47 = tail call float @__aie2p_scalar_fdiv(float %45, float %46) - %48 = insertelement <16 x float> %44, float %47, i64 9 - %49 = extractelement <16 x float> %6, i64 10 - %50 = extractelement <16 x float> %8, i64 10 - %51 = tail call float @__aie2p_scalar_fdiv(float %49, float %50) - %52 = insertelement <16 x float> %48, float %51, i64 10 - %53 = extractelement <16 x float> %6, i64 11 - %54 = extractelement <16 x float> %8, i64 11 - %55 = tail call float @__aie2p_scalar_fdiv(float %53, float %54) - %56 = insertelement <16 x float> %52, float %55, i64 11 - %57 = extractelement <16 x float> %6, i64 12 - %58 = extractelement <16 x float> %8, i64 12 - %59 = tail call float @__aie2p_scalar_fdiv(float %57, float %58) - %60 = insertelement <16 x float> %56, float %59, i64 12 - %61 = extractelement <16 x float> %6, i64 13 - %62 = extractelement <16 x float> %8, i64 13 - %63 = tail call float @__aie2p_scalar_fdiv(float %61, float %62) - %64 = insertelement <16 x float> %60, float %63, i64 13 - %65 = extractelement <16 x float> %6, i64 14 - %66 = extractelement <16 x float> %8, i64 14 - %67 = tail call float @__aie2p_scalar_fdiv(float %65, float %66) - %68 = insertelement <16 x float> %64, float %67, i64 14 - %69 = extractelement <16 x float> %6, i64 15 - %70 = extractelement <16 x float> %8, i64 15 - %71 = tail call float @__aie2p_scalar_fdiv(float %69, float %70) - %72 = insertelement <16 x float> %68, float %71, i64 15 - %73 = getelementptr float, ptr @buf9, i20 %4 - store <16 x float> %72, ptr %73, align 64 - %74 = add nuw nsw i32 %3, 16 - %75 = icmp ult i32 %3, 240 - br i1 %75, label %2, label %76, !llvm.loop !1 - -76: ; preds = %2 - tail call void @llvm.aie2p.release(i32 51, i32 1) - tail call void @llvm.aie2p.release(i32 53, i32 1) - tail call void @llvm.aie2p.release(i32 48, i32 1) - br label %1 -} - -; Function Attrs: nofree nosync nounwind memory(none) -declare float @llvm.aie2p.inv(float) #3 - -attributes #0 = { nofree noinline nosync nounwind memory(none) } -attributes #1 = { nounwind } -attributes #2 = { noreturn nounwind } -attributes #3 = { nofree nosync nounwind memory(none) } - -!llvm.module.flags = !{!0} - -!0 = !{i32 2, !"Debug Info Version", i32 3} -!1 = distinct !{!1, !2} -!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_5.peanohack.ll b/examples/elementwise_arith/air_project/div_kernel_0_core_0_5.peanohack.ll deleted file mode 100644 index 5ef9373..0000000 --- a/examples/elementwise_arith/air_project/div_kernel_0_core_0_5.peanohack.ll +++ /dev/null @@ -1,158 +0,0 @@ -; ModuleID = 'LLVMDialectModule' -source_filename = "LLVMDialectModule" -target triple = "aie2p" - -@buf0 = external global [256 x float] -@buf1 = external global [256 x float] -@buf2 = external global [256 x float] -@buf3 = external global [256 x float] -@buf4 = external global [256 x float] -@buf5 = external global [256 x float] -@buf6 = external global [256 x float] -@buf7 = external global [256 x float] -@buf8 = external global [256 x float] -@buf9 = external global [256 x float] -@buf10 = external global [256 x float] -@buf11 = external global [256 x float] -@buf12 = external global [1024 x float] -@buf13 = external global [1024 x float] -@buf14 = external global [1024 x float] - -; Function Attrs: noinline -define float @__aie2p_scalar_fdiv(float %0, float %1) #0 { - %3 = call float @llvm.aie2p.inv(float %1) - %4 = fmul float %0, %3 - ret float %4 -} - -declare void @debug_i32(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.event(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.put.ms(i32, i32) - -; Unknown intrinsic -declare { i32, i32 } @llvm.aie2p.get.ss() - -; Unknown intrinsic -declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32) - -; Unknown intrinsic -declare <16 x i32> @llvm.aie2p.scd.read.vec(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.acquire(i32, i32) - -; Unknown intrinsic -declare void @llvm.aie2p.release(i32, i32) - -; Unknown intrinsic -declare void @llvm.aie2p.set.ctrl.reg(i32, i32) - -define void @core_0_5() { - br label %1 - -1: ; preds = %76, %0 - call void @llvm.aie2p.acquire(i32 49, i32 -1) - call void @llvm.aie2p.acquire(i32 50, i32 -1) - call void @llvm.aie2p.acquire(i32 52, i32 -1) - br label %2 - -2: ; preds = %5, %1 - %3 = phi i32 [ %75, %5 ], [ 0, %1 ] - %4 = icmp slt i32 %3, 256 - br i1 %4, label %5, label %76 - -5: ; preds = %2 - %6 = getelementptr float, ptr @buf11, i32 %3 - %7 = load <16 x float>, ptr %6 - %8 = getelementptr float, ptr @buf10, i32 %3 - %9 = load <16 x float>, ptr %8 - %10 = extractelement <16 x float> %7, i64 0 - %11 = extractelement <16 x float> %9, i64 0 - %12 = call float @__aie2p_scalar_fdiv(float %10, float %11) - %13 = insertelement <16 x float> poison, float %12, i64 0 - %14 = extractelement <16 x float> %7, i64 1 - %15 = extractelement <16 x float> %9, i64 1 - %16 = call float @__aie2p_scalar_fdiv(float %14, float %15) - %17 = insertelement <16 x float> %13, float %16, i64 1 - %18 = extractelement <16 x float> %7, i64 2 - %19 = extractelement <16 x float> %9, i64 2 - %20 = call float @__aie2p_scalar_fdiv(float %18, float %19) - %21 = insertelement <16 x float> %17, float %20, i64 2 - %22 = extractelement <16 x float> %7, i64 3 - %23 = extractelement <16 x float> %9, i64 3 - %24 = call float @__aie2p_scalar_fdiv(float %22, float %23) - %25 = insertelement <16 x float> %21, float %24, i64 3 - %26 = extractelement <16 x float> %7, i64 4 - %27 = extractelement <16 x float> %9, i64 4 - %28 = call float @__aie2p_scalar_fdiv(float %26, float %27) - %29 = insertelement <16 x float> %25, float %28, i64 4 - %30 = extractelement <16 x float> %7, i64 5 - %31 = extractelement <16 x float> %9, i64 5 - %32 = call float @__aie2p_scalar_fdiv(float %30, float %31) - %33 = insertelement <16 x float> %29, float %32, i64 5 - %34 = extractelement <16 x float> %7, i64 6 - %35 = extractelement <16 x float> %9, i64 6 - %36 = call float @__aie2p_scalar_fdiv(float %34, float %35) - %37 = insertelement <16 x float> %33, float %36, i64 6 - %38 = extractelement <16 x float> %7, i64 7 - %39 = extractelement <16 x float> %9, i64 7 - %40 = call float @__aie2p_scalar_fdiv(float %38, float %39) - %41 = insertelement <16 x float> %37, float %40, i64 7 - %42 = extractelement <16 x float> %7, i64 8 - %43 = extractelement <16 x float> %9, i64 8 - %44 = call float @__aie2p_scalar_fdiv(float %42, float %43) - %45 = insertelement <16 x float> %41, float %44, i64 8 - %46 = extractelement <16 x float> %7, i64 9 - %47 = extractelement <16 x float> %9, i64 9 - %48 = call float @__aie2p_scalar_fdiv(float %46, float %47) - %49 = insertelement <16 x float> %45, float %48, i64 9 - %50 = extractelement <16 x float> %7, i64 10 - %51 = extractelement <16 x float> %9, i64 10 - %52 = call float @__aie2p_scalar_fdiv(float %50, float %51) - %53 = insertelement <16 x float> %49, float %52, i64 10 - %54 = extractelement <16 x float> %7, i64 11 - %55 = extractelement <16 x float> %9, i64 11 - %56 = call float @__aie2p_scalar_fdiv(float %54, float %55) - %57 = insertelement <16 x float> %53, float %56, i64 11 - %58 = extractelement <16 x float> %7, i64 12 - %59 = extractelement <16 x float> %9, i64 12 - %60 = call float @__aie2p_scalar_fdiv(float %58, float %59) - %61 = insertelement <16 x float> %57, float %60, i64 12 - %62 = extractelement <16 x float> %7, i64 13 - %63 = extractelement <16 x float> %9, i64 13 - %64 = call float @__aie2p_scalar_fdiv(float %62, float %63) - %65 = insertelement <16 x float> %61, float %64, i64 13 - %66 = extractelement <16 x float> %7, i64 14 - %67 = extractelement <16 x float> %9, i64 14 - %68 = call float @__aie2p_scalar_fdiv(float %66, float %67) - %69 = insertelement <16 x float> %65, float %68, i64 14 - %70 = extractelement <16 x float> %7, i64 15 - %71 = extractelement <16 x float> %9, i64 15 - %72 = call float @__aie2p_scalar_fdiv(float %70, float %71) - %73 = insertelement <16 x float> %69, float %72, i64 15 - %74 = getelementptr float, ptr @buf9, i32 %3 - store <16 x float> %73, ptr %74 - %75 = add i32 %3, 16 - br label %2, !llvm.loop !1 - -76: ; preds = %2 - call void @llvm.aie2p.release(i32 51, i32 1) - call void @llvm.aie2p.release(i32 53, i32 1) - call void @llvm.aie2p.release(i32 48, i32 1) - br label %1 -} - -; Unknown intrinsic -declare float @llvm.aie2p.inv(float) - -attributes #0 = { noinline } - -!llvm.module.flags = !{!0} - -!0 = !{i32 2, !"Debug Info Version", i32 3} -!1 = distinct !{!1, !2} -!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/div_kernel_0_design.bif b/examples/elementwise_arith/air_project/div_kernel_0_design.bif deleted file mode 100644 index 11c5e21..0000000 --- a/examples/elementwise_arith/air_project/div_kernel_0_design.bif +++ /dev/null @@ -1,10 +0,0 @@ -all: -{ - id_code = 0x14ca8093 - extended_id_code = 0x01 - image - { - name=aie_image, id=0x1c000000 - { type=cdo file=air_project/div_kernel_0_aie_cdo_elfs.bin file=air_project/div_kernel_0_aie_cdo_init.bin file=air_project/div_kernel_0_aie_cdo_enable.bin } - } -} diff --git a/examples/elementwise_arith/air_project/div_kernel_0_div_kernel_0_sequence.bin b/examples/elementwise_arith/air_project/div_kernel_0_div_kernel_0_sequence.bin deleted file mode 100644 index f2eb383b7eb903e8ad8809b7d44fb0a4d660bfb6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3248 zcmcJQ-Aw~A5QNuneg4w5;eLtEZyo^(@j}krmd|WcZ&8OQJf_lyu+9e|Yukmso`>#0IXQ!}T>04{- zdWRz5+iJIiZ@khs7OO7qzIV<49O}M3Vd35Et*zr4+n!k}H6?s5ZHHP2b8iNd9f!uq z2%ELeWFG=0l9+H#Mw;SV&vYiNQGA=m?@TyH^<^fmv5ZNwd1Or5@6ws1SwdvOIT;Fy zFEe2cjS1(dzRbimmN6++jgb*5y%`xL$~=|+C!AAeiZ3%^4UGxssJ_g^HI^|MPc*|& zYb`xk4J-t?`*yLPCTa8!jZ|yI(P^-T#)NY+)Rg;`nXrb&gmYA1X5t#lnB*HUVGWH5 W=cv75Caj?`;T+YMnYhL+nEU|x-i@&U diff --git a/examples/elementwise_arith/air_project/empty_0.pdi b/examples/elementwise_arith/air_project/empty_0.pdi deleted file mode 100644 index a2347424a644d017f5e8ac814673b9061a6becd0..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 368 zcmcc1z`)>QtR&b`o)&s$_o|r;3@i+cKrs~{2GJ%!3<8rIPKiKR5Yp2nAV5Jtfx!S- z5XuJWBL`Gy9h?QS4_SX=W@>z9Zen^W!ax}W6O}bv!%~(5CLQVqjndvbXQt@Lv$f001PR1P=fJ diff --git a/examples/elementwise_arith/air_project/empty_0_aie_cdo_enable.bin b/examples/elementwise_arith/air_project/empty_0_aie_cdo_enable.bin deleted file mode 100644 index cba6b8778c42200ab6ec35c68cb3586f8fb4e055..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 24 dcmZQ!U|?`|@n>LQVqjndvbXQt@Lv$f001PR1P=fJ diff --git a/examples/elementwise_arith/air_project/empty_0_aie_cdo_init.bin b/examples/elementwise_arith/air_project/empty_0_aie_cdo_init.bin deleted file mode 100644 index cba6b8778c42200ab6ec35c68cb3586f8fb4e055..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 24 dcmZQ!U|?`|@n>LQVqjndvbXQt@Lv$f001PR1P=fJ diff --git a/examples/elementwise_arith/air_project/empty_0_design.bif b/examples/elementwise_arith/air_project/empty_0_design.bif deleted file mode 100644 index b22ae3c..0000000 --- a/examples/elementwise_arith/air_project/empty_0_design.bif +++ /dev/null @@ -1,10 +0,0 @@ -all: -{ - id_code = 0x14ca8093 - extended_id_code = 0x01 - image - { - name=aie_image, id=0x1c000000 - { type=cdo file=air_project/empty_0_aie_cdo_elfs.bin file=air_project/empty_0_aie_cdo_init.bin file=air_project/empty_0_aie_cdo_enable.bin } - } -} diff --git a/examples/elementwise_arith/air_project/full_elf_config.json b/examples/elementwise_arith/air_project/full_elf_config.json deleted file mode 100644 index eab4fdb..0000000 --- a/examples/elementwise_arith/air_project/full_elf_config.json +++ /dev/null @@ -1,134 +0,0 @@ -{ - "xrt-kernels": [ - { - "PDIs": [ - { - "PDI_file": "/home/strixminipc/Triton-XDNA/examples/elementwise_arith/air_project/empty_0.pdi", - "id": 1 - }, - { - "PDI_file": "/home/strixminipc/Triton-XDNA/examples/elementwise_arith/air_project/square_kernel_0.pdi", - "id": 2 - }, - { - "PDI_file": "/home/strixminipc/Triton-XDNA/examples/elementwise_arith/air_project/main.pdi", - "id": 3 - } - ], - "arguments": [ - { - "name": "arg_0", - "offset": "0x0", - "type": "char *" - }, - { - "name": "arg_1", - "offset": "0x8", - "type": "char *" - }, - { - "name": "arg_2", - "offset": "0x10", - "type": "char *" - }, - { - "name": "arg_3", - "offset": "0x18", - "type": "char *" - }, - { - "name": "arg_4", - "offset": "0x20", - "type": "char *" - }, - { - "name": "arg_5", - "offset": "0x28", - "type": "char *" - }, - { - "name": "arg_6", - "offset": "0x30", - "type": "char *" - }, - { - "name": "arg_7", - "offset": "0x38", - "type": "char *" - } - ], - "instance": [ - { - "TXN_ctrl_code_file": "/home/strixminipc/Triton-XDNA/examples/elementwise_arith/air_project/square_kernel_0_square_kernel_0_sequence.bin", - "id": "square_kernel_0_sequence" - } - ], - "name": "square_kernel_0" - }, - { - "PDIs": [ - { - "PDI_file": "/home/strixminipc/Triton-XDNA/examples/elementwise_arith/air_project/empty_0.pdi", - "id": 1 - }, - { - "PDI_file": "/home/strixminipc/Triton-XDNA/examples/elementwise_arith/air_project/square_kernel_0.pdi", - "id": 2 - }, - { - "PDI_file": "/home/strixminipc/Triton-XDNA/examples/elementwise_arith/air_project/main.pdi", - "id": 3 - } - ], - "arguments": [ - { - "name": "arg_0", - "offset": "0x0", - "type": "char *" - }, - { - "name": "arg_1", - "offset": "0x8", - "type": "char *" - }, - { - "name": "arg_2", - "offset": "0x10", - "type": "char *" - }, - { - "name": "arg_3", - "offset": "0x18", - "type": "char *" - }, - { - "name": "arg_4", - "offset": "0x20", - "type": "char *" - }, - { - "name": "arg_5", - "offset": "0x28", - "type": "char *" - }, - { - "name": "arg_6", - "offset": "0x30", - "type": "char *" - }, - { - "name": "arg_7", - "offset": "0x38", - "type": "char *" - } - ], - "instance": [ - { - "TXN_ctrl_code_file": "/home/strixminipc/Triton-XDNA/examples/elementwise_arith/air_project/main_square_kernel.bin", - "id": "square_kernel" - } - ], - "name": "main" - } - ] -} diff --git a/examples/elementwise_arith/air_project/input_with_addresses.mlir b/examples/elementwise_arith/air_project/input_with_addresses.mlir deleted file mode 100644 index f2c48f0..0000000 --- a/examples/elementwise_arith/air_project/input_with_addresses.mlir +++ /dev/null @@ -1,328 +0,0 @@ -#loop_annotation = #llvm.loop_annotation -module { - aie.device(npu2) @square_kernel_0 { - %shim_noc_tile_0_0 = aie.tile(0, 0) {controller_id = #aie.packet_info} - %shim_noc_tile_1_0 = aie.tile(1, 0) {controller_id = #aie.packet_info} - %mem_tile_0_1 = aie.tile(0, 1) {controller_id = #aie.packet_info} - %mem_tile_1_1 = aie.tile(1, 1) {controller_id = #aie.packet_info} - %tile_0_2 = aie.tile(0, 2) {controller_id = #aie.packet_info} - %tile_0_3 = aie.tile(0, 3) {controller_id = #aie.packet_info} - %tile_0_4 = aie.tile(0, 4) {controller_id = #aie.packet_info} - %tile_0_5 = aie.tile(0, 5) {controller_id = #aie.packet_info} - %lock_0_1 = aie.lock(%mem_tile_0_1, 1) {init = 4 : i32} - %lock_0_1_0 = aie.lock(%mem_tile_0_1, 0) {init = 0 : i32} - %lock_1_1 = aie.lock(%mem_tile_1_1, 1) {init = 4 : i32} - %lock_1_1_1 = aie.lock(%mem_tile_1_1, 0) {init = 0 : i32} - %lock_0_2 = aie.lock(%tile_0_2, 3) {init = 1 : i32} - %lock_0_2_2 = aie.lock(%tile_0_2, 2) {init = 0 : i32} - %lock_0_2_3 = aie.lock(%tile_0_2, 1) {init = 1 : i32} - %lock_0_2_4 = aie.lock(%tile_0_2, 0) {init = 0 : i32} - %lock_0_3 = aie.lock(%tile_0_3, 3) {init = 1 : i32} - %lock_0_3_5 = aie.lock(%tile_0_3, 2) {init = 0 : i32} - %lock_0_3_6 = aie.lock(%tile_0_3, 1) {init = 1 : i32} - %lock_0_3_7 = aie.lock(%tile_0_3, 0) {init = 0 : i32} - %lock_0_4 = aie.lock(%tile_0_4, 3) {init = 1 : i32} - %lock_0_4_8 = aie.lock(%tile_0_4, 2) {init = 0 : i32} - %lock_0_4_9 = aie.lock(%tile_0_4, 1) {init = 1 : i32} - %lock_0_4_10 = aie.lock(%tile_0_4, 0) {init = 0 : i32} - %lock_0_5 = aie.lock(%tile_0_5, 3) {init = 1 : i32} - %lock_0_5_11 = aie.lock(%tile_0_5, 2) {init = 0 : i32} - %lock_0_5_12 = aie.lock(%tile_0_5, 1) {init = 1 : i32} - %lock_0_5_13 = aie.lock(%tile_0_5, 0) {init = 0 : i32} - %buf9 = aie.buffer(%mem_tile_0_1) {address = 0 : i32, mem_bank = 0 : i32, sym_name = "buf9"} : memref<1024xi16, 1 : i32> - %buf8 = aie.buffer(%mem_tile_1_1) {address = 0 : i32, mem_bank = 0 : i32, sym_name = "buf8"} : memref<1024xi16, 1> - %buf7 = aie.buffer(%tile_0_5) {address = 1024 : i32, mem_bank = 0 : i32, sym_name = "buf7"} : memref<256xi16, 2> - %buf6 = aie.buffer(%tile_0_5) {address = 16384 : i32, mem_bank = 1 : i32, sym_name = "buf6"} : memref<256xi16, 2> - %buf5 = aie.buffer(%tile_0_4) {address = 1024 : i32, mem_bank = 0 : i32, sym_name = "buf5"} : memref<256xi16, 2> - %buf4 = aie.buffer(%tile_0_4) {address = 16384 : i32, mem_bank = 1 : i32, sym_name = "buf4"} : memref<256xi16, 2> - %buf3 = aie.buffer(%tile_0_3) {address = 1024 : i32, mem_bank = 0 : i32, sym_name = "buf3"} : memref<256xi16, 2> - %buf2 = aie.buffer(%tile_0_3) {address = 16384 : i32, mem_bank = 1 : i32, sym_name = "buf2"} : memref<256xi16, 2> - %buf1 = aie.buffer(%tile_0_2) {address = 1024 : i32, mem_bank = 0 : i32, sym_name = "buf1"} : memref<256xi16, 2> - %buf0 = aie.buffer(%tile_0_2) {address = 16384 : i32, mem_bank = 1 : i32, sym_name = "buf0"} : memref<256xi16, 2> - %mem_0_5 = aie.mem(%tile_0_5) { - %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) - ^bb1: // 2 preds: ^bb0, ^bb1 - aie.use_lock(%lock_0_5_13, AcquireGreaterEqual, 1) - aie.dma_bd(%buf6 : memref<256xi16, 2>, 0, 256) {bd_id = 0 : i32, next_bd_id = 0 : i32, task_id = 0 : i32} - aie.use_lock(%lock_0_5_12, Release, 1) - aie.next_bd ^bb1 - ^bb2: // pred: ^bb3 - aie.end - ^bb3: // pred: ^bb0 - %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb2) - ^bb4: // 2 preds: ^bb3, ^bb4 - aie.use_lock(%lock_0_5, AcquireGreaterEqual, 1) - aie.dma_bd(%buf7 : memref<256xi16, 2>, 0, 256) {bd_id = 1 : i32, next_bd_id = 1 : i32, task_id = 0 : i32} - aie.use_lock(%lock_0_5_11, Release, 1) - aie.next_bd ^bb4 - } - %core_0_5 = aie.core(%tile_0_5) { - %c0_i32 = arith.constant 0 : i32 - %c256 = arith.constant 256 : index - %c32 = arith.constant 32 : index - %c0 = arith.constant 0 : index - cf.br ^bb1 - ^bb1: // 2 preds: ^bb0, ^bb4 - aie.use_lock(%lock_0_5_12, AcquireGreaterEqual, 1) - aie.use_lock(%lock_0_5_11, AcquireGreaterEqual, 1) - cf.br ^bb2(%c0 : index) - ^bb2(%0: index): // 2 preds: ^bb1, ^bb3 - %1 = arith.cmpi slt, %0, %c256 : index - cf.cond_br %1, ^bb3, ^bb4 - ^bb3: // pred: ^bb2 - %2 = vector.load %buf7[%0] : memref<256xi16, 2>, vector<32xi16> - %3 = aievec.mul_elem %2, %2 : vector<32xi16>, vector<32xi16>, vector<32xi32> - %4 = aievec.srs %3, %c0_i32 : vector<32xi32>, i32, vector<32xi16> - vector.store %4, %buf6[%0] : memref<256xi16, 2>, vector<32xi16> - %5 = arith.addi %0, %c32 : index - cf.br ^bb2(%5 : index) {loop_annotation = #loop_annotation} - ^bb4: // pred: ^bb2 - aie.use_lock(%lock_0_5, Release, 1) - aie.use_lock(%lock_0_5_13, Release, 1) - cf.br ^bb1 - } - %mem_0_4 = aie.mem(%tile_0_4) { - %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) - ^bb1: // 2 preds: ^bb0, ^bb1 - aie.use_lock(%lock_0_4_10, AcquireGreaterEqual, 1) - aie.dma_bd(%buf4 : memref<256xi16, 2>, 0, 256) {bd_id = 0 : i32, next_bd_id = 0 : i32, task_id = 0 : i32} - aie.use_lock(%lock_0_4_9, Release, 1) - aie.next_bd ^bb1 - ^bb2: // pred: ^bb3 - aie.end - ^bb3: // pred: ^bb0 - %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb2) - ^bb4: // 2 preds: ^bb3, ^bb4 - aie.use_lock(%lock_0_4, AcquireGreaterEqual, 1) - aie.dma_bd(%buf5 : memref<256xi16, 2>, 0, 256) {bd_id = 1 : i32, next_bd_id = 1 : i32, task_id = 0 : i32} - aie.use_lock(%lock_0_4_8, Release, 1) - aie.next_bd ^bb4 - } - %core_0_4 = aie.core(%tile_0_4) { - %c0_i32 = arith.constant 0 : i32 - %c256 = arith.constant 256 : index - %c32 = arith.constant 32 : index - %c0 = arith.constant 0 : index - cf.br ^bb1 - ^bb1: // 2 preds: ^bb0, ^bb4 - aie.use_lock(%lock_0_4_9, AcquireGreaterEqual, 1) - aie.use_lock(%lock_0_4_8, AcquireGreaterEqual, 1) - cf.br ^bb2(%c0 : index) - ^bb2(%0: index): // 2 preds: ^bb1, ^bb3 - %1 = arith.cmpi slt, %0, %c256 : index - cf.cond_br %1, ^bb3, ^bb4 - ^bb3: // pred: ^bb2 - %2 = vector.load %buf5[%0] : memref<256xi16, 2>, vector<32xi16> - %3 = aievec.mul_elem %2, %2 : vector<32xi16>, vector<32xi16>, vector<32xi32> - %4 = aievec.srs %3, %c0_i32 : vector<32xi32>, i32, vector<32xi16> - vector.store %4, %buf4[%0] : memref<256xi16, 2>, vector<32xi16> - %5 = arith.addi %0, %c32 : index - cf.br ^bb2(%5 : index) {loop_annotation = #loop_annotation} - ^bb4: // pred: ^bb2 - aie.use_lock(%lock_0_4, Release, 1) - aie.use_lock(%lock_0_4_10, Release, 1) - cf.br ^bb1 - } - %mem_0_3 = aie.mem(%tile_0_3) { - %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) - ^bb1: // 2 preds: ^bb0, ^bb1 - aie.use_lock(%lock_0_3_7, AcquireGreaterEqual, 1) - aie.dma_bd(%buf2 : memref<256xi16, 2>, 0, 256) {bd_id = 0 : i32, next_bd_id = 0 : i32, task_id = 0 : i32} - aie.use_lock(%lock_0_3_6, Release, 1) - aie.next_bd ^bb1 - ^bb2: // pred: ^bb3 - aie.end - ^bb3: // pred: ^bb0 - %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb2) - ^bb4: // 2 preds: ^bb3, ^bb4 - aie.use_lock(%lock_0_3, AcquireGreaterEqual, 1) - aie.dma_bd(%buf3 : memref<256xi16, 2>, 0, 256) {bd_id = 1 : i32, next_bd_id = 1 : i32, task_id = 0 : i32} - aie.use_lock(%lock_0_3_5, Release, 1) - aie.next_bd ^bb4 - } - %core_0_3 = aie.core(%tile_0_3) { - %c0_i32 = arith.constant 0 : i32 - %c256 = arith.constant 256 : index - %c32 = arith.constant 32 : index - %c0 = arith.constant 0 : index - cf.br ^bb1 - ^bb1: // 2 preds: ^bb0, ^bb4 - aie.use_lock(%lock_0_3_6, AcquireGreaterEqual, 1) - aie.use_lock(%lock_0_3_5, AcquireGreaterEqual, 1) - cf.br ^bb2(%c0 : index) - ^bb2(%0: index): // 2 preds: ^bb1, ^bb3 - %1 = arith.cmpi slt, %0, %c256 : index - cf.cond_br %1, ^bb3, ^bb4 - ^bb3: // pred: ^bb2 - %2 = vector.load %buf3[%0] : memref<256xi16, 2>, vector<32xi16> - %3 = aievec.mul_elem %2, %2 : vector<32xi16>, vector<32xi16>, vector<32xi32> - %4 = aievec.srs %3, %c0_i32 : vector<32xi32>, i32, vector<32xi16> - vector.store %4, %buf2[%0] : memref<256xi16, 2>, vector<32xi16> - %5 = arith.addi %0, %c32 : index - cf.br ^bb2(%5 : index) {loop_annotation = #loop_annotation} - ^bb4: // pred: ^bb2 - aie.use_lock(%lock_0_3, Release, 1) - aie.use_lock(%lock_0_3_7, Release, 1) - cf.br ^bb1 - } - %mem_0_2 = aie.mem(%tile_0_2) { - %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) - ^bb1: // 2 preds: ^bb0, ^bb1 - aie.use_lock(%lock_0_2_4, AcquireGreaterEqual, 1) - aie.dma_bd(%buf0 : memref<256xi16, 2>, 0, 256) {bd_id = 0 : i32, next_bd_id = 0 : i32, task_id = 0 : i32} - aie.use_lock(%lock_0_2_3, Release, 1) - aie.next_bd ^bb1 - ^bb2: // pred: ^bb3 - aie.end - ^bb3: // pred: ^bb0 - %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb2) - ^bb4: // 2 preds: ^bb3, ^bb4 - aie.use_lock(%lock_0_2, AcquireGreaterEqual, 1) - aie.dma_bd(%buf1 : memref<256xi16, 2>, 0, 256) {bd_id = 1 : i32, next_bd_id = 1 : i32, task_id = 0 : i32} - aie.use_lock(%lock_0_2_2, Release, 1) - aie.next_bd ^bb4 - } - %core_0_2 = aie.core(%tile_0_2) { - %c0_i32 = arith.constant 0 : i32 - %c256 = arith.constant 256 : index - %c32 = arith.constant 32 : index - %c0 = arith.constant 0 : index - cf.br ^bb1 - ^bb1: // 2 preds: ^bb0, ^bb4 - aie.use_lock(%lock_0_2_3, AcquireGreaterEqual, 1) - aie.use_lock(%lock_0_2_2, AcquireGreaterEqual, 1) - cf.br ^bb2(%c0 : index) - ^bb2(%0: index): // 2 preds: ^bb1, ^bb3 - %1 = arith.cmpi slt, %0, %c256 : index - cf.cond_br %1, ^bb3, ^bb4 - ^bb3: // pred: ^bb2 - %2 = vector.load %buf1[%0] : memref<256xi16, 2>, vector<32xi16> - %3 = aievec.mul_elem %2, %2 : vector<32xi16>, vector<32xi16>, vector<32xi32> - %4 = aievec.srs %3, %c0_i32 : vector<32xi32>, i32, vector<32xi16> - vector.store %4, %buf0[%0] : memref<256xi16, 2>, vector<32xi16> - %5 = arith.addi %0, %c32 : index - cf.br ^bb2(%5 : index) {loop_annotation = #loop_annotation} - ^bb4: // pred: ^bb2 - aie.use_lock(%lock_0_2, Release, 1) - aie.use_lock(%lock_0_2_4, Release, 1) - cf.br ^bb1 - } - aie.flow(%shim_noc_tile_0_0, DMA : 0, %mem_tile_0_1, DMA : 0) - aie.flow(%mem_tile_1_1, DMA : 0, %shim_noc_tile_1_0, DMA : 0) - aie.flow(%mem_tile_0_1, DMA : 0, %tile_0_2, DMA : 0) - aie.flow(%mem_tile_0_1, DMA : 1, %tile_0_3, DMA : 0) - aie.flow(%mem_tile_0_1, DMA : 2, %tile_0_4, DMA : 0) - aie.flow(%mem_tile_0_1, DMA : 3, %tile_0_5, DMA : 0) - aie.flow(%tile_0_2, DMA : 0, %mem_tile_1_1, DMA : 0) - aie.flow(%tile_0_3, DMA : 0, %mem_tile_1_1, DMA : 1) - aie.flow(%tile_0_4, DMA : 0, %mem_tile_1_1, DMA : 2) - aie.flow(%tile_0_5, DMA : 0, %mem_tile_1_1, DMA : 3) - %memtile_dma_1_1 = aie.memtile_dma(%mem_tile_1_1) { - %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) - ^bb1: // 2 preds: ^bb0, ^bb1 - aie.use_lock(%lock_1_1_1, AcquireGreaterEqual, 4) - aie.dma_bd(%buf8 : memref<1024xi16, 1>, 0, 1024) {bd_id = 0 : i32, next_bd_id = 0 : i32, task_id = 0 : i32} - aie.use_lock(%lock_1_1, Release, 4) - aie.next_bd ^bb1 - ^bb2: // pred: ^bb9 - aie.end - ^bb3: // pred: ^bb0 - %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb5) - ^bb4: // 2 preds: ^bb3, ^bb4 - aie.use_lock(%lock_1_1, AcquireGreaterEqual, 1) - aie.dma_bd(%buf8 : memref<1024xi16, 1>, 0, 256) {bd_id = 1 : i32, next_bd_id = 1 : i32, task_id = 0 : i32} - aie.use_lock(%lock_1_1_1, Release, 1) - aie.next_bd ^bb4 - ^bb5: // pred: ^bb3 - %2 = aie.dma_start(S2MM, 1, ^bb6, ^bb7) - ^bb6: // 2 preds: ^bb5, ^bb6 - aie.use_lock(%lock_1_1, AcquireGreaterEqual, 1) - aie.dma_bd(%buf8 : memref<1024xi16, 1>, 256, 256) {bd_id = 24 : i32, next_bd_id = 24 : i32, task_id = 0 : i32} - aie.use_lock(%lock_1_1_1, Release, 1) - aie.next_bd ^bb6 - ^bb7: // pred: ^bb5 - %3 = aie.dma_start(S2MM, 2, ^bb8, ^bb9) - ^bb8: // 2 preds: ^bb7, ^bb8 - aie.use_lock(%lock_1_1, AcquireGreaterEqual, 1) - aie.dma_bd(%buf8 : memref<1024xi16, 1>, 512, 256) {bd_id = 2 : i32, next_bd_id = 2 : i32, task_id = 0 : i32} - aie.use_lock(%lock_1_1_1, Release, 1) - aie.next_bd ^bb8 - ^bb9: // pred: ^bb7 - %4 = aie.dma_start(S2MM, 3, ^bb10, ^bb2) - ^bb10: // 2 preds: ^bb9, ^bb10 - aie.use_lock(%lock_1_1, AcquireGreaterEqual, 1) - aie.dma_bd(%buf8 : memref<1024xi16, 1>, 768, 256) {bd_id = 25 : i32, next_bd_id = 25 : i32, task_id = 0 : i32} - aie.use_lock(%lock_1_1_1, Release, 1) - aie.next_bd ^bb10 - } - %memtile_dma_0_1 = aie.memtile_dma(%mem_tile_0_1) { - %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) - ^bb1: // 2 preds: ^bb0, ^bb1 - aie.use_lock(%lock_0_1_0, AcquireGreaterEqual, 1) - aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 0, 256) {bd_id = 0 : i32, next_bd_id = 0 : i32, task_id = 0 : i32} - aie.use_lock(%lock_0_1, Release, 1) - aie.next_bd ^bb1 - ^bb2: // pred: ^bb9 - aie.end - ^bb3: // pred: ^bb0 - %1 = aie.dma_start(MM2S, 1, ^bb4, ^bb5) - ^bb4: // 2 preds: ^bb3, ^bb4 - aie.use_lock(%lock_0_1_0, AcquireGreaterEqual, 1) - aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 256, 256) {bd_id = 24 : i32, next_bd_id = 24 : i32, task_id = 0 : i32} - aie.use_lock(%lock_0_1, Release, 1) - aie.next_bd ^bb4 - ^bb5: // pred: ^bb3 - %2 = aie.dma_start(MM2S, 2, ^bb6, ^bb7) - ^bb6: // 2 preds: ^bb5, ^bb6 - aie.use_lock(%lock_0_1_0, AcquireGreaterEqual, 1) - aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 512, 256) {bd_id = 1 : i32, next_bd_id = 1 : i32, task_id = 0 : i32} - aie.use_lock(%lock_0_1, Release, 1) - aie.next_bd ^bb6 - ^bb7: // pred: ^bb5 - %3 = aie.dma_start(MM2S, 3, ^bb8, ^bb9) - ^bb8: // 2 preds: ^bb7, ^bb8 - aie.use_lock(%lock_0_1_0, AcquireGreaterEqual, 1) - aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 768, 256) {bd_id = 25 : i32, next_bd_id = 25 : i32, task_id = 0 : i32} - aie.use_lock(%lock_0_1, Release, 1) - aie.next_bd ^bb8 - ^bb9: // pred: ^bb7 - %4 = aie.dma_start(S2MM, 0, ^bb10, ^bb2) - ^bb10: // 2 preds: ^bb9, ^bb10 - aie.use_lock(%lock_0_1, AcquireGreaterEqual, 4) - aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 0, 1024) {bd_id = 2 : i32, next_bd_id = 2 : i32, task_id = 0 : i32} - aie.use_lock(%lock_0_1_0, Release, 4) - aie.next_bd ^bb10 - } - aie.shim_dma_allocation @air_channel_3(%shim_noc_tile_1_0, S2MM, 0) - aie.shim_dma_allocation @air_channel_0(%shim_noc_tile_0_0, MM2S, 0) - aie.runtime_sequence @square_kernel_0_sequence(%arg0: memref<*xi16>, %arg1: memref<*xi16>, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32) { - %0 = aiex.dma_configure_task_for @air_channel_0 { - aie.dma_bd(%arg0 : memref<*xi16>, 0, 1024, [, ]) - aie.end - } - aiex.dma_start_task(%0) - %1 = aiex.dma_configure_task_for @air_channel_3 { - aie.dma_bd(%arg1 : memref<*xi16>, 0, 1024, [, ]) - aie.end - } {issue_token = true} - aiex.dma_start_task(%1) - aiex.dma_free_task(%0) - aiex.dma_await_task(%1) - } - aie.packet_flow(15) { - aie.packet_source<%shim_noc_tile_0_0, TileControl : 0> - aie.packet_dest<%shim_noc_tile_0_0, South : 0> - } {keep_pkt_header = true, priority_route = true} - aie.packet_flow(15) { - aie.packet_source<%shim_noc_tile_1_0, TileControl : 0> - aie.packet_dest<%shim_noc_tile_1_0, South : 0> - } {keep_pkt_header = true, priority_route = true} - } {dlti.dl_spec = #dlti.dl_spec} - aie.device(npu2) { - aie.runtime_sequence @square_kernel(%arg0: memref<*xi16>, %arg1: memref<*xi16>, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32) { - aiex.configure @square_kernel_0 { - aiex.run @square_kernel_0_sequence(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7) : (memref<*xi16>, memref<*xi16>, i32, i32, i32, i32, i32, i32) - } - } - } -} diff --git a/examples/elementwise_arith/air_project/main.pdi b/examples/elementwise_arith/air_project/main.pdi deleted file mode 100644 index a2347424a644d017f5e8ac814673b9061a6becd0..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 368 zcmcc1z`)>QtR&b`o)&s$_o|r;3@i+cKrs~{2GJ%!3<8rIPKiKR5Yp2nAV5Jtfx!S- z5XuJWBL`Gy9h?QS4_SX=W@>z9Zen^W!ax}W6O}bv!%~(5CLQVqjndvbXQt@Lv$f001PR1P=fJ diff --git a/examples/elementwise_arith/air_project/main_aie_cdo_enable.bin b/examples/elementwise_arith/air_project/main_aie_cdo_enable.bin deleted file mode 100644 index cba6b8778c42200ab6ec35c68cb3586f8fb4e055..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 24 dcmZQ!U|?`|@n>LQVqjndvbXQt@Lv$f001PR1P=fJ diff --git a/examples/elementwise_arith/air_project/main_aie_cdo_init.bin b/examples/elementwise_arith/air_project/main_aie_cdo_init.bin deleted file mode 100644 index cba6b8778c42200ab6ec35c68cb3586f8fb4e055..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 24 dcmZQ!U|?`|@n>LQVqjndvbXQt@Lv$f001PR1P=fJ diff --git a/examples/elementwise_arith/air_project/main_design.bif b/examples/elementwise_arith/air_project/main_design.bif deleted file mode 100644 index 27149ca..0000000 --- a/examples/elementwise_arith/air_project/main_design.bif +++ /dev/null @@ -1,10 +0,0 @@ -all: -{ - id_code = 0x14ca8093 - extended_id_code = 0x01 - image - { - name=aie_image, id=0x1c000000 - { type=cdo file=air_project/main_aie_cdo_elfs.bin file=air_project/main_aie_cdo_init.bin file=air_project/main_aie_cdo_enable.bin } - } -} diff --git a/examples/elementwise_arith/air_project/main_div_kernel.bin b/examples/elementwise_arith/air_project/main_div_kernel.bin deleted file mode 100644 index e44b65c166f6fc2f297dc1df988f5b8b540f6252..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 22460 zcmeHPZ)_aJ6@PoT_8qCs?KyV3*bZJN1{-pL3o+C{0(*&DS=5Rg{jAV{f7qXSa4m5RDXR62r#`(}3L zc4v3S-XbFU;YPZ<-QWD?&71d|nSHn3GZvvxU$O0aON?IH{Tg4G$^x~PQ zvrhN3x)i?CSGdkzqYl=n%PQ&u&$dA20szwhV!|lYF37O{>D#5)`P(<89qp^{Tmpuhv7z_i(^Oc z$eq<(YN^^{RPL_hXUUk_v}*5sKX;~d`ND*Y<8v`e0Carlv0q5n3s}Ex;pVYpKH$J~ zW|dt3J`bXA8Q}QgbocSYhr3DF%j?)%N@njKw0BRshjg7peqjy&PGSA__5HgS?cIAA z?Y)WoE7q`A!}=@Nw|76FilObi7}^V;y@BAMQE+aJrkcbfdkYg#*)Cd+6U~ z3tyjE(an!jv5^T%V`Chf)J8{h4|s@1u# zer8|giL^ayxYK;%f^E5U{m@nQdaAVq9RPe@+hE}8nc}O*&d%e}EWi|PX=j1sI6e!q zDRB6DI7ThwDVjM)+P`3t_Of-r`R&3r9~^M5tt{5brm}U`?1J)bZ)BWW!FaTFv_{6q z^VfY7tY1M}<%arokF5TCr4K5Z6(^GM~2vuA(!0F)nl zC6j|=GtU?%^!mHU7r~mv-GT0xqA>sNGirUpKNr0>#2CpXFItNh3>gmcjm1g0ghiCQ1 z{(w6~uuD8w?!fxF)jzGQ3V)7%YmEE8uM=(A=@DUS6b-qsro9%|vK<2oPt3ukE984^ z7P&o}FAP|zejpc(=P?#j>7s9sY?`&mcGFs%Tm3#p7kwIq`81TtaBH4ydjYxb9PaSA zLw^=XyPGF1IL2qH)>{4B8L(*l2R54i7<$I(7)C^&wCAF8h2p7 z(D7IA!ZFm+S%Ix%uj^39(fzIi&}x$JY{0qTo=T&Ne(}d^56x!^KR;r$0x`;Yvikvx z?8XSR&sw&3Q=VLM0%t6q--gj*QGqOtieGp_vt5esNM zY5Sp6&>qajm~n;C@iuyl+mq%Kj+fe-+kpOYd-QtBiUZCKb^F!lGWNB#jJ@z8?j^5e z>>s?EvG>1`v3LAAW9R>xiC>Sge;3tk6g;0W<@lrjWuuk%M+%?I7oP8dEe6KZvd_-u z?XwqS>`FBH(ASy^9GuVF zl_PmlMBS}aCvoxf;FM)o_E-e=STs%rV}r3VMRu342GQFAnRiovU7#5ZkH@0F|KaG! zlh5W-)84Psm3!;t+bfwu%5>9oJ)nQshV#YzpnLZ4`UNZZjUtMWrt zoyZSWbs|4h)q%5%hO2W`eI1W&x77!Yu;CF-dBBVjr&;5T(z02 zHt(x~`{~?Q_47QMyTx-=eVxb;RqaH6sHy|!77DXMb4K~T>VAf+b5(sEoYvQ66?L>< zzuer+RhzkLGgocqs?A)r`3ykw8Gz<904yzNJ_GRo`V4@d=TVX3xvIWS-u z;57~IMR4(~hU9?r8W$4eo022@O7_!KXENS%c4M@Oceh z(co1LzNEox8r+NEfo}faG1R&Rf8&Vj7JN^YbyMSxr^bsK+|}R{8hlEFPiye92A|X5 z^BTOO!K)g4NrTrkxEH|#-Td$LYTbhG`*GcZJzLgIjXRziFKTdCgHLGiDGffY!OI$a zPJ_>D@QMbnYVaiuUen;72DhM>NrQt(_Py1(OX+N;XpAGzxPNdXsG+PDF zY4F?#3*en^m=_A3+u*S&EOGz#l*xY)0MNdp)B2SI-+d}s+!}dk) zD)_>0SSlrqgW$Q+*yxEI2f~dBfmCj)UN>7l;0afv}(Aw|>G;K{EO8^`f|VhC7P4g?`@`wlALR7JT~Ymu{2& zJcM0+=#L2E#a$`k*MGk_!1)a5j*}57t-6NI8KVOrxj%y+`} z(?8YP7mbHoyOr13BFCwLaNJA{4jR0T+*u3$~+wPWw=6LhAZ?{@=;$Uj`}Kb)R*B3eU*IFm*Mm+yxvh?C4ZOaf1_*xD2Mr- z%Z+cE5}xJ!&*M+e!+Z$%GX9=#=W3%bC-I?&%{SaJB>O{#k7K2T7dbzQgNE;NKI$de zF*W=G=VOp0epbUD;d~sO#BbB^hrKk5zaq{IkB$rD%y31V1-~lA^F5!hh%@73axVGl z7(K#{gkzkQIL29tW1JbTh_jN9ab`HaN&RA+m3)jd!xeE>@+~jTR=O3j?=JheBF>Dj zh%>{HAB{7^qj4U99>JH!g>h5koR9z1K;rMx@J-IgC#xm?h=!l#{PqZcSi^7Q{8-#H zeAC;Cp}}9lM(o?mK8{tApDyRe*1Lv3#`*E}qP9QE`LTJfv7hC9{6Z=DIjP}~bAD`| zYxt(;-}8?9)3_Hd^*hY@@%X6uyEq^JorYv*U|G%2a(-8Yf2)Q+#`zl~{BaF`ob%)H zQTt+9pu`7_M<_~TV+yuHh6ewOp&@l^B2I6odAHGiD*nveSfvmcL-nm@t$@$stp z4(G@A|7A75;O*7;fie`&GD(9<#D!_}}fRGjgfo zsXqq%JcK7jxt^Vw*nX|j`C3p{kjBsoMOZbe&uim$hxYCDKH0 zQ^m$f`emGkCbA7kq>0=nX}_{2+{Ss+qzvIw#ginl3Ae!+AfG1EL~c_>3rYHwHIZ#3 z(?o8Qv|m{hZsWXZLcIz~6%UidCTKyAs`6|Zr2Wd8a2ub5CjSM>GL3!! diff --git a/examples/elementwise_arith/air_project/main_mul_kernel.bin b/examples/elementwise_arith/air_project/main_mul_kernel.bin deleted file mode 100644 index 48ac55f27234b0be60abf7239927bde8dd3fe95a..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 14460 zcmeHNO>87r5w7-h>{)PjCy9v$uTgq`gjSIcV{9ZNS)!dak_ASZ5Du0&V7PGMU~;b$RHtn;XdM!!=6S;4xG>^r<_(!fO}6)IjmUfy;raM)q7Rm$11+8_Q{^^ z{_3lG)%8{X&OlKv_6vY1{{M0tpbxkbjGkYwZuvFKSi>!E8$7&WZ43Bm{zKdF0vlXl z>kGCnvzu^^8)X3#fIT?BvSjBsA797GKmBg6f@U2p|9~ZUe*lwN0Ttlig8_^_LC;Sf zKR7HtzH$@by`=|-@bUL*fEW7jzw@_ZI@qwjCKkYO_{OVlZGBX}xcJ}G@BjX#)A2uF z`uFbAPkys;`W1lFKZFGvzFPN(f2^+n?C+khp8o0=U)wt}^X=82F5;{JIK$81Uj9*G z=WyHi-o6@a&vu7D{oP`5)8vnKH;jF<5N*F)t%G6t!RIz8^LO!Z{u(xEKFofx_!91A z{|xu?9>5*rHS7cJGy2z9WdPzoI7ah=ne+57A6#2FZvNhz{QC3LpFLVyUnuJy?H~FO ze+V!z5zQEW>%&eY82;&lr6JrQ>xTuv=a18VGJXsAFZGjT^Zg{*d_PGxs5T932d2X2 z8$8^wxXB~UMYq8Petp4a=5^y6&wG3FEcdqS8KbqvtS6N86uT3qgJ z-3Ax<^#xnk+dp{T+s|}wyPh#kmG-u7^Zg|G`F@gYK=<2H|9V^KXV&d#Zum98`>%Ek z6ueUKk%HF>K34D}1wU5s69vDo;AaXxQShmP&lLPz!J7#8&qDrQQr4}z&-6Xi3e;@0 zt6Vo3uN3}B!D|H{EBKLuA1nBYg5Ov0GXeijJ zxo$FEH8MU@@LIvg3Vx*E#|nO;;P(~$Ou;7#K2`9Uf}bmRqu>KrGm@CM3+y_DcX1Xs zHl~ZcyA%0_YDdI3p{4j1H@1*Jrt^6b;dOMLH-2V}6n>@f2MQk){+P~FX+HAxV5IP? z=zQn0#}_jS9~AzWj=waYi_d2s=7Yi?Q(wjWTIsiu!mkwmK;c8gH}l&u zd(*u>EASO+G|skSCeD6;C1l4C)GhdKlt~UK2JYON$(D}pXg^&Ga^Njle`;Fxv=)3TL>f7=2^zHb0`gZ(0 zeH}kvU-3R1^|iR5uf+v@MLy~);;65PqrMgw^cDH2uf@$rc)X*&B7diGPn0hJoP_+9 zdHbZv@ILXsgI}G8eCYTb|IEetX7SoJ#)s>+-|&pV;@>G8jz)%$h#%vi@N43uUMx@0pgceqez%NOcw zapX^=5FdYX!{XeY%ltm^@uxeC|3ihpPyAOR{(-_jAbwgOIiIq*Yx^hF z*Wq#hY!M%SOvCbDo6Gz@@zZ+B{C(o5^^y4p#83BE<{uJ2Io_50%jPYmJ`RuT(g+&d2V**WKwUKOPQ`mVhFZ*20{OG%V6WRQ8 z+a@9oPi^j;D*L(fRcJHocHqxnM`pz&!^adi-9F&*UEHud&VdsBFB&T!b#`-)*lx`V z_sse9^V;0ncu^8Kug@K#dEL9Vr=Qnq*9NC~+n)HyX$#oY@S4-lr&f9!9e*ciaA{z) zq6ll#X;OP{yPV>upE_TY+WNL1S(dh8HAUCj?i$a1a_8lm@ECcTaG%_Hxh53j(lj~g>W78Q%@L`= z;4$(v;XYH0mea3X6CNW^6Yi5cFV}=(T$(1+FF+F>BTp0VlUp0E36GJd3HQmJmuo^X Iz6eeJ2joH%+5i9m diff --git a/examples/elementwise_arith/air_project/main_square_kernel.bin b/examples/elementwise_arith/air_project/main_square_kernel.bin deleted file mode 100644 index 8ba56366c72a88bc1322e4d03447a2fe49afc7c7..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 11048 zcmeHNzi%8x6n=ZRu{U6n4LYKYW0Z9gp$J@@6C(vF5_`l*78v0qO(Y5o4Go1!MaM;0 z0U?B38YCn)^e2iGsa8sAbV`-Vx&;1!b*dB*=FQBT+xhj(#)^wb8OgV^-}~mxoA&0$7C6;Ym5aczFvafAmp1hiVQU0;F&rnV&^DKn}N47z`uqz{A};*P{oAPXl~E zdFMJj__zQ#l-&OIkEl%NLN=|J zkPY$+0+tO!vGkd4&r4jy;m5+;V2^Q0z$WHZ?fgsa%0#@V+EtnmOW^bpvT419Y+5fN z8|0S-ESs<0p2Wl2LArfK;$iLbHrQia60p_Uz4}tSGUG3*c9kZ?5`69QHm#S?IIWkE4d}cZ(xm%dT4(&#N%rW$=Q*R~USi!PgkP&)^#jzQy1J1}_g*>>H+cwU9^E`t{gzQW+E48F$TeFooP@GS-(FnGz}Lk1r)c*WqU zJdMe3|JLV}I;E+t(5ef}h!TpY1ukFW!e;Wjv_D4?ZhxhB0 z;e)dusb~PYq<7zcc>hQlJ~;22+P~7X??0^blG{(;e5k*HLsw7^~H{t(Jeho((~)lF@x*p44PlZJKHc`&HKN2 zXLTr!9u%(U*QA)9=eoFlrl)anRc*S}6&&< z!bMOfKV|qcgr9KunZvIWF%7c55@71P+cWD`^O<@T1|RiM^IKs(Sp2rZXX<70Q4h+$ z)F-2dr^Py^LpH3h?ppcm67JUb65(#WE;9T@1}_+VnQ*sWOAH_NCgXSYfO?a-7k{Nr zh=&0BrhYm*C&d2{pfveV^KJaG^7C$V>ZrwslQJGul;!_iq;ND^c$fGt4h+8_ zKCajD6Epld;$z+{e!}oC5+7G*@uwJmTD9fAfj_la{9;Gp?z%9~k{`&k#F1ZG`FyPS zZl2G=NzJ$B#k_@a;^TqB;$LL=G4Z{1G5m!1uQ~n~82%LTk2?HI3_q^UU}^yHXFV(a zY)9c}vhafVo(>uQ67l`_OW6M+@$sieYu=pkzf62jw+#O}@xA;o{J2ujR{VIz&s?lH z7l`lYC(OS@eEccX@^f}4%wHzHzuqvvLwtX|VSZd)mU{5(rEpgd3&h7CNv-&2cEbE+ z;`{N3`5of>@rU_wb&ZK%;comL;`{N0`B`;F?gOtn>z&(Cxa&VBzW07-_#NWE?)YC~ z_%p=!_8*3yRTr4}74FBca6kSqKdUY?{uQqMx3MEzg_##koHwTI-%{c#lD=?mc_uM` zXVK59#c1(o+myD$Hl>@eP1W4AiN0+34;Ed~5y9B4#9JAsxZcDU$-}GJVjRC$>)fyH z;&WuW}6d27^=PQxYtRwviV5NKs*+w%OM~GhEzHO8ndrXNQn#^nGhI|}Y zZbVddE3w-o#(IS4mG*6;+}LBpU6Xn3+>nn0%T4`G!|>!Po5U!%MzeQ-M!DJFymoHL z$ARUhu+~-Lf}cG%wvUGU&A#Tfb3^kUSZ@0JiqX&vc5NTca$}EaS`#}r T*gl%&#vZf3+>npwk(>Vj59vP! diff --git a/examples/elementwise_arith/air_project/main_sub_kernel.bin b/examples/elementwise_arith/air_project/main_sub_kernel.bin deleted file mode 100644 index 32bcee0ccb77c74fa42e783e1359dfb1a7806953..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 14396 zcmeHN&ub*t5w4zTOXFm`J9d;X*;R0_cL|HpVPq+wfWR%U0wO3dj&oSZ!JAy{Lk_|x zALX!+5JK2XKnaBC639{Hki(uD>~k3W2Q0Fa+*>~Rutcf%UcK&D?^S7^7wfX@gPHFB z>Z^Ly^;Q3zfudX*6#xx>wzmLAfGfcm`t|CjU$cx=-14@`qZ`&Xz;E*jZPN>EaDi@8jg3|9DtIy9S>DOyCXNv!53g!0GuvCvbv(xLyn% zPGI&K#`yK}!@I@%do{odqlX{0>eipLxo$FEwK6_a@LIv=3Vu() z4;1`R!5=92v4Wo{c%$G81wU2rR>3E*Y9ujl3)poE@7gSGY)lt>Hz)E9)sBd7LQC;2 zZfqfcPUrI?!t3ZfZ~V-dDf~*|PZT~V{5hSc(tPCW!A#*-(fQ70k1tvjJ}CS-9e-&) z7oX2!eia?ZeLgNu%m;-(r@o5$wbE}hg)TE_q(n z@#=^L@b7oaE1p+(Jln#I``^Eo^u`Tfg`PnSA?#=Wyc5@_Bz}zh`_vd82hTWRe1Gri z`DSUyVkff>o>!A&zuD^+C`}avPJ`UbR@;bPAJ>M+tSlr00gXh)c z*e`e;Jnu<$xX}d&^Yp(p_&hzSq~8l-+`l7^@p16`LKxq_H}-t9bmLk%Gb{G$gD=sA zQ9s_f@A(xF-vZYJHo(q#{gLOxFyI%z>)_Ba?)0Abxq1qTHm6K-OJaYhP%g>Z9o5%MPrf0gj>M*L~S?+bAatiBdd^gZ7R_3HVG zUUk67e(3oFxgQvR81NOnLO%9`i*Hp^`r&bXAIo6}>g!ioJyBna$NFv)9_zKG@OKrw zR`7koW4-niKK7fo3&V;_-fAoh>fZ+4CD#&o3NbsI%qc%{j}nDz5uD497Z)IM!LjvCbA3>MZiH z&KAcnHZRs${7r>FCVo;kgG#15kIYu%-<(|y1z32F7cD&UCF;}-%;x0@VGuB;-~v7^H+$U<}dSS?M-{W6^?iD zJiJBxG#?iy&VQTuX+OyPP2#7=m&~6N|C>?&tewgHyTr#oh_Qa#Q}}D`hEgAg$NPJW z_-Q}Od^|tc_-TD){+#&f{>uD{_{sVIOy;k(ca(e_9`Emz_-Q^eACDIsKe^tW$^6y! zroGONBfVy44o~xOxS+3iyu>Qie~0_{ zLtL?=#WRB$#tVxY{-*@m!fGSg%C@lcU|#mQn)%Uh`zEsaaN7nV4o_|FoGSad^Hpdw z>vrIuR%d3#B*RC8n{FR)`4Mhd9_K)5{w#aJSoyTKn|s7=Yf-po&ZnQ(=GMiFa=>|g z?hwuE-nBjbyjHs|IL+Jj#HUVMz@~;*oqj&G(%b0xJ4Ay^1EUp1Sessx+I!n&gWrDY zd`)WW+kIte!hITa&Y9P8I!$;C$kK%SJVu@-+$VQlt_j7sG)+ua{b+DCr#8KZS!?)amL}ZCY~{?$HQ_PxG~qtE^Kwln z#-(Y}pV0tEZjOcf5{q-OQZ&C%Ar`{FT7 zZ8}qQt?jSz+$VQlt_hEkrwRATotJAuF)mG$!@ho4$lM%}8VnvIPZRFbV6>cm<(lvq td75ya+7DIJ042kN406>6<}6eA>Btx)wQyxiPco%Md%H$e1Z z*Xk_&@?rwd;kAq3cx#nWwO1b!MJEnxG*S)rcmBQn(x=_a&7Zq}uXmlfG`jpa(eme6 znlOwGCt>tOvX5x0zS+C{{yRq|=lHzIevp(*e%NE|Bccj>po*O$Syocw;&WV-noLJe;iIpj?Wew4DH>zhY^Ou)tBQJ+S&d=M^59jTHoHyNn zJ3o=>zm@HLSlP~pl__o%=B-|sx4cXnc{vnve%?+zoVN#Z-gN)%{6wbzR<`qDWjh~M zrg)|>Zzl`$mX~QGFNZ?T&)X{x=k0-3A`roy1<(Prvg`~-{RJP~d4cCo=azRlQhy+LhRUXinfDA%#I;Zr zd@T5i;7RaJ%$D<$zXmKHi&?A)dlGyTJ+$_T=xtT-vEVC$C&4#SE9WQQQ_IJ)p2D6C z-_7?iv+VcH4lmTqI(&L$1ZzdK`E1ZB_gVW`*(c7>px;gh2X?Ygw13qnx#^FugTvhO z9^A8e@vK)2ws{4R8LR7{pesY>tXRkQbzW?{J-_-M^JDvj-R0Kq`6>VR$Paofou9@Y zWX?Un=Oe~Czr3z&KRc`ZoNjx5qkGJc?Gv1uU%Th0{NE!#=&^Ku8oT`TwWRaQ>&o`K zsLIdjw&!3fdOFRv@xXFMuDr*rw?;e=Yf zfx6L=l?`p)lDqZAZXDRb4>Pc0f<0uSUjDOf)whxlq&=04z4x)&RoqatPpp6cSkY9; zMf=$LFJ0NyKi6uk6LDa1t*Qp|231GMj8z>Ax-w)uYf-wepzDEb?*Wy9t{?IfW?v+9 zJJg@&3u~A9Z`F(S)b_j7Gbz}kU)tW^xnJAtOU8bedKT@`FZCFUXqSH7O%_f4IZ0c8 zg>C&OI(y01cpR9Q)lEXSy-5nXX~?$5Qw3cB+4hEnZio8&d#t~Yvl0{=TSxPKWyYOqgpbr<+Ji)Iqes zbJ5ercOHX7@XrygF;AM;_;!h&@t!=Wcla3BC+Ii%sxBI>^7Ps5qncAVM?ogU|Xd_$lzNFV21vd{%Q|AA-jt zc;$aq@YCS4nhQQiYr;=0i)ReHi^uU3;PDfM@;}sedy zI6pz!5cxJlJPrIFqy6`{9UsEp#p(Dd@Gc(5PlI>oI(`;>)_37QNM}SmGg&+#cz3Sj zd%!#Yj<2SJzUX^4!{gwcpTUfu05AHU@nhgc-!r}m{!vq(k+$lR&vjY0>mMR!jRXJy diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_aie_cdo_elfs.bin b/examples/elementwise_arith/air_project/mul_kernel_0_aie_cdo_elfs.bin deleted file mode 100644 index 9f902fee90c41225fe0fdf079a639fbbc0bb5606..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2656 zcmeH_KT8}z7>9qex8ClBnCuCWK@!-UBv@SGX=S&kSBMglm_|sG*w~~n79m}V2wH?R zlCTznjs23sz*PY)ti;mlP{{|dvndHhs^2 zC*7(5JmFG=RyYpYrpvOR^Him$pJlRup@8{}RfX<6BtDr5HMjx^(877eGYokC7`R+0 zg}ACyn`u(>eouT&|>kruev?*7 zzAWz1sr@~=R))~&MGn^ruzI`OyFX%K!bF9J0*<+oLN^mv#xN4aIV0|W`U1+hEAUOjAcH~4t(reAGHJA zDQO(w%(}vvb%is8Zwh>TUEpJ$p_6AmW0{Xj10Va=N9_Q2N*V_^v#xMvT~X`??RlZC diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_aie_cdo_enable.bin b/examples/elementwise_arith/air_project/mul_kernel_0_aie_cdo_enable.bin deleted file mode 100644 index 7cc1818bce4d6ce1226fc5fda519967a4842b99e..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 104 zcmZQ!U|?`|@n>LQVqg#jvKQ~%@L!OTfq|Wo1;_&e1!V>x7mN{N##qFhv4{o3#Q>)) B1+xGE diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_aie_cdo_init.bin b/examples/elementwise_arith/air_project/mul_kernel_0_aie_cdo_init.bin deleted file mode 100644 index d4549ba181167c6e6d7475f23335e34fea2c3376..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6032 zcmb7{&rak<5XQ@%hINw3u1Uy>EJ*QcI7AT#M?)kINHH2BaX`vMPg)mN>ymUudP@3&^d;#l($}PKNY6;mN#BybCv6+u_35mqPjyW9=@;sg z)`iu&Bwdl7lD;5)N&1TPHR&7DGtzU?x1{e$Thh7gr0z}C-Y@95Kj1=K_e*vRasT2A z`KGw%NtC~$_drR!Af6K!;#1rYonOBXO5z3Gubj9LpW;_N?bzRq9a$@;UpUZ4Mwdf~w*)RN!T zTGw0O+2D~L&k!+PZVspqp^>qxyLCZUYYv{=)g)M&yk)HtOi1`aSl`pz$Z~{VHC> z{wm~8zhkS{>@Pz8!5=I0ho0@PV=wrGo|TPu3xmfX=5+8#k7tP3R^-WgJe!Ecz0Z3* zJBWQ9<=Kinq2IpGzOohbc{t*JyYq{${(e{F$2@C)+CR@F|Df@&=r{W@PpQw>ig|vl z-i6*{KG=BId(6k{cs_@S#l1i1@f;%-&#>z8oFW$Y{-no)v!Lz|&O*DNXg+CuAD%p) z+W++V)c&W>r}jU6KIt>&-QrcKx@>wrfeNOYm3=@d_NZ zw|+x>4;)3bJR^PpjtW{nB))G4Db?$G()v+vO7%KEZ|nOoNU7cwf84d}i)35nL%m9) zUZqj5n{~S$^(u~f-3(ifdKE{#Db=fZZU-ih)>n&xd&hVAz^Pux=TW^Tu6I|q)q2#U zHTbJ2{~>V${xaeR#53^EBfd|32(IcOZtO_bqxD;h5z6W6t-#g!Bz^*}&xc-r46f!) z>of4z(euuUpMtA-6E}8D`NwMi``~*2njeCH5^6!G{ z`D%{yOZ(S+3a%8O|3dRUJE8m&wZAQJonLdDk5s*%7Mkzakr_t)DHkL4{@Vhl z{wcl*uJh~l1-L$+%*XWnG36iY_v1oy)KAZ!`1S4I2o|Ro1qz~3EajN7i78>jEaZQL C^4EL- diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.elf b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.elf deleted file mode 100755 index c7c58092079ae00dffa136a058d3c0d77b6c8d5b..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1672 zcma)6O^6$17=9)jldSlYRxE=*FsvY}tl3Eut?6lYNu}5*= z9yVZ6LGh-_4LK~Lh&&{Fz>(o59Wt1TGRIqg#%>my<-)8;?Yx2! z(8TTe;w1XwbHMSNa~kd}F>-@Fua+@l2MzokmZ2Z9+tJD4hH`6u0r57aSK}=uNl?!iCG~4# z9=N!2`y{C+OB>GDD#EibVqS{{+lm%Ho;|U}_%Kb~9 z)|Mby86NH}-iPXw`Pkn;E$zqE7nDZ`{5UuI^xkm$pW;$3Z_K6pV@;Z`CRZwxLTy~1UJ}%At`$`*%7qix72w_p(DH*_ zFFdccT1~CmsOS|PaN1U|t?hYXu|!c*-awy^N`TTjGPM zepfW1aY(%_f^WJ&>OUfHa_$~k;-8Q;4uR;;$(jhFOg$uPA_&nxkTns6=%2`%2txEP zWK9Gi`ZuyBf)M??82RKME9*HHZA|rFq6brrO6={&kZbIC0gdQ#C$?sdS|WaTW!yY*Sc=M?V-|IUDeHM%4f-D>q6k_aKgA+aU43f`hjEG-nJi9 z;dDdK)XlT7`&;8jRV*E*8O2r@!;E@~W$sz+zGpVqL^``LJC+}qe&ELt3#Joy!zil~ zziEY`bvg4SVJ~noE=q%A?ao#vLi}9EtMaJJqc*-yLyqnLKw*-M z#)w-_^?DBYAhm_Ec_g2q32smf7?f|e%)u`N@16f~4*A|^xX8e{OpvoBEYP0KJ7C`5 U|B?4{S~EyvyKE>uvw1Rp0HPHLbpQYW diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.ld.script b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.ld.script deleted file mode 100644 index fc4f0cf..0000000 --- a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.ld.script +++ /dev/null @@ -1,72 +0,0 @@ - -MEMORY -{ - program (RX) : ORIGIN = 0, LENGTH = 0x0020000 - data (!RX) : ORIGIN = 0x78400, LENGTH = 0x7C00 -} -ENTRY(__start) -SECTIONS -{ - . = 0x0; - .text : { - /* the __start symbol has to come at address zero. */ - *crt0.o(.text*) - _ctors_start = .; - _init_array_start = .; - KEEP(SORT(*.init_array)) - _ctors_end = .; - _init_array_end = .; - _dtors_start = .; - _dtors_end = .; - *(.text*) - } > program - .data : { - *(.data*) - *(.rodata*) - } > data - .comment : { - *(.comment*) - } - .symtab : { - *(.symtab) - } - .shstrtab : { - *(.shstrtab) - } - .strtab : { - *(.strtab) - } - .stack_sizes : { - *(.stack_sizes) - } - -. = 0x70000; -_sp_start_value_DM_stack = .; -. += 0x400; /* stack */ -/* No tile with memory exists to the south. */ -. = 0x40000; -. += 0x10000; -/* No tile with memory exists to the west. */ -. = 0x50000; -. += 0x10000; -. = 0x60400; -buf5 = .; -. += 0x400; -. = 0x64000; -buf4 = .; -. += 0x400; -. = 0x68000; -buf3 = .; -. += 0x400; -. = 0x70400; -buf2 = .; -. += 0x400; -. = 0x74000; -buf1 = .; -. += 0x400; -. = 0x78000; -buf0 = .; -. += 0x400; - .bss : { *(.bss*) } > data -} -PROVIDE(main = core_0_2); diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.ll b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.ll deleted file mode 100644 index 19c8134..0000000 --- a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.ll +++ /dev/null @@ -1,95 +0,0 @@ -; ModuleID = 'LLVMDialectModule' -source_filename = "LLVMDialectModule" -target triple = "aie2p" - -@buf0 = external global [256 x float] -@buf1 = external global [256 x float] -@buf2 = external global [256 x float] -@buf3 = external global [256 x float] -@buf4 = external global [256 x float] -@buf5 = external global [256 x float] -@buf6 = external global [256 x float] -@buf7 = external global [256 x float] -@buf8 = external global [256 x float] -@buf9 = external global [256 x float] -@buf10 = external global [256 x float] -@buf11 = external global [256 x float] -@buf12 = external global [1024 x float] -@buf13 = external global [1024 x float] -@buf14 = external global [1024 x float] - -declare void @debug_i32(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.event(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.put.ms(i32, i32) - -; Unknown intrinsic -declare { i32, i32 } @llvm.aie2p.get.ss() - -; Unknown intrinsic -declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32) - -; Unknown intrinsic -declare <16 x i32> @llvm.aie2p.scd.read.vec(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.acquire(i32, i32) - -; Unknown intrinsic -declare void @llvm.aie2p.release(i32, i32) - -; Unknown intrinsic -declare void @llvm.aie2p.set.ctrl.reg(i32, i32) - -define void @core_0_2() { - call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1) - call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0) - br label %1 - -1: ; preds = %17, %0 - call void @llvm.aie2p.acquire(i32 49, i32 -1) - call void @llvm.aie2p.acquire(i32 50, i32 -1) - call void @llvm.aie2p.acquire(i32 52, i32 -1) - br label %2 - -2: ; preds = %5, %1 - %3 = phi i32 [ %16, %5 ], [ 0, %1 ] - %4 = icmp slt i32 %3, 256 - br i1 %4, label %5, label %17 - -5: ; preds = %2 - %6 = getelementptr float, ptr @buf2, i32 %3 - %7 = load <16 x float>, ptr %6, align 4 - %8 = getelementptr float, ptr @buf1, i32 %3 - %9 = load <16 x float>, ptr %8, align 4 - %10 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %7) - %11 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %9) - %12 = shufflevector <16 x bfloat> %10, <16 x bfloat> %10, <32 x i32> - %13 = shufflevector <16 x bfloat> %11, <16 x bfloat> %11, <32 x i32> - %14 = call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %12, <32 x bfloat> %13, i32 60) - %15 = getelementptr float, ptr @buf0, i32 %3 - store <16 x float> %14, ptr %15, align 4 - %16 = add i32 %3, 16 - br label %2, !llvm.loop !1 - -17: ; preds = %2 - call void @llvm.aie2p.release(i32 51, i32 1) - call void @llvm.aie2p.release(i32 53, i32 1) - call void @llvm.aie2p.release(i32 48, i32 1) - br label %1 -} - -; Unknown intrinsic -declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>) - -; Unknown intrinsic -declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32) - -!llvm.module.flags = !{!0} - -!0 = !{i32 2, !"Debug Info Version", i32 3} -!1 = distinct !{!1, !2} -!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.o b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.o deleted file mode 100644 index 43b8281695feaa671e97fd88c5e79951f78aeaed..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1000 zcmaJ5L(5Psh7b~lwOX~7xET}-o3PZjJz zM5GrbM=#!c>S0g)1=3@W75oAE2dvNJtLYvbGMRa1=ACEWNw!v;J4z{JqM#KJa4-dw zn7k1eTt*cqwGwJ$pYJ^#yEHX<#yTvi`FT97K@;C*%6#o>J$nAY|-zpH({QN!;g z)`%EJYa#KLtn%5)->CC>E_(Q5?pL$werQDJf#^$-)L5`JMq6(#wQ)p3 zzAIz}$lwRF&s5Bj;O*dC0UtDt{-=sslo^;$MDn{+6L`MHDf3A=>nUzIc5A=0ffPW&Anr5nFkzgh%vu3fLE4Wq}gzrLN7`Jipw{ zEJdqRpHAQx6IfuKdlV6~T#a4!0MbnkQF4TxCI~BQ2ovf2@ xkV#f|ivd}eohEehuF{wOGLPhDsj~LMal9*nLasrdEpm~%9U>{8rb<8I{s-KramN4v diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.opt.ll b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.opt.ll deleted file mode 100644 index 0eee48f..0000000 --- a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.opt.ll +++ /dev/null @@ -1,72 +0,0 @@ -; ModuleID = 'air_project/mul_kernel_0_core_0_2.peanohack.ll' -source_filename = "LLVMDialectModule" -target datalayout = "e-m:e-p:20:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-f32:32:32-i64:32-f64:32-a:0:32-n32" -target triple = "aie2p" - -@buf0 = external local_unnamed_addr global [256 x float] -@buf1 = external local_unnamed_addr global [256 x float] -@buf2 = external local_unnamed_addr global [256 x float] - -; Function Attrs: nounwind -declare void @llvm.aie2p.acquire(i32, i32) #0 - -; Function Attrs: nounwind -declare void @llvm.aie2p.release(i32, i32) #0 - -; Function Attrs: nounwind memory(inaccessiblemem: write) -declare void @llvm.aie2p.set.ctrl.reg(i32, i32) #1 - -; Function Attrs: noreturn nounwind -define void @core_0_2() local_unnamed_addr #2 { - tail call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1) - tail call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0) - br label %1 - -1: ; preds = %17, %0 - tail call void @llvm.aie2p.acquire(i32 49, i32 -1) - tail call void @llvm.aie2p.acquire(i32 50, i32 -1) - tail call void @llvm.aie2p.acquire(i32 52, i32 -1) - br label %2 - -2: ; preds = %1, %2 - %3 = phi i32 [ 0, %1 ], [ %15, %2 ] - %4 = trunc nuw i32 %3 to i20 - %5 = getelementptr float, ptr @buf2, i20 %4 - %6 = load <16 x float>, ptr %5, align 64 - %7 = getelementptr float, ptr @buf1, i20 %4 - %8 = load <16 x float>, ptr %7, align 64 - %9 = tail call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %6) - %10 = tail call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %8) - %11 = shufflevector <16 x bfloat> %9, <16 x bfloat> poison, <32 x i32> - %12 = shufflevector <16 x bfloat> %10, <16 x bfloat> poison, <32 x i32> - %13 = tail call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %11, <32 x bfloat> %12, i32 60) - %14 = getelementptr float, ptr @buf0, i20 %4 - store <16 x float> %13, ptr %14, align 64 - %15 = add nuw nsw i32 %3, 16 - %16 = icmp ult i32 %3, 240 - br i1 %16, label %2, label %17, !llvm.loop !1 - -17: ; preds = %2 - tail call void @llvm.aie2p.release(i32 51, i32 1) - tail call void @llvm.aie2p.release(i32 53, i32 1) - tail call void @llvm.aie2p.release(i32 48, i32 1) - br label %1 -} - -; Function Attrs: nofree nounwind memory(inaccessiblemem: read) -declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>) #3 - -; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) -declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32) #4 - -attributes #0 = { nounwind } -attributes #1 = { nounwind memory(inaccessiblemem: write) } -attributes #2 = { noreturn nounwind } -attributes #3 = { nofree nounwind memory(inaccessiblemem: read) } -attributes #4 = { mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) } - -!llvm.module.flags = !{!0} - -!0 = !{i32 2, !"Debug Info Version", i32 3} -!1 = distinct !{!1, !2} -!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.peanohack.ll b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.peanohack.ll deleted file mode 100644 index 7de74b2..0000000 --- a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.peanohack.ll +++ /dev/null @@ -1,95 +0,0 @@ -; ModuleID = 'LLVMDialectModule' -source_filename = "LLVMDialectModule" -target triple = "aie2p" - -@buf0 = external global [256 x float] -@buf1 = external global [256 x float] -@buf2 = external global [256 x float] -@buf3 = external global [256 x float] -@buf4 = external global [256 x float] -@buf5 = external global [256 x float] -@buf6 = external global [256 x float] -@buf7 = external global [256 x float] -@buf8 = external global [256 x float] -@buf9 = external global [256 x float] -@buf10 = external global [256 x float] -@buf11 = external global [256 x float] -@buf12 = external global [1024 x float] -@buf13 = external global [1024 x float] -@buf14 = external global [1024 x float] - -declare void @debug_i32(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.event(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.put.ms(i32, i32) - -; Unknown intrinsic -declare { i32, i32 } @llvm.aie2p.get.ss() - -; Unknown intrinsic -declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32) - -; Unknown intrinsic -declare <16 x i32> @llvm.aie2p.scd.read.vec(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.acquire(i32, i32) - -; Unknown intrinsic -declare void @llvm.aie2p.release(i32, i32) - -; Unknown intrinsic -declare void @llvm.aie2p.set.ctrl.reg(i32, i32) - -define void @core_0_2() { - call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1) - call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0) - br label %1 - -1: ; preds = %17, %0 - call void @llvm.aie2p.acquire(i32 49, i32 -1) - call void @llvm.aie2p.acquire(i32 50, i32 -1) - call void @llvm.aie2p.acquire(i32 52, i32 -1) - br label %2 - -2: ; preds = %5, %1 - %3 = phi i32 [ %16, %5 ], [ 0, %1 ] - %4 = icmp slt i32 %3, 256 - br i1 %4, label %5, label %17 - -5: ; preds = %2 - %6 = getelementptr float, ptr @buf2, i32 %3 - %7 = load <16 x float>, ptr %6 - %8 = getelementptr float, ptr @buf1, i32 %3 - %9 = load <16 x float>, ptr %8 - %10 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %7) - %11 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %9) - %12 = shufflevector <16 x bfloat> %10, <16 x bfloat> %10, <32 x i32> - %13 = shufflevector <16 x bfloat> %11, <16 x bfloat> %11, <32 x i32> - %14 = call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %12, <32 x bfloat> %13, i32 60) - %15 = getelementptr float, ptr @buf0, i32 %3 - store <16 x float> %14, ptr %15 - %16 = add i32 %3, 16 - br label %2, !llvm.loop !1 - -17: ; preds = %2 - call void @llvm.aie2p.release(i32 51, i32 1) - call void @llvm.aie2p.release(i32 53, i32 1) - call void @llvm.aie2p.release(i32 48, i32 1) - br label %1 -} - -; Unknown intrinsic -declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>) - -; Unknown intrinsic -declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32) - -!llvm.module.flags = !{!0} - -!0 = !{i32 2, !"Debug Info Version", i32 3} -!1 = distinct !{!1, !2} -!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.elf b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.elf deleted file mode 100755 index f1be4eefad7b89482558fe9d6e292826c0748801..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1736 zcma)7O^6$17=9)jldSlYRxE=*FsvYJS+g@u>^42kE~$vk!g^2^JZQrV_9w>RX<+chXiISt)Kg|b z{nDHTUR%EV%;567&(}7jU(Gz^1`YsrXoioyql|jQ{7U%56u-5sHa^QKbFzNMV@Q0M z`4>N_EkLqTJltEn57o!>vA>CO(vPdpDUSkb;T!Jq3V20?T0pXA)|=p+A<5tGFr1N| zgRieYnd#jBerEXbgTdBc`Gri@oJsb_nj~M1uT&<5TD>v3B&c6q&#Rc13ny+Uz=LC; z?gu+wcwVd5Yno9l=_MU-ns%_I?RsI?ZwHB2o8OM2PWSxc;+7w6_ZlUq-CA7tn|`pj z*lg~$UbcNtYm^;Vue#;3XS-#+dbV=T^_(TQ;+Bm@xlwV9s^?ZKo7I=%HB3t`XN)Y4 zj-Mi18+#NyOM5uQU!bjxixf=mL{9JMSbhr|Q}Gqa=P>Cz2?ZBvBO4!25xKaRIpJ?g zyg%V@3nw%Vsc#73OBYD}2hk>$G~WIcysm=pfH@cx9~6P}HK;gWwO+??<~g!d;LrO4Zh zAlKgV0vg>*t;lXrZ+D|Gw#M3acC4=dzSrgZs=u{b^X;bRM5}GL*Yr@TudL{nk?1qj zR{cWY>TtryC^-%to4vrX8s3&47;xI5XX#cMjo#)fA}S)vu&i!mhY_r9C$_9zyV>)s z+Nw}z2Ug4W1IrKm2x7r program - .data : { - *(.data*) - *(.rodata*) - } > data - .comment : { - *(.comment*) - } - .symtab : { - *(.symtab) - } - .shstrtab : { - *(.shstrtab) - } - .strtab : { - *(.strtab) - } - .stack_sizes : { - *(.stack_sizes) - } - -. = 0x70000; -_sp_start_value_DM_stack = .; -. += 0x400; /* stack */ -. = 0x40400; -buf2 = .; -. += 0x400; -. = 0x44000; -buf1 = .; -. += 0x400; -. = 0x48000; -buf0 = .; -. += 0x400; -/* No tile with memory exists to the west. */ -. = 0x50000; -. += 0x10000; -. = 0x60400; -buf8 = .; -. += 0x400; -. = 0x64000; -buf7 = .; -. += 0x400; -. = 0x68000; -buf6 = .; -. += 0x400; -. = 0x70400; -buf5 = .; -. += 0x400; -. = 0x74000; -buf4 = .; -. += 0x400; -. = 0x78000; -buf3 = .; -. += 0x400; - .bss : { *(.bss*) } > data -} -PROVIDE(main = core_0_3); diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.ll b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.ll deleted file mode 100644 index 79b2ca7..0000000 --- a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.ll +++ /dev/null @@ -1,95 +0,0 @@ -; ModuleID = 'LLVMDialectModule' -source_filename = "LLVMDialectModule" -target triple = "aie2p" - -@buf0 = external global [256 x float] -@buf1 = external global [256 x float] -@buf2 = external global [256 x float] -@buf3 = external global [256 x float] -@buf4 = external global [256 x float] -@buf5 = external global [256 x float] -@buf6 = external global [256 x float] -@buf7 = external global [256 x float] -@buf8 = external global [256 x float] -@buf9 = external global [256 x float] -@buf10 = external global [256 x float] -@buf11 = external global [256 x float] -@buf12 = external global [1024 x float] -@buf13 = external global [1024 x float] -@buf14 = external global [1024 x float] - -declare void @debug_i32(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.event(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.put.ms(i32, i32) - -; Unknown intrinsic -declare { i32, i32 } @llvm.aie2p.get.ss() - -; Unknown intrinsic -declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32) - -; Unknown intrinsic -declare <16 x i32> @llvm.aie2p.scd.read.vec(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.acquire(i32, i32) - -; Unknown intrinsic -declare void @llvm.aie2p.release(i32, i32) - -; Unknown intrinsic -declare void @llvm.aie2p.set.ctrl.reg(i32, i32) - -define void @core_0_3() { - call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1) - call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0) - br label %1 - -1: ; preds = %17, %0 - call void @llvm.aie2p.acquire(i32 49, i32 -1) - call void @llvm.aie2p.acquire(i32 50, i32 -1) - call void @llvm.aie2p.acquire(i32 52, i32 -1) - br label %2 - -2: ; preds = %5, %1 - %3 = phi i32 [ %16, %5 ], [ 0, %1 ] - %4 = icmp slt i32 %3, 256 - br i1 %4, label %5, label %17 - -5: ; preds = %2 - %6 = getelementptr float, ptr @buf5, i32 %3 - %7 = load <16 x float>, ptr %6, align 4 - %8 = getelementptr float, ptr @buf4, i32 %3 - %9 = load <16 x float>, ptr %8, align 4 - %10 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %7) - %11 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %9) - %12 = shufflevector <16 x bfloat> %10, <16 x bfloat> %10, <32 x i32> - %13 = shufflevector <16 x bfloat> %11, <16 x bfloat> %11, <32 x i32> - %14 = call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %12, <32 x bfloat> %13, i32 60) - %15 = getelementptr float, ptr @buf3, i32 %3 - store <16 x float> %14, ptr %15, align 4 - %16 = add i32 %3, 16 - br label %2, !llvm.loop !1 - -17: ; preds = %2 - call void @llvm.aie2p.release(i32 51, i32 1) - call void @llvm.aie2p.release(i32 53, i32 1) - call void @llvm.aie2p.release(i32 48, i32 1) - br label %1 -} - -; Unknown intrinsic -declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>) - -; Unknown intrinsic -declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32) - -!llvm.module.flags = !{!0} - -!0 = !{i32 2, !"Debug Info Version", i32 3} -!1 = distinct !{!1, !2} -!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.o b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.o deleted file mode 100644 index 4343d406be95fbbfffcd92a0f42bf02db06255c2..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1000 zcmaJ9W#}>r3!R-d>se(O- zi1ecD(Tn$NW?5%scbU%sbD#v)Ng5?klB`ih`Cyz|j;? zU~)AoxQ;T;Y9-XfAw1o!29yJjpK(lFs&VIe^-Zkt%Bc6 ztPwGc#zO2ZS;dQ`zhUe1T=@9M+^>4M`=J(I2Es3SQe(l^jIZ_#uwFm88t%Ng*2D=3 zd6viukiqk3pR0%?#=HKd96qWWy-y{zC^Il0i{$sGCh&ZPQ)ZKL)^pr)@!b8mANg&p zBz&B{q%QEL_KmOHjk$=Byg3PE=1S%PMf^D(5L;QSfEV=la@Z4JrGWyTB(BZaJipwX zG)1dZ?@r)*6IfuCdlV6~T#j7!0&Z%tFR!S|pMvs{S}jH{OmZszUm`IBcRPT&ISgF$ z!K0_kTS2$~3Xb!v)qL6Y+h_4Fe3~l#g!><@G;zKF diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.opt.ll b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.opt.ll deleted file mode 100644 index ce97114..0000000 --- a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.opt.ll +++ /dev/null @@ -1,72 +0,0 @@ -; ModuleID = 'air_project/mul_kernel_0_core_0_3.peanohack.ll' -source_filename = "LLVMDialectModule" -target datalayout = "e-m:e-p:20:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-f32:32:32-i64:32-f64:32-a:0:32-n32" -target triple = "aie2p" - -@buf3 = external local_unnamed_addr global [256 x float] -@buf4 = external local_unnamed_addr global [256 x float] -@buf5 = external local_unnamed_addr global [256 x float] - -; Function Attrs: nounwind -declare void @llvm.aie2p.acquire(i32, i32) #0 - -; Function Attrs: nounwind -declare void @llvm.aie2p.release(i32, i32) #0 - -; Function Attrs: nounwind memory(inaccessiblemem: write) -declare void @llvm.aie2p.set.ctrl.reg(i32, i32) #1 - -; Function Attrs: noreturn nounwind -define void @core_0_3() local_unnamed_addr #2 { - tail call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1) - tail call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0) - br label %1 - -1: ; preds = %17, %0 - tail call void @llvm.aie2p.acquire(i32 49, i32 -1) - tail call void @llvm.aie2p.acquire(i32 50, i32 -1) - tail call void @llvm.aie2p.acquire(i32 52, i32 -1) - br label %2 - -2: ; preds = %1, %2 - %3 = phi i32 [ 0, %1 ], [ %15, %2 ] - %4 = trunc nuw i32 %3 to i20 - %5 = getelementptr float, ptr @buf5, i20 %4 - %6 = load <16 x float>, ptr %5, align 64 - %7 = getelementptr float, ptr @buf4, i20 %4 - %8 = load <16 x float>, ptr %7, align 64 - %9 = tail call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %6) - %10 = tail call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %8) - %11 = shufflevector <16 x bfloat> %9, <16 x bfloat> poison, <32 x i32> - %12 = shufflevector <16 x bfloat> %10, <16 x bfloat> poison, <32 x i32> - %13 = tail call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %11, <32 x bfloat> %12, i32 60) - %14 = getelementptr float, ptr @buf3, i20 %4 - store <16 x float> %13, ptr %14, align 64 - %15 = add nuw nsw i32 %3, 16 - %16 = icmp ult i32 %3, 240 - br i1 %16, label %2, label %17, !llvm.loop !1 - -17: ; preds = %2 - tail call void @llvm.aie2p.release(i32 51, i32 1) - tail call void @llvm.aie2p.release(i32 53, i32 1) - tail call void @llvm.aie2p.release(i32 48, i32 1) - br label %1 -} - -; Function Attrs: nofree nounwind memory(inaccessiblemem: read) -declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>) #3 - -; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) -declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32) #4 - -attributes #0 = { nounwind } -attributes #1 = { nounwind memory(inaccessiblemem: write) } -attributes #2 = { noreturn nounwind } -attributes #3 = { nofree nounwind memory(inaccessiblemem: read) } -attributes #4 = { mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) } - -!llvm.module.flags = !{!0} - -!0 = !{i32 2, !"Debug Info Version", i32 3} -!1 = distinct !{!1, !2} -!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.peanohack.ll b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.peanohack.ll deleted file mode 100644 index c86e34d..0000000 --- a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.peanohack.ll +++ /dev/null @@ -1,95 +0,0 @@ -; ModuleID = 'LLVMDialectModule' -source_filename = "LLVMDialectModule" -target triple = "aie2p" - -@buf0 = external global [256 x float] -@buf1 = external global [256 x float] -@buf2 = external global [256 x float] -@buf3 = external global [256 x float] -@buf4 = external global [256 x float] -@buf5 = external global [256 x float] -@buf6 = external global [256 x float] -@buf7 = external global [256 x float] -@buf8 = external global [256 x float] -@buf9 = external global [256 x float] -@buf10 = external global [256 x float] -@buf11 = external global [256 x float] -@buf12 = external global [1024 x float] -@buf13 = external global [1024 x float] -@buf14 = external global [1024 x float] - -declare void @debug_i32(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.event(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.put.ms(i32, i32) - -; Unknown intrinsic -declare { i32, i32 } @llvm.aie2p.get.ss() - -; Unknown intrinsic -declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32) - -; Unknown intrinsic -declare <16 x i32> @llvm.aie2p.scd.read.vec(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.acquire(i32, i32) - -; Unknown intrinsic -declare void @llvm.aie2p.release(i32, i32) - -; Unknown intrinsic -declare void @llvm.aie2p.set.ctrl.reg(i32, i32) - -define void @core_0_3() { - call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1) - call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0) - br label %1 - -1: ; preds = %17, %0 - call void @llvm.aie2p.acquire(i32 49, i32 -1) - call void @llvm.aie2p.acquire(i32 50, i32 -1) - call void @llvm.aie2p.acquire(i32 52, i32 -1) - br label %2 - -2: ; preds = %5, %1 - %3 = phi i32 [ %16, %5 ], [ 0, %1 ] - %4 = icmp slt i32 %3, 256 - br i1 %4, label %5, label %17 - -5: ; preds = %2 - %6 = getelementptr float, ptr @buf5, i32 %3 - %7 = load <16 x float>, ptr %6 - %8 = getelementptr float, ptr @buf4, i32 %3 - %9 = load <16 x float>, ptr %8 - %10 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %7) - %11 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %9) - %12 = shufflevector <16 x bfloat> %10, <16 x bfloat> %10, <32 x i32> - %13 = shufflevector <16 x bfloat> %11, <16 x bfloat> %11, <32 x i32> - %14 = call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %12, <32 x bfloat> %13, i32 60) - %15 = getelementptr float, ptr @buf3, i32 %3 - store <16 x float> %14, ptr %15 - %16 = add i32 %3, 16 - br label %2, !llvm.loop !1 - -17: ; preds = %2 - call void @llvm.aie2p.release(i32 51, i32 1) - call void @llvm.aie2p.release(i32 53, i32 1) - call void @llvm.aie2p.release(i32 48, i32 1) - br label %1 -} - -; Unknown intrinsic -declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>) - -; Unknown intrinsic -declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32) - -!llvm.module.flags = !{!0} - -!0 = !{i32 2, !"Debug Info Version", i32 3} -!1 = distinct !{!1, !2} -!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.elf b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.elf deleted file mode 100755 index 2158287344d4e58ea0bb86865df912b73324cb82..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1740 zcma)7O=u%!7=9*=NhRM5e%Yx=t5Xt=vT&vn+kQpAuIH1Kb~2>pQ74vzQNm0PoOzSyLG@bdEQCkB_^db+kD^=jrJH*f&3LNk2mO=VOg=9j}qrudCzwefLInUntMk0JhH z=3o4zwg8Draer;G-j^S*$NDD9Nj-L-Qyv7=!k66TW$=m!wSZ*L%s0V1LlVE=p*tfr z2VY!$G}F2J-OTWVqrujn`Gri@oJsb_oFrb2uT&O=TD>;8B&eTX&8wJ~3n#8Az|k>K z_k$fTJg3#`HO;7&^pXxZO*`1qcD=Ccw}Zs0&2L9hr+aR3am$akdySIQZY{3+O+VOM zY&LgW&)dGIHOh{wSKV^iv)!^@JySXBdd`wtamz-d+^9H4)pM(r&FXV;57RT-86%5( z$B&V%jV%hEq&}SD&rsLKNeU)+BByunSbhr|Q~ni+=P+qI2?ZCZBO4!25xKaQIpME~ zzdzw`2q!cS$!`eZOBYD~yObMTyF)4d_bD|Ff$)zgH6cWid_buQA%uTTsR<#3|3Ik; zA%y=#sR<#3-=x%p5W;^ECChh1NI!?7m=pe+@cxAVE<79m!X^F>;pT)N3GYuhN|Co0 zL9V^!1ys5hTan$M-0ntUtc|7Z>{wm@9k0vxReyc8=G#rriB{Wguj!#wUs=&DBatsq zTJ`gRtHTK+qvSX=Z1w`jYIs|IV8CgIo~2tAGp$X7CprDZ_YDh5&2*hq{sTMI5=sC7 diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.ld.script b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.ld.script deleted file mode 100644 index ddda3c2..0000000 --- a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.ld.script +++ /dev/null @@ -1,78 +0,0 @@ - -MEMORY -{ - program (RX) : ORIGIN = 0, LENGTH = 0x0020000 - data (!RX) : ORIGIN = 0x78400, LENGTH = 0x7C00 -} -ENTRY(__start) -SECTIONS -{ - . = 0x0; - .text : { - /* the __start symbol has to come at address zero. */ - *crt0.o(.text*) - _ctors_start = .; - _init_array_start = .; - KEEP(SORT(*.init_array)) - _ctors_end = .; - _init_array_end = .; - _dtors_start = .; - _dtors_end = .; - *(.text*) - } > program - .data : { - *(.data*) - *(.rodata*) - } > data - .comment : { - *(.comment*) - } - .symtab : { - *(.symtab) - } - .shstrtab : { - *(.shstrtab) - } - .strtab : { - *(.strtab) - } - .stack_sizes : { - *(.stack_sizes) - } - -. = 0x70000; -_sp_start_value_DM_stack = .; -. += 0x400; /* stack */ -. = 0x40400; -buf5 = .; -. += 0x400; -. = 0x44000; -buf4 = .; -. += 0x400; -. = 0x48000; -buf3 = .; -. += 0x400; -/* No tile with memory exists to the west. */ -. = 0x50000; -. += 0x10000; -. = 0x60400; -buf11 = .; -. += 0x400; -. = 0x64000; -buf10 = .; -. += 0x400; -. = 0x68000; -buf9 = .; -. += 0x400; -. = 0x70400; -buf8 = .; -. += 0x400; -. = 0x74000; -buf7 = .; -. += 0x400; -. = 0x78000; -buf6 = .; -. += 0x400; - .bss : { *(.bss*) } > data -} -PROVIDE(main = core_0_4); diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.ll b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.ll deleted file mode 100644 index 2552e6c..0000000 --- a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.ll +++ /dev/null @@ -1,95 +0,0 @@ -; ModuleID = 'LLVMDialectModule' -source_filename = "LLVMDialectModule" -target triple = "aie2p" - -@buf0 = external global [256 x float] -@buf1 = external global [256 x float] -@buf2 = external global [256 x float] -@buf3 = external global [256 x float] -@buf4 = external global [256 x float] -@buf5 = external global [256 x float] -@buf6 = external global [256 x float] -@buf7 = external global [256 x float] -@buf8 = external global [256 x float] -@buf9 = external global [256 x float] -@buf10 = external global [256 x float] -@buf11 = external global [256 x float] -@buf12 = external global [1024 x float] -@buf13 = external global [1024 x float] -@buf14 = external global [1024 x float] - -declare void @debug_i32(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.event(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.put.ms(i32, i32) - -; Unknown intrinsic -declare { i32, i32 } @llvm.aie2p.get.ss() - -; Unknown intrinsic -declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32) - -; Unknown intrinsic -declare <16 x i32> @llvm.aie2p.scd.read.vec(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.acquire(i32, i32) - -; Unknown intrinsic -declare void @llvm.aie2p.release(i32, i32) - -; Unknown intrinsic -declare void @llvm.aie2p.set.ctrl.reg(i32, i32) - -define void @core_0_4() { - call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1) - call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0) - br label %1 - -1: ; preds = %17, %0 - call void @llvm.aie2p.acquire(i32 49, i32 -1) - call void @llvm.aie2p.acquire(i32 50, i32 -1) - call void @llvm.aie2p.acquire(i32 52, i32 -1) - br label %2 - -2: ; preds = %5, %1 - %3 = phi i32 [ %16, %5 ], [ 0, %1 ] - %4 = icmp slt i32 %3, 256 - br i1 %4, label %5, label %17 - -5: ; preds = %2 - %6 = getelementptr float, ptr @buf8, i32 %3 - %7 = load <16 x float>, ptr %6, align 4 - %8 = getelementptr float, ptr @buf7, i32 %3 - %9 = load <16 x float>, ptr %8, align 4 - %10 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %7) - %11 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %9) - %12 = shufflevector <16 x bfloat> %10, <16 x bfloat> %10, <32 x i32> - %13 = shufflevector <16 x bfloat> %11, <16 x bfloat> %11, <32 x i32> - %14 = call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %12, <32 x bfloat> %13, i32 60) - %15 = getelementptr float, ptr @buf6, i32 %3 - store <16 x float> %14, ptr %15, align 4 - %16 = add i32 %3, 16 - br label %2, !llvm.loop !1 - -17: ; preds = %2 - call void @llvm.aie2p.release(i32 51, i32 1) - call void @llvm.aie2p.release(i32 53, i32 1) - call void @llvm.aie2p.release(i32 48, i32 1) - br label %1 -} - -; Unknown intrinsic -declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>) - -; Unknown intrinsic -declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32) - -!llvm.module.flags = !{!0} - -!0 = !{i32 2, !"Debug Info Version", i32 3} -!1 = distinct !{!1, !2} -!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.o b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.o deleted file mode 100644 index 2fea81b77544b9d8acd307af0a8527ae8aff2af3..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1000 zcmaJse(O- zi1ecD(Tn$NW?5%scbU%sbD#v)Nj;?A0`JS?4h|n!z_hlv`CaW->Q($+ zVvUHdHy2`W$t<2N{SDin=fX!n=6*HG&WCz<9tgkWN%aLw)4y6Xz*^(zVz~9@QVT~U zR3_db=>qRha2ERx@yn!xi_PMJ;0Sx<4x#Z&j=e&n}s zJ>lc@C3S{3wQqdocFaYD%+h` z?mv98yb(D4SFr7;?bb`jbNfMiI2d^@jE%q#oF47l*J6947q{fNDaQ>MzUw(rFQY&7 z-LBbPfnhhBX7^fR>$F{ab$xKMreuWwnJY{tPl(DooO_hJkgim$XtEMmxf*e0ry_0Y zWSxA2RMNUN24r1UlBkeZV$xsck=!g*)}FhLcX^P@)#, ptr %5, align 64 - %7 = getelementptr float, ptr @buf7, i20 %4 - %8 = load <16 x float>, ptr %7, align 64 - %9 = tail call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %6) - %10 = tail call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %8) - %11 = shufflevector <16 x bfloat> %9, <16 x bfloat> poison, <32 x i32> - %12 = shufflevector <16 x bfloat> %10, <16 x bfloat> poison, <32 x i32> - %13 = tail call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %11, <32 x bfloat> %12, i32 60) - %14 = getelementptr float, ptr @buf6, i20 %4 - store <16 x float> %13, ptr %14, align 64 - %15 = add nuw nsw i32 %3, 16 - %16 = icmp ult i32 %3, 240 - br i1 %16, label %2, label %17, !llvm.loop !1 - -17: ; preds = %2 - tail call void @llvm.aie2p.release(i32 51, i32 1) - tail call void @llvm.aie2p.release(i32 53, i32 1) - tail call void @llvm.aie2p.release(i32 48, i32 1) - br label %1 -} - -; Function Attrs: nofree nounwind memory(inaccessiblemem: read) -declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>) #3 - -; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) -declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32) #4 - -attributes #0 = { nounwind } -attributes #1 = { nounwind memory(inaccessiblemem: write) } -attributes #2 = { noreturn nounwind } -attributes #3 = { nofree nounwind memory(inaccessiblemem: read) } -attributes #4 = { mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) } - -!llvm.module.flags = !{!0} - -!0 = !{i32 2, !"Debug Info Version", i32 3} -!1 = distinct !{!1, !2} -!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.peanohack.ll b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.peanohack.ll deleted file mode 100644 index bfe891f..0000000 --- a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.peanohack.ll +++ /dev/null @@ -1,95 +0,0 @@ -; ModuleID = 'LLVMDialectModule' -source_filename = "LLVMDialectModule" -target triple = "aie2p" - -@buf0 = external global [256 x float] -@buf1 = external global [256 x float] -@buf2 = external global [256 x float] -@buf3 = external global [256 x float] -@buf4 = external global [256 x float] -@buf5 = external global [256 x float] -@buf6 = external global [256 x float] -@buf7 = external global [256 x float] -@buf8 = external global [256 x float] -@buf9 = external global [256 x float] -@buf10 = external global [256 x float] -@buf11 = external global [256 x float] -@buf12 = external global [1024 x float] -@buf13 = external global [1024 x float] -@buf14 = external global [1024 x float] - -declare void @debug_i32(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.event(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.put.ms(i32, i32) - -; Unknown intrinsic -declare { i32, i32 } @llvm.aie2p.get.ss() - -; Unknown intrinsic -declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32) - -; Unknown intrinsic -declare <16 x i32> @llvm.aie2p.scd.read.vec(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.acquire(i32, i32) - -; Unknown intrinsic -declare void @llvm.aie2p.release(i32, i32) - -; Unknown intrinsic -declare void @llvm.aie2p.set.ctrl.reg(i32, i32) - -define void @core_0_4() { - call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1) - call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0) - br label %1 - -1: ; preds = %17, %0 - call void @llvm.aie2p.acquire(i32 49, i32 -1) - call void @llvm.aie2p.acquire(i32 50, i32 -1) - call void @llvm.aie2p.acquire(i32 52, i32 -1) - br label %2 - -2: ; preds = %5, %1 - %3 = phi i32 [ %16, %5 ], [ 0, %1 ] - %4 = icmp slt i32 %3, 256 - br i1 %4, label %5, label %17 - -5: ; preds = %2 - %6 = getelementptr float, ptr @buf8, i32 %3 - %7 = load <16 x float>, ptr %6 - %8 = getelementptr float, ptr @buf7, i32 %3 - %9 = load <16 x float>, ptr %8 - %10 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %7) - %11 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %9) - %12 = shufflevector <16 x bfloat> %10, <16 x bfloat> %10, <32 x i32> - %13 = shufflevector <16 x bfloat> %11, <16 x bfloat> %11, <32 x i32> - %14 = call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %12, <32 x bfloat> %13, i32 60) - %15 = getelementptr float, ptr @buf6, i32 %3 - store <16 x float> %14, ptr %15 - %16 = add i32 %3, 16 - br label %2, !llvm.loop !1 - -17: ; preds = %2 - call void @llvm.aie2p.release(i32 51, i32 1) - call void @llvm.aie2p.release(i32 53, i32 1) - call void @llvm.aie2p.release(i32 48, i32 1) - br label %1 -} - -; Unknown intrinsic -declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>) - -; Unknown intrinsic -declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32) - -!llvm.module.flags = !{!0} - -!0 = !{i32 2, !"Debug Info Version", i32 3} -!1 = distinct !{!1, !2} -!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.elf b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.elf deleted file mode 100755 index 680e4695811b0567bb930a499c2383fd1f5d7bf8..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1676 zcma)6O^6$17=9)jldSlYR$2ysU|2y`S+g@HcAK7NmsE<)!uFyI7G@_iv%AA4lXNm! zw}NcIf<34=B?^KUZyth&3VRY-uqWxkLvMCVJ$NbYtq1A$c{4LfDjs~8?|a|p{rTSS zn|J2hKU;rKQ52-epfF|?8lz``8OjPYWH66BvXJOLM~YiC$Y3an92fqI+|)NMbJHZz zvkHbl4R>bqWA7WU0(ak^(ePk_o|~+Bt%xBjXy9{Dgnr0sherqN%I(=X;IsU}I&Pmg zxr4&By&sidi^eE7UQkt6PR81To_#*|c(D4_vBBE!$L=lX?W@M%1Tgq|bfh>4>PfSp zeq+u8=a%ohJlMbVT5Ut>)y%s*z#+g2&G5O8lu?bCUkP8B;T&luLQ0b4|<1te!?z6tgWN&G>F?u^tN ze1HALOy|MRGsDjx4YvNtFJ!XjOmaTvB=Kt8Qdtyg^~R)0P`|yNS1~UQCvGUfqa&d1 z2RmMPMyuCrno%w3B^_{@cCe-GdSTaZ2Z>di-;SbA_srtrmLF~R8YQRQT3q*=ez3RL zZ0@$+wtY`)lpR;Ey5+KGyJfw4s&d-(oF%v7mW@WaQE`l_=TikOwZiT7+E|# zdWmdpY*FwE_2CqMow_zoQZVU>yuF7<@-J*m`Bx;K!=&vv6r87yZ2WqP$l-!;bHYCq z-kH-^5Y+7;{TNP4X)iu{8uS84uQmfNvR1Tisau>YC;I%KT&Ez2;sj_YC;I% zzfo#J2;qNFYC;I%e~OajcSFc|?ulYf_}{|&6OK~k?M09_w!DB!_i`(;8wn^P`MuOXSgrYX({rNLw%coZDAiY1bjwKOJCs)aY~bo}!pJB&4h@^Vz_A+M zmLC{!+M#FZ*1Kr*HVtDm>oQkiS>4DEBUs%|tXaEuv*%g0RiVxftd{KumLK>LM1$o- z?Xa6>iQTfp(7v2nVz(E#7$qgn5qGCEVqtCZW85U^(<%41tRdZ$82`Q+*?UbKk>C#P->>@r1A+>rU?lE diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.ld.script b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.ld.script deleted file mode 100644 index 51c13db..0000000 --- a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.ld.script +++ /dev/null @@ -1,72 +0,0 @@ - -MEMORY -{ - program (RX) : ORIGIN = 0, LENGTH = 0x0020000 - data (!RX) : ORIGIN = 0x78400, LENGTH = 0x7C00 -} -ENTRY(__start) -SECTIONS -{ - . = 0x0; - .text : { - /* the __start symbol has to come at address zero. */ - *crt0.o(.text*) - _ctors_start = .; - _init_array_start = .; - KEEP(SORT(*.init_array)) - _ctors_end = .; - _init_array_end = .; - _dtors_start = .; - _dtors_end = .; - *(.text*) - } > program - .data : { - *(.data*) - *(.rodata*) - } > data - .comment : { - *(.comment*) - } - .symtab : { - *(.symtab) - } - .shstrtab : { - *(.shstrtab) - } - .strtab : { - *(.strtab) - } - .stack_sizes : { - *(.stack_sizes) - } - -. = 0x70000; -_sp_start_value_DM_stack = .; -. += 0x400; /* stack */ -. = 0x40400; -buf8 = .; -. += 0x400; -. = 0x44000; -buf7 = .; -. += 0x400; -. = 0x48000; -buf6 = .; -. += 0x400; -/* No tile with memory exists to the west. */ -. = 0x50000; -. += 0x10000; -/* No tile with memory exists to the north. */ -. = 0x60000; -. += 0x10000; -. = 0x70400; -buf11 = .; -. += 0x400; -. = 0x74000; -buf10 = .; -. += 0x400; -. = 0x78000; -buf9 = .; -. += 0x400; - .bss : { *(.bss*) } > data -} -PROVIDE(main = core_0_5); diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.ll b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.ll deleted file mode 100644 index 4ed7251..0000000 --- a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.ll +++ /dev/null @@ -1,95 +0,0 @@ -; ModuleID = 'LLVMDialectModule' -source_filename = "LLVMDialectModule" -target triple = "aie2p" - -@buf0 = external global [256 x float] -@buf1 = external global [256 x float] -@buf2 = external global [256 x float] -@buf3 = external global [256 x float] -@buf4 = external global [256 x float] -@buf5 = external global [256 x float] -@buf6 = external global [256 x float] -@buf7 = external global [256 x float] -@buf8 = external global [256 x float] -@buf9 = external global [256 x float] -@buf10 = external global [256 x float] -@buf11 = external global [256 x float] -@buf12 = external global [1024 x float] -@buf13 = external global [1024 x float] -@buf14 = external global [1024 x float] - -declare void @debug_i32(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.event(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.put.ms(i32, i32) - -; Unknown intrinsic -declare { i32, i32 } @llvm.aie2p.get.ss() - -; Unknown intrinsic -declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32) - -; Unknown intrinsic -declare <16 x i32> @llvm.aie2p.scd.read.vec(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.acquire(i32, i32) - -; Unknown intrinsic -declare void @llvm.aie2p.release(i32, i32) - -; Unknown intrinsic -declare void @llvm.aie2p.set.ctrl.reg(i32, i32) - -define void @core_0_5() { - call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1) - call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0) - br label %1 - -1: ; preds = %17, %0 - call void @llvm.aie2p.acquire(i32 49, i32 -1) - call void @llvm.aie2p.acquire(i32 50, i32 -1) - call void @llvm.aie2p.acquire(i32 52, i32 -1) - br label %2 - -2: ; preds = %5, %1 - %3 = phi i32 [ %16, %5 ], [ 0, %1 ] - %4 = icmp slt i32 %3, 256 - br i1 %4, label %5, label %17 - -5: ; preds = %2 - %6 = getelementptr float, ptr @buf11, i32 %3 - %7 = load <16 x float>, ptr %6, align 4 - %8 = getelementptr float, ptr @buf10, i32 %3 - %9 = load <16 x float>, ptr %8, align 4 - %10 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %7) - %11 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %9) - %12 = shufflevector <16 x bfloat> %10, <16 x bfloat> %10, <32 x i32> - %13 = shufflevector <16 x bfloat> %11, <16 x bfloat> %11, <32 x i32> - %14 = call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %12, <32 x bfloat> %13, i32 60) - %15 = getelementptr float, ptr @buf9, i32 %3 - store <16 x float> %14, ptr %15, align 4 - %16 = add i32 %3, 16 - br label %2, !llvm.loop !1 - -17: ; preds = %2 - call void @llvm.aie2p.release(i32 51, i32 1) - call void @llvm.aie2p.release(i32 53, i32 1) - call void @llvm.aie2p.release(i32 48, i32 1) - br label %1 -} - -; Unknown intrinsic -declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>) - -; Unknown intrinsic -declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32) - -!llvm.module.flags = !{!0} - -!0 = !{i32 2, !"Debug Info Version", i32 3} -!1 = distinct !{!1, !2} -!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.o b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.o deleted file mode 100644 index e70224c9ad756c9bb1acf0ca65fa018ff13a4066..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1000 zcmZ`%O=}Zj5Pse!O}0{HTd*#I5PH(sF1xlKdXY^`wc~_4Dt!23>Q}AY`Cx|^f$&S7)R=KJlto2c; zVY>CY)xMEfTMBKPsO>JVty&R0T~so}|IIa~k}E_-EzbQWSDvm^tZ1?lIJp{eWv3#y zsiSrB4N^(#ZZRP1a*~8j-WB@NU*?hABvsa)JC1jGkjvTh*&-9E+a;3najNta?tf@J Bay|e6 diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.opt.ll b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.opt.ll deleted file mode 100644 index 80307e8..0000000 --- a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.opt.ll +++ /dev/null @@ -1,72 +0,0 @@ -; ModuleID = 'air_project/mul_kernel_0_core_0_5.peanohack.ll' -source_filename = "LLVMDialectModule" -target datalayout = "e-m:e-p:20:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-f32:32:32-i64:32-f64:32-a:0:32-n32" -target triple = "aie2p" - -@buf9 = external local_unnamed_addr global [256 x float] -@buf10 = external local_unnamed_addr global [256 x float] -@buf11 = external local_unnamed_addr global [256 x float] - -; Function Attrs: nounwind -declare void @llvm.aie2p.acquire(i32, i32) #0 - -; Function Attrs: nounwind -declare void @llvm.aie2p.release(i32, i32) #0 - -; Function Attrs: nounwind memory(inaccessiblemem: write) -declare void @llvm.aie2p.set.ctrl.reg(i32, i32) #1 - -; Function Attrs: noreturn nounwind -define void @core_0_5() local_unnamed_addr #2 { - tail call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1) - tail call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0) - br label %1 - -1: ; preds = %17, %0 - tail call void @llvm.aie2p.acquire(i32 49, i32 -1) - tail call void @llvm.aie2p.acquire(i32 50, i32 -1) - tail call void @llvm.aie2p.acquire(i32 52, i32 -1) - br label %2 - -2: ; preds = %1, %2 - %3 = phi i32 [ 0, %1 ], [ %15, %2 ] - %4 = trunc nuw i32 %3 to i20 - %5 = getelementptr float, ptr @buf11, i20 %4 - %6 = load <16 x float>, ptr %5, align 64 - %7 = getelementptr float, ptr @buf10, i20 %4 - %8 = load <16 x float>, ptr %7, align 64 - %9 = tail call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %6) - %10 = tail call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %8) - %11 = shufflevector <16 x bfloat> %9, <16 x bfloat> poison, <32 x i32> - %12 = shufflevector <16 x bfloat> %10, <16 x bfloat> poison, <32 x i32> - %13 = tail call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %11, <32 x bfloat> %12, i32 60) - %14 = getelementptr float, ptr @buf9, i20 %4 - store <16 x float> %13, ptr %14, align 64 - %15 = add nuw nsw i32 %3, 16 - %16 = icmp ult i32 %3, 240 - br i1 %16, label %2, label %17, !llvm.loop !1 - -17: ; preds = %2 - tail call void @llvm.aie2p.release(i32 51, i32 1) - tail call void @llvm.aie2p.release(i32 53, i32 1) - tail call void @llvm.aie2p.release(i32 48, i32 1) - br label %1 -} - -; Function Attrs: nofree nounwind memory(inaccessiblemem: read) -declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>) #3 - -; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) -declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32) #4 - -attributes #0 = { nounwind } -attributes #1 = { nounwind memory(inaccessiblemem: write) } -attributes #2 = { noreturn nounwind } -attributes #3 = { nofree nounwind memory(inaccessiblemem: read) } -attributes #4 = { mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) } - -!llvm.module.flags = !{!0} - -!0 = !{i32 2, !"Debug Info Version", i32 3} -!1 = distinct !{!1, !2} -!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.peanohack.ll b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.peanohack.ll deleted file mode 100644 index 5a9b5b8..0000000 --- a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.peanohack.ll +++ /dev/null @@ -1,95 +0,0 @@ -; ModuleID = 'LLVMDialectModule' -source_filename = "LLVMDialectModule" -target triple = "aie2p" - -@buf0 = external global [256 x float] -@buf1 = external global [256 x float] -@buf2 = external global [256 x float] -@buf3 = external global [256 x float] -@buf4 = external global [256 x float] -@buf5 = external global [256 x float] -@buf6 = external global [256 x float] -@buf7 = external global [256 x float] -@buf8 = external global [256 x float] -@buf9 = external global [256 x float] -@buf10 = external global [256 x float] -@buf11 = external global [256 x float] -@buf12 = external global [1024 x float] -@buf13 = external global [1024 x float] -@buf14 = external global [1024 x float] - -declare void @debug_i32(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.event(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.put.ms(i32, i32) - -; Unknown intrinsic -declare { i32, i32 } @llvm.aie2p.get.ss() - -; Unknown intrinsic -declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32) - -; Unknown intrinsic -declare <16 x i32> @llvm.aie2p.scd.read.vec(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.acquire(i32, i32) - -; Unknown intrinsic -declare void @llvm.aie2p.release(i32, i32) - -; Unknown intrinsic -declare void @llvm.aie2p.set.ctrl.reg(i32, i32) - -define void @core_0_5() { - call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1) - call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0) - br label %1 - -1: ; preds = %17, %0 - call void @llvm.aie2p.acquire(i32 49, i32 -1) - call void @llvm.aie2p.acquire(i32 50, i32 -1) - call void @llvm.aie2p.acquire(i32 52, i32 -1) - br label %2 - -2: ; preds = %5, %1 - %3 = phi i32 [ %16, %5 ], [ 0, %1 ] - %4 = icmp slt i32 %3, 256 - br i1 %4, label %5, label %17 - -5: ; preds = %2 - %6 = getelementptr float, ptr @buf11, i32 %3 - %7 = load <16 x float>, ptr %6 - %8 = getelementptr float, ptr @buf10, i32 %3 - %9 = load <16 x float>, ptr %8 - %10 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %7) - %11 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %9) - %12 = shufflevector <16 x bfloat> %10, <16 x bfloat> %10, <32 x i32> - %13 = shufflevector <16 x bfloat> %11, <16 x bfloat> %11, <32 x i32> - %14 = call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %12, <32 x bfloat> %13, i32 60) - %15 = getelementptr float, ptr @buf9, i32 %3 - store <16 x float> %14, ptr %15 - %16 = add i32 %3, 16 - br label %2, !llvm.loop !1 - -17: ; preds = %2 - call void @llvm.aie2p.release(i32 51, i32 1) - call void @llvm.aie2p.release(i32 53, i32 1) - call void @llvm.aie2p.release(i32 48, i32 1) - br label %1 -} - -; Unknown intrinsic -declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>) - -; Unknown intrinsic -declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32) - -!llvm.module.flags = !{!0} - -!0 = !{i32 2, !"Debug Info Version", i32 3} -!1 = distinct !{!1, !2} -!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_design.bif b/examples/elementwise_arith/air_project/mul_kernel_0_design.bif deleted file mode 100644 index 86ba205..0000000 --- a/examples/elementwise_arith/air_project/mul_kernel_0_design.bif +++ /dev/null @@ -1,10 +0,0 @@ -all: -{ - id_code = 0x14ca8093 - extended_id_code = 0x01 - image - { - name=aie_image, id=0x1c000000 - { type=cdo file=air_project/mul_kernel_0_aie_cdo_elfs.bin file=air_project/mul_kernel_0_aie_cdo_init.bin file=air_project/mul_kernel_0_aie_cdo_enable.bin } - } -} diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_mul_kernel_0_sequence.bin b/examples/elementwise_arith/air_project/mul_kernel_0_mul_kernel_0_sequence.bin deleted file mode 100644 index f2eb383b7eb903e8ad8809b7d44fb0a4d660bfb6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3248 zcmcJQ-Aw~A5QNuneg4w5;eLtEZyo^(@j}krmd|WcZ&8OQJf_lyu+9e|Yukmso`>#0IXQ!}T>04{- zdWRz5+iJIiZ@khs7OO7qzIV<49O}M3Vd35Et*zr4+n!k}H6?s5ZHHP2b8iNd9f!uq z2%ELeWFG=0l9+H#Mw;SV&vYiNQGA=m?@TyH^<^fmv5ZNwd1Or5@6ws1SwdvOIT;Fy zFEe2cjS1(dzRbimmN6++jgb*5y%`xL$~=|+C!AAeiZ3%^4UGxssJ_g^HI^|MPc*|& zYb`xk4J-t?`*yLPCTa8!jZ|yI(P^-T#)NY+)Rg;`nXrb&gmYA1X5t#lnB*HUVGWH5 W=cv75Caj?`;T+YMnYhL+nEU|x-i@&U diff --git a/examples/elementwise_arith/air_project/npu.asm_air_output.mlir b/examples/elementwise_arith/air_project/npu.asm_air_output.mlir deleted file mode 100644 index a66ce9e..0000000 --- a/examples/elementwise_arith/air_project/npu.asm_air_output.mlir +++ /dev/null @@ -1,300 +0,0 @@ -#loop_annotation = #llvm.loop_annotation -module { - aie.device(npu2) @square_kernel_0 { - %shim_noc_tile_0_0 = aie.tile(0, 0) - %shim_noc_tile_1_0 = aie.tile(1, 0) - %mem_tile_0_1 = aie.tile(0, 1) - %mem_tile_1_1 = aie.tile(1, 1) - %tile_0_2 = aie.tile(0, 2) - %tile_0_3 = aie.tile(0, 3) - %tile_0_4 = aie.tile(0, 4) - %tile_0_5 = aie.tile(0, 5) - %lock_0_1 = aie.lock(%mem_tile_0_1, 1) {init = 4 : i32} - %lock_0_1_0 = aie.lock(%mem_tile_0_1, 0) {init = 0 : i32} - %lock_1_1 = aie.lock(%mem_tile_1_1, 1) {init = 4 : i32} - %lock_1_1_1 = aie.lock(%mem_tile_1_1, 0) {init = 0 : i32} - %lock_0_2 = aie.lock(%tile_0_2, 3) {init = 1 : i32} - %lock_0_2_2 = aie.lock(%tile_0_2, 2) {init = 0 : i32} - %lock_0_2_3 = aie.lock(%tile_0_2, 1) {init = 1 : i32} - %lock_0_2_4 = aie.lock(%tile_0_2, 0) {init = 0 : i32} - %lock_0_3 = aie.lock(%tile_0_3, 3) {init = 1 : i32} - %lock_0_3_5 = aie.lock(%tile_0_3, 2) {init = 0 : i32} - %lock_0_3_6 = aie.lock(%tile_0_3, 1) {init = 1 : i32} - %lock_0_3_7 = aie.lock(%tile_0_3, 0) {init = 0 : i32} - %lock_0_4 = aie.lock(%tile_0_4, 3) {init = 1 : i32} - %lock_0_4_8 = aie.lock(%tile_0_4, 2) {init = 0 : i32} - %lock_0_4_9 = aie.lock(%tile_0_4, 1) {init = 1 : i32} - %lock_0_4_10 = aie.lock(%tile_0_4, 0) {init = 0 : i32} - %lock_0_5 = aie.lock(%tile_0_5, 3) {init = 1 : i32} - %lock_0_5_11 = aie.lock(%tile_0_5, 2) {init = 0 : i32} - %lock_0_5_12 = aie.lock(%tile_0_5, 1) {init = 1 : i32} - %lock_0_5_13 = aie.lock(%tile_0_5, 0) {init = 0 : i32} - %buf9 = aie.buffer(%mem_tile_0_1) {sym_name = "buf9"} : memref<1024xi16, 1 : i32> - %buf8 = aie.buffer(%mem_tile_1_1) {sym_name = "buf8"} : memref<1024xi16, 1> - %buf7 = aie.buffer(%tile_0_5) {sym_name = "buf7"} : memref<256xi16, 2> - %buf6 = aie.buffer(%tile_0_5) {sym_name = "buf6"} : memref<256xi16, 2> - %buf5 = aie.buffer(%tile_0_4) {sym_name = "buf5"} : memref<256xi16, 2> - %buf4 = aie.buffer(%tile_0_4) {sym_name = "buf4"} : memref<256xi16, 2> - %buf3 = aie.buffer(%tile_0_3) {sym_name = "buf3"} : memref<256xi16, 2> - %buf2 = aie.buffer(%tile_0_3) {sym_name = "buf2"} : memref<256xi16, 2> - %buf1 = aie.buffer(%tile_0_2) {sym_name = "buf1"} : memref<256xi16, 2> - %buf0 = aie.buffer(%tile_0_2) {sym_name = "buf0"} : memref<256xi16, 2> - %mem_0_5 = aie.mem(%tile_0_5) { - %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) - ^bb1: // 2 preds: ^bb0, ^bb1 - aie.use_lock(%lock_0_5_13, AcquireGreaterEqual, 1) - aie.dma_bd(%buf6 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32} - aie.use_lock(%lock_0_5_12, Release, 1) - aie.next_bd ^bb1 - ^bb2: // pred: ^bb3 - aie.end - ^bb3: // pred: ^bb0 - %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb2) - ^bb4: // 2 preds: ^bb3, ^bb4 - aie.use_lock(%lock_0_5, AcquireGreaterEqual, 1) - aie.dma_bd(%buf7 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32} - aie.use_lock(%lock_0_5_11, Release, 1) - aie.next_bd ^bb4 - } - %core_0_5 = aie.core(%tile_0_5) { - %0 = ub.poison : i16 - %c256 = arith.constant 256 : index - %c32 = arith.constant 32 : index - %c0 = arith.constant 0 : index - cf.br ^bb1 - ^bb1: // 2 preds: ^bb0, ^bb1 - aie.use_lock(%lock_0_5_12, AcquireGreaterEqual, 1) - aie.use_lock(%lock_0_5_11, AcquireGreaterEqual, 1) - scf.for %arg0 = %c0 to %c256 step %c32 { - %subview = memref.subview %buf7[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2> - %subview_14 = memref.subview %buf6[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2> - %1 = vector.transfer_read %subview[%c0], %0 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16> - %2 = arith.muli %1, %1 : vector<32xi16> - vector.transfer_write %2, %subview_14[%c0] {in_bounds = [true]} : vector<32xi16>, memref<32xi16, strided<[1], offset: ?>, 2> - } {loop_annotation = #loop_annotation} - aie.use_lock(%lock_0_5, Release, 1) - aie.use_lock(%lock_0_5_13, Release, 1) - cf.br ^bb1 - } - %mem_0_4 = aie.mem(%tile_0_4) { - %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) - ^bb1: // 2 preds: ^bb0, ^bb1 - aie.use_lock(%lock_0_4_10, AcquireGreaterEqual, 1) - aie.dma_bd(%buf4 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32} - aie.use_lock(%lock_0_4_9, Release, 1) - aie.next_bd ^bb1 - ^bb2: // pred: ^bb3 - aie.end - ^bb3: // pred: ^bb0 - %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb2) - ^bb4: // 2 preds: ^bb3, ^bb4 - aie.use_lock(%lock_0_4, AcquireGreaterEqual, 1) - aie.dma_bd(%buf5 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32} - aie.use_lock(%lock_0_4_8, Release, 1) - aie.next_bd ^bb4 - } - %core_0_4 = aie.core(%tile_0_4) { - %0 = ub.poison : i16 - %c256 = arith.constant 256 : index - %c32 = arith.constant 32 : index - %c0 = arith.constant 0 : index - cf.br ^bb1 - ^bb1: // 2 preds: ^bb0, ^bb1 - aie.use_lock(%lock_0_4_9, AcquireGreaterEqual, 1) - aie.use_lock(%lock_0_4_8, AcquireGreaterEqual, 1) - scf.for %arg0 = %c0 to %c256 step %c32 { - %subview = memref.subview %buf5[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2> - %subview_14 = memref.subview %buf4[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2> - %1 = vector.transfer_read %subview[%c0], %0 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16> - %2 = arith.muli %1, %1 : vector<32xi16> - vector.transfer_write %2, %subview_14[%c0] {in_bounds = [true]} : vector<32xi16>, memref<32xi16, strided<[1], offset: ?>, 2> - } {loop_annotation = #loop_annotation} - aie.use_lock(%lock_0_4, Release, 1) - aie.use_lock(%lock_0_4_10, Release, 1) - cf.br ^bb1 - } - %mem_0_3 = aie.mem(%tile_0_3) { - %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) - ^bb1: // 2 preds: ^bb0, ^bb1 - aie.use_lock(%lock_0_3_7, AcquireGreaterEqual, 1) - aie.dma_bd(%buf2 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32} - aie.use_lock(%lock_0_3_6, Release, 1) - aie.next_bd ^bb1 - ^bb2: // pred: ^bb3 - aie.end - ^bb3: // pred: ^bb0 - %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb2) - ^bb4: // 2 preds: ^bb3, ^bb4 - aie.use_lock(%lock_0_3, AcquireGreaterEqual, 1) - aie.dma_bd(%buf3 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32} - aie.use_lock(%lock_0_3_5, Release, 1) - aie.next_bd ^bb4 - } - %core_0_3 = aie.core(%tile_0_3) { - %0 = ub.poison : i16 - %c256 = arith.constant 256 : index - %c32 = arith.constant 32 : index - %c0 = arith.constant 0 : index - cf.br ^bb1 - ^bb1: // 2 preds: ^bb0, ^bb1 - aie.use_lock(%lock_0_3_6, AcquireGreaterEqual, 1) - aie.use_lock(%lock_0_3_5, AcquireGreaterEqual, 1) - scf.for %arg0 = %c0 to %c256 step %c32 { - %subview = memref.subview %buf3[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2> - %subview_14 = memref.subview %buf2[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2> - %1 = vector.transfer_read %subview[%c0], %0 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16> - %2 = arith.muli %1, %1 : vector<32xi16> - vector.transfer_write %2, %subview_14[%c0] {in_bounds = [true]} : vector<32xi16>, memref<32xi16, strided<[1], offset: ?>, 2> - } {loop_annotation = #loop_annotation} - aie.use_lock(%lock_0_3, Release, 1) - aie.use_lock(%lock_0_3_7, Release, 1) - cf.br ^bb1 - } - %mem_0_2 = aie.mem(%tile_0_2) { - %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) - ^bb1: // 2 preds: ^bb0, ^bb1 - aie.use_lock(%lock_0_2_4, AcquireGreaterEqual, 1) - aie.dma_bd(%buf0 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32} - aie.use_lock(%lock_0_2_3, Release, 1) - aie.next_bd ^bb1 - ^bb2: // pred: ^bb3 - aie.end - ^bb3: // pred: ^bb0 - %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb2) - ^bb4: // 2 preds: ^bb3, ^bb4 - aie.use_lock(%lock_0_2, AcquireGreaterEqual, 1) - aie.dma_bd(%buf1 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32} - aie.use_lock(%lock_0_2_2, Release, 1) - aie.next_bd ^bb4 - } - %core_0_2 = aie.core(%tile_0_2) { - %0 = ub.poison : i16 - %c256 = arith.constant 256 : index - %c32 = arith.constant 32 : index - %c0 = arith.constant 0 : index - cf.br ^bb1 - ^bb1: // 2 preds: ^bb0, ^bb1 - aie.use_lock(%lock_0_2_3, AcquireGreaterEqual, 1) - aie.use_lock(%lock_0_2_2, AcquireGreaterEqual, 1) - scf.for %arg0 = %c0 to %c256 step %c32 { - %subview = memref.subview %buf1[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2> - %subview_14 = memref.subview %buf0[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2> - %1 = vector.transfer_read %subview[%c0], %0 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16> - %2 = arith.muli %1, %1 : vector<32xi16> - vector.transfer_write %2, %subview_14[%c0] {in_bounds = [true]} : vector<32xi16>, memref<32xi16, strided<[1], offset: ?>, 2> - } {loop_annotation = #loop_annotation} - aie.use_lock(%lock_0_2, Release, 1) - aie.use_lock(%lock_0_2_4, Release, 1) - cf.br ^bb1 - } - aie.flow(%shim_noc_tile_0_0, DMA : 0, %mem_tile_0_1, DMA : 0) - aie.flow(%mem_tile_1_1, DMA : 0, %shim_noc_tile_1_0, DMA : 0) - aie.flow(%mem_tile_0_1, DMA : 0, %tile_0_2, DMA : 0) - aie.flow(%mem_tile_0_1, DMA : 1, %tile_0_3, DMA : 0) - aie.flow(%mem_tile_0_1, DMA : 2, %tile_0_4, DMA : 0) - aie.flow(%mem_tile_0_1, DMA : 3, %tile_0_5, DMA : 0) - aie.flow(%tile_0_2, DMA : 0, %mem_tile_1_1, DMA : 0) - aie.flow(%tile_0_3, DMA : 0, %mem_tile_1_1, DMA : 1) - aie.flow(%tile_0_4, DMA : 0, %mem_tile_1_1, DMA : 2) - aie.flow(%tile_0_5, DMA : 0, %mem_tile_1_1, DMA : 3) - %memtile_dma_1_1 = aie.memtile_dma(%mem_tile_1_1) { - %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) - ^bb1: // 2 preds: ^bb0, ^bb1 - aie.use_lock(%lock_1_1_1, AcquireGreaterEqual, 4) - aie.dma_bd(%buf8 : memref<1024xi16, 1>, 0, 1024) {task_id = 0 : i32} - aie.use_lock(%lock_1_1, Release, 4) - aie.next_bd ^bb1 - ^bb2: // pred: ^bb9 - aie.end - ^bb3: // pred: ^bb0 - %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb5) - ^bb4: // 2 preds: ^bb3, ^bb4 - aie.use_lock(%lock_1_1, AcquireGreaterEqual, 1) - aie.dma_bd(%buf8 : memref<1024xi16, 1>, 0, 256) {task_id = 0 : i32} - aie.use_lock(%lock_1_1_1, Release, 1) - aie.next_bd ^bb4 - ^bb5: // pred: ^bb3 - %2 = aie.dma_start(S2MM, 1, ^bb6, ^bb7) - ^bb6: // 2 preds: ^bb5, ^bb6 - aie.use_lock(%lock_1_1, AcquireGreaterEqual, 1) - aie.dma_bd(%buf8 : memref<1024xi16, 1>, 256, 256) {task_id = 0 : i32} - aie.use_lock(%lock_1_1_1, Release, 1) - aie.next_bd ^bb6 - ^bb7: // pred: ^bb5 - %3 = aie.dma_start(S2MM, 2, ^bb8, ^bb9) - ^bb8: // 2 preds: ^bb7, ^bb8 - aie.use_lock(%lock_1_1, AcquireGreaterEqual, 1) - aie.dma_bd(%buf8 : memref<1024xi16, 1>, 512, 256) {task_id = 0 : i32} - aie.use_lock(%lock_1_1_1, Release, 1) - aie.next_bd ^bb8 - ^bb9: // pred: ^bb7 - %4 = aie.dma_start(S2MM, 3, ^bb10, ^bb2) - ^bb10: // 2 preds: ^bb9, ^bb10 - aie.use_lock(%lock_1_1, AcquireGreaterEqual, 1) - aie.dma_bd(%buf8 : memref<1024xi16, 1>, 768, 256) {task_id = 0 : i32} - aie.use_lock(%lock_1_1_1, Release, 1) - aie.next_bd ^bb10 - } - %memtile_dma_0_1 = aie.memtile_dma(%mem_tile_0_1) { - %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3) - ^bb1: // 2 preds: ^bb0, ^bb1 - aie.use_lock(%lock_0_1_0, AcquireGreaterEqual, 1) - aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 0, 256) {task_id = 0 : i32} - aie.use_lock(%lock_0_1, Release, 1) - aie.next_bd ^bb1 - ^bb2: // pred: ^bb9 - aie.end - ^bb3: // pred: ^bb0 - %1 = aie.dma_start(MM2S, 1, ^bb4, ^bb5) - ^bb4: // 2 preds: ^bb3, ^bb4 - aie.use_lock(%lock_0_1_0, AcquireGreaterEqual, 1) - aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 256, 256) {task_id = 0 : i32} - aie.use_lock(%lock_0_1, Release, 1) - aie.next_bd ^bb4 - ^bb5: // pred: ^bb3 - %2 = aie.dma_start(MM2S, 2, ^bb6, ^bb7) - ^bb6: // 2 preds: ^bb5, ^bb6 - aie.use_lock(%lock_0_1_0, AcquireGreaterEqual, 1) - aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 512, 256) {task_id = 0 : i32} - aie.use_lock(%lock_0_1, Release, 1) - aie.next_bd ^bb6 - ^bb7: // pred: ^bb5 - %3 = aie.dma_start(MM2S, 3, ^bb8, ^bb9) - ^bb8: // 2 preds: ^bb7, ^bb8 - aie.use_lock(%lock_0_1_0, AcquireGreaterEqual, 1) - aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 768, 256) {task_id = 0 : i32} - aie.use_lock(%lock_0_1, Release, 1) - aie.next_bd ^bb8 - ^bb9: // pred: ^bb7 - %4 = aie.dma_start(S2MM, 0, ^bb10, ^bb2) - ^bb10: // 2 preds: ^bb9, ^bb10 - aie.use_lock(%lock_0_1, AcquireGreaterEqual, 4) - aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 0, 1024) {task_id = 0 : i32} - aie.use_lock(%lock_0_1_0, Release, 4) - aie.next_bd ^bb10 - } - aie.shim_dma_allocation @air_channel_3(%shim_noc_tile_1_0, S2MM, 0) - aie.shim_dma_allocation @air_channel_0(%shim_noc_tile_0_0, MM2S, 0) - aie.runtime_sequence @square_kernel_0_sequence(%arg0: memref<*xi16>, %arg1: memref<*xi16>, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32) { - %0 = aiex.dma_configure_task_for @air_channel_0 { - aie.dma_bd(%arg0 : memref<*xi16>, 0, 1024, [, ]) - aie.end - } - aiex.dma_start_task(%0) - %1 = aiex.dma_configure_task_for @air_channel_3 { - aie.dma_bd(%arg1 : memref<*xi16>, 0, 1024, [, ]) - aie.end - } {issue_token = true} - aiex.dma_start_task(%1) - aiex.dma_free_task(%0) - aiex.dma_await_task(%1) - } - } {dlti.dl_spec = #dlti.dl_spec} - aie.device(npu2) { - aie.runtime_sequence @square_kernel(%arg0: memref<*xi16>, %arg1: memref<*xi16>, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32) { - aiex.configure @square_kernel_0 { - aiex.run @square_kernel_0_sequence(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7) : (memref<*xi16>, memref<*xi16>, i32, i32, i32, i32, i32, i32) - } - } - } -} diff --git a/examples/elementwise_arith/air_project/placed.asm_air_output.mlir b/examples/elementwise_arith/air_project/placed.asm_air_output.mlir deleted file mode 100644 index aa82d2e..0000000 --- a/examples/elementwise_arith/air_project/placed.asm_air_output.mlir +++ /dev/null @@ -1,86 +0,0 @@ -module { - air.channel @channel_0 [] - air.channel @channel_1 [4, 1] - air.channel @channel_2 [4, 1] - air.channel @channel_3 [] - func.func @square_kernel(%arg0: memref<*xi16> {tt.divisibility = 16 : i32}, %arg1: memref<*xi16> {tt.divisibility = 16 : i32}, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32) { - %c1 = arith.constant 1 : index - %0 = air.launch async (%arg8, %arg9, %arg10) in (%arg11=%c1, %arg12=%c1, %arg13=%c1) args(%arg14=%arg0, %arg15=%arg1) : memref<*xi16>, memref<*xi16> attributes {id = 1 : i32} { - %c1024 = arith.constant 1024 : index - %c1_0 = arith.constant 1 : index - %1 = arith.muli %arg8, %c1024 : index - %2 = air.channel.put async @channel_0[] (%arg14[%1] [%c1024] [%c1_0]) {id = 1 : i32} : (memref<*xi16>) - %3 = air.channel.get async @channel_3[] (%arg15[%1] [%c1024] [%c1_0]) {id = 2 : i32} : (memref<*xi16>) - %4 = air.segment @square_kernel_0 async attributes {id = 2 : i32, x_loc = 0 : i64, x_size = 8 : i64, y_loc = 2 : i64, y_size = 6 : i64} { - %c4 = arith.constant 4 : index - %c768 = arith.constant 768 : index - %c3 = arith.constant 3 : index - %c512 = arith.constant 512 : index - %c2 = arith.constant 2 : index - %c256 = arith.constant 256 : index - %c0 = arith.constant 0 : index - %c1_1 = arith.constant 1 : index - %async_token, %results = air.execute -> (memref<1024xi16, 1 : i32>) { - %alloc = memref.alloc() : memref<1024xi16, 1 : i32> - air.execute_terminator %alloc : memref<1024xi16, 1 : i32> - } - %5 = air.channel.get async [%async_token] @channel_0[] (%results[] [] []) {id = 3 : i32} : (memref<1024xi16, 1 : i32>) - %async_token_2, %results_3 = air.execute -> (memref<1024xi16, 1>) { - %alloc = memref.alloc() : memref<1024xi16, 1> - air.execute_terminator %alloc : memref<1024xi16, 1> - } - %6 = air.channel.put async [%5] @channel_1[%c0, %c0] (%results[%c0] [%c256] [%c1_1]) {id = 4 : i32} : (memref<1024xi16, 1 : i32>) - %7 = air.channel.put async [%5] @channel_1[%c1_1, %c0] (%results[%c256] [%c256] [%c1_1]) {id = 5 : i32} : (memref<1024xi16, 1 : i32>) - %8 = air.channel.put async [%5] @channel_1[%c2, %c0] (%results[%c512] [%c256] [%c1_1]) {id = 6 : i32} : (memref<1024xi16, 1 : i32>) - %9 = air.channel.put async [%5] @channel_1[%c3, %c0] (%results[%c768] [%c256] [%c1_1]) {id = 7 : i32} : (memref<1024xi16, 1 : i32>) - %10 = air.channel.get async [%async_token_2] @channel_2[%c0, %c0] (%results_3[%c0] [%c256] [%c1_1]) {id = 8 : i32} : (memref<1024xi16, 1>) - %11 = air.channel.get async [%async_token_2] @channel_2[%c1_1, %c0] (%results_3[%c256] [%c256] [%c1_1]) {id = 9 : i32} : (memref<1024xi16, 1>) - %12 = air.channel.get async [%async_token_2] @channel_2[%c2, %c0] (%results_3[%c512] [%c256] [%c1_1]) {id = 10 : i32} : (memref<1024xi16, 1>) - %13 = air.channel.get async [%async_token_2] @channel_2[%c3, %c0] (%results_3[%c768] [%c256] [%c1_1]) {id = 11 : i32} : (memref<1024xi16, 1>) - %14 = air.herd @herd_0 async [%5, %async_token_2] tile (%arg16, %arg17) in (%arg18=%c1_1, %arg19=%c4) attributes {id = 3 : i32, x_loc = 0 : i64, y_loc = 2 : i64} { - %c32 = arith.constant 32 : index - %c256_5 = arith.constant 256 : index - %c0_6 = arith.constant 0 : index - %16 = ub.poison : i16 - %async_token_7, %results_8 = air.execute -> (memref<256xi16, 2>) { - %alloc = memref.alloc() : memref<256xi16, 2> - air.execute_terminator %alloc : memref<256xi16, 2> - } - %17 = air.channel.get async [%async_token_7] @channel_1[%arg17, %c0_6] (%results_8[] [] []) {id = 12 : i32} : (memref<256xi16, 2>) - %async_token_9, %results_10 = air.execute -> (memref<256xi16, 2>) { - %alloc = memref.alloc() : memref<256xi16, 2> - air.execute_terminator %alloc : memref<256xi16, 2> - } - %18 = air.wait_all async [%17, %async_token_9] - %19 = scf.for %arg20 = %c0_6 to %c256_5 step %c32 iter_args(%arg21 = %18) -> (!air.async.token) { - %subview = memref.subview %results_8[%arg20] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2> - %subview_13 = memref.subview %results_10[%arg20] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2> - %async_token_14, %results_15 = air.execute [%arg21] -> (vector<32xi16>) { - %23 = vector.transfer_read %subview[%c0_6], %16 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16> - air.execute_terminator %23 : vector<32xi16> - } - %21 = arith.muli %results_15, %results_15 : vector<32xi16> - %async_token_16 = air.execute [%arg21] { - vector.transfer_write %21, %subview_13[%c0_6] {in_bounds = [true]} : vector<32xi16>, memref<32xi16, strided<[1], offset: ?>, 2> - } - %22 = air.wait_all async [%async_token_14, %async_token_16] - scf.yield %22 : !air.async.token - } - %20 = air.channel.put async [%async_token_9] @channel_2[%arg17, %c0_6] (%results_10[] [] []) {id = 13 : i32} : (memref<256xi16, 2>) - %async_token_11 = air.execute [%17] { - memref.dealloc %results_8 : memref<256xi16, 2> - } - %async_token_12 = air.execute [%20] { - memref.dealloc %results_10 : memref<256xi16, 2> - } - } - %15 = air.channel.put async [%14] @channel_3[] (%results_3[] [] []) {id = 14 : i32} : (memref<1024xi16, 1>) - %async_token_4 = air.execute [%15] { - memref.dealloc %results_3 : memref<1024xi16, 1> - } - air.wait_all [%6, %7, %8, %9, %10, %11, %12, %13, %async_token_4] {air.segment_end} - } - } - return - } -} diff --git a/examples/elementwise_arith/air_project/square_kernel_0.pdi b/examples/elementwise_arith/air_project/square_kernel_0.pdi deleted file mode 100644 index 1a6b4e2869f47c37579486ca1fbb299d49a2e6c4..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6272 zcmeHLO=w(I6h3d}C7Eh$-^f^A+Z11tBB5mQ(uvSQz&FkmqDGt=g@TKwLM;WA*p1y8 zG1MX=0XNf%kc}HxgF+W|GtzF9BD3tqFs-iTrMT##I-YZX-u>RQSyWerUYMEh`|i2t zo_o%jzxyi@wdzyVOCP@T=FK0zy-ehj$J!x&$m={mWPG{yb4?@t{8Fpkj)I6{6b+Vj z|7Bb_{Nokb57(bQcjnZ&_fDTZ;~Y*nm`7jD96NY(gy<)J$SaPa8E>#SN>su<(C+`> z<9`hQ^4USN^&0O$bc18~^ZV|w%A1wEGEp?0olSVKH7REL!8|N*D)0*Ms;5sNzDK>>)quKvj{)5tUOVaC+BHS= z$=J1%bnBIbJy)(?_{Qr6b80QAF&nRskN(|x^^0n!{Y&-t#@OjgbDd}OoP7@WCKaMZ zZ?LcP(FX@x$!(6Z72V;)^ZxAd-SgwJzmA9b`uW|!`;GAJ&o9!Xnb%HbzFU&@j`R9! znlZA&?JY{tKCGrzP`Svw3>cb?tXf& zbN2q&Y{^f`<~)>G`%v<0MtJOoD*}a=uaAW^V`gt~M(z!DU)Vm7WzrY64=lHRV7cuB z%PFcC`f9GwS9wlbd0qjY@24Y=`-vXKe$w@|{Slh_T5juaxvjtD6g3O|)F|{*p3_#I zSAggH>BYzW^hoxTuCMKn(A3v*TYt-K{Vk_xxzJB13jLJlw3X)-;Q4-f{c%4%lKrIX zYx^TK^|jpA-*Q`j;ZlG5TMBzGxjWwp zIG@8&>TnD=0o(?>0(ceh8sH7Un}EB3dw~0Z2Y^$+0ZquSWc^-aY|q7d33S!6@_vlD zw>@JYLC*)E6X-GbinDh;9e_^ITQTR}*}EPIKqshG%x7`-F6RO0WX@Za&l597JpN-N zqt9zRl>*Jp^WX)4^g;HKvQKihLVi2t@0R@3_%l$3^iE)~jn8K;SoIamRatcLG#)Y@ zu}@^n+;H*mJP#d@#vXHt)#vYj!8#r{myd^Q_E7l|`-Hvbhl_{jdFXgFcJb(5(eb#s zd_0p68IRZ}YEOJ}@$ftk9goHybDbaE>pC7cr{j4-ts~mci4Xx#|E1Mjr7%}xE>2bM3v(09_nEm6=InjRh-dT7c~p^G&Wm6q!$vWm+J1+8 z7VWuS+P?JQdi~43Z0vW)XVISPrLHN6cBt3=WZmRnoyz22VVnPj(fwp|e3Y>aJJ(`v zYm*e_mYCZdFBax}=C(Fem|IB8#8+Z#5?5`T@52nPuzsAVCpu|eg-g|bB(&>P_ zQ`FiS??ARTX5IQB&=b}^~IDLV2zMfP2 z9@=*L66JEC1^{0*fBIpyWXZ-_unAX969sJk9zth9C3HD88uRe=Y pj((hv!{1NB{nRVPOgYBf|1Y+ZbL!7;a|h|TzC&_c-vJ26@egb?K~ diff --git a/examples/elementwise_arith/air_project/square_kernel_0_aie_cdo_elfs.bin b/examples/elementwise_arith/air_project/square_kernel_0_aie_cdo_elfs.bin deleted file mode 100644 index 29b57b909b220d0abba36bf749886d055504d85a..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2528 zcmeH_ze~eF6vw}pR+B=}cCehF5GzQB4z^Bisa6n0{IwKZ)Wt=l*2T@iq7LF52f@X? zI4E=~*jYL{6)XM$jt*kIYwpOy^oQP-Xxzuyda=lvgB0F1+5C6JrH%WKn zHThXE_pWm8n75{XdCIYXW%fSqo*hpXosMXzg-Y Q>I_#ircAp)nRY?4KU-3TT>t<8 diff --git a/examples/elementwise_arith/air_project/square_kernel_0_aie_cdo_enable.bin b/examples/elementwise_arith/air_project/square_kernel_0_aie_cdo_enable.bin deleted file mode 100644 index 7cc1818bce4d6ce1226fc5fda519967a4842b99e..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 104 zcmZQ!U|?`|@n>LQVqg#jvKQ~%@L!OTfq|Wo1;_&e1!V>x7mN{N##qFhv4{o3#Q>)) B1+xGE diff --git a/examples/elementwise_arith/air_project/square_kernel_0_aie_cdo_init.bin b/examples/elementwise_arith/air_project/square_kernel_0_aie_cdo_init.bin deleted file mode 100644 index ace360fc11f90c98660ab2cdf87c15ccc1146121..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 4300 zcmb7`y>1gh6ov2pBp3*=K`u;UiAAPBC_;-Iq!f`D%Rx$EiJ&xeBnG9QU zs%kD6pNrU!l#}M1^n%zBpAerBpA%1sFNiORXT)>j1@V&D*1XPX+>%pO(>aYOC(XIl zTo4=L6XG-CbK)uS1@R^EjCfAGAYKyNn%6m1A3S*c+Sygb5o&%9 zDQb?rM*@%T4`=_N{G8ulX8q{?;KAe9&aNuf{Z)6+dnEAa{@y&WKj$~)`r$v_A3S*c z+Sygb5o+uFs&~wLB=A)IJt^l9*@>&$=eaJ|ALeHCEA;2i?-r6BZLDc@Ykr1)Nos%e zyBx)X*l+y&@q4zG%>LZZpZvLZexJ$yCVIi+Gbw6TxuNFRd!)s)g<3E0bXq)J)ZAiz zo~*^wN9|SM=>?vR$e#ytE_db`1A@ogjyUhie^1`*$9Y!%xcxlu_>oKMN&`5dAaW`EG)IYKQwL(}4!pcZC-+~P@53(pXD;ch=Rkqwcl z-$Qp*{x)y81O1E0-0M>5Yran8Q}p^pqsy+~q?_^YqFuLU!v*w^g>(ZQdvE+H>3h(z zr$+CPegGXiX!I@8Gn>R&pqo|VB5M5DZ%q4j`n=iSbrOgBeJi_aJocj*`txA^1JYCI zsvpuj(4Ph4_etM^{vyy1Nl)!SHUqu48mRmYbah^&A4AvoLytd#j$Z;zemRYwKv(BU z`giE6f6`Msr1^)c|9$AXf2|)ve;oApcBS1;N=hu4A4&tXle|xKey1zMebw5e(LVp>IKPJ5oUEP1u pdv-|c4{7}&t*`Z-9n<)6Io^K0T<@{sIXlnetpM3{;ac?}PM z2Ar8;ylOTZ%BWZLiViq!J6Ka&UKsh^ zAoZ%b^*G*)&Mz#i`SE(MRdKqVg%!W;2iptn_EzUJ+xJwf>bQE{tyVqTt?Kp9YhSpY zbIz@~RijmH)f}Vlx%Jv={gY%5lTU7Eggjmzze{#?V5&GVNhAXs@PmZUA>0~W)%)gO1p(v97oe=&l0?FT^HrY$YBk`Z86-6L? zms$xSO5}H`l@LPsLuw_25I&$*LI~me)QTby{yVi2LI^()BcJ?%vY$VNn`4ej>}|)8 zF?GCvMs%YS+b!z#C=L^Aq-|%ziu@nFh~Go=+vSFDw>>9b?z+9Uhe~s4Nw^53YkpwB>4u)ATZ?G*Rtv4HRQ_>x1Mr62mFxQqOv}c&(H{uNUk#|cdhi`pU?U7ZlK6LouSCU ixr|U}TR2O5GVcQOj{YO>)3oMR8au-UWzDQlrvCwN-uX%Z diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_2.ld.script b/examples/elementwise_arith/air_project/square_kernel_0_core_0_2.ld.script deleted file mode 100644 index 13a60c2..0000000 --- a/examples/elementwise_arith/air_project/square_kernel_0_core_0_2.ld.script +++ /dev/null @@ -1,66 +0,0 @@ - -MEMORY -{ - program (RX) : ORIGIN = 0, LENGTH = 0x0020000 - data (!RX) : ORIGIN = 0x74400, LENGTH = 0xBC00 -} -ENTRY(__start) -SECTIONS -{ - . = 0x0; - .text : { - /* the __start symbol has to come at address zero. */ - *crt0.o(.text*) - _ctors_start = .; - _init_array_start = .; - KEEP(SORT(*.init_array)) - _ctors_end = .; - _init_array_end = .; - _dtors_start = .; - _dtors_end = .; - *(.text*) - } > program - .data : { - *(.data*) - *(.rodata*) - } > data - .comment : { - *(.comment*) - } - .symtab : { - *(.symtab) - } - .shstrtab : { - *(.shstrtab) - } - .strtab : { - *(.strtab) - } - .stack_sizes : { - *(.stack_sizes) - } - -. = 0x70000; -_sp_start_value_DM_stack = .; -. += 0x400; /* stack */ -/* No tile with memory exists to the south. */ -. = 0x40000; -. += 0x10000; -/* No tile with memory exists to the west. */ -. = 0x50000; -. += 0x10000; -. = 0x60400; -buf3 = .; -. += 0x400; -. = 0x64000; -buf2 = .; -. += 0x400; -. = 0x70400; -buf1 = .; -. += 0x400; -. = 0x74000; -buf0 = .; -. += 0x400; - .bss : { *(.bss*) } > data -} -PROVIDE(main = core_0_2); diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_2.ll b/examples/elementwise_arith/air_project/square_kernel_0_core_0_2.ll deleted file mode 100644 index d193819..0000000 --- a/examples/elementwise_arith/air_project/square_kernel_0_core_0_2.ll +++ /dev/null @@ -1,84 +0,0 @@ -; ModuleID = 'LLVMDialectModule' -source_filename = "LLVMDialectModule" -target triple = "aie2p" - -@buf0 = external global [256 x float] -@buf1 = external global [256 x float] -@buf2 = external global [256 x float] -@buf3 = external global [256 x float] -@buf4 = external global [256 x float] -@buf5 = external global [256 x float] -@buf6 = external global [256 x float] -@buf7 = external global [256 x float] -@buf8 = external global [1024 x float] -@buf9 = external global [1024 x float] - -declare void @debug_i32(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.event(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.put.ms(i32, i32) - -; Unknown intrinsic -declare { i32, i32 } @llvm.aie2p.get.ss() - -; Unknown intrinsic -declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32) - -; Unknown intrinsic -declare <16 x i32> @llvm.aie2p.scd.read.vec(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.acquire(i32, i32) - -; Unknown intrinsic -declare void @llvm.aie2p.release(i32, i32) - -; Unknown intrinsic -declare void @llvm.aie2p.set.ctrl.reg(i32, i32) - -define void @core_0_2() { - call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1) - call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0) - br label %1 - -1: ; preds = %13, %0 - call void @llvm.aie2p.acquire(i32 49, i32 -1) - call void @llvm.aie2p.acquire(i32 50, i32 -1) - br label %2 - -2: ; preds = %5, %1 - %3 = phi i32 [ %12, %5 ], [ 0, %1 ] - %4 = icmp slt i32 %3, 256 - br i1 %4, label %5, label %13 - -5: ; preds = %2 - %6 = getelementptr float, ptr @buf1, i32 %3 - %7 = load <16 x float>, ptr %6, align 4 - %8 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %7) - %9 = shufflevector <16 x bfloat> %8, <16 x bfloat> %8, <32 x i32> - %10 = call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %9, <32 x bfloat> %9, i32 60) - %11 = getelementptr float, ptr @buf0, i32 %3 - store <16 x float> %10, ptr %11, align 4 - %12 = add i32 %3, 16 - br label %2, !llvm.loop !1 - -13: ; preds = %2 - call void @llvm.aie2p.release(i32 51, i32 1) - call void @llvm.aie2p.release(i32 48, i32 1) - br label %1 -} - -; Unknown intrinsic -declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>) - -; Unknown intrinsic -declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32) - -!llvm.module.flags = !{!0} - -!0 = !{i32 2, !"Debug Info Version", i32 3} -!1 = distinct !{!1, !2} -!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_2.o b/examples/elementwise_arith/air_project/square_kernel_0_core_0_2.o deleted file mode 100644 index 57437bb20b770876576b8753e9fb67450fce9b78..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 932 zcmZ`$O-mb56g|(cj1VLWO;czgx~if>M%)Neq9kphlj6oAbrC0VT1r#Xjw9G^5@?Vv zLf7*Lx_9LwtNw!0rOOEZ0Q(2D=f0V7hJY9DyZ5|v-hKDo*Qwgil~RaJ!AK#XKL(7G zT+@Q5$RN?m)+LZbiOHTS0!5tXa7M)Z@^-MRE~XcN@0r0aE?$*^N#p$Fm-?Nvinv~; zOl-N-%TC<{wV$)W_Mh3S&5ZNI3Lb=9ycY6h19+*1kAlvZClw6Y1lK8jKP$GptlZC< z?1#AL&7|<(%|fGpius5Siwb`>Ci(>P{2)Yk-2utnLXGMR#(o|MPBBfTfqXAKfA9J* z#Z2Je?OS3QUuff~G3iD~CjXxKbqZIayvPa6k>9!FrgieuTUBy4wX8P`l5A(3?yXK46J1a-N)BFhAep9P_a9VEDueA@3TQ1Ba-}9XY z, ptr %5, align 64 - %7 = tail call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %6) - %8 = shufflevector <16 x bfloat> %7, <16 x bfloat> poison, <32 x i32> - %9 = tail call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %8, <32 x bfloat> %8, i32 60) - %10 = getelementptr float, ptr @buf0, i20 %4 - store <16 x float> %9, ptr %10, align 64 - %11 = add nuw nsw i32 %3, 16 - %12 = icmp ult i32 %3, 240 - br i1 %12, label %2, label %13, !llvm.loop !1 - -13: ; preds = %2 - tail call void @llvm.aie2p.release(i32 51, i32 1) - tail call void @llvm.aie2p.release(i32 48, i32 1) - br label %1 -} - -; Function Attrs: nofree nounwind memory(inaccessiblemem: read) -declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>) #3 - -; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) -declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32) #4 - -attributes #0 = { nounwind } -attributes #1 = { nounwind memory(inaccessiblemem: write) } -attributes #2 = { noreturn nounwind } -attributes #3 = { nofree nounwind memory(inaccessiblemem: read) } -attributes #4 = { mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) } - -!llvm.module.flags = !{!0} - -!0 = !{i32 2, !"Debug Info Version", i32 3} -!1 = distinct !{!1, !2} -!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_2.peanohack.ll b/examples/elementwise_arith/air_project/square_kernel_0_core_0_2.peanohack.ll deleted file mode 100644 index 055e011..0000000 --- a/examples/elementwise_arith/air_project/square_kernel_0_core_0_2.peanohack.ll +++ /dev/null @@ -1,84 +0,0 @@ -; ModuleID = 'LLVMDialectModule' -source_filename = "LLVMDialectModule" -target triple = "aie2p" - -@buf0 = external global [256 x float] -@buf1 = external global [256 x float] -@buf2 = external global [256 x float] -@buf3 = external global [256 x float] -@buf4 = external global [256 x float] -@buf5 = external global [256 x float] -@buf6 = external global [256 x float] -@buf7 = external global [256 x float] -@buf8 = external global [1024 x float] -@buf9 = external global [1024 x float] - -declare void @debug_i32(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.event(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.put.ms(i32, i32) - -; Unknown intrinsic -declare { i32, i32 } @llvm.aie2p.get.ss() - -; Unknown intrinsic -declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32) - -; Unknown intrinsic -declare <16 x i32> @llvm.aie2p.scd.read.vec(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.acquire(i32, i32) - -; Unknown intrinsic -declare void @llvm.aie2p.release(i32, i32) - -; Unknown intrinsic -declare void @llvm.aie2p.set.ctrl.reg(i32, i32) - -define void @core_0_2() { - call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1) - call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0) - br label %1 - -1: ; preds = %13, %0 - call void @llvm.aie2p.acquire(i32 49, i32 -1) - call void @llvm.aie2p.acquire(i32 50, i32 -1) - br label %2 - -2: ; preds = %5, %1 - %3 = phi i32 [ %12, %5 ], [ 0, %1 ] - %4 = icmp slt i32 %3, 256 - br i1 %4, label %5, label %13 - -5: ; preds = %2 - %6 = getelementptr float, ptr @buf1, i32 %3 - %7 = load <16 x float>, ptr %6 - %8 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %7) - %9 = shufflevector <16 x bfloat> %8, <16 x bfloat> %8, <32 x i32> - %10 = call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %9, <32 x bfloat> %9, i32 60) - %11 = getelementptr float, ptr @buf0, i32 %3 - store <16 x float> %10, ptr %11 - %12 = add i32 %3, 16 - br label %2, !llvm.loop !1 - -13: ; preds = %2 - call void @llvm.aie2p.release(i32 51, i32 1) - call void @llvm.aie2p.release(i32 48, i32 1) - br label %1 -} - -; Unknown intrinsic -declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>) - -; Unknown intrinsic -declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32) - -!llvm.module.flags = !{!0} - -!0 = !{i32 2, !"Debug Info Version", i32 3} -!1 = distinct !{!1, !2} -!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_3.elf b/examples/elementwise_arith/air_project/square_kernel_0_core_0_3.elf deleted file mode 100755 index a8d3607d6115f3e937af1d24e81c3abb59403ece..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1640 zcma)6J!~UI6#mvRu_K+p2O=^8q~VZIK*8CK4TrDVi+l(XM+yZf5<<&*cayAQufy&- z~cP;43G)R{SL_=}cU4aPS+aKGa;*(~+_r3RK=IzYv z+`7?zSJO1)D4;x64LaAK0%ph!C={@O5{i)Q1GXF^I;1fb6}Aulh}}Fk&9l=Yvx^#T z11&t9D~+SCy#$=UKcnzy5oa>x@`;9zH3cZ#tzh&(qk*%F;|=X}?h5c#>39RDt0pTb z-`&5Xom7luED|j0hnHsm8LoYIdAR=T<=?ND>~D?RIVNk1=fC;Y z@~88_@Cb82wT$^g@DQfJd=osAWPfD-T?PwM$?(T}&lGka{X8@J`pIzX@6uwSXwGEc z!!cR?)%3`c8LQ3vlS4rDhkK zBflSHUbV0t$Gg$BrKK%D-X3&nPQSOb;dlLDf2rHu>%D3Fp6b*cS8uxYx@Ws}z4>CUt+1PBpp4Kqkvz#%Cczp33)%w_@;YHfFr}V3|E8{^LCi^eD z_xM8Yx;YhJp;pr9dlnj2X``5KPX)8-TINK5C~-ouN&Y7y__qioe~)Z(E;$~Fe@Rws z0?|igC4#6>e@|8-2+?=QN(3Q#L{=gQ(I;fZCJ_A#S&1M-pNdiBJt3r?-^4H{`doA| z{X-@GXA+-i)M9Txh8$zh3ur_)d$HXiZ%1*MT4T1I9V_y`@FL!q_D5?i-|l)&yw-OI zT@SVP%8G6o884Hq_Km>R;e@eKa~wK02Z3XCye&U4;PgY!(ycl=gUzH{hGj*u9mcSt z-PE%7?C!v`T5BSm9aufv4=g|MV~7RIiTh!cj}pIShoOBl_oQJjaFGnkf{AvgN2Wsj zEXI&dm+md;8p-jRvTgqd8k3}@JDk3!@`mQ9WQN=zaqF4Z3&7`@EtJnA`3#Nm6U708 z@+Qk1a?+B+|2=unNeLDhxS9)cUcwdH^6uxbnD program - .data : { - *(.data*) - *(.rodata*) - } > data - .comment : { - *(.comment*) - } - .symtab : { - *(.symtab) - } - .shstrtab : { - *(.shstrtab) - } - .strtab : { - *(.strtab) - } - .stack_sizes : { - *(.stack_sizes) - } - -. = 0x70000; -_sp_start_value_DM_stack = .; -. += 0x400; /* stack */ -. = 0x40400; -buf1 = .; -. += 0x400; -. = 0x44000; -buf0 = .; -. += 0x400; -/* No tile with memory exists to the west. */ -. = 0x50000; -. += 0x10000; -. = 0x60400; -buf5 = .; -. += 0x400; -. = 0x64000; -buf4 = .; -. += 0x400; -. = 0x70400; -buf3 = .; -. += 0x400; -. = 0x74000; -buf2 = .; -. += 0x400; - .bss : { *(.bss*) } > data -} -PROVIDE(main = core_0_3); diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_3.ll b/examples/elementwise_arith/air_project/square_kernel_0_core_0_3.ll deleted file mode 100644 index 9d2e115..0000000 --- a/examples/elementwise_arith/air_project/square_kernel_0_core_0_3.ll +++ /dev/null @@ -1,84 +0,0 @@ -; ModuleID = 'LLVMDialectModule' -source_filename = "LLVMDialectModule" -target triple = "aie2p" - -@buf0 = external global [256 x float] -@buf1 = external global [256 x float] -@buf2 = external global [256 x float] -@buf3 = external global [256 x float] -@buf4 = external global [256 x float] -@buf5 = external global [256 x float] -@buf6 = external global [256 x float] -@buf7 = external global [256 x float] -@buf8 = external global [1024 x float] -@buf9 = external global [1024 x float] - -declare void @debug_i32(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.event(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.put.ms(i32, i32) - -; Unknown intrinsic -declare { i32, i32 } @llvm.aie2p.get.ss() - -; Unknown intrinsic -declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32) - -; Unknown intrinsic -declare <16 x i32> @llvm.aie2p.scd.read.vec(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.acquire(i32, i32) - -; Unknown intrinsic -declare void @llvm.aie2p.release(i32, i32) - -; Unknown intrinsic -declare void @llvm.aie2p.set.ctrl.reg(i32, i32) - -define void @core_0_3() { - call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1) - call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0) - br label %1 - -1: ; preds = %13, %0 - call void @llvm.aie2p.acquire(i32 49, i32 -1) - call void @llvm.aie2p.acquire(i32 50, i32 -1) - br label %2 - -2: ; preds = %5, %1 - %3 = phi i32 [ %12, %5 ], [ 0, %1 ] - %4 = icmp slt i32 %3, 256 - br i1 %4, label %5, label %13 - -5: ; preds = %2 - %6 = getelementptr float, ptr @buf3, i32 %3 - %7 = load <16 x float>, ptr %6, align 4 - %8 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %7) - %9 = shufflevector <16 x bfloat> %8, <16 x bfloat> %8, <32 x i32> - %10 = call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %9, <32 x bfloat> %9, i32 60) - %11 = getelementptr float, ptr @buf2, i32 %3 - store <16 x float> %10, ptr %11, align 4 - %12 = add i32 %3, 16 - br label %2, !llvm.loop !1 - -13: ; preds = %2 - call void @llvm.aie2p.release(i32 51, i32 1) - call void @llvm.aie2p.release(i32 48, i32 1) - br label %1 -} - -; Unknown intrinsic -declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>) - -; Unknown intrinsic -declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32) - -!llvm.module.flags = !{!0} - -!0 = !{i32 2, !"Debug Info Version", i32 3} -!1 = distinct !{!1, !2} -!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_3.o b/examples/elementwise_arith/air_project/square_kernel_0_core_0_3.o deleted file mode 100644 index 6b3d34570ca51b7ee784f20a1a0a4907b4e35d5c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 932 zcmZ`$O-mb56g|(cj1VLWO(V3Bc2z}(q;^vfiB{W6C#48}fQxYwr$tPSjw5I{3JubQ zbUlBddsi;9>MxKkT}JQ+*gsHv?wc8B2zcSXd(S)P-FM%8#|ze5r4(XQFj5FO>j8R6 zzS4ph$RN?m)-{kriOISu0!3Wpa8AVh`mVjEt_Gh2-!ttsTz#kj{l>-NFZDZDF5-5A zGO>wLD?9KnsQw%dK3xvq%x0V)<=|1s#cLs7F@X1~Ga4N4J)K8~O>iC1_p@Td%gX(% z$$p4i-cSnvnJv`LPVp?_!=l2U_lQ2hJl_t{eRn`|w@}^sg0YbYf>R7qX&~PU&p)_6 zPBDG>duJ2N_(JQ&DwC54$>hHuP zSlT!Gc%M#@V_n5R*Jk?ep$~JX>AU8;kIR$$zEj_ZWvx`_x15Gs_p8m#gN6%p-}ii{ z#(4BqV(sAb6im-;IJ$yaZ+fm>vZsW23DcUJE7@@@6Wi9p&So-&==KN07@4Sbt$D(J zcT;LUL7PPJ==rGV-DHt@h`phA>tw#M#kv`8$e1b-(V=FVxzv|_gv-)p#Hr)hNP`qt TW*(24sn(q^ioBmL_2lmVgu7{C diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_3.opt.ll b/examples/elementwise_arith/air_project/square_kernel_0_core_0_3.opt.ll deleted file mode 100644 index f2c89be..0000000 --- a/examples/elementwise_arith/air_project/square_kernel_0_core_0_3.opt.ll +++ /dev/null @@ -1,65 +0,0 @@ -; ModuleID = 'air_project/square_kernel_0_core_0_3.peanohack.ll' -source_filename = "LLVMDialectModule" -target datalayout = "e-m:e-p:20:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-f32:32:32-i64:32-f64:32-a:0:32-n32" -target triple = "aie2p" - -@buf2 = external local_unnamed_addr global [256 x float] -@buf3 = external local_unnamed_addr global [256 x float] - -; Function Attrs: nounwind -declare void @llvm.aie2p.acquire(i32, i32) #0 - -; Function Attrs: nounwind -declare void @llvm.aie2p.release(i32, i32) #0 - -; Function Attrs: nounwind memory(inaccessiblemem: write) -declare void @llvm.aie2p.set.ctrl.reg(i32, i32) #1 - -; Function Attrs: noreturn nounwind -define void @core_0_3() local_unnamed_addr #2 { - tail call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1) - tail call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0) - br label %1 - -1: ; preds = %13, %0 - tail call void @llvm.aie2p.acquire(i32 49, i32 -1) - tail call void @llvm.aie2p.acquire(i32 50, i32 -1) - br label %2 - -2: ; preds = %1, %2 - %3 = phi i32 [ 0, %1 ], [ %11, %2 ] - %4 = trunc nuw i32 %3 to i20 - %5 = getelementptr float, ptr @buf3, i20 %4 - %6 = load <16 x float>, ptr %5, align 64 - %7 = tail call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %6) - %8 = shufflevector <16 x bfloat> %7, <16 x bfloat> poison, <32 x i32> - %9 = tail call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %8, <32 x bfloat> %8, i32 60) - %10 = getelementptr float, ptr @buf2, i20 %4 - store <16 x float> %9, ptr %10, align 64 - %11 = add nuw nsw i32 %3, 16 - %12 = icmp ult i32 %3, 240 - br i1 %12, label %2, label %13, !llvm.loop !1 - -13: ; preds = %2 - tail call void @llvm.aie2p.release(i32 51, i32 1) - tail call void @llvm.aie2p.release(i32 48, i32 1) - br label %1 -} - -; Function Attrs: nofree nounwind memory(inaccessiblemem: read) -declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>) #3 - -; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) -declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32) #4 - -attributes #0 = { nounwind } -attributes #1 = { nounwind memory(inaccessiblemem: write) } -attributes #2 = { noreturn nounwind } -attributes #3 = { nofree nounwind memory(inaccessiblemem: read) } -attributes #4 = { mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) } - -!llvm.module.flags = !{!0} - -!0 = !{i32 2, !"Debug Info Version", i32 3} -!1 = distinct !{!1, !2} -!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_3.peanohack.ll b/examples/elementwise_arith/air_project/square_kernel_0_core_0_3.peanohack.ll deleted file mode 100644 index ed78c15..0000000 --- a/examples/elementwise_arith/air_project/square_kernel_0_core_0_3.peanohack.ll +++ /dev/null @@ -1,84 +0,0 @@ -; ModuleID = 'LLVMDialectModule' -source_filename = "LLVMDialectModule" -target triple = "aie2p" - -@buf0 = external global [256 x float] -@buf1 = external global [256 x float] -@buf2 = external global [256 x float] -@buf3 = external global [256 x float] -@buf4 = external global [256 x float] -@buf5 = external global [256 x float] -@buf6 = external global [256 x float] -@buf7 = external global [256 x float] -@buf8 = external global [1024 x float] -@buf9 = external global [1024 x float] - -declare void @debug_i32(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.event(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.put.ms(i32, i32) - -; Unknown intrinsic -declare { i32, i32 } @llvm.aie2p.get.ss() - -; Unknown intrinsic -declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32) - -; Unknown intrinsic -declare <16 x i32> @llvm.aie2p.scd.read.vec(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.acquire(i32, i32) - -; Unknown intrinsic -declare void @llvm.aie2p.release(i32, i32) - -; Unknown intrinsic -declare void @llvm.aie2p.set.ctrl.reg(i32, i32) - -define void @core_0_3() { - call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1) - call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0) - br label %1 - -1: ; preds = %13, %0 - call void @llvm.aie2p.acquire(i32 49, i32 -1) - call void @llvm.aie2p.acquire(i32 50, i32 -1) - br label %2 - -2: ; preds = %5, %1 - %3 = phi i32 [ %12, %5 ], [ 0, %1 ] - %4 = icmp slt i32 %3, 256 - br i1 %4, label %5, label %13 - -5: ; preds = %2 - %6 = getelementptr float, ptr @buf3, i32 %3 - %7 = load <16 x float>, ptr %6 - %8 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %7) - %9 = shufflevector <16 x bfloat> %8, <16 x bfloat> %8, <32 x i32> - %10 = call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %9, <32 x bfloat> %9, i32 60) - %11 = getelementptr float, ptr @buf2, i32 %3 - store <16 x float> %10, ptr %11 - %12 = add i32 %3, 16 - br label %2, !llvm.loop !1 - -13: ; preds = %2 - call void @llvm.aie2p.release(i32 51, i32 1) - call void @llvm.aie2p.release(i32 48, i32 1) - br label %1 -} - -; Unknown intrinsic -declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>) - -; Unknown intrinsic -declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32) - -!llvm.module.flags = !{!0} - -!0 = !{i32 2, !"Debug Info Version", i32 3} -!1 = distinct !{!1, !2} -!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_4.elf b/examples/elementwise_arith/air_project/square_kernel_0_core_0_4.elf deleted file mode 100755 index b06bbc2edc2ab1b34e0e4e5e00fa2e23d127e11e..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1640 zcma)6O>7%Q6#mvRu_IOBf{2U?(oiH6P;l0#(~{ zQ!Wi63LFp@F6MyLOC^pLhaPf@)Lub7=7Q8jATB+$w;mATd;4QsocN@f?|tvRnRz=i zJ9lohKF~A`8FDC&RGrSX7l0{p9SS+jp@2Ljd!H@CkPc}~MTza>KO-lLP4e`lNbS6a zyFe3A5AGdoyVz6xp=JMrltUe`y~t?X*6(pezdNg%v=V(E*!1nWW`_w z#ru2rwBu4W9*Y?B`oV?ie+H}HUmUFccJYtv1?$`D;MF9S_2pTk2z;nLer<5)i#M9a zDZi$v&R8(l=Zu};f~=p^LwyjuIH4b1FSoxt#G7e6nN;G#oQxl9jz3BitqkL}#q;0% za`E$7U~q^Tpj^c40eA=#V7>vKNxVNY{|oh!|@@Y{NsZ{8FO-=#6t~ueh#!ef7=bNsaC70s*Q?X(E+<-`I~Cj z4MMNyr(QL;6-7JYwS|REFWTz2D|WBDu{-qTc|1FRg=%f&(Qt|O-3k2`?b2wFhVlN( z?mat~yKYRxm#LLB`d)^H723!r+f%}HvX(K{pGcfgY?A+(2>vYs$=@X#oJ)>J;$M*! zn?Up-S&1M@)IX4w2txEdvJydv9+H&^Li90Nu?a-~N>(BW(I;Z$c~1zb=XWuTu|5-> zPySGe{}so_8kNZ1iy+6?b$uG)?QUeX$y;F*B-V&6d)o}XFWr#$rS<7*)3Z9R9j*49 ze#b?nwY;pG)sz>>X6uIU=&*yRTCr_9Hu}D8w%tw7ufpyHuBn@KwEG+HOIL?!hLIIS zFvFe1GIy;`-!+@7BJFLMUCZ-L&-WsT1=EgtL70sazi9=5bvyGUVb^yM4@!fvb|yzA zLj0_wCf%ZROVVA5kJpfG`#;c_BrVzD program - .data : { - *(.data*) - *(.rodata*) - } > data - .comment : { - *(.comment*) - } - .symtab : { - *(.symtab) - } - .shstrtab : { - *(.shstrtab) - } - .strtab : { - *(.strtab) - } - .stack_sizes : { - *(.stack_sizes) - } - -. = 0x70000; -_sp_start_value_DM_stack = .; -. += 0x400; /* stack */ -. = 0x40400; -buf3 = .; -. += 0x400; -. = 0x44000; -buf2 = .; -. += 0x400; -/* No tile with memory exists to the west. */ -. = 0x50000; -. += 0x10000; -. = 0x60400; -buf7 = .; -. += 0x400; -. = 0x64000; -buf6 = .; -. += 0x400; -. = 0x70400; -buf5 = .; -. += 0x400; -. = 0x74000; -buf4 = .; -. += 0x400; - .bss : { *(.bss*) } > data -} -PROVIDE(main = core_0_4); diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_4.ll b/examples/elementwise_arith/air_project/square_kernel_0_core_0_4.ll deleted file mode 100644 index cfa104c..0000000 --- a/examples/elementwise_arith/air_project/square_kernel_0_core_0_4.ll +++ /dev/null @@ -1,84 +0,0 @@ -; ModuleID = 'LLVMDialectModule' -source_filename = "LLVMDialectModule" -target triple = "aie2p" - -@buf0 = external global [256 x float] -@buf1 = external global [256 x float] -@buf2 = external global [256 x float] -@buf3 = external global [256 x float] -@buf4 = external global [256 x float] -@buf5 = external global [256 x float] -@buf6 = external global [256 x float] -@buf7 = external global [256 x float] -@buf8 = external global [1024 x float] -@buf9 = external global [1024 x float] - -declare void @debug_i32(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.event(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.put.ms(i32, i32) - -; Unknown intrinsic -declare { i32, i32 } @llvm.aie2p.get.ss() - -; Unknown intrinsic -declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32) - -; Unknown intrinsic -declare <16 x i32> @llvm.aie2p.scd.read.vec(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.acquire(i32, i32) - -; Unknown intrinsic -declare void @llvm.aie2p.release(i32, i32) - -; Unknown intrinsic -declare void @llvm.aie2p.set.ctrl.reg(i32, i32) - -define void @core_0_4() { - call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1) - call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0) - br label %1 - -1: ; preds = %13, %0 - call void @llvm.aie2p.acquire(i32 49, i32 -1) - call void @llvm.aie2p.acquire(i32 50, i32 -1) - br label %2 - -2: ; preds = %5, %1 - %3 = phi i32 [ %12, %5 ], [ 0, %1 ] - %4 = icmp slt i32 %3, 256 - br i1 %4, label %5, label %13 - -5: ; preds = %2 - %6 = getelementptr float, ptr @buf5, i32 %3 - %7 = load <16 x float>, ptr %6, align 4 - %8 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %7) - %9 = shufflevector <16 x bfloat> %8, <16 x bfloat> %8, <32 x i32> - %10 = call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %9, <32 x bfloat> %9, i32 60) - %11 = getelementptr float, ptr @buf4, i32 %3 - store <16 x float> %10, ptr %11, align 4 - %12 = add i32 %3, 16 - br label %2, !llvm.loop !1 - -13: ; preds = %2 - call void @llvm.aie2p.release(i32 51, i32 1) - call void @llvm.aie2p.release(i32 48, i32 1) - br label %1 -} - -; Unknown intrinsic -declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>) - -; Unknown intrinsic -declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32) - -!llvm.module.flags = !{!0} - -!0 = !{i32 2, !"Debug Info Version", i32 3} -!1 = distinct !{!1, !2} -!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_4.o b/examples/elementwise_arith/air_project/square_kernel_0_core_0_4.o deleted file mode 100644 index 6afdc2e5495148f08a97bed4dbec3859aee95823..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 932 zcmZ`$O-mb56g|(cj1WpJ7*nv2c2z}(MC?KkiC?YIiF8vHx)>*MTEx_};|S_Tp+Q`T z>-huSyK<3Le?jTeWrY3!{R7%_-^@5ezzg@?d)_(kzWeSwUbfyTr4XBfkwU!IOCshs5A6+gJ@^{atR$a!F532&x#E%EBCV| z`yp<5&rgPD z#q{Cs!6ugRh1QF8CMOY+$$w>jo5HJ6UgQLxlb?CypH7iuUBy4wX8P`d4|BKayXO1VPm_DTQ{RDQtyPz{orYWYtIe(bh6{7g_k5?u zc=%0XZU4(#S*BrnZo|=K%zD#v?UKzsYhj^ePerlJXj{v>Tgeon+n)?$WTMu!W|aMY zPpSC?Z4$+!=cA%`lSSqswnFdL$$Vprb&K4PF;ya>L(MF6sW1Hqm!->yQ^&EH1}U!0 TJRUVutvg{9c|Tq1$=&|}aJy+@ diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_4.opt.ll b/examples/elementwise_arith/air_project/square_kernel_0_core_0_4.opt.ll deleted file mode 100644 index a653490..0000000 --- a/examples/elementwise_arith/air_project/square_kernel_0_core_0_4.opt.ll +++ /dev/null @@ -1,65 +0,0 @@ -; ModuleID = 'air_project/square_kernel_0_core_0_4.peanohack.ll' -source_filename = "LLVMDialectModule" -target datalayout = "e-m:e-p:20:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-f32:32:32-i64:32-f64:32-a:0:32-n32" -target triple = "aie2p" - -@buf4 = external local_unnamed_addr global [256 x float] -@buf5 = external local_unnamed_addr global [256 x float] - -; Function Attrs: nounwind -declare void @llvm.aie2p.acquire(i32, i32) #0 - -; Function Attrs: nounwind -declare void @llvm.aie2p.release(i32, i32) #0 - -; Function Attrs: nounwind memory(inaccessiblemem: write) -declare void @llvm.aie2p.set.ctrl.reg(i32, i32) #1 - -; Function Attrs: noreturn nounwind -define void @core_0_4() local_unnamed_addr #2 { - tail call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1) - tail call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0) - br label %1 - -1: ; preds = %13, %0 - tail call void @llvm.aie2p.acquire(i32 49, i32 -1) - tail call void @llvm.aie2p.acquire(i32 50, i32 -1) - br label %2 - -2: ; preds = %1, %2 - %3 = phi i32 [ 0, %1 ], [ %11, %2 ] - %4 = trunc nuw i32 %3 to i20 - %5 = getelementptr float, ptr @buf5, i20 %4 - %6 = load <16 x float>, ptr %5, align 64 - %7 = tail call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %6) - %8 = shufflevector <16 x bfloat> %7, <16 x bfloat> poison, <32 x i32> - %9 = tail call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %8, <32 x bfloat> %8, i32 60) - %10 = getelementptr float, ptr @buf4, i20 %4 - store <16 x float> %9, ptr %10, align 64 - %11 = add nuw nsw i32 %3, 16 - %12 = icmp ult i32 %3, 240 - br i1 %12, label %2, label %13, !llvm.loop !1 - -13: ; preds = %2 - tail call void @llvm.aie2p.release(i32 51, i32 1) - tail call void @llvm.aie2p.release(i32 48, i32 1) - br label %1 -} - -; Function Attrs: nofree nounwind memory(inaccessiblemem: read) -declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>) #3 - -; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) -declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32) #4 - -attributes #0 = { nounwind } -attributes #1 = { nounwind memory(inaccessiblemem: write) } -attributes #2 = { noreturn nounwind } -attributes #3 = { nofree nounwind memory(inaccessiblemem: read) } -attributes #4 = { mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) } - -!llvm.module.flags = !{!0} - -!0 = !{i32 2, !"Debug Info Version", i32 3} -!1 = distinct !{!1, !2} -!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_4.peanohack.ll b/examples/elementwise_arith/air_project/square_kernel_0_core_0_4.peanohack.ll deleted file mode 100644 index 520a891..0000000 --- a/examples/elementwise_arith/air_project/square_kernel_0_core_0_4.peanohack.ll +++ /dev/null @@ -1,84 +0,0 @@ -; ModuleID = 'LLVMDialectModule' -source_filename = "LLVMDialectModule" -target triple = "aie2p" - -@buf0 = external global [256 x float] -@buf1 = external global [256 x float] -@buf2 = external global [256 x float] -@buf3 = external global [256 x float] -@buf4 = external global [256 x float] -@buf5 = external global [256 x float] -@buf6 = external global [256 x float] -@buf7 = external global [256 x float] -@buf8 = external global [1024 x float] -@buf9 = external global [1024 x float] - -declare void @debug_i32(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.event(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.put.ms(i32, i32) - -; Unknown intrinsic -declare { i32, i32 } @llvm.aie2p.get.ss() - -; Unknown intrinsic -declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32) - -; Unknown intrinsic -declare <16 x i32> @llvm.aie2p.scd.read.vec(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.acquire(i32, i32) - -; Unknown intrinsic -declare void @llvm.aie2p.release(i32, i32) - -; Unknown intrinsic -declare void @llvm.aie2p.set.ctrl.reg(i32, i32) - -define void @core_0_4() { - call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1) - call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0) - br label %1 - -1: ; preds = %13, %0 - call void @llvm.aie2p.acquire(i32 49, i32 -1) - call void @llvm.aie2p.acquire(i32 50, i32 -1) - br label %2 - -2: ; preds = %5, %1 - %3 = phi i32 [ %12, %5 ], [ 0, %1 ] - %4 = icmp slt i32 %3, 256 - br i1 %4, label %5, label %13 - -5: ; preds = %2 - %6 = getelementptr float, ptr @buf5, i32 %3 - %7 = load <16 x float>, ptr %6 - %8 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %7) - %9 = shufflevector <16 x bfloat> %8, <16 x bfloat> %8, <32 x i32> - %10 = call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %9, <32 x bfloat> %9, i32 60) - %11 = getelementptr float, ptr @buf4, i32 %3 - store <16 x float> %10, ptr %11 - %12 = add i32 %3, 16 - br label %2, !llvm.loop !1 - -13: ; preds = %2 - call void @llvm.aie2p.release(i32 51, i32 1) - call void @llvm.aie2p.release(i32 48, i32 1) - br label %1 -} - -; Unknown intrinsic -declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>) - -; Unknown intrinsic -declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32) - -!llvm.module.flags = !{!0} - -!0 = !{i32 2, !"Debug Info Version", i32 3} -!1 = distinct !{!1, !2} -!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_5.elf b/examples/elementwise_arith/air_project/square_kernel_0_core_0_5.elf deleted file mode 100755 index 9b231c6aa9ecc51f66997deeb1a18cef3b26ae8b..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1600 zcma)6L2MgE6n*QM*pVu5sfdgKX<8%{P;l0@X>c!WBNrj+NaX;EgwS@qyGd5D*I~U5 zDVGKj1rCS{7jr=Br4mPrLk~GcYOkOkb3tk%xVN_+5aIpZ9oy=OC(XY9{{J)ccmD3& zx!(HFFbrtqP#mi|oeM7jGvqoHa#%nCc}VsihsKBw8B9fqNM3%gKWZ-i{0Sfm^7(Fs*;NwZ zHcfTPg1J9e>_q2e|Fja%aSF_jfayap#1ZLLKzFvP~ss^2( zkT!MwkVgM@H+I_O&3+uE)|ef4%kBqX`+a^7teGs~K+usPnD%@V=+ot^<+Jp6Fd0c^Q_hToDVfVLF%ieK11K)10 zigdSNcby=#gD{987Hl`}MSVR>{I(NC&TZ{U!+z)?nUn<+?M=^2h4@)VQc*o=nk9$z z|6njphQ90eXv^KGQOXSYGQ@3W+ROo8WwwxBNAejO<5!AX49ZZ9 k8CceWtV_5=Tkdm>514nV1zG34li7P&>>@XmJ#}33e}uC6SpWb4 diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_5.ld.script b/examples/elementwise_arith/air_project/square_kernel_0_core_0_5.ld.script deleted file mode 100644 index 818260c..0000000 --- a/examples/elementwise_arith/air_project/square_kernel_0_core_0_5.ld.script +++ /dev/null @@ -1,66 +0,0 @@ - -MEMORY -{ - program (RX) : ORIGIN = 0, LENGTH = 0x0020000 - data (!RX) : ORIGIN = 0x74400, LENGTH = 0xBC00 -} -ENTRY(__start) -SECTIONS -{ - . = 0x0; - .text : { - /* the __start symbol has to come at address zero. */ - *crt0.o(.text*) - _ctors_start = .; - _init_array_start = .; - KEEP(SORT(*.init_array)) - _ctors_end = .; - _init_array_end = .; - _dtors_start = .; - _dtors_end = .; - *(.text*) - } > program - .data : { - *(.data*) - *(.rodata*) - } > data - .comment : { - *(.comment*) - } - .symtab : { - *(.symtab) - } - .shstrtab : { - *(.shstrtab) - } - .strtab : { - *(.strtab) - } - .stack_sizes : { - *(.stack_sizes) - } - -. = 0x70000; -_sp_start_value_DM_stack = .; -. += 0x400; /* stack */ -. = 0x40400; -buf5 = .; -. += 0x400; -. = 0x44000; -buf4 = .; -. += 0x400; -/* No tile with memory exists to the west. */ -. = 0x50000; -. += 0x10000; -/* No tile with memory exists to the north. */ -. = 0x60000; -. += 0x10000; -. = 0x70400; -buf7 = .; -. += 0x400; -. = 0x74000; -buf6 = .; -. += 0x400; - .bss : { *(.bss*) } > data -} -PROVIDE(main = core_0_5); diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_5.ll b/examples/elementwise_arith/air_project/square_kernel_0_core_0_5.ll deleted file mode 100644 index 3e15d3e..0000000 --- a/examples/elementwise_arith/air_project/square_kernel_0_core_0_5.ll +++ /dev/null @@ -1,84 +0,0 @@ -; ModuleID = 'LLVMDialectModule' -source_filename = "LLVMDialectModule" -target triple = "aie2p" - -@buf0 = external global [256 x float] -@buf1 = external global [256 x float] -@buf2 = external global [256 x float] -@buf3 = external global [256 x float] -@buf4 = external global [256 x float] -@buf5 = external global [256 x float] -@buf6 = external global [256 x float] -@buf7 = external global [256 x float] -@buf8 = external global [1024 x float] -@buf9 = external global [1024 x float] - -declare void @debug_i32(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.event(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.put.ms(i32, i32) - -; Unknown intrinsic -declare { i32, i32 } @llvm.aie2p.get.ss() - -; Unknown intrinsic -declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32) - -; Unknown intrinsic -declare <16 x i32> @llvm.aie2p.scd.read.vec(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.acquire(i32, i32) - -; Unknown intrinsic -declare void @llvm.aie2p.release(i32, i32) - -; Unknown intrinsic -declare void @llvm.aie2p.set.ctrl.reg(i32, i32) - -define void @core_0_5() { - call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1) - call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0) - br label %1 - -1: ; preds = %13, %0 - call void @llvm.aie2p.acquire(i32 49, i32 -1) - call void @llvm.aie2p.acquire(i32 50, i32 -1) - br label %2 - -2: ; preds = %5, %1 - %3 = phi i32 [ %12, %5 ], [ 0, %1 ] - %4 = icmp slt i32 %3, 256 - br i1 %4, label %5, label %13 - -5: ; preds = %2 - %6 = getelementptr float, ptr @buf7, i32 %3 - %7 = load <16 x float>, ptr %6, align 4 - %8 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %7) - %9 = shufflevector <16 x bfloat> %8, <16 x bfloat> %8, <32 x i32> - %10 = call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %9, <32 x bfloat> %9, i32 60) - %11 = getelementptr float, ptr @buf6, i32 %3 - store <16 x float> %10, ptr %11, align 4 - %12 = add i32 %3, 16 - br label %2, !llvm.loop !1 - -13: ; preds = %2 - call void @llvm.aie2p.release(i32 51, i32 1) - call void @llvm.aie2p.release(i32 48, i32 1) - br label %1 -} - -; Unknown intrinsic -declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>) - -; Unknown intrinsic -declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32) - -!llvm.module.flags = !{!0} - -!0 = !{i32 2, !"Debug Info Version", i32 3} -!1 = distinct !{!1, !2} -!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_5.o b/examples/elementwise_arith/air_project/square_kernel_0_core_0_5.o deleted file mode 100644 index cb309eb89ecceee56448345389d9d7a76543e35e..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 932 zcmZ`$O-mb56g|(cj1Wo`OjBqfx>C_0DQ!h4C0cC-C(=z@!NoX9rlmABI*wqwQD~4Z z#P$3E_pV%I)n8Ctx{S~tVE;h%+&44M5b(l%_nvpoyYIgH4wkJqN-4yqV5AUm+z0fN ze5M7H$RN?m))kOLiOGg40!5tXa7M)Z>ZY@|nSpmDx`s($CnMO@EN zCN@)QXNUd=)svCn-Jg-a3mNBEIT#DMcrD~B2C$;KkAj1pCrjwE39du>epYOHS-GDz z*$;8sdyv8x3x(S85ym4vEGqn2pXd|J^PLdgbq6GO3)QPH7@K(@IKnWM2J-Fj{JrbL z6f=N-H#V`1FSLGqWO5iGnfzzw*D1UfGc1Cm#X4>cIl1I43dTXEQp5&oC_? zOZ!2e9?&UrtgHCv+DzZw_hD|eeAj%t_I`HPcj{kZSs$uPU!0~}_p7bOUekrS>wCUa zWBmS2Vr}o!oGdS3dT!IvWz2fZbM2D-9HzCnShAl+vCL>&%Ug|P3eoNNhAA>p>ss@e z{cfk!e1bNK;?eU_(Ywha^AKC1_v&Q6vBkRA+>kL1m> VT$y=1YNlFu$SCqby3~`qZvl4{X=eZc diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_5.opt.ll b/examples/elementwise_arith/air_project/square_kernel_0_core_0_5.opt.ll deleted file mode 100644 index bccc4ff..0000000 --- a/examples/elementwise_arith/air_project/square_kernel_0_core_0_5.opt.ll +++ /dev/null @@ -1,65 +0,0 @@ -; ModuleID = 'air_project/square_kernel_0_core_0_5.peanohack.ll' -source_filename = "LLVMDialectModule" -target datalayout = "e-m:e-p:20:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-f32:32:32-i64:32-f64:32-a:0:32-n32" -target triple = "aie2p" - -@buf6 = external local_unnamed_addr global [256 x float] -@buf7 = external local_unnamed_addr global [256 x float] - -; Function Attrs: nounwind -declare void @llvm.aie2p.acquire(i32, i32) #0 - -; Function Attrs: nounwind -declare void @llvm.aie2p.release(i32, i32) #0 - -; Function Attrs: nounwind memory(inaccessiblemem: write) -declare void @llvm.aie2p.set.ctrl.reg(i32, i32) #1 - -; Function Attrs: noreturn nounwind -define void @core_0_5() local_unnamed_addr #2 { - tail call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1) - tail call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0) - br label %1 - -1: ; preds = %13, %0 - tail call void @llvm.aie2p.acquire(i32 49, i32 -1) - tail call void @llvm.aie2p.acquire(i32 50, i32 -1) - br label %2 - -2: ; preds = %1, %2 - %3 = phi i32 [ 0, %1 ], [ %11, %2 ] - %4 = trunc nuw i32 %3 to i20 - %5 = getelementptr float, ptr @buf7, i20 %4 - %6 = load <16 x float>, ptr %5, align 64 - %7 = tail call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %6) - %8 = shufflevector <16 x bfloat> %7, <16 x bfloat> poison, <32 x i32> - %9 = tail call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %8, <32 x bfloat> %8, i32 60) - %10 = getelementptr float, ptr @buf6, i20 %4 - store <16 x float> %9, ptr %10, align 64 - %11 = add nuw nsw i32 %3, 16 - %12 = icmp ult i32 %3, 240 - br i1 %12, label %2, label %13, !llvm.loop !1 - -13: ; preds = %2 - tail call void @llvm.aie2p.release(i32 51, i32 1) - tail call void @llvm.aie2p.release(i32 48, i32 1) - br label %1 -} - -; Function Attrs: nofree nounwind memory(inaccessiblemem: read) -declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>) #3 - -; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) -declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32) #4 - -attributes #0 = { nounwind } -attributes #1 = { nounwind memory(inaccessiblemem: write) } -attributes #2 = { noreturn nounwind } -attributes #3 = { nofree nounwind memory(inaccessiblemem: read) } -attributes #4 = { mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) } - -!llvm.module.flags = !{!0} - -!0 = !{i32 2, !"Debug Info Version", i32 3} -!1 = distinct !{!1, !2} -!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_5.peanohack.ll b/examples/elementwise_arith/air_project/square_kernel_0_core_0_5.peanohack.ll deleted file mode 100644 index d8f77fa..0000000 --- a/examples/elementwise_arith/air_project/square_kernel_0_core_0_5.peanohack.ll +++ /dev/null @@ -1,84 +0,0 @@ -; ModuleID = 'LLVMDialectModule' -source_filename = "LLVMDialectModule" -target triple = "aie2p" - -@buf0 = external global [256 x float] -@buf1 = external global [256 x float] -@buf2 = external global [256 x float] -@buf3 = external global [256 x float] -@buf4 = external global [256 x float] -@buf5 = external global [256 x float] -@buf6 = external global [256 x float] -@buf7 = external global [256 x float] -@buf8 = external global [1024 x float] -@buf9 = external global [1024 x float] - -declare void @debug_i32(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.event(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.put.ms(i32, i32) - -; Unknown intrinsic -declare { i32, i32 } @llvm.aie2p.get.ss() - -; Unknown intrinsic -declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32) - -; Unknown intrinsic -declare <16 x i32> @llvm.aie2p.scd.read.vec(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.acquire(i32, i32) - -; Unknown intrinsic -declare void @llvm.aie2p.release(i32, i32) - -; Unknown intrinsic -declare void @llvm.aie2p.set.ctrl.reg(i32, i32) - -define void @core_0_5() { - call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1) - call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0) - br label %1 - -1: ; preds = %13, %0 - call void @llvm.aie2p.acquire(i32 49, i32 -1) - call void @llvm.aie2p.acquire(i32 50, i32 -1) - br label %2 - -2: ; preds = %5, %1 - %3 = phi i32 [ %12, %5 ], [ 0, %1 ] - %4 = icmp slt i32 %3, 256 - br i1 %4, label %5, label %13 - -5: ; preds = %2 - %6 = getelementptr float, ptr @buf7, i32 %3 - %7 = load <16 x float>, ptr %6 - %8 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %7) - %9 = shufflevector <16 x bfloat> %8, <16 x bfloat> %8, <32 x i32> - %10 = call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %9, <32 x bfloat> %9, i32 60) - %11 = getelementptr float, ptr @buf6, i32 %3 - store <16 x float> %10, ptr %11 - %12 = add i32 %3, 16 - br label %2, !llvm.loop !1 - -13: ; preds = %2 - call void @llvm.aie2p.release(i32 51, i32 1) - call void @llvm.aie2p.release(i32 48, i32 1) - br label %1 -} - -; Unknown intrinsic -declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>) - -; Unknown intrinsic -declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32) - -!llvm.module.flags = !{!0} - -!0 = !{i32 2, !"Debug Info Version", i32 3} -!1 = distinct !{!1, !2} -!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/square_kernel_0_design.bif b/examples/elementwise_arith/air_project/square_kernel_0_design.bif deleted file mode 100644 index 6e94022..0000000 --- a/examples/elementwise_arith/air_project/square_kernel_0_design.bif +++ /dev/null @@ -1,10 +0,0 @@ -all: -{ - id_code = 0x14ca8093 - extended_id_code = 0x01 - image - { - name=aie_image, id=0x1c000000 - { type=cdo file=air_project/square_kernel_0_aie_cdo_elfs.bin file=air_project/square_kernel_0_aie_cdo_init.bin file=air_project/square_kernel_0_aie_cdo_enable.bin } - } -} diff --git a/examples/elementwise_arith/air_project/square_kernel_0_square_kernel_0_sequence.bin b/examples/elementwise_arith/air_project/square_kernel_0_square_kernel_0_sequence.bin deleted file mode 100644 index 97e175b81722f66b4d8fb9099cc8f7fbe11452c6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2288 zcmcJP!41MN3`Lz%A#QNszyXOB7@?zu%{)TIB#gjM>*D?kas#o1VyE%<&jTu{-yf35 zAR@1W2+}#mB=?e?toB8bc2;%|B-1(DSe73B-{SoA=NBCORAcIXoI82@=$VP#V&<7< z??-IhM6Sy|)_!>=l8dK|&wPETrL^|wYz7?iP^8=03@%Wz5F74Mv_~Hx|ia@+u5*2@O@J?m)Y3dW^8hm$JK8JDQm>oa8L9*VK%1G UjE(+*4Qs^Ma8Df@)A$FQHy}r3Z2$lO diff --git a/examples/elementwise_arith/air_project/sub_kernel_0.pdi b/examples/elementwise_arith/air_project/sub_kernel_0.pdi deleted file mode 100644 index cad10284470b3236f231f480e9d2396b90ba8f55..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 7792 zcmeHM&ubiY6n`^2Nw&IeH#L^ANpac)2_=V3H$n=5&UT}$5^-xP1UxhqYAq;)UfN@n zfJHz{nynrmyngkaH-Gy6+Z#jydEmzI;`b@Mczw3~k)-B1waJ z-+x)xPW*6_&y)KvT)uez^7|JqU33md9L%CG#!nq>3=qAH7r)E3ZPq!E{Y1VzL)!fx z-1&3+uK*6}v#;X>q8ehjdaJcPh{JrZKopIpxv^Z{JjQ1F1v$*gl#`1>9`tku?7Ppq zV!}R7fk)qoeL}5xv(-tRoeo!zq~oAXDs!Xl_{q$3OPGpC!fsMV=-U4|8`zN({C%=d9bpb z2P;!l&&^jYH(yzqHnVb3$l3Wi^JKmri}}+1w(}C2ep}hjgO%+(Sec^v+0(S)75;!qf=U2^KGki~w&2LiV=WvubToE`HxGC_W zz)J!z3%n}uy1*@g+X8n4-V!)5Sm#&I<)>xWznaL;;V5yqB5*8lQ{Y8`mjqrGcvawa zfm;H%1?~vEC2(S}&Tl@KpO#&IXGMMvM~TA~fn$N20xt@@B=EAps{*eJ+!DAga7W-R zffIqtG~)AjC3UR<+q}T@-L~ajjx-Voo}sd7V&*-AV{t811Rn{$EO-)p6SL*~WUm3s zM`9Ms!kz@*L=UZfEP7iJd?fg?;7RaJ)XMqE_SEu`tf#Oi!w>T3m|6DwVxL#kOb0$a zGJ>_jH^S3Fzuaf-BV`{uLxX-h9~{`tKGyzKpX8=Lz6uU=&xi1t&5LKfY_QEMfXrB3 z2RU5{GH1m){#@t9wmb8y?J+;LkJ(*j_0CWEe@K4NW9j@f_8@ca0q%zw>-@61lD#)p z`8nOr{3iF9AKS+`HM@G}r~E%8Kj^V^ej2;{^tGh(%j!z@yQRv{>2~IKY>)Y|eXL@V ze|qPq{68c==&^Ku8hem+|MX{$&M&Jg*=IB=Kc{o~;bDVXM*{UhM^-kpd5a&^Rt9lk zA3vPHiU{_QiH`7}9jmq-e<1CtVC=n*)vo-8ynSr_`zP|IO3vFy)_>t#0NM72gl?Dm`+Kavwtw{cYx_s9zqWt$`s0I~EB3GTSO0{~{3~p0 zjQabc{uv$Wubxr!KGZ*>L;ZbG|BMdx_n9!)R7N+G6sV7Andf4ti|5e>7s0Nrm?kGYt@q-Zwf_LXSeh9qt@AyhG>WjXoGdvC6 z`58_58StX-DL(~X^gZR9;GZ`2ndmw{3*hksjjCHi@DoW@#8XZEjDdG`c0A@m#N+rT zcz3SjBk<|`cO5^G)PLlLgPU!aEGtY;YTd>QC( z#Koe8`bAN-58uo2A`t(u53av<7vf=Kr535GwXmOu>Q5EFH0M=+eIME% zHXpNnry@TdPbXV0|C632+|1+^}`q6#!Vh)fSzv*3v zz^XbWH(*{#`4K>~H2LiYg&ox+{&oJuX$=15THkKsPq$vlaig4hADuDzQvKg%$yXQ2 zP#}MvdlIWAdLe+DVd9N-L>(P>gfr_1XVwwU5U$&?Jk$%i<&lHR#iU1?AtTRcEb(t` z>fglpr#;{fNi%cKtQ(wJH#kFB&G4_1;a{4en`S;^iGQ0@|0c#iJ#+4mG&ASSy1|)s qgENHhGW>g+;a{4en`S;^iGRCO|0c#iJ#+4mG&ASSy1|)sL$P~=DwMGR diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_aie_cdo_enable.bin b/examples/elementwise_arith/air_project/sub_kernel_0_aie_cdo_enable.bin deleted file mode 100644 index 7cc1818bce4d6ce1226fc5fda519967a4842b99e..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 104 zcmZQ!U|?`|@n>LQVqg#jvKQ~%@L!OTfq|Wo1;_&e1!V>x7mN{N##qFhv4{o3#Q>)) B1+xGE diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_aie_cdo_init.bin b/examples/elementwise_arith/air_project/sub_kernel_0_aie_cdo_init.bin deleted file mode 100644 index d4549ba181167c6e6d7475f23335e34fea2c3376..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6032 zcmb7{&rak<5XQ@%hINw3u1Uy>EJ*QcI7AT#M?)kINHH2BaX`vMPg)mN>ymUudP@3&^d;#l($}PKNY6;mN#BybCv6+u_35mqPjyW9=@;sg z)`iu&Bwdl7lD;5)N&1TPHR&7DGtzU?x1{e$Thh7gr0z}C-Y@95Kj1=K_e*vRasT2A z`KGw%NtC~$_drR!Af6K!;#1rYonOBXO5z3Gubj9LpW;_N?bzRq9a$@;UpUZ4Mwdf~w*)RN!T zTGw0O+2D~L&k!+PZVspqp^>qxyLCZUYYv{=)g)M&yk)HtOi1`aSl`pz$Z~{VHC> z{wm~8zhkS{>@Pz8!5=I0ho0@PV=wrGo|TPu3xmfX=5+8#k7tP3R^-WgJe!Ecz0Z3* zJBWQ9<=Kinq2IpGzOohbc{t*JyYq{${(e{F$2@C)+CR@F|Df@&=r{W@PpQw>ig|vl z-i6*{KG=BId(6k{cs_@S#l1i1@f;%-&#>z8oFW$Y{-no)v!Lz|&O*DNXg+CuAD%p) z+W++V)c&W>r}jU6KIt>&-QrcKx@>wrfeNOYm3=@d_NZ zw|+x>4;)3bJR^PpjtW{nB))G4Db?$G()v+vO7%KEZ|nOoNU7cwf84d}i)35nL%m9) zUZqj5n{~S$^(u~f-3(ifdKE{#Db=fZZU-ih)>n&xd&hVAz^Pux=TW^Tu6I|q)q2#U zHTbJ2{~>V${xaeR#53^EBfd|32(IcOZtO_bqxD;h5z6W6t-#g!Bz^*}&xc-r46f!) z>of4z(euuUpMtA-6E}8D`NwMi``~*2njeCH5^6!G{ z`D%{yOZ(S+3a%8O|3dRUJE8m&wZAQJonLdDk5s*%7Mkzakr_t)DHkL4{@Vhl z{wcl*uJh~l1-L$+%*XWnG36iY_v1oy)KAZ!`1S4I2o|Ro1qz~3EajN7i78>jEaZQL C^4EL- diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.elf b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.elf deleted file mode 100755 index fbf768003c7ae9be77deae967dc2a77b06391f68..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1656 zcma)6PiPcp6o0eX?xv;HHDX!N!iR!HiJO`1+GM?S5;hX*2GWBP>R~$nk__3MU1ny} zSPw2mYC-9(ussyK2;wOe6v-)&crBs#LK0fMct~#^O4Hw)Kf9xN@CV<#_j~Vqe`enI z&7Il$RY{U4L57C=tODaV9}o>eR!C5WMk!BO;@w@W1Y0oh!d5C^ef4)>CBC%GrAe%3 zC3;L$qrDIFeed}*L^CJrw0HEZKpVRfL6xn zqJ!bh^+%FgP!JiRI6U)dq>UA_pXC1Q&Hr?wxA6Okzozs0k4oW7gkLqtp<+vXD%}E~-6{jBJz3vi50u-kxeoO3E2{Q`67cy_Q;m&Z4-mlxoGws9Q zhBofK?k)eDAIoIbq4@m}6X!2RBjlu1tnLhkLd9RU^FLS z{b7oqgI?$dNj!%>IG)e`Ldrh_F0f?SMsQ%RL8ol=c?y(^YN-Q$o&DVb|C%`5Y!9)#jO+tE4q`$JHFy35Ye)%%J7@Nfg^P-ShNYi6wH| zu0>%`> Lf?PAnlgR%8Kh^|F diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.ld.script b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.ld.script deleted file mode 100644 index fc4f0cf..0000000 --- a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.ld.script +++ /dev/null @@ -1,72 +0,0 @@ - -MEMORY -{ - program (RX) : ORIGIN = 0, LENGTH = 0x0020000 - data (!RX) : ORIGIN = 0x78400, LENGTH = 0x7C00 -} -ENTRY(__start) -SECTIONS -{ - . = 0x0; - .text : { - /* the __start symbol has to come at address zero. */ - *crt0.o(.text*) - _ctors_start = .; - _init_array_start = .; - KEEP(SORT(*.init_array)) - _ctors_end = .; - _init_array_end = .; - _dtors_start = .; - _dtors_end = .; - *(.text*) - } > program - .data : { - *(.data*) - *(.rodata*) - } > data - .comment : { - *(.comment*) - } - .symtab : { - *(.symtab) - } - .shstrtab : { - *(.shstrtab) - } - .strtab : { - *(.strtab) - } - .stack_sizes : { - *(.stack_sizes) - } - -. = 0x70000; -_sp_start_value_DM_stack = .; -. += 0x400; /* stack */ -/* No tile with memory exists to the south. */ -. = 0x40000; -. += 0x10000; -/* No tile with memory exists to the west. */ -. = 0x50000; -. += 0x10000; -. = 0x60400; -buf5 = .; -. += 0x400; -. = 0x64000; -buf4 = .; -. += 0x400; -. = 0x68000; -buf3 = .; -. += 0x400; -. = 0x70400; -buf2 = .; -. += 0x400; -. = 0x74000; -buf1 = .; -. += 0x400; -. = 0x78000; -buf0 = .; -. += 0x400; - .bss : { *(.bss*) } > data -} -PROVIDE(main = core_0_2); diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.ll b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.ll deleted file mode 100644 index 906e39c..0000000 --- a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.ll +++ /dev/null @@ -1,95 +0,0 @@ -; ModuleID = 'LLVMDialectModule' -source_filename = "LLVMDialectModule" -target triple = "aie2p" - -@buf0 = external global [256 x float] -@buf1 = external global [256 x float] -@buf2 = external global [256 x float] -@buf3 = external global [256 x float] -@buf4 = external global [256 x float] -@buf5 = external global [256 x float] -@buf6 = external global [256 x float] -@buf7 = external global [256 x float] -@buf8 = external global [256 x float] -@buf9 = external global [256 x float] -@buf10 = external global [256 x float] -@buf11 = external global [256 x float] -@buf12 = external global [1024 x float] -@buf13 = external global [1024 x float] -@buf14 = external global [1024 x float] - -declare void @debug_i32(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.event(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.put.ms(i32, i32) - -; Unknown intrinsic -declare { i32, i32 } @llvm.aie2p.get.ss() - -; Unknown intrinsic -declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32) - -; Unknown intrinsic -declare <16 x i32> @llvm.aie2p.scd.read.vec(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.acquire(i32, i32) - -; Unknown intrinsic -declare void @llvm.aie2p.release(i32, i32) - -; Unknown intrinsic -declare void @llvm.aie2p.set.ctrl.reg(i32, i32) - -define void @core_0_2() { - br label %1 - -1: ; preds = %22, %0 - call void @llvm.aie2p.acquire(i32 49, i32 -1) - call void @llvm.aie2p.acquire(i32 50, i32 -1) - call void @llvm.aie2p.acquire(i32 52, i32 -1) - br label %2 - -2: ; preds = %5, %1 - %3 = phi i32 [ %21, %5 ], [ 0, %1 ] - %4 = icmp slt i32 %3, 256 - br i1 %4, label %5, label %22 - -5: ; preds = %2 - %6 = getelementptr float, ptr @buf2, i32 %3 - %7 = load <16 x float>, ptr %6, align 4 - %8 = getelementptr float, ptr @buf1, i32 %3 - %9 = load <16 x float>, ptr %8, align 4 - %10 = bitcast <16 x float> %7 to <8 x i64> - %11 = bitcast <16 x float> %9 to <8 x i64> - %12 = shufflevector <8 x i64> %10, <8 x i64> %10, <32 x i32> - %13 = shufflevector <8 x i64> %11, <8 x i64> %11, <32 x i32> - %14 = bitcast <32 x i64> %12 to <64 x float> - %15 = bitcast <32 x i64> %13 to <64 x float> - %16 = call <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float> %14, <64 x float> %15, i32 60) - %17 = bitcast <64 x float> %16 to <32 x i64> - %18 = shufflevector <32 x i64> %17, <32 x i64> %17, <8 x i32> - %19 = bitcast <8 x i64> %18 to <16 x float> - %20 = getelementptr float, ptr @buf0, i32 %3 - store <16 x float> %19, ptr %20, align 4 - %21 = add i32 %3, 16 - br label %2, !llvm.loop !1 - -22: ; preds = %2 - call void @llvm.aie2p.release(i32 51, i32 1) - call void @llvm.aie2p.release(i32 53, i32 1) - call void @llvm.aie2p.release(i32 48, i32 1) - br label %1 -} - -; Unknown intrinsic -declare <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float>, <64 x float>, i32) - -!llvm.module.flags = !{!0} - -!0 = !{i32 2, !"Debug Info Version", i32 3} -!1 = distinct !{!1, !2} -!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.o b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.o deleted file mode 100644 index c01f0bbb18a2b8aef10a542b3c15d504b366c5fe..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 984 zcmaJnfSQGt;;NrR#&%IRn#NWmOhg2w)J2#i(<+@dWjcZCMv*RD zh`14E>8AUxTx8W>ApV33{($xm7|(t4l1#Jk!kv50J9qBA@6NnmcOEIF&^Lu#f&r%~ zV2t9jPMAR!!(2J~4dhWGc%`O+v)+$9PRNK=s!K((KW4id>UFgYjOV(C$EuvS&lh+u zR?Jmy#AqR(zMABo69>I(0* z@}2iOGR2E4Ft;=ZgK%?zfHW?8b#hr}Y-4y%@WC+o57G0ozZ=lJC0X9eY z_6Qd`%RP!o{$f@`++vu|#W-Y`lb?Y0jy{*x5L4`__+#bUiwB|G z+=1h~tgXFqeXkkT+O3Z7!8!BAoXP)(I)7!_Wlfn*iVv#W)ri-z$EW0nWT@?rJl_A4}x%RasU7T diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.opt.ll b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.opt.ll deleted file mode 100644 index 1f9925e..0000000 --- a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.opt.ll +++ /dev/null @@ -1,64 +0,0 @@ -; ModuleID = 'air_project/sub_kernel_0_core_0_2.peanohack.ll' -source_filename = "LLVMDialectModule" -target datalayout = "e-m:e-p:20:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-f32:32:32-i64:32-f64:32-a:0:32-n32" -target triple = "aie2p" - -@buf0 = external local_unnamed_addr global [256 x float] -@buf1 = external local_unnamed_addr global [256 x float] -@buf2 = external local_unnamed_addr global [256 x float] - -; Function Attrs: nounwind -declare void @llvm.aie2p.acquire(i32, i32) #0 - -; Function Attrs: nounwind -declare void @llvm.aie2p.release(i32, i32) #0 - -; Function Attrs: noreturn nounwind -define void @core_0_2() local_unnamed_addr #1 { - br label %1 - -1: ; preds = %19, %0 - tail call void @llvm.aie2p.acquire(i32 49, i32 -1) - tail call void @llvm.aie2p.acquire(i32 50, i32 -1) - tail call void @llvm.aie2p.acquire(i32 52, i32 -1) - br label %2 - -2: ; preds = %1, %2 - %3 = phi i32 [ 0, %1 ], [ %17, %2 ] - %4 = trunc nuw i32 %3 to i20 - %5 = getelementptr float, ptr @buf2, i20 %4 - %6 = load <8 x i64>, ptr %5, align 64 - %7 = getelementptr float, ptr @buf1, i20 %4 - %8 = load <8 x i64>, ptr %7, align 64 - %9 = shufflevector <8 x i64> %6, <8 x i64> poison, <32 x i32> - %10 = shufflevector <8 x i64> %8, <8 x i64> poison, <32 x i32> - %11 = bitcast <32 x i64> %9 to <64 x float> - %12 = bitcast <32 x i64> %10 to <64 x float> - %13 = tail call <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float> %11, <64 x float> %12, i32 60) - %14 = bitcast <64 x float> %13 to <32 x i64> - %15 = shufflevector <32 x i64> %14, <32 x i64> poison, <8 x i32> - %16 = getelementptr float, ptr @buf0, i20 %4 - store <8 x i64> %15, ptr %16, align 64 - %17 = add nuw nsw i32 %3, 16 - %18 = icmp ult i32 %3, 240 - br i1 %18, label %2, label %19, !llvm.loop !1 - -19: ; preds = %2 - tail call void @llvm.aie2p.release(i32 51, i32 1) - tail call void @llvm.aie2p.release(i32 53, i32 1) - tail call void @llvm.aie2p.release(i32 48, i32 1) - br label %1 -} - -; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) -declare <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float>, <64 x float>, i32) #2 - -attributes #0 = { nounwind } -attributes #1 = { noreturn nounwind } -attributes #2 = { mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) } - -!llvm.module.flags = !{!0} - -!0 = !{i32 2, !"Debug Info Version", i32 3} -!1 = distinct !{!1, !2} -!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.peanohack.ll b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.peanohack.ll deleted file mode 100644 index d91a003..0000000 --- a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.peanohack.ll +++ /dev/null @@ -1,95 +0,0 @@ -; ModuleID = 'LLVMDialectModule' -source_filename = "LLVMDialectModule" -target triple = "aie2p" - -@buf0 = external global [256 x float] -@buf1 = external global [256 x float] -@buf2 = external global [256 x float] -@buf3 = external global [256 x float] -@buf4 = external global [256 x float] -@buf5 = external global [256 x float] -@buf6 = external global [256 x float] -@buf7 = external global [256 x float] -@buf8 = external global [256 x float] -@buf9 = external global [256 x float] -@buf10 = external global [256 x float] -@buf11 = external global [256 x float] -@buf12 = external global [1024 x float] -@buf13 = external global [1024 x float] -@buf14 = external global [1024 x float] - -declare void @debug_i32(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.event(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.put.ms(i32, i32) - -; Unknown intrinsic -declare { i32, i32 } @llvm.aie2p.get.ss() - -; Unknown intrinsic -declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32) - -; Unknown intrinsic -declare <16 x i32> @llvm.aie2p.scd.read.vec(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.acquire(i32, i32) - -; Unknown intrinsic -declare void @llvm.aie2p.release(i32, i32) - -; Unknown intrinsic -declare void @llvm.aie2p.set.ctrl.reg(i32, i32) - -define void @core_0_2() { - br label %1 - -1: ; preds = %22, %0 - call void @llvm.aie2p.acquire(i32 49, i32 -1) - call void @llvm.aie2p.acquire(i32 50, i32 -1) - call void @llvm.aie2p.acquire(i32 52, i32 -1) - br label %2 - -2: ; preds = %5, %1 - %3 = phi i32 [ %21, %5 ], [ 0, %1 ] - %4 = icmp slt i32 %3, 256 - br i1 %4, label %5, label %22 - -5: ; preds = %2 - %6 = getelementptr float, ptr @buf2, i32 %3 - %7 = load <16 x float>, ptr %6 - %8 = getelementptr float, ptr @buf1, i32 %3 - %9 = load <16 x float>, ptr %8 - %10 = bitcast <16 x float> %7 to <8 x i64> - %11 = bitcast <16 x float> %9 to <8 x i64> - %12 = shufflevector <8 x i64> %10, <8 x i64> %10, <32 x i32> - %13 = shufflevector <8 x i64> %11, <8 x i64> %11, <32 x i32> - %14 = bitcast <32 x i64> %12 to <64 x float> - %15 = bitcast <32 x i64> %13 to <64 x float> - %16 = call <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float> %14, <64 x float> %15, i32 60) - %17 = bitcast <64 x float> %16 to <32 x i64> - %18 = shufflevector <32 x i64> %17, <32 x i64> %17, <8 x i32> - %19 = bitcast <8 x i64> %18 to <16 x float> - %20 = getelementptr float, ptr @buf0, i32 %3 - store <16 x float> %19, ptr %20 - %21 = add i32 %3, 16 - br label %2, !llvm.loop !1 - -22: ; preds = %2 - call void @llvm.aie2p.release(i32 51, i32 1) - call void @llvm.aie2p.release(i32 53, i32 1) - call void @llvm.aie2p.release(i32 48, i32 1) - br label %1 -} - -; Unknown intrinsic -declare <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float>, <64 x float>, i32) - -!llvm.module.flags = !{!0} - -!0 = !{i32 2, !"Debug Info Version", i32 3} -!1 = distinct !{!1, !2} -!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.elf b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.elf deleted file mode 100755 index e4e226b260243a140d2d00b735618366285dcca0..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1720 zcma)7O^6$17=9-klU4jlS1bV)J}Ss6Yj$Rnb~n8^T~aBUh4rA6_ApFlW;eqolQ5aA zTRj-0sGxWgq9E=?5KoJs$etEjuTs1V?V@<;p}j~!aebcoNunno_`dgj-k^KcevyVR==t#k=rNVm*n7viTf_5hLC*|lr$T5G0FQ|WUtcO11 zURj%g|Me&JkF;u95B(OjM?}hN`*^PFf+GRSYmz=C;!VLB#ruVo5BqTESY!OfjVE&b zd*99-eDGkr{a0}*mpA9q=R;1qemOZpE-K~v&FP^?`Li3vvMlm|ft#AhgAPQ3sS3EY)4UlaDI7t+mG7AR>kRcmoNJrKiFIDbauNh z*}kV*RmauqZnf&!ZdI?JtF5`7v+CB|s?n;pYK~F&+X*`Dsn9sYvIpbdkR~R~M3%D?^L6>~8JtZk5wah7hgZ|N!Um*t+ z2J>4ac$*03zXQp7JTCg*hg28@`A3k71So-zAr%Qg{tcue0m#3DR3rfTkC2K4ApaRs zkpSesP|4%o0PN?OiaF)KlaHqSE_ptAqZxmn+??`1$VXEymB`zRgvZ|X0w{xP-No6X!34Q=MfRo>N-df%S#^8Gq{~sV? nwBcCI2xQj68R-0vWcU(se`W+Si+eHEN2%{~6lBfJpUHm!Ajl1> diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.ld.script b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.ld.script deleted file mode 100644 index 6120a88..0000000 --- a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.ld.script +++ /dev/null @@ -1,78 +0,0 @@ - -MEMORY -{ - program (RX) : ORIGIN = 0, LENGTH = 0x0020000 - data (!RX) : ORIGIN = 0x78400, LENGTH = 0x7C00 -} -ENTRY(__start) -SECTIONS -{ - . = 0x0; - .text : { - /* the __start symbol has to come at address zero. */ - *crt0.o(.text*) - _ctors_start = .; - _init_array_start = .; - KEEP(SORT(*.init_array)) - _ctors_end = .; - _init_array_end = .; - _dtors_start = .; - _dtors_end = .; - *(.text*) - } > program - .data : { - *(.data*) - *(.rodata*) - } > data - .comment : { - *(.comment*) - } - .symtab : { - *(.symtab) - } - .shstrtab : { - *(.shstrtab) - } - .strtab : { - *(.strtab) - } - .stack_sizes : { - *(.stack_sizes) - } - -. = 0x70000; -_sp_start_value_DM_stack = .; -. += 0x400; /* stack */ -. = 0x40400; -buf2 = .; -. += 0x400; -. = 0x44000; -buf1 = .; -. += 0x400; -. = 0x48000; -buf0 = .; -. += 0x400; -/* No tile with memory exists to the west. */ -. = 0x50000; -. += 0x10000; -. = 0x60400; -buf8 = .; -. += 0x400; -. = 0x64000; -buf7 = .; -. += 0x400; -. = 0x68000; -buf6 = .; -. += 0x400; -. = 0x70400; -buf5 = .; -. += 0x400; -. = 0x74000; -buf4 = .; -. += 0x400; -. = 0x78000; -buf3 = .; -. += 0x400; - .bss : { *(.bss*) } > data -} -PROVIDE(main = core_0_3); diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.ll b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.ll deleted file mode 100644 index ba863ab..0000000 --- a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.ll +++ /dev/null @@ -1,95 +0,0 @@ -; ModuleID = 'LLVMDialectModule' -source_filename = "LLVMDialectModule" -target triple = "aie2p" - -@buf0 = external global [256 x float] -@buf1 = external global [256 x float] -@buf2 = external global [256 x float] -@buf3 = external global [256 x float] -@buf4 = external global [256 x float] -@buf5 = external global [256 x float] -@buf6 = external global [256 x float] -@buf7 = external global [256 x float] -@buf8 = external global [256 x float] -@buf9 = external global [256 x float] -@buf10 = external global [256 x float] -@buf11 = external global [256 x float] -@buf12 = external global [1024 x float] -@buf13 = external global [1024 x float] -@buf14 = external global [1024 x float] - -declare void @debug_i32(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.event(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.put.ms(i32, i32) - -; Unknown intrinsic -declare { i32, i32 } @llvm.aie2p.get.ss() - -; Unknown intrinsic -declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32) - -; Unknown intrinsic -declare <16 x i32> @llvm.aie2p.scd.read.vec(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.acquire(i32, i32) - -; Unknown intrinsic -declare void @llvm.aie2p.release(i32, i32) - -; Unknown intrinsic -declare void @llvm.aie2p.set.ctrl.reg(i32, i32) - -define void @core_0_3() { - br label %1 - -1: ; preds = %22, %0 - call void @llvm.aie2p.acquire(i32 49, i32 -1) - call void @llvm.aie2p.acquire(i32 50, i32 -1) - call void @llvm.aie2p.acquire(i32 52, i32 -1) - br label %2 - -2: ; preds = %5, %1 - %3 = phi i32 [ %21, %5 ], [ 0, %1 ] - %4 = icmp slt i32 %3, 256 - br i1 %4, label %5, label %22 - -5: ; preds = %2 - %6 = getelementptr float, ptr @buf5, i32 %3 - %7 = load <16 x float>, ptr %6, align 4 - %8 = getelementptr float, ptr @buf4, i32 %3 - %9 = load <16 x float>, ptr %8, align 4 - %10 = bitcast <16 x float> %7 to <8 x i64> - %11 = bitcast <16 x float> %9 to <8 x i64> - %12 = shufflevector <8 x i64> %10, <8 x i64> %10, <32 x i32> - %13 = shufflevector <8 x i64> %11, <8 x i64> %11, <32 x i32> - %14 = bitcast <32 x i64> %12 to <64 x float> - %15 = bitcast <32 x i64> %13 to <64 x float> - %16 = call <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float> %14, <64 x float> %15, i32 60) - %17 = bitcast <64 x float> %16 to <32 x i64> - %18 = shufflevector <32 x i64> %17, <32 x i64> %17, <8 x i32> - %19 = bitcast <8 x i64> %18 to <16 x float> - %20 = getelementptr float, ptr @buf3, i32 %3 - store <16 x float> %19, ptr %20, align 4 - %21 = add i32 %3, 16 - br label %2, !llvm.loop !1 - -22: ; preds = %2 - call void @llvm.aie2p.release(i32 51, i32 1) - call void @llvm.aie2p.release(i32 53, i32 1) - call void @llvm.aie2p.release(i32 48, i32 1) - br label %1 -} - -; Unknown intrinsic -declare <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float>, <64 x float>, i32) - -!llvm.module.flags = !{!0} - -!0 = !{i32 2, !"Debug Info Version", i32 3} -!1 = distinct !{!1, !2} -!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.o b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.o deleted file mode 100644 index d5d447390e239adbd0677385ea8f16d9e12c9d10..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 984 zcmaJ<&ubG=5T2K&NsuaOK%0X~@zkG7Vk@*4O=GJ$Y($WjfyJoQ3IK1_q{v10XI9R@fK{S{2(v$@JDX~|QW6=>bPi5cDO^%mIl7kBX!&lfP=n7d zmkYJW#ui~~_S$Z9uqHFYO=cQFa|NAN!QAcG2e^tBTB=!zl(ctna7UCXbU05xb{&m& uNmzk$T_va%^resWIS*rFP_Df<1EB21Ni0?bMv1;K?>iXNN1?388Lt3R{cdRh diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.opt.ll b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.opt.ll deleted file mode 100644 index ddb3226..0000000 --- a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.opt.ll +++ /dev/null @@ -1,64 +0,0 @@ -; ModuleID = 'air_project/sub_kernel_0_core_0_3.peanohack.ll' -source_filename = "LLVMDialectModule" -target datalayout = "e-m:e-p:20:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-f32:32:32-i64:32-f64:32-a:0:32-n32" -target triple = "aie2p" - -@buf3 = external local_unnamed_addr global [256 x float] -@buf4 = external local_unnamed_addr global [256 x float] -@buf5 = external local_unnamed_addr global [256 x float] - -; Function Attrs: nounwind -declare void @llvm.aie2p.acquire(i32, i32) #0 - -; Function Attrs: nounwind -declare void @llvm.aie2p.release(i32, i32) #0 - -; Function Attrs: noreturn nounwind -define void @core_0_3() local_unnamed_addr #1 { - br label %1 - -1: ; preds = %19, %0 - tail call void @llvm.aie2p.acquire(i32 49, i32 -1) - tail call void @llvm.aie2p.acquire(i32 50, i32 -1) - tail call void @llvm.aie2p.acquire(i32 52, i32 -1) - br label %2 - -2: ; preds = %1, %2 - %3 = phi i32 [ 0, %1 ], [ %17, %2 ] - %4 = trunc nuw i32 %3 to i20 - %5 = getelementptr float, ptr @buf5, i20 %4 - %6 = load <8 x i64>, ptr %5, align 64 - %7 = getelementptr float, ptr @buf4, i20 %4 - %8 = load <8 x i64>, ptr %7, align 64 - %9 = shufflevector <8 x i64> %6, <8 x i64> poison, <32 x i32> - %10 = shufflevector <8 x i64> %8, <8 x i64> poison, <32 x i32> - %11 = bitcast <32 x i64> %9 to <64 x float> - %12 = bitcast <32 x i64> %10 to <64 x float> - %13 = tail call <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float> %11, <64 x float> %12, i32 60) - %14 = bitcast <64 x float> %13 to <32 x i64> - %15 = shufflevector <32 x i64> %14, <32 x i64> poison, <8 x i32> - %16 = getelementptr float, ptr @buf3, i20 %4 - store <8 x i64> %15, ptr %16, align 64 - %17 = add nuw nsw i32 %3, 16 - %18 = icmp ult i32 %3, 240 - br i1 %18, label %2, label %19, !llvm.loop !1 - -19: ; preds = %2 - tail call void @llvm.aie2p.release(i32 51, i32 1) - tail call void @llvm.aie2p.release(i32 53, i32 1) - tail call void @llvm.aie2p.release(i32 48, i32 1) - br label %1 -} - -; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) -declare <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float>, <64 x float>, i32) #2 - -attributes #0 = { nounwind } -attributes #1 = { noreturn nounwind } -attributes #2 = { mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) } - -!llvm.module.flags = !{!0} - -!0 = !{i32 2, !"Debug Info Version", i32 3} -!1 = distinct !{!1, !2} -!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.peanohack.ll b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.peanohack.ll deleted file mode 100644 index 8b8d6a6..0000000 --- a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.peanohack.ll +++ /dev/null @@ -1,95 +0,0 @@ -; ModuleID = 'LLVMDialectModule' -source_filename = "LLVMDialectModule" -target triple = "aie2p" - -@buf0 = external global [256 x float] -@buf1 = external global [256 x float] -@buf2 = external global [256 x float] -@buf3 = external global [256 x float] -@buf4 = external global [256 x float] -@buf5 = external global [256 x float] -@buf6 = external global [256 x float] -@buf7 = external global [256 x float] -@buf8 = external global [256 x float] -@buf9 = external global [256 x float] -@buf10 = external global [256 x float] -@buf11 = external global [256 x float] -@buf12 = external global [1024 x float] -@buf13 = external global [1024 x float] -@buf14 = external global [1024 x float] - -declare void @debug_i32(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.event(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.put.ms(i32, i32) - -; Unknown intrinsic -declare { i32, i32 } @llvm.aie2p.get.ss() - -; Unknown intrinsic -declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32) - -; Unknown intrinsic -declare <16 x i32> @llvm.aie2p.scd.read.vec(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.acquire(i32, i32) - -; Unknown intrinsic -declare void @llvm.aie2p.release(i32, i32) - -; Unknown intrinsic -declare void @llvm.aie2p.set.ctrl.reg(i32, i32) - -define void @core_0_3() { - br label %1 - -1: ; preds = %22, %0 - call void @llvm.aie2p.acquire(i32 49, i32 -1) - call void @llvm.aie2p.acquire(i32 50, i32 -1) - call void @llvm.aie2p.acquire(i32 52, i32 -1) - br label %2 - -2: ; preds = %5, %1 - %3 = phi i32 [ %21, %5 ], [ 0, %1 ] - %4 = icmp slt i32 %3, 256 - br i1 %4, label %5, label %22 - -5: ; preds = %2 - %6 = getelementptr float, ptr @buf5, i32 %3 - %7 = load <16 x float>, ptr %6 - %8 = getelementptr float, ptr @buf4, i32 %3 - %9 = load <16 x float>, ptr %8 - %10 = bitcast <16 x float> %7 to <8 x i64> - %11 = bitcast <16 x float> %9 to <8 x i64> - %12 = shufflevector <8 x i64> %10, <8 x i64> %10, <32 x i32> - %13 = shufflevector <8 x i64> %11, <8 x i64> %11, <32 x i32> - %14 = bitcast <32 x i64> %12 to <64 x float> - %15 = bitcast <32 x i64> %13 to <64 x float> - %16 = call <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float> %14, <64 x float> %15, i32 60) - %17 = bitcast <64 x float> %16 to <32 x i64> - %18 = shufflevector <32 x i64> %17, <32 x i64> %17, <8 x i32> - %19 = bitcast <8 x i64> %18 to <16 x float> - %20 = getelementptr float, ptr @buf3, i32 %3 - store <16 x float> %19, ptr %20 - %21 = add i32 %3, 16 - br label %2, !llvm.loop !1 - -22: ; preds = %2 - call void @llvm.aie2p.release(i32 51, i32 1) - call void @llvm.aie2p.release(i32 53, i32 1) - call void @llvm.aie2p.release(i32 48, i32 1) - br label %1 -} - -; Unknown intrinsic -declare <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float>, <64 x float>, i32) - -!llvm.module.flags = !{!0} - -!0 = !{i32 2, !"Debug Info Version", i32 3} -!1 = distinct !{!1, !2} -!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.elf b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.elf deleted file mode 100755 index cfc7551935cf7c8534f551f4b481c91b57b4768e..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1724 zcma)7O=uKn7=C9n-Hp_GM9|I@J?14(VD$@jU;;4z9a1m>nEweWm;lWGffP&t<_{qS z6M*@DtjM@G0Qa-aiZ8!Y(N>*JJ$3yqtS8n>H<@1mGq|Jdb;bm zA+bif6}J2!&0@Q5_`Y!|wZv}6vnfhSoDsK&Gh@Pb^l_4>sJ9Ypsv6A-&z0ytXuOZ_ z2?=pL;$6q~L*HUUg5=eWZXKKPsz%TCjCMY;~h!_~*%ARBPOV diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.ld.script b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.ld.script deleted file mode 100644 index ddda3c2..0000000 --- a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.ld.script +++ /dev/null @@ -1,78 +0,0 @@ - -MEMORY -{ - program (RX) : ORIGIN = 0, LENGTH = 0x0020000 - data (!RX) : ORIGIN = 0x78400, LENGTH = 0x7C00 -} -ENTRY(__start) -SECTIONS -{ - . = 0x0; - .text : { - /* the __start symbol has to come at address zero. */ - *crt0.o(.text*) - _ctors_start = .; - _init_array_start = .; - KEEP(SORT(*.init_array)) - _ctors_end = .; - _init_array_end = .; - _dtors_start = .; - _dtors_end = .; - *(.text*) - } > program - .data : { - *(.data*) - *(.rodata*) - } > data - .comment : { - *(.comment*) - } - .symtab : { - *(.symtab) - } - .shstrtab : { - *(.shstrtab) - } - .strtab : { - *(.strtab) - } - .stack_sizes : { - *(.stack_sizes) - } - -. = 0x70000; -_sp_start_value_DM_stack = .; -. += 0x400; /* stack */ -. = 0x40400; -buf5 = .; -. += 0x400; -. = 0x44000; -buf4 = .; -. += 0x400; -. = 0x48000; -buf3 = .; -. += 0x400; -/* No tile with memory exists to the west. */ -. = 0x50000; -. += 0x10000; -. = 0x60400; -buf11 = .; -. += 0x400; -. = 0x64000; -buf10 = .; -. += 0x400; -. = 0x68000; -buf9 = .; -. += 0x400; -. = 0x70400; -buf8 = .; -. += 0x400; -. = 0x74000; -buf7 = .; -. += 0x400; -. = 0x78000; -buf6 = .; -. += 0x400; - .bss : { *(.bss*) } > data -} -PROVIDE(main = core_0_4); diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.ll b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.ll deleted file mode 100644 index 54f47e7..0000000 --- a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.ll +++ /dev/null @@ -1,95 +0,0 @@ -; ModuleID = 'LLVMDialectModule' -source_filename = "LLVMDialectModule" -target triple = "aie2p" - -@buf0 = external global [256 x float] -@buf1 = external global [256 x float] -@buf2 = external global [256 x float] -@buf3 = external global [256 x float] -@buf4 = external global [256 x float] -@buf5 = external global [256 x float] -@buf6 = external global [256 x float] -@buf7 = external global [256 x float] -@buf8 = external global [256 x float] -@buf9 = external global [256 x float] -@buf10 = external global [256 x float] -@buf11 = external global [256 x float] -@buf12 = external global [1024 x float] -@buf13 = external global [1024 x float] -@buf14 = external global [1024 x float] - -declare void @debug_i32(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.event(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.put.ms(i32, i32) - -; Unknown intrinsic -declare { i32, i32 } @llvm.aie2p.get.ss() - -; Unknown intrinsic -declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32) - -; Unknown intrinsic -declare <16 x i32> @llvm.aie2p.scd.read.vec(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.acquire(i32, i32) - -; Unknown intrinsic -declare void @llvm.aie2p.release(i32, i32) - -; Unknown intrinsic -declare void @llvm.aie2p.set.ctrl.reg(i32, i32) - -define void @core_0_4() { - br label %1 - -1: ; preds = %22, %0 - call void @llvm.aie2p.acquire(i32 49, i32 -1) - call void @llvm.aie2p.acquire(i32 50, i32 -1) - call void @llvm.aie2p.acquire(i32 52, i32 -1) - br label %2 - -2: ; preds = %5, %1 - %3 = phi i32 [ %21, %5 ], [ 0, %1 ] - %4 = icmp slt i32 %3, 256 - br i1 %4, label %5, label %22 - -5: ; preds = %2 - %6 = getelementptr float, ptr @buf8, i32 %3 - %7 = load <16 x float>, ptr %6, align 4 - %8 = getelementptr float, ptr @buf7, i32 %3 - %9 = load <16 x float>, ptr %8, align 4 - %10 = bitcast <16 x float> %7 to <8 x i64> - %11 = bitcast <16 x float> %9 to <8 x i64> - %12 = shufflevector <8 x i64> %10, <8 x i64> %10, <32 x i32> - %13 = shufflevector <8 x i64> %11, <8 x i64> %11, <32 x i32> - %14 = bitcast <32 x i64> %12 to <64 x float> - %15 = bitcast <32 x i64> %13 to <64 x float> - %16 = call <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float> %14, <64 x float> %15, i32 60) - %17 = bitcast <64 x float> %16 to <32 x i64> - %18 = shufflevector <32 x i64> %17, <32 x i64> %17, <8 x i32> - %19 = bitcast <8 x i64> %18 to <16 x float> - %20 = getelementptr float, ptr @buf6, i32 %3 - store <16 x float> %19, ptr %20, align 4 - %21 = add i32 %3, 16 - br label %2, !llvm.loop !1 - -22: ; preds = %2 - call void @llvm.aie2p.release(i32 51, i32 1) - call void @llvm.aie2p.release(i32 53, i32 1) - call void @llvm.aie2p.release(i32 48, i32 1) - br label %1 -} - -; Unknown intrinsic -declare <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float>, <64 x float>, i32) - -!llvm.module.flags = !{!0} - -!0 = !{i32 2, !"Debug Info Version", i32 3} -!1 = distinct !{!1, !2} -!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.o b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.o deleted file mode 100644 index 01ddce691877b0d149c944b2a2bff263cbb20063..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 984 zcmaJnfSQF$an;Wuu|?yirm?MAOr%hhfOjOV_2Nv2u!!hQFgckkS5>#QOLv?aFPJV z=v>kP(@0|&D_4JjED8j#)D&>m`<2Bp8L>)q>5=G9>F$<#Q!NAIneM?)Rn8XA=Xo!Z z&s1(jXg-^~p86NozD|Z4r;~qI($42%coPV}_e)lvBy(YJ@9rwjn3iWs*Jno0pOE^@ z%6`a?{2MX;dnMoal2Z4LKPo5wn;ov1}qkNu|9HO^J~ z!MTo1@_Y$qN^>zVw+9GF;-c3gmvu%mhSvli4U<3N5ueBSropo49Q}_6KKU`g@(6!A z!iCOok0KJkkk$})Xy!5z4rpfO5l}u*XOkM@9&0MT4=q;U9t5y<+ktC6d;Vf!KX95m zuR4uG4camTF5#Iz=*52=d5PNYF^A(AiVjvRtjZDJFsgh56{10FHZgT(t diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.opt.ll b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.opt.ll deleted file mode 100644 index de0f954..0000000 --- a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.opt.ll +++ /dev/null @@ -1,64 +0,0 @@ -; ModuleID = 'air_project/sub_kernel_0_core_0_4.peanohack.ll' -source_filename = "LLVMDialectModule" -target datalayout = "e-m:e-p:20:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-f32:32:32-i64:32-f64:32-a:0:32-n32" -target triple = "aie2p" - -@buf6 = external local_unnamed_addr global [256 x float] -@buf7 = external local_unnamed_addr global [256 x float] -@buf8 = external local_unnamed_addr global [256 x float] - -; Function Attrs: nounwind -declare void @llvm.aie2p.acquire(i32, i32) #0 - -; Function Attrs: nounwind -declare void @llvm.aie2p.release(i32, i32) #0 - -; Function Attrs: noreturn nounwind -define void @core_0_4() local_unnamed_addr #1 { - br label %1 - -1: ; preds = %19, %0 - tail call void @llvm.aie2p.acquire(i32 49, i32 -1) - tail call void @llvm.aie2p.acquire(i32 50, i32 -1) - tail call void @llvm.aie2p.acquire(i32 52, i32 -1) - br label %2 - -2: ; preds = %1, %2 - %3 = phi i32 [ 0, %1 ], [ %17, %2 ] - %4 = trunc nuw i32 %3 to i20 - %5 = getelementptr float, ptr @buf8, i20 %4 - %6 = load <8 x i64>, ptr %5, align 64 - %7 = getelementptr float, ptr @buf7, i20 %4 - %8 = load <8 x i64>, ptr %7, align 64 - %9 = shufflevector <8 x i64> %6, <8 x i64> poison, <32 x i32> - %10 = shufflevector <8 x i64> %8, <8 x i64> poison, <32 x i32> - %11 = bitcast <32 x i64> %9 to <64 x float> - %12 = bitcast <32 x i64> %10 to <64 x float> - %13 = tail call <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float> %11, <64 x float> %12, i32 60) - %14 = bitcast <64 x float> %13 to <32 x i64> - %15 = shufflevector <32 x i64> %14, <32 x i64> poison, <8 x i32> - %16 = getelementptr float, ptr @buf6, i20 %4 - store <8 x i64> %15, ptr %16, align 64 - %17 = add nuw nsw i32 %3, 16 - %18 = icmp ult i32 %3, 240 - br i1 %18, label %2, label %19, !llvm.loop !1 - -19: ; preds = %2 - tail call void @llvm.aie2p.release(i32 51, i32 1) - tail call void @llvm.aie2p.release(i32 53, i32 1) - tail call void @llvm.aie2p.release(i32 48, i32 1) - br label %1 -} - -; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) -declare <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float>, <64 x float>, i32) #2 - -attributes #0 = { nounwind } -attributes #1 = { noreturn nounwind } -attributes #2 = { mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) } - -!llvm.module.flags = !{!0} - -!0 = !{i32 2, !"Debug Info Version", i32 3} -!1 = distinct !{!1, !2} -!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.peanohack.ll b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.peanohack.ll deleted file mode 100644 index 56c3882..0000000 --- a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.peanohack.ll +++ /dev/null @@ -1,95 +0,0 @@ -; ModuleID = 'LLVMDialectModule' -source_filename = "LLVMDialectModule" -target triple = "aie2p" - -@buf0 = external global [256 x float] -@buf1 = external global [256 x float] -@buf2 = external global [256 x float] -@buf3 = external global [256 x float] -@buf4 = external global [256 x float] -@buf5 = external global [256 x float] -@buf6 = external global [256 x float] -@buf7 = external global [256 x float] -@buf8 = external global [256 x float] -@buf9 = external global [256 x float] -@buf10 = external global [256 x float] -@buf11 = external global [256 x float] -@buf12 = external global [1024 x float] -@buf13 = external global [1024 x float] -@buf14 = external global [1024 x float] - -declare void @debug_i32(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.event(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.put.ms(i32, i32) - -; Unknown intrinsic -declare { i32, i32 } @llvm.aie2p.get.ss() - -; Unknown intrinsic -declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32) - -; Unknown intrinsic -declare <16 x i32> @llvm.aie2p.scd.read.vec(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.acquire(i32, i32) - -; Unknown intrinsic -declare void @llvm.aie2p.release(i32, i32) - -; Unknown intrinsic -declare void @llvm.aie2p.set.ctrl.reg(i32, i32) - -define void @core_0_4() { - br label %1 - -1: ; preds = %22, %0 - call void @llvm.aie2p.acquire(i32 49, i32 -1) - call void @llvm.aie2p.acquire(i32 50, i32 -1) - call void @llvm.aie2p.acquire(i32 52, i32 -1) - br label %2 - -2: ; preds = %5, %1 - %3 = phi i32 [ %21, %5 ], [ 0, %1 ] - %4 = icmp slt i32 %3, 256 - br i1 %4, label %5, label %22 - -5: ; preds = %2 - %6 = getelementptr float, ptr @buf8, i32 %3 - %7 = load <16 x float>, ptr %6 - %8 = getelementptr float, ptr @buf7, i32 %3 - %9 = load <16 x float>, ptr %8 - %10 = bitcast <16 x float> %7 to <8 x i64> - %11 = bitcast <16 x float> %9 to <8 x i64> - %12 = shufflevector <8 x i64> %10, <8 x i64> %10, <32 x i32> - %13 = shufflevector <8 x i64> %11, <8 x i64> %11, <32 x i32> - %14 = bitcast <32 x i64> %12 to <64 x float> - %15 = bitcast <32 x i64> %13 to <64 x float> - %16 = call <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float> %14, <64 x float> %15, i32 60) - %17 = bitcast <64 x float> %16 to <32 x i64> - %18 = shufflevector <32 x i64> %17, <32 x i64> %17, <8 x i32> - %19 = bitcast <8 x i64> %18 to <16 x float> - %20 = getelementptr float, ptr @buf6, i32 %3 - store <16 x float> %19, ptr %20 - %21 = add i32 %3, 16 - br label %2, !llvm.loop !1 - -22: ; preds = %2 - call void @llvm.aie2p.release(i32 51, i32 1) - call void @llvm.aie2p.release(i32 53, i32 1) - call void @llvm.aie2p.release(i32 48, i32 1) - br label %1 -} - -; Unknown intrinsic -declare <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float>, <64 x float>, i32) - -!llvm.module.flags = !{!0} - -!0 = !{i32 2, !"Debug Info Version", i32 3} -!1 = distinct !{!1, !2} -!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.elf b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.elf deleted file mode 100755 index 4588246a4ddfdd6db12c4f239462db7892609dba..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 1660 zcma)7O=uHQ5S~qAQpKNGQ35JF6r@T`cTH>)FRmquXbSbB1(jv9yKR>yo3Oj7tsn*w z^q}5^C5WPDqkO zF%mS`Wd#_`bwmS@1rn5?VMN>`}$?7pG106qQ^vK`mjFT z_3qn6^y$=qpzkB}ky9Q*_Vbb;`mpvaL(6X^f-WUiG9-UQ-D~NUX=(1HiVg->79UA! zM#0Gt*};h&p*E69Zc6>`Oy1ernR>JJ^RcvkQ|W9c>f8-3grJj0lS7@g=I*k}SaZ1ExPb)7I0AX}v9mWWpgGYWn$WL{~)o!}as2QRlIn%|!eEZ_Lqnf;X> zNhH;Q=zcg8<z3L5xviDx5voy>D23|=-Ki7yVwlipO@5t!rtnJUWsyU-kADy;qwzD`|t1Z+I>b51SdBc=TW1jGF~BuT=BZf}xZwvs9QV?GJnC-_cGNN&2z24Q#4wk!UCMhkbl6^i21V z#B=C^&-1fC)#sl87g#cELwI0LL#Jf;^JFMRXPB!!evWy&$1gAk1QwqUUyz>v3hZ2u z$Ho4ekOGUr{5GUu0%XANLkcDU^Cysk3Bde0q+kLte+?;^0Ly`t>TdMnd6>`q=-B9aF-I&)r`?BTXd8wS8EZcg`GW^Mg z*{WHTt4vJDni9!FkXq%qW6ET>zLGNx7-m|Ip;fI}+fm49xRxeshpF0{QIuepc`uQs zdA{!YqbmV{wqyIm8fk{#aJ~2x+cn*F^`+Pnx-G|~;85fY zxY>U)By2|?Mc&4F8w&=i@(=fakPyZz{!X@`ud*RR^y=Xo9LC{Q3!o;5E=4+vI1k6e z)x`^tHk|yO<{Ugtj=?)0eB&Vk6yX?;31r;D9_ajE5Tk*(Z!v+4;|@f6JMvvbL9QA5 GWBCW<3 program - .data : { - *(.data*) - *(.rodata*) - } > data - .comment : { - *(.comment*) - } - .symtab : { - *(.symtab) - } - .shstrtab : { - *(.shstrtab) - } - .strtab : { - *(.strtab) - } - .stack_sizes : { - *(.stack_sizes) - } - -. = 0x70000; -_sp_start_value_DM_stack = .; -. += 0x400; /* stack */ -. = 0x40400; -buf8 = .; -. += 0x400; -. = 0x44000; -buf7 = .; -. += 0x400; -. = 0x48000; -buf6 = .; -. += 0x400; -/* No tile with memory exists to the west. */ -. = 0x50000; -. += 0x10000; -/* No tile with memory exists to the north. */ -. = 0x60000; -. += 0x10000; -. = 0x70400; -buf11 = .; -. += 0x400; -. = 0x74000; -buf10 = .; -. += 0x400; -. = 0x78000; -buf9 = .; -. += 0x400; - .bss : { *(.bss*) } > data -} -PROVIDE(main = core_0_5); diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.ll b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.ll deleted file mode 100644 index 8972a4d..0000000 --- a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.ll +++ /dev/null @@ -1,95 +0,0 @@ -; ModuleID = 'LLVMDialectModule' -source_filename = "LLVMDialectModule" -target triple = "aie2p" - -@buf0 = external global [256 x float] -@buf1 = external global [256 x float] -@buf2 = external global [256 x float] -@buf3 = external global [256 x float] -@buf4 = external global [256 x float] -@buf5 = external global [256 x float] -@buf6 = external global [256 x float] -@buf7 = external global [256 x float] -@buf8 = external global [256 x float] -@buf9 = external global [256 x float] -@buf10 = external global [256 x float] -@buf11 = external global [256 x float] -@buf12 = external global [1024 x float] -@buf13 = external global [1024 x float] -@buf14 = external global [1024 x float] - -declare void @debug_i32(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.event(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.put.ms(i32, i32) - -; Unknown intrinsic -declare { i32, i32 } @llvm.aie2p.get.ss() - -; Unknown intrinsic -declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32) - -; Unknown intrinsic -declare <16 x i32> @llvm.aie2p.scd.read.vec(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.acquire(i32, i32) - -; Unknown intrinsic -declare void @llvm.aie2p.release(i32, i32) - -; Unknown intrinsic -declare void @llvm.aie2p.set.ctrl.reg(i32, i32) - -define void @core_0_5() { - br label %1 - -1: ; preds = %22, %0 - call void @llvm.aie2p.acquire(i32 49, i32 -1) - call void @llvm.aie2p.acquire(i32 50, i32 -1) - call void @llvm.aie2p.acquire(i32 52, i32 -1) - br label %2 - -2: ; preds = %5, %1 - %3 = phi i32 [ %21, %5 ], [ 0, %1 ] - %4 = icmp slt i32 %3, 256 - br i1 %4, label %5, label %22 - -5: ; preds = %2 - %6 = getelementptr float, ptr @buf11, i32 %3 - %7 = load <16 x float>, ptr %6, align 4 - %8 = getelementptr float, ptr @buf10, i32 %3 - %9 = load <16 x float>, ptr %8, align 4 - %10 = bitcast <16 x float> %7 to <8 x i64> - %11 = bitcast <16 x float> %9 to <8 x i64> - %12 = shufflevector <8 x i64> %10, <8 x i64> %10, <32 x i32> - %13 = shufflevector <8 x i64> %11, <8 x i64> %11, <32 x i32> - %14 = bitcast <32 x i64> %12 to <64 x float> - %15 = bitcast <32 x i64> %13 to <64 x float> - %16 = call <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float> %14, <64 x float> %15, i32 60) - %17 = bitcast <64 x float> %16 to <32 x i64> - %18 = shufflevector <32 x i64> %17, <32 x i64> %17, <8 x i32> - %19 = bitcast <8 x i64> %18 to <16 x float> - %20 = getelementptr float, ptr @buf9, i32 %3 - store <16 x float> %19, ptr %20, align 4 - %21 = add i32 %3, 16 - br label %2, !llvm.loop !1 - -22: ; preds = %2 - call void @llvm.aie2p.release(i32 51, i32 1) - call void @llvm.aie2p.release(i32 53, i32 1) - call void @llvm.aie2p.release(i32 48, i32 1) - br label %1 -} - -; Unknown intrinsic -declare <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float>, <64 x float>, i32) - -!llvm.module.flags = !{!0} - -!0 = !{i32 2, !"Debug Info Version", i32 3} -!1 = distinct !{!1, !2} -!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.o b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.o deleted file mode 100644 index ca78f75bdee0299a5c24ffa00da8f22d235605ba..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 984 zcmZ`%O-~b16g{t$wlPRb6SWI9#1$Y+I({Uu(NaN5GBw5oVsOz+%NUH6;xI&XqcJR8 z5aPyUmTcIwa?usPfbbI_;RnDUu$=qmwH=yxlRNjEcTeuU@6P)^@4Qq>p<@b}7z2(H zzyOm^v|tQr^t5tx24qnnc&A2z)Arvi4#|jBqQ#7;f2Lb+)mpg-3}#x}2dbF0FD7{| zmd})KM`$veyqWqJR`y21*T*C0vuXF29o_=MKb@B4=gC~y-nci1Q`7Zz}pUBtv3VDT6ncIwHdhe zk8qsj%G?LH;njmmb9JlX!P*S`z^yTU{Uour^?nAH?=@Uq!KycXuUe=+h2@k=h3X>{ zZ4ufwS=E_eUoB|bTa+2%|6!ada|JCbbM7xOIj^EkqPSTJiu$zoksXP>p!e(aW7pYe wmja*BWnD$OHufzONPU?{azk`kduIkhY{W?{XA`qT*F^W5Oyq-fsV6gD0S*Oj=>Px# diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.opt.ll b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.opt.ll deleted file mode 100644 index d08aa8f..0000000 --- a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.opt.ll +++ /dev/null @@ -1,64 +0,0 @@ -; ModuleID = 'air_project/sub_kernel_0_core_0_5.peanohack.ll' -source_filename = "LLVMDialectModule" -target datalayout = "e-m:e-p:20:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-f32:32:32-i64:32-f64:32-a:0:32-n32" -target triple = "aie2p" - -@buf9 = external local_unnamed_addr global [256 x float] -@buf10 = external local_unnamed_addr global [256 x float] -@buf11 = external local_unnamed_addr global [256 x float] - -; Function Attrs: nounwind -declare void @llvm.aie2p.acquire(i32, i32) #0 - -; Function Attrs: nounwind -declare void @llvm.aie2p.release(i32, i32) #0 - -; Function Attrs: noreturn nounwind -define void @core_0_5() local_unnamed_addr #1 { - br label %1 - -1: ; preds = %19, %0 - tail call void @llvm.aie2p.acquire(i32 49, i32 -1) - tail call void @llvm.aie2p.acquire(i32 50, i32 -1) - tail call void @llvm.aie2p.acquire(i32 52, i32 -1) - br label %2 - -2: ; preds = %1, %2 - %3 = phi i32 [ 0, %1 ], [ %17, %2 ] - %4 = trunc nuw i32 %3 to i20 - %5 = getelementptr float, ptr @buf11, i20 %4 - %6 = load <8 x i64>, ptr %5, align 64 - %7 = getelementptr float, ptr @buf10, i20 %4 - %8 = load <8 x i64>, ptr %7, align 64 - %9 = shufflevector <8 x i64> %6, <8 x i64> poison, <32 x i32> - %10 = shufflevector <8 x i64> %8, <8 x i64> poison, <32 x i32> - %11 = bitcast <32 x i64> %9 to <64 x float> - %12 = bitcast <32 x i64> %10 to <64 x float> - %13 = tail call <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float> %11, <64 x float> %12, i32 60) - %14 = bitcast <64 x float> %13 to <32 x i64> - %15 = shufflevector <32 x i64> %14, <32 x i64> poison, <8 x i32> - %16 = getelementptr float, ptr @buf9, i20 %4 - store <8 x i64> %15, ptr %16, align 64 - %17 = add nuw nsw i32 %3, 16 - %18 = icmp ult i32 %3, 240 - br i1 %18, label %2, label %19, !llvm.loop !1 - -19: ; preds = %2 - tail call void @llvm.aie2p.release(i32 51, i32 1) - tail call void @llvm.aie2p.release(i32 53, i32 1) - tail call void @llvm.aie2p.release(i32 48, i32 1) - br label %1 -} - -; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) -declare <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float>, <64 x float>, i32) #2 - -attributes #0 = { nounwind } -attributes #1 = { noreturn nounwind } -attributes #2 = { mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) } - -!llvm.module.flags = !{!0} - -!0 = !{i32 2, !"Debug Info Version", i32 3} -!1 = distinct !{!1, !2} -!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.peanohack.ll b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.peanohack.ll deleted file mode 100644 index 69f695d..0000000 --- a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.peanohack.ll +++ /dev/null @@ -1,95 +0,0 @@ -; ModuleID = 'LLVMDialectModule' -source_filename = "LLVMDialectModule" -target triple = "aie2p" - -@buf0 = external global [256 x float] -@buf1 = external global [256 x float] -@buf2 = external global [256 x float] -@buf3 = external global [256 x float] -@buf4 = external global [256 x float] -@buf5 = external global [256 x float] -@buf6 = external global [256 x float] -@buf7 = external global [256 x float] -@buf8 = external global [256 x float] -@buf9 = external global [256 x float] -@buf10 = external global [256 x float] -@buf11 = external global [256 x float] -@buf12 = external global [1024 x float] -@buf13 = external global [1024 x float] -@buf14 = external global [1024 x float] - -declare void @debug_i32(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.event(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.put.ms(i32, i32) - -; Unknown intrinsic -declare { i32, i32 } @llvm.aie2p.get.ss() - -; Unknown intrinsic -declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32) - -; Unknown intrinsic -declare <16 x i32> @llvm.aie2p.scd.read.vec(i32) - -; Unknown intrinsic -declare void @llvm.aie2p.acquire(i32, i32) - -; Unknown intrinsic -declare void @llvm.aie2p.release(i32, i32) - -; Unknown intrinsic -declare void @llvm.aie2p.set.ctrl.reg(i32, i32) - -define void @core_0_5() { - br label %1 - -1: ; preds = %22, %0 - call void @llvm.aie2p.acquire(i32 49, i32 -1) - call void @llvm.aie2p.acquire(i32 50, i32 -1) - call void @llvm.aie2p.acquire(i32 52, i32 -1) - br label %2 - -2: ; preds = %5, %1 - %3 = phi i32 [ %21, %5 ], [ 0, %1 ] - %4 = icmp slt i32 %3, 256 - br i1 %4, label %5, label %22 - -5: ; preds = %2 - %6 = getelementptr float, ptr @buf11, i32 %3 - %7 = load <16 x float>, ptr %6 - %8 = getelementptr float, ptr @buf10, i32 %3 - %9 = load <16 x float>, ptr %8 - %10 = bitcast <16 x float> %7 to <8 x i64> - %11 = bitcast <16 x float> %9 to <8 x i64> - %12 = shufflevector <8 x i64> %10, <8 x i64> %10, <32 x i32> - %13 = shufflevector <8 x i64> %11, <8 x i64> %11, <32 x i32> - %14 = bitcast <32 x i64> %12 to <64 x float> - %15 = bitcast <32 x i64> %13 to <64 x float> - %16 = call <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float> %14, <64 x float> %15, i32 60) - %17 = bitcast <64 x float> %16 to <32 x i64> - %18 = shufflevector <32 x i64> %17, <32 x i64> %17, <8 x i32> - %19 = bitcast <8 x i64> %18 to <16 x float> - %20 = getelementptr float, ptr @buf9, i32 %3 - store <16 x float> %19, ptr %20 - %21 = add i32 %3, 16 - br label %2, !llvm.loop !1 - -22: ; preds = %2 - call void @llvm.aie2p.release(i32 51, i32 1) - call void @llvm.aie2p.release(i32 53, i32 1) - call void @llvm.aie2p.release(i32 48, i32 1) - br label %1 -} - -; Unknown intrinsic -declare <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float>, <64 x float>, i32) - -!llvm.module.flags = !{!0} - -!0 = !{i32 2, !"Debug Info Version", i32 3} -!1 = distinct !{!1, !2} -!2 = !{!"llvm.loop.mustprogress"} diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_design.bif b/examples/elementwise_arith/air_project/sub_kernel_0_design.bif deleted file mode 100644 index bbeec41..0000000 --- a/examples/elementwise_arith/air_project/sub_kernel_0_design.bif +++ /dev/null @@ -1,10 +0,0 @@ -all: -{ - id_code = 0x14ca8093 - extended_id_code = 0x01 - image - { - name=aie_image, id=0x1c000000 - { type=cdo file=air_project/sub_kernel_0_aie_cdo_elfs.bin file=air_project/sub_kernel_0_aie_cdo_init.bin file=air_project/sub_kernel_0_aie_cdo_enable.bin } - } -} diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_sub_kernel_0_sequence.bin b/examples/elementwise_arith/air_project/sub_kernel_0_sub_kernel_0_sequence.bin deleted file mode 100644 index f2eb383b7eb903e8ad8809b7d44fb0a4d660bfb6..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 3248 zcmcJQ-Aw~A5QNuneg4w5;eLtEZyo^(@j}krmd|WcZ&8OQJf_lyu+9e|Yukmso`>#0IXQ!}T>04{- zdWRz5+iJIiZ@khs7OO7qzIV<49O}M3Vd35Et*zr4+n!k}H6?s5ZHHP2b8iNd9f!uq z2%ELeWFG=0l9+H#Mw;SV&vYiNQGA=m?@TyH^<^fmv5ZNwd1Or5@6ws1SwdvOIT;Fy zFEe2cjS1(dzRbimmN6++jgb*5y%`xL$~=|+C!AAeiZ3%^4UGxssJ_g^HI^|MPc*|& zYb`xk4J-t?`*yLPCTa8!jZ|yI(P^-T#)NY+)Rg;`nXrb&gmYA1X5t#lnB*HUVGWH5 W=cv75Caj?`;T+YMnYhL+nEU|x-i@&U diff --git a/examples/elementwise_arith/air_project/tt.mlir b/examples/elementwise_arith/air_project/tt.mlir deleted file mode 100644 index cfdc62d..0000000 --- a/examples/elementwise_arith/air_project/tt.mlir +++ /dev/null @@ -1,35 +0,0 @@ -#loc = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":86:1) -#loc10 = loc("X"(#loc)) -#loc11 = loc("OUT"(#loc)) -module { - tt.func public @square_kernel(%X: !tt.ptr {tt.divisibility = 16 : i32} loc("X"(#loc)), %OUT: !tt.ptr {tt.divisibility = 16 : i32} loc("OUT"(#loc))) attributes {noinline = false} { - %c1024_i32 = arith.constant 1024 : i32 loc(#loc1) - %pid = tt.get_program_id x : i32 loc(#loc12) - %offsets = arith.muli %pid, %c1024_i32 : i32 loc(#loc13) - %offsets_0 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc14) - %offsets_1 = tt.splat %offsets : i32 -> tensor<1024xi32> loc(#loc13) - %offsets_2 = arith.addi %offsets_1, %offsets_0 : tensor<1024xi32> loc(#loc13) - %x = tt.splat %X : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc15) - %x_3 = tt.addptr %x, %offsets_2 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc15) - %x_4 = tt.load %x_3 : tensor<1024x!tt.ptr> loc(#loc16) - %0 = tt.splat %OUT : !tt.ptr -> tensor<1024x!tt.ptr> loc(#loc7) - %1 = tt.addptr %0, %offsets_2 : tensor<1024x!tt.ptr>, tensor<1024xi32> loc(#loc7) - %2 = arith.muli %x_4, %x_4 : tensor<1024xi16> loc(#loc8) - tt.store %1, %2 : tensor<1024x!tt.ptr> loc(#loc9) - tt.return loc(#loc) - } loc(#loc) -} loc(#loc) -#loc1 = loc(unknown) -#loc2 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":87:11) -#loc3 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":88:15) -#loc4 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":88:34) -#loc5 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":89:17) -#loc6 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":89:9) -#loc7 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":90:14) -#loc8 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":90:32) -#loc9 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":90:5) -#loc12 = loc("pid"(#loc2)) -#loc13 = loc("offsets"(#loc3)) -#loc14 = loc("offsets"(#loc4)) -#loc15 = loc("x"(#loc5)) -#loc16 = loc("x"(#loc6)) diff --git a/examples/elementwise_arith/tt.shared.mlir b/examples/elementwise_arith/tt.shared.mlir deleted file mode 100644 index dc6929b..0000000 --- a/examples/elementwise_arith/tt.shared.mlir +++ /dev/null @@ -1 +0,0 @@ -b'#loc = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":86:1)\n#loc5 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":89:9)\n#map = affine_map<(d0) -> (d0)>\n#loc8 = loc("X"(#loc))\n#loc9 = loc("OUT"(#loc))\n#loc12 = loc("x"(#loc5))\nmodule {\n func.func @square_kernel(%arg0: memref<*xf32> {tt.divisibility = 16 : i32} loc("X"(#loc)), %arg1: memref<*xf32> {tt.divisibility = 16 : i32} loc("OUT"(#loc)), %arg2: i32 loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":86:1), %arg3: i32 loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":86:1), %arg4: i32 loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":86:1), %arg5: i32 loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":86:1), %arg6: i32 loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":86:1), %arg7: i32 loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":86:1)) {\n %c1024_i32 = arith.constant 1024 : i32 loc(#loc1)\n %0 = arith.muli %arg5, %c1024_i32 : i32 loc(#loc10)\n %1 = arith.index_cast %0 : i32 to index loc(#loc3)\n %reinterpret_cast = memref.reinterpret_cast %arg0 to offset: [%1], sizes: [1024], strides: [1] : memref<*xf32> to memref<1024xf32, strided<[1], offset: ?>> loc(#loc11)\n %alloc = memref.alloc() : memref<1024xf32> loc(#loc12)\n memref.copy %reinterpret_cast, %alloc : memref<1024xf32, strided<[1], offset: ?>> to memref<1024xf32> loc(#loc12)\n %2 = bufferization.to_tensor %alloc restrict writable : memref<1024xf32> to tensor<1024xf32> loc(#loc12)\n %reinterpret_cast_0 = memref.reinterpret_cast %arg1 to offset: [%1], sizes: [1024], strides: [1] : memref<*xf32> to memref<1024xf32, strided<[1], offset: ?>> loc(#loc3)\n %3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%2, %2 : tensor<1024xf32>, tensor<1024xf32>) outs(%2 : tensor<1024xf32>) {\n ^bb0(%in: f32 loc("x"(#loc5)), %in_1: f32 loc("x"(#loc5)), %out: f32 loc("x"(#loc5))):\n %4 = arith.mulf %in, %in_1 : f32 loc(#loc6)\n linalg.yield %4 : f32 loc(#loc6)\n } -> tensor<1024xf32> loc(#loc6)\n bufferization.materialize_in_destination %3 in writable %reinterpret_cast_0 : (tensor<1024xf32>, memref<1024xf32, strided<[1], offset: ?>>) -> () loc(#loc7)\n return loc(#loc)\n } loc(#loc)\n} loc(#loc)\n#loc1 = loc(unknown)\n#loc2 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":88:15)\n#loc3 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":90:14)\n#loc4 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":89:17)\n#loc6 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":90:32)\n#loc7 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":90:5)\n#loc10 = loc("offsets"(#loc2))\n#loc11 = loc("x"(#loc4))\n\n' \ No newline at end of file From 1998bd8a8a73703746c3627caf534adbf198bfd1 Mon Sep 17 00:00:00 2001 From: erwei-xilinx Date: Thu, 9 Apr 2026 22:42:05 -0700 Subject: [PATCH 9/9] Add NPU1 (AIE2) support to elementwise_arith example MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Create transform_binary_aie2.mlir and transform_unary_aie2.mlir for NPU1 targets — content is identical to the AIE2P variants since @DTYPE@ and @VECTOR_SIZE@ placeholders handle the differences. Update elementwise_arith.py to auto-detect the NPU version via detect_npu_version() and select the correct transform script suffix (aie2 vs aie2p) instead of hardcoding aie2p. Update generate_readme.py get_device_support() to use glob patterns so it detects both transform_aie2.mlir and transform_*_aie2.mlir naming conventions used by multi-op examples. Tested on NPU1 (Phoenix/AIE2): all 7 test cases pass (sub bf16/f32, mul bf16/f32, div f32, square bf16/f32). Co-Authored-By: Claude Opus 4.6 (1M context) --- .../elementwise_arith/elementwise_arith.py | 8 +++- .../transform_binary_aie2.mlir | 40 +++++++++++++++++++ .../transform_unary_aie2.mlir | 40 +++++++++++++++++++ examples/generate_readme.py | 7 +++- 4 files changed, 91 insertions(+), 4 deletions(-) create mode 100644 examples/elementwise_arith/transform_binary_aie2.mlir create mode 100644 examples/elementwise_arith/transform_unary_aie2.mlir diff --git a/examples/elementwise_arith/elementwise_arith.py b/examples/elementwise_arith/elementwise_arith.py index 04d4844..cd4c678 100644 --- a/examples/elementwise_arith/elementwise_arith.py +++ b/examples/elementwise_arith/elementwise_arith.py @@ -174,14 +174,18 @@ def bench_op(op, N, provider, cfg): if cfg["bf16_emulation"]: os.environ["AMD_TRITON_NPU_BF16_EMULATION"] = "1" - # Select the right transform script based on op arity. + # Select the right transform script based on op arity and NPU version. # If AIR_TRANSFORM_TILING_SCRIPT is already set, respect it. if not os.environ.get("AIR_TRANSFORM_TILING_SCRIPT"): + from triton.backends.amd_triton_npu.driver import detect_npu_version + is_unary = args.op == "square" script_dir = os.path.dirname(os.path.abspath(__file__)) arity = "unary" if is_unary else "binary" + npu = detect_npu_version() + suffix = "aie2" if npu == "npu1" else "aie2p" os.environ["AIR_TRANSFORM_TILING_SCRIPT"] = os.path.join( - script_dir, f"transform_{arity}_aie2p.mlir" + script_dir, f"transform_{arity}_{suffix}.mlir" ) benchmark.select_npu_backend() diff --git a/examples/elementwise_arith/transform_binary_aie2.mlir b/examples/elementwise_arith/transform_binary_aie2.mlir new file mode 100644 index 0000000..ccec81d --- /dev/null +++ b/examples/elementwise_arith/transform_binary_aie2.mlir @@ -0,0 +1,40 @@ +// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved. +// SPDX-License-Identifier: MIT + +//////////////////////////////////////////////////////////////////////////////// +// Transform Script for Binary Elementwise Ops (AIE2): sub, mul, div +// Dtype-generic: uses @DTYPE@ and @VECTOR_SIZE@ placeholders. +// Uses shared library sequences from transform_library.mlir (auto-injected). +//////////////////////////////////////////////////////////////////////////////// + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main( + %arg1: !transform.any_op {transform.readonly}) { + + transform.include @canonicalize_with_fold_dims failures(propagate) + (%arg1) : (!transform.any_op) -> () + transform.include @fuse_elementwise_and_canonicalize failures(propagate) + (%arg1) : (!transform.any_op) -> () + transform.include @flatten_tile_forall failures(propagate) + (%arg1) : (!transform.any_op) -> () + transform.include @canonicalize_with_cse failures(propagate) + (%arg1) : (!transform.any_op) -> () + transform.include @pad_and_promote_binary_@DTYPE@ failures(propagate) + (%arg1) : (!transform.any_op) -> () + transform.include @canonicalize_with_cse failures(propagate) + (%arg1) : (!transform.any_op) -> () + transform.include @one_shot_bufferize failures(propagate) + (%arg1) : (!transform.any_op) -> () + transform.include @post_bufferize_cleanup failures(propagate) + (%arg1) : (!transform.any_op) -> () + + transform.include @vectorize_generics_at_@VECTOR_SIZE@ failures(propagate) + (%arg1) : (!transform.any_op) -> () + %vh = transform.include @air_herd_mapping_and_vectorize + failures(propagate) (%arg1) : (!transform.any_op) -> !transform.any_op + transform.include @cast_bf16_only_ops failures(propagate) + (%vh) : (!transform.any_op) -> () + + transform.yield + } +} diff --git a/examples/elementwise_arith/transform_unary_aie2.mlir b/examples/elementwise_arith/transform_unary_aie2.mlir new file mode 100644 index 0000000..2e09a8b --- /dev/null +++ b/examples/elementwise_arith/transform_unary_aie2.mlir @@ -0,0 +1,40 @@ +// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved. +// SPDX-License-Identifier: MIT + +//////////////////////////////////////////////////////////////////////////////// +// Transform Script for Unary Elementwise Ops (AIE2): square +// Dtype-generic: uses @DTYPE@ and @VECTOR_SIZE@ placeholders. +// Uses shared library sequences from transform_library.mlir (auto-injected). +//////////////////////////////////////////////////////////////////////////////// + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main( + %arg1: !transform.any_op {transform.readonly}) { + + transform.include @canonicalize_with_fold_dims failures(propagate) + (%arg1) : (!transform.any_op) -> () + transform.include @fuse_elementwise_and_canonicalize failures(propagate) + (%arg1) : (!transform.any_op) -> () + transform.include @flatten_tile_forall failures(propagate) + (%arg1) : (!transform.any_op) -> () + transform.include @canonicalize_with_cse failures(propagate) + (%arg1) : (!transform.any_op) -> () + transform.include @pad_and_promote_unary_@DTYPE@ failures(propagate) + (%arg1) : (!transform.any_op) -> () + transform.include @canonicalize_with_cse failures(propagate) + (%arg1) : (!transform.any_op) -> () + transform.include @one_shot_bufferize failures(propagate) + (%arg1) : (!transform.any_op) -> () + transform.include @post_bufferize_cleanup failures(propagate) + (%arg1) : (!transform.any_op) -> () + + transform.include @vectorize_generics_at_@VECTOR_SIZE@ failures(propagate) + (%arg1) : (!transform.any_op) -> () + %vh = transform.include @air_herd_mapping_and_vectorize + failures(propagate) (%arg1) : (!transform.any_op) -> !transform.any_op + transform.include @cast_bf16_only_ops failures(propagate) + (%vh) : (!transform.any_op) -> () + + transform.yield + } +} diff --git a/examples/generate_readme.py b/examples/generate_readme.py index bc75808..b64b9f5 100644 --- a/examples/generate_readme.py +++ b/examples/generate_readme.py @@ -161,10 +161,13 @@ def get_device_support(example_dir): """Check which device targets have transform files. + Checks for both exact names (transform_aie2.mlir) and prefixed + variants (transform_*_aie2.mlir) used by multi-op examples. + Returns (has_aie2, has_aie2p) as booleans. """ - has_aie2 = (example_dir / "transform_aie2.mlir").exists() - has_aie2p = (example_dir / "transform_aie2p.mlir").exists() + has_aie2 = bool(list(example_dir.glob("transform*_aie2.mlir"))) + has_aie2p = bool(list(example_dir.glob("transform*_aie2p.mlir"))) return has_aie2, has_aie2p