From 2045089ac1623572aba2320d2660b5e031b0aba0 Mon Sep 17 00:00:00 2001
From: erweiw <erwei.wang@amd.com>
Date: Thu, 9 Apr 2026 20:27:47 -0700
Subject: [PATCH 1/9] Add multi-dtype support to vec-add example (bf16, f32,
 i8, i16)

Extends the vec-add example to support bf16, f32 (via bf16-emulation),
i8, and i16 data types, inspired by mlir-air's triton_vec_add test.

Driver changes:
- Add dtype detection from Linalg IR (_detect_element_type)
- Add placeholder substitution (@DTYPE@, @PAD_VAL@, @VECTOR_SIZE@) in
  transform scripts, resolved before library injection based on the
  IR element type and NPU version. Backward-compatible: no-op when
  no placeholders are present.

Transform library:
- Add pad_and_promote_binary_{f32,i8,i16} sequences alongside the
  existing bf16 variant.

Vec-add example:
- Add --dtype and --bf16-emulation CLI arguments
- Transform scripts now use @DTYPE@ and @VECTOR_SIZE@ placeholders,
  making them dtype-generic across both AIE2 and AIE2P.

Tested on NPU2 (Strix/AIE2P): all 4 dtypes pass correctness checks
across vector sizes 1024-32768.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 amd_triton_npu/backend/driver.py              |  68 +++++++++++-
 .../transform_library/elementwise.mlir        |  94 ++++++++++++++++
 examples/vec-add/transform_aie2.mlir          |  10 +-
 examples/vec-add/transform_aie2p.mlir         |  10 +-
 examples/vec-add/vec-add.py                   | 102 ++++++++++++++----
 5 files changed, 256 insertions(+), 28 deletions(-)

diff --git a/amd_triton_npu/backend/driver.py b/amd_triton_npu/backend/driver.py
index 7887cff..8c3634b 100644
--- a/amd_triton_npu/backend/driver.py
+++ b/amd_triton_npu/backend/driver.py
@@ -410,7 +410,57 @@ def _replace_include(m):
     return result
 
 
-def _get_transform_ir_string():
+def _detect_element_type(ir_str):
+    """Detect the primary element type from the Linalg IR function signature.
+
+    Scans memref types in the first func.func line for the element type.
+    Returns the MLIR type string (e.g., "bf16", "f32", "i8", "i16").
+    Falls back to "bf16" if detection fails.
+    """
+    import re
+
+    # Match memref<...xTYPE> in the function signature
+    match = re.search(r"memref<[^>]*x(\w+)>", ir_str)
+    if match:
+        return match.group(1)
+    return "bf16"
+
+
+# Dtype-aware placeholder info: padding value and default vector size per NPU.
+_DTYPE_PLACEHOLDER_INFO = {
+    "bf16": {"pad_val": "0.0 : bf16", "vector_size": {"npu1": 16, "npu2": 32}},
+    "f32": {"pad_val": "0.0 : f32", "vector_size": {"npu1": 16, "npu2": 16}},
+    "i8": {"pad_val": "0 : i8", "vector_size": {"npu1": 32, "npu2": 32}},
+    "i16": {"pad_val": "0 : i16", "vector_size": {"npu1": 32, "npu2": 32}},
+    "i32": {"pad_val": "0 : i32", "vector_size": {"npu1": 16, "npu2": 16}},
+}
+
+
+def _substitute_dtype_placeholders(script, dtype, npu_version):
+    """Substitute dtype-aware placeholders in a transform script.
+
+    Replaces @DTYPE@, @PAD_VAL@, and @VECTOR_SIZE@ with values derived
+    from the detected element type and target NPU version.
+    No-op if the script contains no placeholders (backward compatible).
+    """
+    if (
+        "@DTYPE@" not in script
+        and "@PAD_VAL@" not in script
+        and "@VECTOR_SIZE@" not in script
+    ):
+        return script
+    info = _DTYPE_PLACEHOLDER_INFO.get(dtype)
+    if info is None:
+        return script
+    script = script.replace("@DTYPE@", dtype)
+    script = script.replace("@PAD_VAL@", info["pad_val"])
+    script = script.replace(
+        "@VECTOR_SIZE@", str(info["vector_size"].get(npu_version, 16))
+    )
+    return script
+
+
+def _get_transform_ir_string(ir_str=None):
     """
     Get the transform IR string for tiling operations.
 
@@ -421,6 +471,12 @@ def _get_transform_ir_string():
     If the script uses `transform.include`, the shared transform library
     (transform_library.mlir) is automatically injected.
 
+    If ir_str is provided, dtype-aware placeholders (@DTYPE@, @PAD_VAL@,
+    @VECTOR_SIZE@) are substituted before library injection.
+
+    Args:
+        ir_str: Optional Linalg IR string for dtype detection.
+
     Returns:
         str: The transform IR string to use for tiling
     """
@@ -436,6 +492,14 @@ def _get_transform_ir_string():
         with open(custom_script_path, "r") as f:
             print(f"Using custom tiling script from: {custom_script_path}")
             user_script = f.read()
+        if ir_str is not None:
+            dtype = _detect_element_type(
+                ir_str if isinstance(ir_str, str) else str(ir_str)
+            )
+            npu_version = detect_npu_version()
+            user_script = _substitute_dtype_placeholders(
+                user_script, dtype, npu_version
+            )
         return _inject_transform_library(user_script)
 
     # Default hardcoded transform IR string
@@ -493,7 +557,7 @@ def _ttshared_to_air(mod, gridX, gridY, gridZ, actual_sizes=None):
         pm = air.passmanager.PassManager.parse(pipeline, context=air_context)
         pm.run(air_module.operation)
         # MLIR-AIR compilation step 2: tiling the launch body
-        transform_ir_string = _get_transform_ir_string()
+        transform_ir_string = _get_transform_ir_string(ir_str=mod)
         transform_ir = Module.parse(transform_ir_string, context=air_context)
         run_transform(transform_ir, air_module)
         # MLIR-AIR compilation step 3: converting to AIR
diff --git a/amd_triton_npu/backend/transform_library/elementwise.mlir b/amd_triton_npu/backend/transform_library/elementwise.mlir
index 26fda74..cd847cb 100644
--- a/amd_triton_npu/backend/transform_library/elementwise.mlir
+++ b/amd_triton_npu/backend/transform_library/elementwise.mlir
@@ -96,3 +96,97 @@ transform.named_sequence @pad_and_promote_binary_bf16(
       {memory_space = 2, bufferize_destination_only, emit_dealloc} : !transform.any_op
   transform.yield
 }
+
+// Binary variant for f32: 2 inputs + 1 output = 3 operands.
+// Used with bf16-emulation (f32 data, bf16 compute on AIE cores).
+transform.named_sequence @pad_and_promote_binary_f32(
+    %module: !transform.any_op {transform.readonly}) {
+  %op = transform.structured.match ops{["linalg.generic"]} in %module
+      : (!transform.any_op) -> !transform.any_op
+  %padded_op, %pad_op, %__ = transform.structured.pad %op {
+      padding_values=[0.0 : f32, 0.0 : f32, 0.0 : f32],
+      padding_dimensions=[0, 1, 2],
+      nofold_flags=[1, 1, 1],
+      copy_back_op="linalg.copy"
+  } : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+  %pad_dps = transform.structured.rewrite_in_destination_passing_style %pad_op
+      : (!transform.any_op) -> !transform.any_op
+  %padded_lhs = transform.get_producer_of_operand %padded_op[0]
+      : (!transform.any_op) -> (!transform.any_op)
+  %padded_lhs_buffer, %padded_lhs_new =
+      transform.structured.bufferize_to_allocation %padded_lhs
+      {memory_space = 2, bufferize_destination_only, emit_dealloc} : !transform.any_op
+  %padded_rhs = transform.get_producer_of_operand %padded_op[1]
+      : (!transform.any_op) -> (!transform.any_op)
+  %padded_rhs_buffer, %padded_rhs_new =
+      transform.structured.bufferize_to_allocation %padded_rhs
+      {memory_space = 2, bufferize_destination_only, emit_dealloc} : !transform.any_op
+  %padded_result = transform.get_producer_of_operand %padded_op[2]
+      : (!transform.any_op) -> (!transform.any_op)
+  %padded_result_buffer, %padded_result_new =
+      transform.structured.bufferize_to_allocation %padded_result
+      {memory_space = 2, bufferize_destination_only, emit_dealloc} : !transform.any_op
+  transform.yield
+}
+
+// Binary variant for i8: 2 inputs + 1 output = 3 operands.
+transform.named_sequence @pad_and_promote_binary_i8(
+    %module: !transform.any_op {transform.readonly}) {
+  %op = transform.structured.match ops{["linalg.generic"]} in %module
+      : (!transform.any_op) -> !transform.any_op
+  %padded_op, %pad_op, %__ = transform.structured.pad %op {
+      padding_values=[0 : i8, 0 : i8, 0 : i8],
+      padding_dimensions=[0, 1, 2],
+      nofold_flags=[1, 1, 1],
+      copy_back_op="linalg.copy"
+  } : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+  %pad_dps = transform.structured.rewrite_in_destination_passing_style %pad_op
+      : (!transform.any_op) -> !transform.any_op
+  %padded_lhs = transform.get_producer_of_operand %padded_op[0]
+      : (!transform.any_op) -> (!transform.any_op)
+  %padded_lhs_buffer, %padded_lhs_new =
+      transform.structured.bufferize_to_allocation %padded_lhs
+      {memory_space = 2, bufferize_destination_only, emit_dealloc} : !transform.any_op
+  %padded_rhs = transform.get_producer_of_operand %padded_op[1]
+      : (!transform.any_op) -> (!transform.any_op)
+  %padded_rhs_buffer, %padded_rhs_new =
+      transform.structured.bufferize_to_allocation %padded_rhs
+      {memory_space = 2, bufferize_destination_only, emit_dealloc} : !transform.any_op
+  %padded_result = transform.get_producer_of_operand %padded_op[2]
+      : (!transform.any_op) -> (!transform.any_op)
+  %padded_result_buffer, %padded_result_new =
+      transform.structured.bufferize_to_allocation %padded_result
+      {memory_space = 2, bufferize_destination_only, emit_dealloc} : !transform.any_op
+  transform.yield
+}
+
+// Binary variant for i16: 2 inputs + 1 output = 3 operands.
+transform.named_sequence @pad_and_promote_binary_i16(
+    %module: !transform.any_op {transform.readonly}) {
+  %op = transform.structured.match ops{["linalg.generic"]} in %module
+      : (!transform.any_op) -> !transform.any_op
+  %padded_op, %pad_op, %__ = transform.structured.pad %op {
+      padding_values=[0 : i16, 0 : i16, 0 : i16],
+      padding_dimensions=[0, 1, 2],
+      nofold_flags=[1, 1, 1],
+      copy_back_op="linalg.copy"
+  } : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+  %pad_dps = transform.structured.rewrite_in_destination_passing_style %pad_op
+      : (!transform.any_op) -> !transform.any_op
+  %padded_lhs = transform.get_producer_of_operand %padded_op[0]
+      : (!transform.any_op) -> (!transform.any_op)
+  %padded_lhs_buffer, %padded_lhs_new =
+      transform.structured.bufferize_to_allocation %padded_lhs
+      {memory_space = 2, bufferize_destination_only, emit_dealloc} : !transform.any_op
+  %padded_rhs = transform.get_producer_of_operand %padded_op[1]
+      : (!transform.any_op) -> (!transform.any_op)
+  %padded_rhs_buffer, %padded_rhs_new =
+      transform.structured.bufferize_to_allocation %padded_rhs
+      {memory_space = 2, bufferize_destination_only, emit_dealloc} : !transform.any_op
+  %padded_result = transform.get_producer_of_operand %padded_op[2]
+      : (!transform.any_op) -> (!transform.any_op)
+  %padded_result_buffer, %padded_result_new =
+      transform.structured.bufferize_to_allocation %padded_result
+      {memory_space = 2, bufferize_destination_only, emit_dealloc} : !transform.any_op
+  transform.yield
+}
diff --git a/examples/vec-add/transform_aie2.mlir b/examples/vec-add/transform_aie2.mlir
index b192305..5fdcf4f 100644
--- a/examples/vec-add/transform_aie2.mlir
+++ b/examples/vec-add/transform_aie2.mlir
@@ -4,8 +4,10 @@
 ////////////////////////////////////////////////////////////////////////////////
 // Transform Script for Vector Addition (AIE2)
 // Simple elementwise add: out = a + b
-// Binary op (2 inputs + 1 output). No fusion needed. Vec tile = 16 (AIE2).
-// No type casts needed (bf16 add is native).
+// Binary op (2 inputs + 1 output). No fusion needed.
+// No type casts needed (bf16/i8/i16 add is native; f32 uses bf16-emulation).
+// Dtype-generic: uses @DTYPE@ and @VECTOR_SIZE@ placeholders substituted
+// by the driver based on the IR element type and NPU version.
 // Uses shared library sequences from transform_library.mlir (auto-injected).
 ////////////////////////////////////////////////////////////////////////////////
 
@@ -18,7 +20,7 @@ module attributes {transform.with_named_sequence} {
         (%arg1) : (!transform.any_op) -> ()
     transform.include @canonicalize_with_cse failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
-    transform.include @pad_and_promote_binary_bf16 failures(propagate)
+    transform.include @pad_and_promote_binary_@DTYPE@ failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
     transform.include @canonicalize_with_cse failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
@@ -27,7 +29,7 @@ module attributes {transform.with_named_sequence} {
     transform.include @post_bufferize_cleanup failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
 
-    transform.include @vectorize_generics_at_16 failures(propagate)
+    transform.include @vectorize_generics_at_@VECTOR_SIZE@ failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
     %vh = transform.include @air_herd_mapping_and_vectorize
         failures(propagate) (%arg1) : (!transform.any_op) -> !transform.any_op
diff --git a/examples/vec-add/transform_aie2p.mlir b/examples/vec-add/transform_aie2p.mlir
index c9bae4f..9dad749 100644
--- a/examples/vec-add/transform_aie2p.mlir
+++ b/examples/vec-add/transform_aie2p.mlir
@@ -4,8 +4,10 @@
 ////////////////////////////////////////////////////////////////////////////////
 // Transform Script for Vector Addition (AIE2P)
 // Simple elementwise add: out = a + b
-// Binary op (2 inputs + 1 output). No fusion needed. Vec tile = 32 (AIE2P).
-// No type casts needed (bf16 add is native).
+// Binary op (2 inputs + 1 output). No fusion needed.
+// No type casts needed (bf16/i8/i16 add is native; f32 uses bf16-emulation).
+// Dtype-generic: uses @DTYPE@ and @VECTOR_SIZE@ placeholders substituted
+// by the driver based on the IR element type and NPU version.
 // Uses shared library sequences from transform_library.mlir (auto-injected).
 ////////////////////////////////////////////////////////////////////////////////
 
@@ -18,7 +20,7 @@ module attributes {transform.with_named_sequence} {
         (%arg1) : (!transform.any_op) -> ()
     transform.include @canonicalize_with_cse failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
-    transform.include @pad_and_promote_binary_bf16 failures(propagate)
+    transform.include @pad_and_promote_binary_@DTYPE@ failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
     transform.include @canonicalize_with_cse failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
@@ -27,7 +29,7 @@ module attributes {transform.with_named_sequence} {
     transform.include @post_bufferize_cleanup failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
 
-    transform.include @vectorize_generics_at_32 failures(propagate)
+    transform.include @vectorize_generics_at_@VECTOR_SIZE@ failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
     %vh = transform.include @air_herd_mapping_and_vectorize
         failures(propagate) (%arg1) : (!transform.any_op) -> !transform.any_op
diff --git a/examples/vec-add/vec-add.py b/examples/vec-add/vec-add.py
index c5452dd..fafb087 100644
--- a/examples/vec-add/vec-add.py
+++ b/examples/vec-add/vec-add.py
@@ -1,17 +1,51 @@
 # Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
 # SPDX-License-Identifier: MIT
 
-# this is a benchmark for adding vectors with maximum block size
-# to check the performance of tl.dot operation
+# Vector addition benchmark supporting multiple data types.
+# Supports bf16 (default), f32 (via bf16-emulation), i8, and i16.
 
+import argparse
 import torch
 import triton
 import triton.language as tl
-import sys, os
+import sys
+import os
 
 sys.path.append(os.path.abspath(".."))
 import benchmark
 
+# Dtype configuration: torch type, whether it's a float, tolerances.
+DTYPE_CONFIG = {
+    "bf16": {
+        "torch_dtype": torch.bfloat16,
+        "is_float": True,
+        "atol": 1e-2,
+        "rtol": 1e-2,
+        "bf16_emulation": False,
+    },
+    "f32": {
+        "torch_dtype": torch.float32,
+        "is_float": True,
+        "atol": 1e-1,
+        "rtol": 5e-2,
+        "bf16_emulation": True,  # f32 addf not native on AIE; requires bf16-emulation
+    },
+    "i8": {
+        "torch_dtype": torch.int8,
+        "is_float": False,
+        "atol": 0,
+        "rtol": 0,
+        "bf16_emulation": False,
+    },
+    "i16": {
+        "torch_dtype": torch.int16,
+        "is_float": False,
+        "atol": 0,
+        "rtol": 0,
+        "bf16_emulation": False,
+    },
+}
+
 
 @triton.jit
 def vecadd(
@@ -25,8 +59,6 @@ def vecadd(
     block_start = pid * BLOCK_SIZE_N
     offsets = block_start + tl.arange(0, BLOCK_SIZE_N)
 
-    # mask = offsets < n_elements    #AMK - in triton example, do we need?
-
     a_block = tl.load(A + offsets[:])
     b_block = tl.load(B + offsets[:])
 
@@ -35,35 +67,69 @@ def vecadd(
     tl.store(C + offsets[:], c_block)
 
 
-# @benchmark.measure()
-def bench_vecadd(N, provider):
+def bench_vecadd(N, provider, cfg):
     device = "cpu"
-    dtype_in = torch.bfloat16
-    dtype_out = (
-        torch.bfloat16
-    )  # torch.float32 won't work due to unsupported `%33 = fpext <8 x bfloat> %32 to <8 x float>`
-    a = torch.randn(N, device=device, dtype=dtype_in)
-    b = torch.randn(N, device=device, dtype=dtype_in)
-    c = torch.empty(N, device=device, dtype=dtype_out)
+    torch_dtype = cfg["torch_dtype"]
+
+    if cfg["is_float"]:
+        a = torch.randn(N, device=device, dtype=torch_dtype)
+        b = torch.randn(N, device=device, dtype=torch_dtype)
+    else:
+        # Clamp to half-max to avoid overflow on addition
+        iinfo = torch.iinfo(torch_dtype)
+        half_max = iinfo.max // 2
+        a = torch.randint(0, half_max, (N,), device=device, dtype=torch_dtype)
+        b = torch.randint(0, half_max, (N,), device=device, dtype=torch_dtype)
+
+    c = torch.empty(N, device=device, dtype=torch_dtype)
+
     if provider == "torch" or provider == "test":
         c_ref = torch.add(a, b)
     if provider == "triton" or provider == "test":
-        # 2D launch kernel where each block gets its own program.
         grid = lambda META: (triton.cdiv(N, META["BLOCK_SIZE_N"]),)
         compiled_kernel = vecadd[grid](
             a,
             b,
             c,
             N,
-            BLOCK_SIZE_N=1024,  # TODO: small tile sizes currently face errors due to lock race condition at memtiles
+            BLOCK_SIZE_N=1024,
         )
         with open("tt.shared.mlir", "w") as f:
             f.write(str(compiled_kernel.asm["ttsharedir"]))
         if provider == "test":
-            torch.testing.assert_close(c, c_ref, atol=1e-2, rtol=1e-2)
+            torch.testing.assert_close(c, c_ref, atol=cfg["atol"], rtol=cfg["rtol"])
 
 
 if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Vector addition benchmark for AMD NPU"
+    )
+    parser.add_argument(
+        "--dtype",
+        type=str,
+        choices=list(DTYPE_CONFIG.keys()),
+        default="bf16",
+        help="Element data type (default: bf16)",
+    )
+    parser.add_argument(
+        "--bf16-emulation",
+        dest="bf16_emulation",
+        default=False,
+        action="store_true",
+        help="Use f32 data type with bf16 emulation on AIE cores",
+    )
+    args = parser.parse_args()
+
+    # --bf16-emulation is shorthand for --dtype f32
+    if args.bf16_emulation:
+        args.dtype = "f32"
+
+    cfg = DTYPE_CONFIG[args.dtype]
+
+    # Enable bf16 emulation env var when needed
+    if cfg["bf16_emulation"]:
+        os.environ["AMD_TRITON_NPU_BF16_EMULATION"] = "1"
+
     benchmark.select_npu_backend()
     for N in [2**i for i in range(10, 16, 1)]:
-        bench_vecadd(N, "test")
+        bench_vecadd(N, "test", cfg)

From a7c3f5ce4295d937bf8805a2d400fdd1488234f4 Mon Sep 17 00:00:00 2001
From: erweiw <erwei.wang@amd.com>
Date: Thu, 9 Apr 2026 20:35:29 -0700
Subject: [PATCH 2/9] Address Copilot review: guard placeholder substitution

- Only call detect_npu_version() when @VECTOR_SIZE@ placeholder is
  actually present, avoiding failures in environments without xrt-smi
- Raise ValueError with supported types when an unsupported element
  type is detected but placeholders are present
- Fix _detect_element_type docstring to match actual behavior

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 amd_triton_npu/backend/driver.py | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/amd_triton_npu/backend/driver.py b/amd_triton_npu/backend/driver.py
index 8c3634b..efe63c7 100644
--- a/amd_triton_npu/backend/driver.py
+++ b/amd_triton_npu/backend/driver.py
@@ -411,15 +411,16 @@ def _replace_include(m):
 
 
 def _detect_element_type(ir_str):
-    """Detect the primary element type from the Linalg IR function signature.
+    """Detect the primary element type from the provided Linalg IR string.
 
-    Scans memref types in the first func.func line for the element type.
-    Returns the MLIR type string (e.g., "bf16", "f32", "i8", "i16").
+    Searches the IR text for the first ``memref<...xTYPE>`` occurrence and
+    returns the captured MLIR element type string (for example, ``"bf16"``,
+    ``"f32"``, ``"i8"``, or ``"i16"``).
     Falls back to "bf16" if detection fails.
     """
     import re
 
-    # Match memref<...xTYPE> in the function signature
+    # Match the first memref<...xTYPE> occurrence in the provided IR text.
     match = re.search(r"memref<[^>]*x(\w+)>", ir_str)
     if match:
         return match.group(1)
@@ -451,7 +452,12 @@ def _substitute_dtype_placeholders(script, dtype, npu_version):
         return script
     info = _DTYPE_PLACEHOLDER_INFO.get(dtype)
     if info is None:
-        return script
+        raise ValueError(
+            f"Unsupported element type '{dtype}' for transform script placeholder "
+            f"substitution. Supported types: {list(_DTYPE_PLACEHOLDER_INFO.keys())}. "
+            f"The script contains @DTYPE@/@PAD_VAL@/@VECTOR_SIZE@ placeholders that "
+            f"require a supported element type."
+        )
     script = script.replace("@DTYPE@", dtype)
     script = script.replace("@PAD_VAL@", info["pad_val"])
     script = script.replace(
@@ -492,11 +498,14 @@ def _get_transform_ir_string(ir_str=None):
         with open(custom_script_path, "r") as f:
             print(f"Using custom tiling script from: {custom_script_path}")
             user_script = f.read()
-        if ir_str is not None:
+        _PLACEHOLDERS = ("@DTYPE@", "@PAD_VAL@", "@VECTOR_SIZE@")
+        if ir_str is not None and any(p in user_script for p in _PLACEHOLDERS):
             dtype = _detect_element_type(
                 ir_str if isinstance(ir_str, str) else str(ir_str)
             )
-            npu_version = detect_npu_version()
+            npu_version = (
+                detect_npu_version() if "@VECTOR_SIZE@" in user_script else None
+            )
             user_script = _substitute_dtype_placeholders(
                 user_script, dtype, npu_version
             )

From eaa7210b8e892e02904e1d18e6b1a3906471e11f Mon Sep 17 00:00:00 2001
From: erweiw <erwei.wang@amd.com>
Date: Thu, 9 Apr 2026 20:36:59 -0700
Subject: [PATCH 3/9] Update vec-add datatypes in examples dashboard

Update generate_readme.py registry to reflect multi-dtype support
(bf16, f32, i8, i16) and regenerate examples/README.md.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 examples/README.md          | 59 +++++++++++++++++++++++++++++++++++++
 examples/generate_readme.py |  2 +-
 2 files changed, 60 insertions(+), 1 deletion(-)
 create mode 100644 examples/README.md

diff --git a/examples/README.md b/examples/README.md
new file mode 100644
index 0000000..351bfae
--- /dev/null
+++ b/examples/README.md
@@ -0,0 +1,59 @@
+<!-- This file is auto-generated by generate_readme.py. Do not edit manually. -->
+
+# Triton-XDNA Examples
+
+These examples demonstrate how to write [Triton](https://github.com/triton-lang/triton) kernels that compile and run on AMD XDNA™ NPUs via the [MLIR-AIR](https://github.com/Xilinx/mlir-air) compilation flow.
+
+## Operator Dashboard
+
+| Category | Operation | Datatype(s) | AIE2 | AIE2P | Example |
+|:---------|:----------|:------------|:----:|:-----:|:--------|
+| Matrix | [Matrix Multiplication (BF16)](matmul_bf16_m64_n64_k64/) | bf16 | ✅ | ✅ | [matmul_bf16_m64_n64_k64/](matmul_bf16_m64_n64_k64/) |
+| Matrix | [Padded Matrix Multiplication (F32, A Transposed)](matmul_f32_m64_n32_k16_padded_atransposed/) | f32 (bf16 emulation) | — | ✅ | [matmul_f32_m64_n32_k16_padded_atransposed/](matmul_f32_m64_n32_k16_padded_atransposed/) |
+| Matrix | [Matrix Multiplication (INT8)](matmul_i8_m64_n64_k64/) | i8 | — | ✅ | [matmul_i8_m64_n64_k64/](matmul_i8_m64_n64_k64/) |
+| Matrix | [Matrix Multiplication (INT8, Large Tile)](matmul_i8_m128_n64_k64/) | i8 | — | ✅ | [matmul_i8_m128_n64_k64/](matmul_i8_m128_n64_k64/) |
+| Matrix | [Matrix Multiplication (Autotune)](autotune-matmul/) | bf16 | ✅ | — | [autotune-matmul/](autotune-matmul/) |
+| Element-wise | [ReLU](relu/) | bf16 | ✅ | ✅ | [relu/](relu/) |
+| Element-wise | [Sigmoid](sigmoid/) | bf16 | ✅ | ✅ | [sigmoid/](sigmoid/) |
+| Element-wise | [SiLU](silu/) | bf16 | ✅ | ✅ | [silu/](silu/) |
+| Element-wise | [GELU](gelu/) | bf16 | — | ✅ | [gelu/](gelu/) |
+| Element-wise | [Leaky ReLU](leaky_relu/) | bf16 | ✅ | ✅ | [leaky_relu/](leaky_relu/) |
+| Element-wise | [SwiGLU](swiglu/) | bf16 | ✅ | ✅ | [swiglu/](swiglu/) |
+| Element-wise | [AXPY](axpy/) | bf16 | ✅ | ✅ | [axpy/](axpy/) |
+| Element-wise | [Vector Add](vec-add/) | bf16, f32, i8, i16 | ✅ | ✅ | [vec-add/](vec-add/) |
+| Normalization | [RMS Normalization](rms_norm/) | bf16 | — | ✅ | [rms_norm/](rms_norm/) |
+| Normalization | [Weighted RMS Normalization](weighted_rms_norm/) | bf16 | ✅ | ✅ | [weighted_rms_norm/](weighted_rms_norm/) |
+| Normalization | [Softmax](test_softmax/) | bf16 | ✅ | ✅ | [test_softmax/](test_softmax/) |
+| Normalization | [Layer Normalization](test_layernorm/) | f32 | ✅ | ✅ | [test_layernorm/](test_layernorm/) |
+| Pooling | [Average Pool](average_pool/) | bf16 | ✅ | ✅ | [average_pool/](average_pool/) |
+| Special | [2D Block Load](load_2d_block/) | f32 | — | — | [load_2d_block/](load_2d_block/) |
+| Special | [Multi-Driver](multi_drivers/) | bf16 | ✅ | ✅ | [multi_drivers/](multi_drivers/) |
+
+### Legend
+
+- ✅ Transform file available (device target supported)
+- — Not yet available
+
+**AIE2** = AMD Ryzen™ AI (Phoenix, NPU1) &nbsp;&nbsp; **AIE2P** = AMD Ryzen™ AI (Strix, NPU2)
+
+## Running Examples
+
+Make sure XRT is sourced and a virtual environment with `triton-xdna` is active (see top-level [README](../README.md)):
+
+```bash
+source /opt/xilinx/xrt/setup.sh
+
+# Run an example on AIE2 (NPU1):
+cd matmul_bf16_m64_n64_k64
+AIR_TRANSFORM_TILING_SCRIPT=transform_aie2.mlir python matmul_bf16_m64_n64_k64.py
+
+# Run on AIE2P (NPU2):
+AIR_TRANSFORM_TILING_SCRIPT=transform_aie2p.mlir python matmul_bf16_m64_n64_k64.py
+```
+
+## Running All Tests
+
+```bash
+python scripts/run_tests.py --device aie2 --verbose
+python scripts/run_tests.py --device aie2p --verbose
+```
diff --git a/examples/generate_readme.py b/examples/generate_readme.py
index a8a06ae..b479cf1 100644
--- a/examples/generate_readme.py
+++ b/examples/generate_readme.py
@@ -102,7 +102,7 @@
         "category": "Element-wise",
         "name": "Vector Add",
         "path": "vec-add",
-        "datatypes": "bf16",
+        "datatypes": "bf16, f32, i8, i16",
     },
     {
         "category": "Normalization",

From a18807179018df2f325ea8c3a2e5c50c84afa950 Mon Sep 17 00:00:00 2001
From: erweiw <erwei.wang@amd.com>
Date: Thu, 9 Apr 2026 20:39:15 -0700
Subject: [PATCH 4/9] Remove auto-generated README.md (generated by CI)

examples/README.md is auto-generated by generate_readme.py in CI
and should not be committed.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 examples/README.md | 59 ----------------------------------------------
 1 file changed, 59 deletions(-)
 delete mode 100644 examples/README.md

diff --git a/examples/README.md b/examples/README.md
deleted file mode 100644
index 351bfae..0000000
--- a/examples/README.md
+++ /dev/null
@@ -1,59 +0,0 @@
-<!-- This file is auto-generated by generate_readme.py. Do not edit manually. -->
-
-# Triton-XDNA Examples
-
-These examples demonstrate how to write [Triton](https://github.com/triton-lang/triton) kernels that compile and run on AMD XDNA™ NPUs via the [MLIR-AIR](https://github.com/Xilinx/mlir-air) compilation flow.
-
-## Operator Dashboard
-
-| Category | Operation | Datatype(s) | AIE2 | AIE2P | Example |
-|:---------|:----------|:------------|:----:|:-----:|:--------|
-| Matrix | [Matrix Multiplication (BF16)](matmul_bf16_m64_n64_k64/) | bf16 | ✅ | ✅ | [matmul_bf16_m64_n64_k64/](matmul_bf16_m64_n64_k64/) |
-| Matrix | [Padded Matrix Multiplication (F32, A Transposed)](matmul_f32_m64_n32_k16_padded_atransposed/) | f32 (bf16 emulation) | — | ✅ | [matmul_f32_m64_n32_k16_padded_atransposed/](matmul_f32_m64_n32_k16_padded_atransposed/) |
-| Matrix | [Matrix Multiplication (INT8)](matmul_i8_m64_n64_k64/) | i8 | — | ✅ | [matmul_i8_m64_n64_k64/](matmul_i8_m64_n64_k64/) |
-| Matrix | [Matrix Multiplication (INT8, Large Tile)](matmul_i8_m128_n64_k64/) | i8 | — | ✅ | [matmul_i8_m128_n64_k64/](matmul_i8_m128_n64_k64/) |
-| Matrix | [Matrix Multiplication (Autotune)](autotune-matmul/) | bf16 | ✅ | — | [autotune-matmul/](autotune-matmul/) |
-| Element-wise | [ReLU](relu/) | bf16 | ✅ | ✅ | [relu/](relu/) |
-| Element-wise | [Sigmoid](sigmoid/) | bf16 | ✅ | ✅ | [sigmoid/](sigmoid/) |
-| Element-wise | [SiLU](silu/) | bf16 | ✅ | ✅ | [silu/](silu/) |
-| Element-wise | [GELU](gelu/) | bf16 | — | ✅ | [gelu/](gelu/) |
-| Element-wise | [Leaky ReLU](leaky_relu/) | bf16 | ✅ | ✅ | [leaky_relu/](leaky_relu/) |
-| Element-wise | [SwiGLU](swiglu/) | bf16 | ✅ | ✅ | [swiglu/](swiglu/) |
-| Element-wise | [AXPY](axpy/) | bf16 | ✅ | ✅ | [axpy/](axpy/) |
-| Element-wise | [Vector Add](vec-add/) | bf16, f32, i8, i16 | ✅ | ✅ | [vec-add/](vec-add/) |
-| Normalization | [RMS Normalization](rms_norm/) | bf16 | — | ✅ | [rms_norm/](rms_norm/) |
-| Normalization | [Weighted RMS Normalization](weighted_rms_norm/) | bf16 | ✅ | ✅ | [weighted_rms_norm/](weighted_rms_norm/) |
-| Normalization | [Softmax](test_softmax/) | bf16 | ✅ | ✅ | [test_softmax/](test_softmax/) |
-| Normalization | [Layer Normalization](test_layernorm/) | f32 | ✅ | ✅ | [test_layernorm/](test_layernorm/) |
-| Pooling | [Average Pool](average_pool/) | bf16 | ✅ | ✅ | [average_pool/](average_pool/) |
-| Special | [2D Block Load](load_2d_block/) | f32 | — | — | [load_2d_block/](load_2d_block/) |
-| Special | [Multi-Driver](multi_drivers/) | bf16 | ✅ | ✅ | [multi_drivers/](multi_drivers/) |
-
-### Legend
-
-- ✅ Transform file available (device target supported)
-- — Not yet available
-
-**AIE2** = AMD Ryzen™ AI (Phoenix, NPU1) &nbsp;&nbsp; **AIE2P** = AMD Ryzen™ AI (Strix, NPU2)
-
-## Running Examples
-
-Make sure XRT is sourced and a virtual environment with `triton-xdna` is active (see top-level [README](../README.md)):
-
-```bash
-source /opt/xilinx/xrt/setup.sh
-
-# Run an example on AIE2 (NPU1):
-cd matmul_bf16_m64_n64_k64
-AIR_TRANSFORM_TILING_SCRIPT=transform_aie2.mlir python matmul_bf16_m64_n64_k64.py
-
-# Run on AIE2P (NPU2):
-AIR_TRANSFORM_TILING_SCRIPT=transform_aie2p.mlir python matmul_bf16_m64_n64_k64.py
-```
-
-## Running All Tests
-
-```bash
-python scripts/run_tests.py --device aie2 --verbose
-python scripts/run_tests.py --device aie2p --verbose
-```

From 68dd5fee18838e69e65bd91eb754d0f44dc7da24 Mon Sep 17 00:00:00 2001
From: erweiw <erwei.wang@amd.com>
Date: Thu, 9 Apr 2026 21:01:48 -0700
Subject: [PATCH 5/9] Add multi-dtype support to axpy and relu examples

Extend axpy and relu to support bf16, f32 (bf16-emulation), i8, and
i16 using the same @DTYPE@/@VECTOR_SIZE@ placeholder mechanism as
vec-add.

Transform library: add pad_and_promote_unary_{f32,i8,i16} sequences.

Tested on NPU2 (Strix/AIE2P):
- bf16, f32, i16: pass for both axpy and relu
- i8: compiles through triton-shared-opt and AIR transforms but fails
  at aircc (arith.muli/maxsi not supported for i8 vectors on AIE2P).
  vec-add i8 works because it only uses arith.addi.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../transform_library/elementwise.mlir        | 79 ++++++++++++++++
 examples/axpy/axpy.py                         | 93 +++++++++++++++++--
 examples/axpy/transform_aie2.mlir             |  8 +-
 examples/axpy/transform_aie2p.mlir            |  8 +-
 examples/generate_readme.py                   |  4 +-
 examples/relu/relu.py                         | 85 +++++++++++++++--
 examples/relu/transform_aie2.mlir             |  6 +-
 examples/relu/transform_aie2p.mlir            |  7 +-
 8 files changed, 256 insertions(+), 34 deletions(-)

diff --git a/amd_triton_npu/backend/transform_library/elementwise.mlir b/amd_triton_npu/backend/transform_library/elementwise.mlir
index cd847cb..30c14f1 100644
--- a/amd_triton_npu/backend/transform_library/elementwise.mlir
+++ b/amd_triton_npu/backend/transform_library/elementwise.mlir
@@ -66,6 +66,85 @@ transform.named_sequence @pad_and_promote_unary_bf16(
   transform.yield
 }
 
+// Unary variant for f32: 1 input + 1 output = 2 operands.
+// Used with bf16-emulation (f32 data, bf16 compute on AIE cores).
+transform.named_sequence @pad_and_promote_unary_f32(
+    %module: !transform.any_op {transform.readonly}) {
+  %op = transform.structured.match ops{["linalg.generic"]} in %module
+      : (!transform.any_op) -> !transform.any_op
+  %padded_op, %pad_op, %__ = transform.structured.pad %op {
+      padding_values=[0.0 : f32, 0.0 : f32],
+      padding_dimensions=[0, 1],
+      nofold_flags=[1, 1],
+      copy_back_op="linalg.copy"
+  } : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+  %pad_dps = transform.structured.rewrite_in_destination_passing_style %pad_op
+      : (!transform.any_op) -> !transform.any_op
+  %padded_input = transform.get_producer_of_operand %padded_op[0]
+      : (!transform.any_op) -> (!transform.any_op)
+  %padded_input_buffer, %padded_input_new =
+      transform.structured.bufferize_to_allocation %padded_input
+      {memory_space = 2, bufferize_destination_only, emit_dealloc} : !transform.any_op
+  %padded_result = transform.get_producer_of_operand %padded_op[1]
+      : (!transform.any_op) -> (!transform.any_op)
+  %padded_result_buffer, %padded_result_new =
+      transform.structured.bufferize_to_allocation %padded_result
+      {memory_space = 2, bufferize_destination_only, emit_dealloc} : !transform.any_op
+  transform.yield
+}
+
+// Unary variant for i8: 1 input + 1 output = 2 operands.
+transform.named_sequence @pad_and_promote_unary_i8(
+    %module: !transform.any_op {transform.readonly}) {
+  %op = transform.structured.match ops{["linalg.generic"]} in %module
+      : (!transform.any_op) -> !transform.any_op
+  %padded_op, %pad_op, %__ = transform.structured.pad %op {
+      padding_values=[0 : i8, 0 : i8],
+      padding_dimensions=[0, 1],
+      nofold_flags=[1, 1],
+      copy_back_op="linalg.copy"
+  } : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+  %pad_dps = transform.structured.rewrite_in_destination_passing_style %pad_op
+      : (!transform.any_op) -> !transform.any_op
+  %padded_input = transform.get_producer_of_operand %padded_op[0]
+      : (!transform.any_op) -> (!transform.any_op)
+  %padded_input_buffer, %padded_input_new =
+      transform.structured.bufferize_to_allocation %padded_input
+      {memory_space = 2, bufferize_destination_only, emit_dealloc} : !transform.any_op
+  %padded_result = transform.get_producer_of_operand %padded_op[1]
+      : (!transform.any_op) -> (!transform.any_op)
+  %padded_result_buffer, %padded_result_new =
+      transform.structured.bufferize_to_allocation %padded_result
+      {memory_space = 2, bufferize_destination_only, emit_dealloc} : !transform.any_op
+  transform.yield
+}
+
+// Unary variant for i16: 1 input + 1 output = 2 operands.
+transform.named_sequence @pad_and_promote_unary_i16(
+    %module: !transform.any_op {transform.readonly}) {
+  %op = transform.structured.match ops{["linalg.generic"]} in %module
+      : (!transform.any_op) -> !transform.any_op
+  %padded_op, %pad_op, %__ = transform.structured.pad %op {
+      padding_values=[0 : i16, 0 : i16],
+      padding_dimensions=[0, 1],
+      nofold_flags=[1, 1],
+      copy_back_op="linalg.copy"
+  } : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+  %pad_dps = transform.structured.rewrite_in_destination_passing_style %pad_op
+      : (!transform.any_op) -> !transform.any_op
+  %padded_input = transform.get_producer_of_operand %padded_op[0]
+      : (!transform.any_op) -> (!transform.any_op)
+  %padded_input_buffer, %padded_input_new =
+      transform.structured.bufferize_to_allocation %padded_input
+      {memory_space = 2, bufferize_destination_only, emit_dealloc} : !transform.any_op
+  %padded_result = transform.get_producer_of_operand %padded_op[1]
+      : (!transform.any_op) -> (!transform.any_op)
+  %padded_result_buffer, %padded_result_new =
+      transform.structured.bufferize_to_allocation %padded_result
+      {memory_space = 2, bufferize_destination_only, emit_dealloc} : !transform.any_op
+  transform.yield
+}
+
 // Binary variant: 2 inputs + 1 output = 3 operands (vec-add, axpy, swiglu).
 transform.named_sequence @pad_and_promote_binary_bf16(
     %module: !transform.any_op {transform.readonly}) {
diff --git a/examples/axpy/axpy.py b/examples/axpy/axpy.py
index 9eb2738..90bc69d 100644
--- a/examples/axpy/axpy.py
+++ b/examples/axpy/axpy.py
@@ -1,14 +1,54 @@
 # Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
 # SPDX-License-Identifier: MIT
 
+# AXPY benchmark: out = alpha * x + y
+# Supports bf16 (default), f32 (via bf16-emulation), i8, and i16.
+
+import argparse
 import torch
 import triton
 import triton.language as tl
-import sys, os
+import sys
+import os
 
 sys.path.append(os.path.abspath(".."))
 import benchmark
 
+DTYPE_CONFIG = {
+    "bf16": {
+        "torch_dtype": torch.bfloat16,
+        "is_float": True,
+        "alpha": 2.0,
+        "atol": 1e-2,
+        "rtol": 1e-2,
+        "bf16_emulation": False,
+    },
+    "f32": {
+        "torch_dtype": torch.float32,
+        "is_float": True,
+        "alpha": 2.0,
+        "atol": 1e-1,
+        "rtol": 5e-2,
+        "bf16_emulation": True,
+    },
+    "i8": {
+        "torch_dtype": torch.int8,
+        "is_float": False,
+        "alpha": 2,
+        "atol": 0,
+        "rtol": 0,
+        "bf16_emulation": False,
+    },
+    "i16": {
+        "torch_dtype": torch.int16,
+        "is_float": False,
+        "alpha": 2,
+        "atol": 0,
+        "rtol": 0,
+        "bf16_emulation": False,
+    },
+}
+
 
 @triton.jit
 def axpy_kernel(
@@ -29,13 +69,23 @@ def axpy_kernel(
     tl.store(OUT + offsets[:], out)
 
 
-def bench_axpy(N, provider):
+def bench_axpy(N, provider, cfg):
     device = "cpu"
-    dtype = torch.bfloat16
-    alpha = 2.0
-    x = torch.randn(N, device=device, dtype=dtype)
-    y = torch.randn(N, device=device, dtype=dtype)
-    out = torch.empty(N, device=device, dtype=dtype)
+    torch_dtype = cfg["torch_dtype"]
+    alpha = cfg["alpha"]
+
+    if cfg["is_float"]:
+        x = torch.randn(N, device=device, dtype=torch_dtype)
+        y = torch.randn(N, device=device, dtype=torch_dtype)
+    else:
+        iinfo = torch.iinfo(torch_dtype)
+        # Keep values small enough that alpha*x+y doesn't overflow
+        quarter_max = iinfo.max // 4
+        x = torch.randint(0, quarter_max, (N,), device=device, dtype=torch_dtype)
+        y = torch.randint(0, quarter_max, (N,), device=device, dtype=torch_dtype)
+
+    out = torch.empty(N, device=device, dtype=torch_dtype)
+
     if provider == "torch" or provider == "test":
         out_ref = alpha * x + y
     if provider == "triton" or provider == "test":
@@ -51,10 +101,35 @@ def bench_axpy(N, provider):
         with open("tt.shared.mlir", "w") as f:
             f.write(str(compiled_kernel.asm["ttsharedir"]))
         if provider == "test":
-            torch.testing.assert_close(out, out_ref, atol=1e-2, rtol=1e-2)
+            torch.testing.assert_close(out, out_ref, atol=cfg["atol"], rtol=cfg["rtol"])
 
 
 if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="AXPY benchmark for AMD NPU")
+    parser.add_argument(
+        "--dtype",
+        type=str,
+        choices=list(DTYPE_CONFIG.keys()),
+        default="bf16",
+        help="Element data type (default: bf16)",
+    )
+    parser.add_argument(
+        "--bf16-emulation",
+        dest="bf16_emulation",
+        default=False,
+        action="store_true",
+        help="Use f32 data type with bf16 emulation on AIE cores",
+    )
+    args = parser.parse_args()
+
+    if args.bf16_emulation:
+        args.dtype = "f32"
+
+    cfg = DTYPE_CONFIG[args.dtype]
+
+    if cfg["bf16_emulation"]:
+        os.environ["AMD_TRITON_NPU_BF16_EMULATION"] = "1"
+
     benchmark.select_npu_backend()
     for N in [2**i for i in range(10, 16, 1)]:
-        bench_axpy(N, "test")
+        bench_axpy(N, "test", cfg)
diff --git a/examples/axpy/transform_aie2.mlir b/examples/axpy/transform_aie2.mlir
index 31e907d..2bea4be 100644
--- a/examples/axpy/transform_aie2.mlir
+++ b/examples/axpy/transform_aie2.mlir
@@ -3,8 +3,8 @@
 
 ////////////////////////////////////////////////////////////////////////////////
 // Transform Script for AXPY (AIE2): out = alpha * x + y
-// Binary op (2 inputs: x, y). Cast mulf and addf to bf16.
-// No extern_func.o needed (native mulf/addf).
+// Binary op (2 inputs: x, y). Cast mulf and addf to bf16 when float.
+// Dtype-generic: uses @DTYPE@ and @VECTOR_SIZE@ placeholders.
 // Uses shared library sequences from transform_library.mlir (auto-injected).
 ////////////////////////////////////////////////////////////////////////////////
 
@@ -20,7 +20,7 @@ module attributes {transform.with_named_sequence} {
         (%arg1) : (!transform.any_op) -> ()
     transform.include @canonicalize_with_cse failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
-    transform.include @pad_and_promote_binary_bf16 failures(propagate)
+    transform.include @pad_and_promote_binary_@DTYPE@ failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
     transform.include @canonicalize_with_cse failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
@@ -29,7 +29,7 @@ module attributes {transform.with_named_sequence} {
     transform.include @post_bufferize_cleanup failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
 
-    transform.include @vectorize_generics_at_16 failures(propagate)
+    transform.include @vectorize_generics_at_@VECTOR_SIZE@ failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
     %vh = transform.include @air_herd_mapping_and_vectorize
         failures(propagate) (%arg1) : (!transform.any_op) -> !transform.any_op
diff --git a/examples/axpy/transform_aie2p.mlir b/examples/axpy/transform_aie2p.mlir
index 3244ef5..df56af7 100644
--- a/examples/axpy/transform_aie2p.mlir
+++ b/examples/axpy/transform_aie2p.mlir
@@ -3,8 +3,8 @@
 
 ////////////////////////////////////////////////////////////////////////////////
 // Transform Script for AXPY (AIE2P): out = alpha * x + y
-// Binary op (2 inputs: x, y). Cast mulf and addf to bf16.
-// No extern_func.o needed (native mulf/addf).
+// Binary op (2 inputs: x, y). Cast mulf and addf to bf16 when float.
+// Dtype-generic: uses @DTYPE@ and @VECTOR_SIZE@ placeholders.
 // Uses shared library sequences from transform_library.mlir (auto-injected).
 ////////////////////////////////////////////////////////////////////////////////
 
@@ -20,7 +20,7 @@ module attributes {transform.with_named_sequence} {
         (%arg1) : (!transform.any_op) -> ()
     transform.include @canonicalize_with_cse failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
-    transform.include @pad_and_promote_binary_bf16 failures(propagate)
+    transform.include @pad_and_promote_binary_@DTYPE@ failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
     transform.include @canonicalize_with_cse failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
@@ -29,7 +29,7 @@ module attributes {transform.with_named_sequence} {
     transform.include @post_bufferize_cleanup failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
 
-    transform.include @vectorize_generics_at_16 failures(propagate)
+    transform.include @vectorize_generics_at_@VECTOR_SIZE@ failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
     %vh = transform.include @air_herd_mapping_and_vectorize
         failures(propagate) (%arg1) : (!transform.any_op) -> !transform.any_op
diff --git a/examples/generate_readme.py b/examples/generate_readme.py
index b479cf1..1887247 100644
--- a/examples/generate_readme.py
+++ b/examples/generate_readme.py
@@ -60,7 +60,7 @@
         "category": "Element-wise",
         "name": "ReLU",
         "path": "relu",
-        "datatypes": "bf16",
+        "datatypes": "bf16, f32, i8, i16",
     },
     {
         "category": "Element-wise",
@@ -96,7 +96,7 @@
         "category": "Element-wise",
         "name": "AXPY",
         "path": "axpy",
-        "datatypes": "bf16",
+        "datatypes": "bf16, f32, i8, i16",
     },
     {
         "category": "Element-wise",
diff --git a/examples/relu/relu.py b/examples/relu/relu.py
index b873aab..66ab642 100644
--- a/examples/relu/relu.py
+++ b/examples/relu/relu.py
@@ -1,14 +1,50 @@
 # Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
 # SPDX-License-Identifier: MIT
 
+# ReLU benchmark: y = max(x, 0)
+# Supports bf16 (default), f32 (via bf16-emulation), i8, and i16.
+
+import argparse
 import torch
 import triton
 import triton.language as tl
-import sys, os
+import sys
+import os
 
 sys.path.append(os.path.abspath(".."))
 import benchmark
 
+DTYPE_CONFIG = {
+    "bf16": {
+        "torch_dtype": torch.bfloat16,
+        "is_float": True,
+        "atol": 1e-2,
+        "rtol": 1e-2,
+        "bf16_emulation": False,
+    },
+    "f32": {
+        "torch_dtype": torch.float32,
+        "is_float": True,
+        "atol": 1e-1,
+        "rtol": 5e-2,
+        "bf16_emulation": True,
+    },
+    "i8": {
+        "torch_dtype": torch.int8,
+        "is_float": False,
+        "atol": 0,
+        "rtol": 0,
+        "bf16_emulation": False,
+    },
+    "i16": {
+        "torch_dtype": torch.int16,
+        "is_float": False,
+        "atol": 0,
+        "rtol": 0,
+        "bf16_emulation": False,
+    },
+}
+
 
 @triton.jit
 def relu_kernel(
@@ -22,15 +58,23 @@ def relu_kernel(
     offsets = block_start + tl.arange(0, BLOCK_SIZE)
 
     x = tl.load(X + offsets[:])
-    y = tl.maximum(x, 0.0)
+    # x * 0 produces a dtype-compatible zero for both float and int types.
+    y = tl.maximum(x, x * 0)
     tl.store(Y + offsets[:], y)
 
 
-def bench_relu(N, provider):
+def bench_relu(N, provider, cfg):
     device = "cpu"
-    dtype = torch.bfloat16
-    x = torch.randn(N, device=device, dtype=dtype)
-    y = torch.empty(N, device=device, dtype=dtype)
+    torch_dtype = cfg["torch_dtype"]
+
+    if cfg["is_float"]:
+        x = torch.randn(N, device=device, dtype=torch_dtype)
+    else:
+        iinfo = torch.iinfo(torch_dtype)
+        x = torch.randint(iinfo.min, iinfo.max, (N,), device=device, dtype=torch_dtype)
+
+    y = torch.empty(N, device=device, dtype=torch_dtype)
+
     if provider == "torch" or provider == "test":
         y_ref = torch.relu(x)
     if provider == "triton" or provider == "test":
@@ -44,10 +88,35 @@ def bench_relu(N, provider):
         with open("tt.shared.mlir", "w") as f:
             f.write(str(compiled_kernel.asm["ttsharedir"]))
         if provider == "test":
-            torch.testing.assert_close(y, y_ref, atol=1e-2, rtol=1e-2)
+            torch.testing.assert_close(y, y_ref, atol=cfg["atol"], rtol=cfg["rtol"])
 
 
 if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="ReLU benchmark for AMD NPU")
+    parser.add_argument(
+        "--dtype",
+        type=str,
+        choices=list(DTYPE_CONFIG.keys()),
+        default="bf16",
+        help="Element data type (default: bf16)",
+    )
+    parser.add_argument(
+        "--bf16-emulation",
+        dest="bf16_emulation",
+        default=False,
+        action="store_true",
+        help="Use f32 data type with bf16 emulation on AIE cores",
+    )
+    args = parser.parse_args()
+
+    if args.bf16_emulation:
+        args.dtype = "f32"
+
+    cfg = DTYPE_CONFIG[args.dtype]
+
+    if cfg["bf16_emulation"]:
+        os.environ["AMD_TRITON_NPU_BF16_EMULATION"] = "1"
+
     benchmark.select_npu_backend()
     for N in [2**i for i in range(10, 16, 1)]:
-        bench_relu(N, "test")
+        bench_relu(N, "test", cfg)
diff --git a/examples/relu/transform_aie2.mlir b/examples/relu/transform_aie2.mlir
index ce5dc86..fbcf1df 100644
--- a/examples/relu/transform_aie2.mlir
+++ b/examples/relu/transform_aie2.mlir
@@ -4,7 +4,7 @@
 ////////////////////////////////////////////////////////////////////////////////
 // Transform Script for ReLU (AIE2)
 // relu(x) = max(x, 0)
-// No extern_func.o needed (native maxnumf).
+// Dtype-generic: uses @DTYPE@ and @VECTOR_SIZE@ placeholders.
 // Uses shared library sequences from transform_library.mlir (auto-injected).
 ////////////////////////////////////////////////////////////////////////////////
 
@@ -20,7 +20,7 @@ module attributes {transform.with_named_sequence} {
         (%arg1) : (!transform.any_op) -> ()
     transform.include @canonicalize_with_cse failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
-    transform.include @pad_and_promote_unary_bf16 failures(propagate)
+    transform.include @pad_and_promote_unary_@DTYPE@ failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
     transform.include @canonicalize_with_cse failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
@@ -29,7 +29,7 @@ module attributes {transform.with_named_sequence} {
     transform.include @post_bufferize_cleanup failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
 
-    transform.include @vectorize_generics_at_16 failures(propagate)
+    transform.include @vectorize_generics_at_@VECTOR_SIZE@ failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
     %vh = transform.include @air_herd_mapping_and_vectorize
         failures(propagate) (%arg1) : (!transform.any_op) -> !transform.any_op
diff --git a/examples/relu/transform_aie2p.mlir b/examples/relu/transform_aie2p.mlir
index eba1f17..7e4ba8e 100644
--- a/examples/relu/transform_aie2p.mlir
+++ b/examples/relu/transform_aie2p.mlir
@@ -4,8 +4,7 @@
 ////////////////////////////////////////////////////////////////////////////////
 // Transform Script for ReLU (AIE2P)
 // relu(x) = max(x, 0)
-// Strategy: fuse_elementwise_linalg -> unary pad+promote -> vectorize at 16
-// -> cast maxnumf to bf16.
+// Dtype-generic: uses @DTYPE@ and @VECTOR_SIZE@ placeholders.
 // Uses shared library sequences from transform_library.mlir (auto-injected).
 ////////////////////////////////////////////////////////////////////////////////
 
@@ -21,7 +20,7 @@ module attributes {transform.with_named_sequence} {
         (%arg1) : (!transform.any_op) -> ()
     transform.include @canonicalize_with_cse failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
-    transform.include @pad_and_promote_unary_bf16 failures(propagate)
+    transform.include @pad_and_promote_unary_@DTYPE@ failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
     transform.include @canonicalize_with_cse failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
@@ -30,7 +29,7 @@ module attributes {transform.with_named_sequence} {
     transform.include @post_bufferize_cleanup failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
 
-    transform.include @vectorize_generics_at_16 failures(propagate)
+    transform.include @vectorize_generics_at_@VECTOR_SIZE@ failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
     %vh = transform.include @air_herd_mapping_and_vectorize
         failures(propagate) (%arg1) : (!transform.any_op) -> !transform.any_op

From 00c502dd59440ea8f921f9dd15fa9f600c73e839 Mon Sep 17 00:00:00 2001
From: erweiw <erwei.wang@amd.com>
Date: Thu, 9 Apr 2026 21:12:19 -0700
Subject: [PATCH 6/9] Add f32 bf16-emulation support to all elementwise
 activation examples

Extend sigmoid, silu, gelu, swiglu, and leaky_relu examples to support
f32 input via bf16-emulation, in addition to the existing bf16.

All transform scripts updated with @DTYPE@/@VECTOR_SIZE@ placeholders.
The @cast_bf16_only_ops and @cast_cmpf_and_select_ops phases work
correctly for both bf16 and f32 inputs -- for f32, the cast converts
f32 vector ops to bf16 at the MLIR level (equivalent to what
bf16-emulation does at the LLVM level).

Tested on NPU2 (Strix/AIE2P): all 5 examples pass correctness checks
for both bf16 and f32 across vector sizes 1024-32768.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 examples/gelu/gelu.py                    | 61 ++++++++++++++++++++----
 examples/gelu/transform_aie2p.mlir       |  4 +-
 examples/generate_readme.py              | 10 ++--
 examples/leaky_relu/leaky_relu.py        | 59 ++++++++++++++++++++---
 examples/leaky_relu/transform_aie2.mlir  |  4 +-
 examples/leaky_relu/transform_aie2p.mlir |  4 +-
 examples/sigmoid/sigmoid.py              | 59 ++++++++++++++++++++---
 examples/sigmoid/transform_aie2p.mlir    | 25 ++--------
 examples/silu/silu.py                    | 59 ++++++++++++++++++++---
 examples/silu/transform_aie2.mlir        |  4 +-
 examples/silu/transform_aie2p.mlir       |  4 +-
 examples/swiglu/swiglu.py                | 61 ++++++++++++++++++++----
 examples/swiglu/transform_aie2.mlir      |  4 +-
 examples/swiglu/transform_aie2p.mlir     |  4 +-
 14 files changed, 285 insertions(+), 77 deletions(-)

diff --git a/examples/gelu/gelu.py b/examples/gelu/gelu.py
index 304afbb..ceadacb 100644
--- a/examples/gelu/gelu.py
+++ b/examples/gelu/gelu.py
@@ -1,14 +1,34 @@
 # Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
 # SPDX-License-Identifier: MIT
 
+# GELU benchmark: y = x * sigmoid(1.702 * x)
+# Supports bf16 (default) and f32 (via bf16-emulation).
+
+import argparse
 import torch
 import triton
 import triton.language as tl
-import sys, os
+import sys
+import os
 
 sys.path.append(os.path.abspath(".."))
 import benchmark
 
+DTYPE_CONFIG = {
+    "bf16": {
+        "torch_dtype": torch.bfloat16,
+        "atol": 1e-1,
+        "rtol": 1e-1,
+        "bf16_emulation": False,
+    },
+    "f32": {
+        "torch_dtype": torch.float32,
+        "atol": 2e-1,
+        "rtol": 1e-1,
+        "bf16_emulation": True,
+    },
+}
+
 
 @triton.jit
 def gelu_kernel(
@@ -30,14 +50,14 @@ def gelu_kernel(
     tl.store(Y + offsets[:], y)
 
 
-def bench_gelu(N, provider):
+def bench_gelu(N, provider, cfg):
     device = "cpu"
-    dtype = torch.bfloat16
-    x = torch.randn(N, device=device, dtype=dtype)
-    y = torch.empty(N, device=device, dtype=dtype)
+    torch_dtype = cfg["torch_dtype"]
+    x = torch.randn(N, device=device, dtype=torch_dtype)
+    y = torch.empty(N, device=device, dtype=torch_dtype)
     if provider == "torch" or provider == "test":
         # Reference uses sigmoid approximation: x * sigmoid(1.702 * x)
-        y_ref = x * torch.sigmoid(1.702 * x.float()).to(dtype)
+        y_ref = x * torch.sigmoid(1.702 * x.float()).to(torch_dtype)
     if provider == "triton" or provider == "test":
         grid = lambda META: (triton.cdiv(N, META["BLOCK_SIZE"]),)
         compiled_kernel = gelu_kernel[grid](
@@ -49,10 +69,35 @@ def bench_gelu(N, provider):
         with open("tt.shared.mlir", "w") as f:
             f.write(str(compiled_kernel.asm["ttsharedir"]))
         if provider == "test":
-            torch.testing.assert_close(y, y_ref, atol=1e-1, rtol=1e-1)
+            torch.testing.assert_close(y, y_ref, atol=cfg["atol"], rtol=cfg["rtol"])
 
 
 if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="GELU benchmark for AMD NPU")
+    parser.add_argument(
+        "--dtype",
+        type=str,
+        choices=list(DTYPE_CONFIG.keys()),
+        default="bf16",
+        help="Element data type (default: bf16)",
+    )
+    parser.add_argument(
+        "--bf16-emulation",
+        dest="bf16_emulation",
+        default=False,
+        action="store_true",
+        help="Use f32 data type with bf16 emulation on AIE cores",
+    )
+    args = parser.parse_args()
+
+    if args.bf16_emulation:
+        args.dtype = "f32"
+
+    cfg = DTYPE_CONFIG[args.dtype]
+
+    if cfg["bf16_emulation"]:
+        os.environ["AMD_TRITON_NPU_BF16_EMULATION"] = "1"
+
     benchmark.select_npu_backend()
     for N in [2**i for i in range(10, 16, 1)]:
-        bench_gelu(N, "test")
+        bench_gelu(N, "test", cfg)
diff --git a/examples/gelu/transform_aie2p.mlir b/examples/gelu/transform_aie2p.mlir
index 2fa1afa..71de302 100644
--- a/examples/gelu/transform_aie2p.mlir
+++ b/examples/gelu/transform_aie2p.mlir
@@ -22,7 +22,7 @@ module attributes {transform.with_named_sequence} {
         (%arg1) : (!transform.any_op) -> ()
     transform.include @canonicalize_with_cse failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
-    transform.include @pad_and_promote_unary_bf16 failures(propagate)
+    transform.include @pad_and_promote_unary_@DTYPE@ failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
     transform.include @canonicalize_with_cse failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
@@ -31,7 +31,7 @@ module attributes {transform.with_named_sequence} {
     transform.include @post_bufferize_cleanup failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
 
-    transform.include @vectorize_generics_at_16 failures(propagate)
+    transform.include @vectorize_generics_at_@VECTOR_SIZE@ failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
     %vh = transform.include @air_herd_mapping_and_vectorize
         failures(propagate) (%arg1) : (!transform.any_op) -> !transform.any_op
diff --git a/examples/generate_readme.py b/examples/generate_readme.py
index 1887247..75dd998 100644
--- a/examples/generate_readme.py
+++ b/examples/generate_readme.py
@@ -66,31 +66,31 @@
         "category": "Element-wise",
         "name": "Sigmoid",
         "path": "sigmoid",
-        "datatypes": "bf16",
+        "datatypes": "bf16, f32",
     },
     {
         "category": "Element-wise",
         "name": "SiLU",
         "path": "silu",
-        "datatypes": "bf16",
+        "datatypes": "bf16, f32",
     },
     {
         "category": "Element-wise",
         "name": "GELU",
         "path": "gelu",
-        "datatypes": "bf16",
+        "datatypes": "bf16, f32",
     },
     {
         "category": "Element-wise",
         "name": "Leaky ReLU",
         "path": "leaky_relu",
-        "datatypes": "bf16",
+        "datatypes": "bf16, f32",
     },
     {
         "category": "Element-wise",
         "name": "SwiGLU",
         "path": "swiglu",
-        "datatypes": "bf16",
+        "datatypes": "bf16, f32",
     },
     {
         "category": "Element-wise",
diff --git a/examples/leaky_relu/leaky_relu.py b/examples/leaky_relu/leaky_relu.py
index 088b6b1..5b9927c 100644
--- a/examples/leaky_relu/leaky_relu.py
+++ b/examples/leaky_relu/leaky_relu.py
@@ -1,16 +1,36 @@
 # Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
 # SPDX-License-Identifier: MIT
 
+# Leaky ReLU benchmark: y = x if x >= 0, else alpha * x
+# Supports bf16 (default) and f32 (via bf16-emulation).
+
+import argparse
 import torch
 import triton
 import triton.language as tl
-import sys, os
+import sys
+import os
 
 sys.path.append(os.path.abspath(".."))
 import benchmark
 
 ALPHA = 0.01  # Standard leaky relu negative slope
 
+DTYPE_CONFIG = {
+    "bf16": {
+        "torch_dtype": torch.bfloat16,
+        "atol": 1e-2,
+        "rtol": 1e-2,
+        "bf16_emulation": False,
+    },
+    "f32": {
+        "torch_dtype": torch.float32,
+        "atol": 1e-1,
+        "rtol": 5e-2,
+        "bf16_emulation": True,
+    },
+}
+
 
 @triton.jit
 def leaky_relu_kernel(
@@ -31,11 +51,11 @@ def leaky_relu_kernel(
     tl.store(Y + offsets[:], y)
 
 
-def bench_leaky_relu(N, provider):
+def bench_leaky_relu(N, provider, cfg):
     device = "cpu"
-    dtype = torch.bfloat16
-    x = torch.randn(N, device=device, dtype=dtype)
-    y = torch.empty(N, device=device, dtype=dtype)
+    torch_dtype = cfg["torch_dtype"]
+    x = torch.randn(N, device=device, dtype=torch_dtype)
+    y = torch.empty(N, device=device, dtype=torch_dtype)
     if provider == "torch" or provider == "test":
         y_ref = torch.nn.functional.leaky_relu(x, negative_slope=ALPHA)
     if provider == "triton" or provider == "test":
@@ -49,10 +69,35 @@ def bench_leaky_relu(N, provider):
         with open("tt.shared.mlir", "w") as f:
             f.write(str(compiled_kernel.asm["ttsharedir"]))
         if provider == "test":
-            torch.testing.assert_close(y, y_ref, atol=1e-2, rtol=1e-2)
+            torch.testing.assert_close(y, y_ref, atol=cfg["atol"], rtol=cfg["rtol"])
 
 
 if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Leaky ReLU benchmark for AMD NPU")
+    parser.add_argument(
+        "--dtype",
+        type=str,
+        choices=list(DTYPE_CONFIG.keys()),
+        default="bf16",
+        help="Element data type (default: bf16)",
+    )
+    parser.add_argument(
+        "--bf16-emulation",
+        dest="bf16_emulation",
+        default=False,
+        action="store_true",
+        help="Use f32 data type with bf16 emulation on AIE cores",
+    )
+    args = parser.parse_args()
+
+    if args.bf16_emulation:
+        args.dtype = "f32"
+
+    cfg = DTYPE_CONFIG[args.dtype]
+
+    if cfg["bf16_emulation"]:
+        os.environ["AMD_TRITON_NPU_BF16_EMULATION"] = "1"
+
     benchmark.select_npu_backend()
     for N in [2**i for i in range(10, 16, 1)]:
-        bench_leaky_relu(N, "test")
+        bench_leaky_relu(N, "test", cfg)
diff --git a/examples/leaky_relu/transform_aie2.mlir b/examples/leaky_relu/transform_aie2.mlir
index e0234a4..f804e9f 100644
--- a/examples/leaky_relu/transform_aie2.mlir
+++ b/examples/leaky_relu/transform_aie2.mlir
@@ -20,7 +20,7 @@ module attributes {transform.with_named_sequence} {
         (%arg1) : (!transform.any_op) -> ()
     transform.include @canonicalize_with_cse failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
-    transform.include @pad_and_promote_unary_bf16 failures(propagate)
+    transform.include @pad_and_promote_unary_@DTYPE@ failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
     transform.include @canonicalize_with_cse failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
@@ -29,7 +29,7 @@ module attributes {transform.with_named_sequence} {
     transform.include @post_bufferize_cleanup failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
 
-    transform.include @vectorize_generics_at_16 failures(propagate)
+    transform.include @vectorize_generics_at_@VECTOR_SIZE@ failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
     %vh = transform.include @air_herd_mapping_and_vectorize
         failures(propagate) (%arg1) : (!transform.any_op) -> !transform.any_op
diff --git a/examples/leaky_relu/transform_aie2p.mlir b/examples/leaky_relu/transform_aie2p.mlir
index 7ed2de4..bc2d3c9 100644
--- a/examples/leaky_relu/transform_aie2p.mlir
+++ b/examples/leaky_relu/transform_aie2p.mlir
@@ -20,7 +20,7 @@ module attributes {transform.with_named_sequence} {
         (%arg1) : (!transform.any_op) -> ()
     transform.include @canonicalize_with_cse failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
-    transform.include @pad_and_promote_unary_bf16 failures(propagate)
+    transform.include @pad_and_promote_unary_@DTYPE@ failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
     transform.include @canonicalize_with_cse failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
@@ -29,7 +29,7 @@ module attributes {transform.with_named_sequence} {
     transform.include @post_bufferize_cleanup failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
 
-    transform.include @vectorize_generics_at_16 failures(propagate)
+    transform.include @vectorize_generics_at_@VECTOR_SIZE@ failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
     %vh = transform.include @air_herd_mapping_and_vectorize
         failures(propagate) (%arg1) : (!transform.any_op) -> !transform.any_op
diff --git a/examples/sigmoid/sigmoid.py b/examples/sigmoid/sigmoid.py
index 12b602c..d5922dd 100644
--- a/examples/sigmoid/sigmoid.py
+++ b/examples/sigmoid/sigmoid.py
@@ -1,14 +1,34 @@
 # Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
 # SPDX-License-Identifier: MIT
 
+# Sigmoid benchmark: y = 1 / (1 + exp(-x))
+# Supports bf16 (default) and f32 (via bf16-emulation).
+
+import argparse
 import torch
 import triton
 import triton.language as tl
-import sys, os
+import sys
+import os
 
 sys.path.append(os.path.abspath(".."))
 import benchmark
 
+DTYPE_CONFIG = {
+    "bf16": {
+        "torch_dtype": torch.bfloat16,
+        "atol": 1e-1,
+        "rtol": 1e-1,
+        "bf16_emulation": False,
+    },
+    "f32": {
+        "torch_dtype": torch.float32,
+        "atol": 2e-1,
+        "rtol": 1e-1,
+        "bf16_emulation": True,
+    },
+}
+
 
 @triton.jit
 def sigmoid_kernel(
@@ -34,11 +54,11 @@ def sigmoid_kernel(
     tl.store(Y + offsets[:], y)
 
 
-def bench_sigmoid(N, provider):
+def bench_sigmoid(N, provider, cfg):
     device = "cpu"
-    dtype = torch.bfloat16
-    x = torch.randn(N, device=device, dtype=dtype)
-    y = torch.empty(N, device=device, dtype=dtype)
+    torch_dtype = cfg["torch_dtype"]
+    x = torch.randn(N, device=device, dtype=torch_dtype)
+    y = torch.empty(N, device=device, dtype=torch_dtype)
     if provider == "torch" or provider == "test":
         y_ref = torch.sigmoid(x)
     if provider == "triton" or provider == "test":
@@ -52,10 +72,35 @@ def bench_sigmoid(N, provider):
         with open("tt.shared.mlir", "w") as f:
             f.write(str(compiled_kernel.asm["ttsharedir"]))
         if provider == "test":
-            torch.testing.assert_close(y, y_ref, atol=1e-1, rtol=1e-1)
+            torch.testing.assert_close(y, y_ref, atol=cfg["atol"], rtol=cfg["rtol"])
 
 
 if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Sigmoid benchmark for AMD NPU")
+    parser.add_argument(
+        "--dtype",
+        type=str,
+        choices=list(DTYPE_CONFIG.keys()),
+        default="bf16",
+        help="Element data type (default: bf16)",
+    )
+    parser.add_argument(
+        "--bf16-emulation",
+        dest="bf16_emulation",
+        default=False,
+        action="store_true",
+        help="Use f32 data type with bf16 emulation on AIE cores",
+    )
+    args = parser.parse_args()
+
+    if args.bf16_emulation:
+        args.dtype = "f32"
+
+    cfg = DTYPE_CONFIG[args.dtype]
+
+    if cfg["bf16_emulation"]:
+        os.environ["AMD_TRITON_NPU_BF16_EMULATION"] = "1"
+
     benchmark.select_npu_backend()
     for N in [2**i for i in range(10, 16, 1)]:
-        bench_sigmoid(N, "test")
+        bench_sigmoid(N, "test", cfg)
diff --git a/examples/sigmoid/transform_aie2p.mlir b/examples/sigmoid/transform_aie2p.mlir
index 2494c2b..8fe2d8e 100644
--- a/examples/sigmoid/transform_aie2p.mlir
+++ b/examples/sigmoid/transform_aie2p.mlir
@@ -6,8 +6,9 @@
 //
 // sigmoid(x) = 1 / (1 + exp(-x))
 //
-// Strategy: fuse_elementwise_linalg -> unary pad+promote -> vectorize at 16
+// Strategy: fuse_elementwise_linalg -> unary pad+promote -> vectorize
 // -> cast exp, subf, addf, mulf to bf16; divf stays f32.
+// Dtype-generic: uses @DTYPE@ and @VECTOR_SIZE@ placeholders.
 //
 // Uses shared library sequences from transform_library.mlir (auto-injected).
 ////////////////////////////////////////////////////////////////////////////////
@@ -16,43 +17,25 @@ module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(
       %arg1: !transform.any_op {transform.readonly}) {
 
-    // Phase 1: Initial canonicalization
     transform.include @canonicalize_with_fold_dims failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
-
-    // Phase 2: Fuse elementwise chain (extf + subf + exp + addf + divf + truncf)
     transform.include @fuse_elementwise_and_canonicalize failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
-
-    // Phase 3: Flatten + tile forall [256]
     transform.include @flatten_tile_forall failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
-
-    // Phase 4: Canonicalization
     transform.include @canonicalize_with_cse failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
-
-    // Phase 5: Pad and promote to L1 (unary: 1 input + 1 output)
-    transform.include @pad_and_promote_unary_bf16 failures(propagate)
+    transform.include @pad_and_promote_unary_@DTYPE@ failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
-
-    // Phase 6: Canonicalization
     transform.include @canonicalize_with_cse failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
-
-    // Phase 7: Bufferization
     transform.include @one_shot_bufferize failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
-
-    // Phase 8: Post-bufferization cleanup
     transform.include @post_bufferize_cleanup failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
 
-    // Phase 9: Vectorization tiling (16-lane for bf16)
-    transform.include @vectorize_generics_at_16 failures(propagate)
+    transform.include @vectorize_generics_at_@VECTOR_SIZE@ failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
-
-    // Phase 10: AIR herd mapping + vectorization
     %vh = transform.include @air_herd_mapping_and_vectorize
         failures(propagate) (%arg1) : (!transform.any_op) -> !transform.any_op
     transform.include @cast_bf16_only_ops failures(propagate)
diff --git a/examples/silu/silu.py b/examples/silu/silu.py
index 59b0aa0..05d55df 100644
--- a/examples/silu/silu.py
+++ b/examples/silu/silu.py
@@ -1,14 +1,34 @@
 # Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
 # SPDX-License-Identifier: MIT
 
+# SiLU benchmark: y = x * sigmoid(x)
+# Supports bf16 (default) and f32 (via bf16-emulation).
+
+import argparse
 import torch
 import triton
 import triton.language as tl
-import sys, os
+import sys
+import os
 
 sys.path.append(os.path.abspath(".."))
 import benchmark
 
+DTYPE_CONFIG = {
+    "bf16": {
+        "torch_dtype": torch.bfloat16,
+        "atol": 1e-1,
+        "rtol": 1e-1,
+        "bf16_emulation": False,
+    },
+    "f32": {
+        "torch_dtype": torch.float32,
+        "atol": 2e-1,
+        "rtol": 1e-1,
+        "bf16_emulation": True,
+    },
+}
+
 
 @triton.jit
 def silu_kernel(
@@ -30,11 +50,11 @@ def silu_kernel(
     tl.store(Y + offsets[:], y)
 
 
-def bench_silu(N, provider):
+def bench_silu(N, provider, cfg):
     device = "cpu"
-    dtype = torch.bfloat16
-    x = torch.randn(N, device=device, dtype=dtype)
-    y = torch.empty(N, device=device, dtype=dtype)
+    torch_dtype = cfg["torch_dtype"]
+    x = torch.randn(N, device=device, dtype=torch_dtype)
+    y = torch.empty(N, device=device, dtype=torch_dtype)
     if provider == "torch" or provider == "test":
         y_ref = torch.nn.functional.silu(x)
     if provider == "triton" or provider == "test":
@@ -48,10 +68,35 @@ def bench_silu(N, provider):
         with open("tt.shared.mlir", "w") as f:
             f.write(str(compiled_kernel.asm["ttsharedir"]))
         if provider == "test":
-            torch.testing.assert_close(y, y_ref, atol=1e-1, rtol=1e-1)
+            torch.testing.assert_close(y, y_ref, atol=cfg["atol"], rtol=cfg["rtol"])
 
 
 if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="SiLU benchmark for AMD NPU")
+    parser.add_argument(
+        "--dtype",
+        type=str,
+        choices=list(DTYPE_CONFIG.keys()),
+        default="bf16",
+        help="Element data type (default: bf16)",
+    )
+    parser.add_argument(
+        "--bf16-emulation",
+        dest="bf16_emulation",
+        default=False,
+        action="store_true",
+        help="Use f32 data type with bf16 emulation on AIE cores",
+    )
+    args = parser.parse_args()
+
+    if args.bf16_emulation:
+        args.dtype = "f32"
+
+    cfg = DTYPE_CONFIG[args.dtype]
+
+    if cfg["bf16_emulation"]:
+        os.environ["AMD_TRITON_NPU_BF16_EMULATION"] = "1"
+
     benchmark.select_npu_backend()
     for N in [2**i for i in range(10, 16, 1)]:
-        bench_silu(N, "test")
+        bench_silu(N, "test", cfg)
diff --git a/examples/silu/transform_aie2.mlir b/examples/silu/transform_aie2.mlir
index 3f16514..78784f9 100644
--- a/examples/silu/transform_aie2.mlir
+++ b/examples/silu/transform_aie2.mlir
@@ -20,7 +20,7 @@ module attributes {transform.with_named_sequence} {
         (%arg1) : (!transform.any_op) -> ()
     transform.include @canonicalize_with_cse failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
-    transform.include @pad_and_promote_unary_bf16 failures(propagate)
+    transform.include @pad_and_promote_unary_@DTYPE@ failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
     transform.include @canonicalize_with_cse failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
@@ -29,7 +29,7 @@ module attributes {transform.with_named_sequence} {
     transform.include @post_bufferize_cleanup failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
 
-    transform.include @vectorize_generics_at_16 failures(propagate)
+    transform.include @vectorize_generics_at_@VECTOR_SIZE@ failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
     %vh = transform.include @air_herd_mapping_with_extern_and_vectorize
         failures(propagate) (%arg1) : (!transform.any_op) -> !transform.any_op
diff --git a/examples/silu/transform_aie2p.mlir b/examples/silu/transform_aie2p.mlir
index 53de42f..acc0aea 100644
--- a/examples/silu/transform_aie2p.mlir
+++ b/examples/silu/transform_aie2p.mlir
@@ -21,7 +21,7 @@ module attributes {transform.with_named_sequence} {
         (%arg1) : (!transform.any_op) -> ()
     transform.include @canonicalize_with_cse failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
-    transform.include @pad_and_promote_unary_bf16 failures(propagate)
+    transform.include @pad_and_promote_unary_@DTYPE@ failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
     transform.include @canonicalize_with_cse failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
@@ -30,7 +30,7 @@ module attributes {transform.with_named_sequence} {
     transform.include @post_bufferize_cleanup failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
 
-    transform.include @vectorize_generics_at_16 failures(propagate)
+    transform.include @vectorize_generics_at_@VECTOR_SIZE@ failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
     %vh = transform.include @air_herd_mapping_and_vectorize
         failures(propagate) (%arg1) : (!transform.any_op) -> !transform.any_op
diff --git a/examples/swiglu/swiglu.py b/examples/swiglu/swiglu.py
index 180e856..65157fe 100644
--- a/examples/swiglu/swiglu.py
+++ b/examples/swiglu/swiglu.py
@@ -1,14 +1,34 @@
 # Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
 # SPDX-License-Identifier: MIT
 
+# SwiGLU benchmark: out = SiLU(gate) * up = gate * sigmoid(gate) * up
+# Supports bf16 (default) and f32 (via bf16-emulation).
+
+import argparse
 import torch
 import triton
 import triton.language as tl
-import sys, os
+import sys
+import os
 
 sys.path.append(os.path.abspath(".."))
 import benchmark
 
+DTYPE_CONFIG = {
+    "bf16": {
+        "torch_dtype": torch.bfloat16,
+        "atol": 1e-1,
+        "rtol": 1e-1,
+        "bf16_emulation": False,
+    },
+    "f32": {
+        "torch_dtype": torch.float32,
+        "atol": 2e-1,
+        "rtol": 1e-1,
+        "bf16_emulation": True,
+    },
+}
+
 
 @triton.jit
 def swiglu_kernel(
@@ -33,12 +53,12 @@ def swiglu_kernel(
     tl.store(OUT + offsets[:], out)
 
 
-def bench_swiglu(N, provider):
+def bench_swiglu(N, provider, cfg):
     device = "cpu"
-    dtype = torch.bfloat16
-    gate = torch.randn(N, device=device, dtype=dtype)
-    up = torch.randn(N, device=device, dtype=dtype)
-    out = torch.empty(N, device=device, dtype=dtype)
+    torch_dtype = cfg["torch_dtype"]
+    gate = torch.randn(N, device=device, dtype=torch_dtype)
+    up = torch.randn(N, device=device, dtype=torch_dtype)
+    out = torch.empty(N, device=device, dtype=torch_dtype)
     if provider == "torch" or provider == "test":
         out_ref = torch.nn.functional.silu(gate) * up
     if provider == "triton" or provider == "test":
@@ -53,10 +73,35 @@ def bench_swiglu(N, provider):
         with open("tt.shared.mlir", "w") as f:
             f.write(str(compiled_kernel.asm["ttsharedir"]))
         if provider == "test":
-            torch.testing.assert_close(out, out_ref, atol=1e-1, rtol=1e-1)
+            torch.testing.assert_close(out, out_ref, atol=cfg["atol"], rtol=cfg["rtol"])
 
 
 if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="SwiGLU benchmark for AMD NPU")
+    parser.add_argument(
+        "--dtype",
+        type=str,
+        choices=list(DTYPE_CONFIG.keys()),
+        default="bf16",
+        help="Element data type (default: bf16)",
+    )
+    parser.add_argument(
+        "--bf16-emulation",
+        dest="bf16_emulation",
+        default=False,
+        action="store_true",
+        help="Use f32 data type with bf16 emulation on AIE cores",
+    )
+    args = parser.parse_args()
+
+    if args.bf16_emulation:
+        args.dtype = "f32"
+
+    cfg = DTYPE_CONFIG[args.dtype]
+
+    if cfg["bf16_emulation"]:
+        os.environ["AMD_TRITON_NPU_BF16_EMULATION"] = "1"
+
     benchmark.select_npu_backend()
     for N in [2**i for i in range(10, 16, 1)]:
-        bench_swiglu(N, "test")
+        bench_swiglu(N, "test", cfg)
diff --git a/examples/swiglu/transform_aie2.mlir b/examples/swiglu/transform_aie2.mlir
index 0de74b4..94c07ff 100644
--- a/examples/swiglu/transform_aie2.mlir
+++ b/examples/swiglu/transform_aie2.mlir
@@ -21,7 +21,7 @@ module attributes {transform.with_named_sequence} {
         (%arg1) : (!transform.any_op) -> ()
     transform.include @canonicalize_with_cse failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
-    transform.include @pad_and_promote_binary_bf16 failures(propagate)
+    transform.include @pad_and_promote_binary_@DTYPE@ failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
     transform.include @canonicalize_with_cse failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
@@ -30,7 +30,7 @@ module attributes {transform.with_named_sequence} {
     transform.include @post_bufferize_cleanup failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
 
-    transform.include @vectorize_generics_at_16 failures(propagate)
+    transform.include @vectorize_generics_at_@VECTOR_SIZE@ failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
     %vh = transform.include @air_herd_mapping_with_extern_and_vectorize
         failures(propagate) (%arg1) : (!transform.any_op) -> !transform.any_op
diff --git a/examples/swiglu/transform_aie2p.mlir b/examples/swiglu/transform_aie2p.mlir
index ee1c6b2..7d799d3 100644
--- a/examples/swiglu/transform_aie2p.mlir
+++ b/examples/swiglu/transform_aie2p.mlir
@@ -20,7 +20,7 @@ module attributes {transform.with_named_sequence} {
         (%arg1) : (!transform.any_op) -> ()
     transform.include @canonicalize_with_cse failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
-    transform.include @pad_and_promote_binary_bf16 failures(propagate)
+    transform.include @pad_and_promote_binary_@DTYPE@ failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
     transform.include @canonicalize_with_cse failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
@@ -29,7 +29,7 @@ module attributes {transform.with_named_sequence} {
     transform.include @post_bufferize_cleanup failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
 
-    transform.include @vectorize_generics_at_16 failures(propagate)
+    transform.include @vectorize_generics_at_@VECTOR_SIZE@ failures(propagate)
         (%arg1) : (!transform.any_op) -> ()
     %vh = transform.include @air_herd_mapping_and_vectorize
         failures(propagate) (%arg1) : (!transform.any_op) -> !transform.any_op

From baf67c69097b8e9105f457b6a15748de678abe91 Mon Sep 17 00:00:00 2001
From: erweiw <erwei.wang@amd.com>
Date: Thu, 9 Apr 2026 22:01:00 -0700
Subject: [PATCH 7/9] Add elementwise_arith example (sub, mul, div, square)

New multi-op example supporting sub, mul, div, and square with
--op and --dtype CLI arguments. Auto-selects unary or binary
transform script based on op arity.

Supported dtypes: bf16 and f32 (via bf16-emulation). Integer types
(i16) fail at aircc for subi/muli -- only addi works for integer
vectors on AIE2P (tracked in Xilinx/mlir-aie#3027).

div is f32-only (arith.divf has no bf16 hardware support on AIE2P).

Tested on NPU2 (Strix/AIE2P): sub, mul, div, square all pass for
their supported dtypes.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../air_project/aie.asm_air_output.mlir       | 386 +++++++++++
 .../elementwise_arith/air_project/aie.elf     | Bin 0 -> 29488 bytes
 .../aiecc_failure_1775797115_856352.mlir      | 411 ++++++++++++
 .../aiecc_failure_1775797139_858651.mlir      | 601 ++++++++++++++++++
 .../aiecc_failure_1775797174_862028.mlir      | 431 +++++++++++++
 .../aiecc_repeater_1775797115_856352.sh       |  12 +
 .../aiecc_repeater_1775797139_858651.sh       |  14 +
 .../aiecc_repeater_1775797174_862028.sh       |  14 +
 .../air_project/airinput.mlir                 |  41 ++
 .../air_project/asm_air_output.mlir           |  41 ++
 .../air_project/asm_src.mlir                  |  34 +
 .../air_project/div_kernel_0.pdi              | Bin 0 -> 15904 bytes
 .../air_project/div_kernel_0_aie_cdo_elfs.bin | Bin 0 -> 10704 bytes
 .../div_kernel_0_aie_cdo_enable.bin           | Bin 0 -> 104 bytes
 .../air_project/div_kernel_0_aie_cdo_init.bin | Bin 0 -> 6032 bytes
 .../air_project/div_kernel_0_core_0_2.elf     | Bin 0 -> 4132 bytes
 .../div_kernel_0_core_0_2.ld.script           |  72 +++
 .../air_project/div_kernel_0_core_0_2.ll      | 158 +++++
 .../air_project/div_kernel_0_core_0_2.o       | Bin 0 -> 2048 bytes
 .../air_project/div_kernel_0_core_0_2.opt.ll  | 129 ++++
 .../div_kernel_0_core_0_2.peanohack.ll        | 158 +++++
 .../air_project/div_kernel_0_core_0_3.elf     | Bin 0 -> 4192 bytes
 .../div_kernel_0_core_0_3.ld.script           |  78 +++
 .../air_project/div_kernel_0_core_0_3.ll      | 158 +++++
 .../air_project/div_kernel_0_core_0_3.o       | Bin 0 -> 2048 bytes
 .../air_project/div_kernel_0_core_0_3.opt.ll  | 129 ++++
 .../div_kernel_0_core_0_3.peanohack.ll        | 158 +++++
 .../air_project/div_kernel_0_core_0_4.elf     | Bin 0 -> 4196 bytes
 .../div_kernel_0_core_0_4.ld.script           |  78 +++
 .../air_project/div_kernel_0_core_0_4.ll      | 158 +++++
 .../air_project/div_kernel_0_core_0_4.o       | Bin 0 -> 2048 bytes
 .../air_project/div_kernel_0_core_0_4.opt.ll  | 129 ++++
 .../div_kernel_0_core_0_4.peanohack.ll        | 158 +++++
 .../air_project/div_kernel_0_core_0_5.elf     | Bin 0 -> 4132 bytes
 .../div_kernel_0_core_0_5.ld.script           |  72 +++
 .../air_project/div_kernel_0_core_0_5.ll      | 158 +++++
 .../air_project/div_kernel_0_core_0_5.o       | Bin 0 -> 2052 bytes
 .../air_project/div_kernel_0_core_0_5.opt.ll  | 129 ++++
 .../div_kernel_0_core_0_5.peanohack.ll        | 158 +++++
 .../air_project/div_kernel_0_design.bif       |  10 +
 .../div_kernel_0_div_kernel_0_sequence.bin    | Bin 0 -> 3248 bytes
 .../elementwise_arith/air_project/empty_0.pdi | Bin 0 -> 368 bytes
 .../air_project/empty_0_aie_cdo_elfs.bin      | Bin 0 -> 24 bytes
 .../air_project/empty_0_aie_cdo_enable.bin    | Bin 0 -> 24 bytes
 .../air_project/empty_0_aie_cdo_init.bin      | Bin 0 -> 24 bytes
 .../air_project/empty_0_design.bif            |  10 +
 .../air_project/full_elf_config.json          | 134 ++++
 .../air_project/input_with_addresses.mlir     | 328 ++++++++++
 .../elementwise_arith/air_project/main.pdi    | Bin 0 -> 368 bytes
 .../air_project/main_aie_cdo_elfs.bin         | Bin 0 -> 24 bytes
 .../air_project/main_aie_cdo_enable.bin       | Bin 0 -> 24 bytes
 .../air_project/main_aie_cdo_init.bin         | Bin 0 -> 24 bytes
 .../air_project/main_design.bif               |  10 +
 .../air_project/main_div_kernel.bin           | Bin 0 -> 22460 bytes
 .../air_project/main_mul_kernel.bin           | Bin 0 -> 14460 bytes
 .../air_project/main_square_kernel.bin        | Bin 0 -> 11048 bytes
 .../air_project/main_sub_kernel.bin           | Bin 0 -> 14396 bytes
 .../air_project/mul_kernel_0.pdi              | Bin 0 -> 7856 bytes
 .../air_project/mul_kernel_0_aie_cdo_elfs.bin | Bin 0 -> 2656 bytes
 .../mul_kernel_0_aie_cdo_enable.bin           | Bin 0 -> 104 bytes
 .../air_project/mul_kernel_0_aie_cdo_init.bin | Bin 0 -> 6032 bytes
 .../air_project/mul_kernel_0_core_0_2.elf     | Bin 0 -> 1672 bytes
 .../mul_kernel_0_core_0_2.ld.script           |  72 +++
 .../air_project/mul_kernel_0_core_0_2.ll      |  95 +++
 .../air_project/mul_kernel_0_core_0_2.o       | Bin 0 -> 1000 bytes
 .../air_project/mul_kernel_0_core_0_2.opt.ll  |  72 +++
 .../mul_kernel_0_core_0_2.peanohack.ll        |  95 +++
 .../air_project/mul_kernel_0_core_0_3.elf     | Bin 0 -> 1736 bytes
 .../mul_kernel_0_core_0_3.ld.script           |  78 +++
 .../air_project/mul_kernel_0_core_0_3.ll      |  95 +++
 .../air_project/mul_kernel_0_core_0_3.o       | Bin 0 -> 1000 bytes
 .../air_project/mul_kernel_0_core_0_3.opt.ll  |  72 +++
 .../mul_kernel_0_core_0_3.peanohack.ll        |  95 +++
 .../air_project/mul_kernel_0_core_0_4.elf     | Bin 0 -> 1740 bytes
 .../mul_kernel_0_core_0_4.ld.script           |  78 +++
 .../air_project/mul_kernel_0_core_0_4.ll      |  95 +++
 .../air_project/mul_kernel_0_core_0_4.o       | Bin 0 -> 1000 bytes
 .../air_project/mul_kernel_0_core_0_4.opt.ll  |  72 +++
 .../mul_kernel_0_core_0_4.peanohack.ll        |  95 +++
 .../air_project/mul_kernel_0_core_0_5.elf     | Bin 0 -> 1676 bytes
 .../mul_kernel_0_core_0_5.ld.script           |  72 +++
 .../air_project/mul_kernel_0_core_0_5.ll      |  95 +++
 .../air_project/mul_kernel_0_core_0_5.o       | Bin 0 -> 1000 bytes
 .../air_project/mul_kernel_0_core_0_5.opt.ll  |  72 +++
 .../mul_kernel_0_core_0_5.peanohack.ll        |  95 +++
 .../air_project/mul_kernel_0_design.bif       |  10 +
 .../mul_kernel_0_mul_kernel_0_sequence.bin    | Bin 0 -> 3248 bytes
 .../air_project/npu.asm_air_output.mlir       | 300 +++++++++
 .../air_project/placed.asm_air_output.mlir    |  86 +++
 .../air_project/square_kernel_0.pdi           | Bin 0 -> 6272 bytes
 .../square_kernel_0_aie_cdo_elfs.bin          | Bin 0 -> 2528 bytes
 .../square_kernel_0_aie_cdo_enable.bin        | Bin 0 -> 104 bytes
 .../square_kernel_0_aie_cdo_init.bin          | Bin 0 -> 4300 bytes
 .../air_project/square_kernel_0_core_0_2.elf  | Bin 0 -> 1600 bytes
 .../square_kernel_0_core_0_2.ld.script        |  66 ++
 .../air_project/square_kernel_0_core_0_2.ll   |  84 +++
 .../air_project/square_kernel_0_core_0_2.o    | Bin 0 -> 932 bytes
 .../square_kernel_0_core_0_2.opt.ll           |  65 ++
 .../square_kernel_0_core_0_2.peanohack.ll     |  84 +++
 .../air_project/square_kernel_0_core_0_3.elf  | Bin 0 -> 1640 bytes
 .../square_kernel_0_core_0_3.ld.script        |  69 ++
 .../air_project/square_kernel_0_core_0_3.ll   |  84 +++
 .../air_project/square_kernel_0_core_0_3.o    | Bin 0 -> 932 bytes
 .../square_kernel_0_core_0_3.opt.ll           |  65 ++
 .../square_kernel_0_core_0_3.peanohack.ll     |  84 +++
 .../air_project/square_kernel_0_core_0_4.elf  | Bin 0 -> 1640 bytes
 .../square_kernel_0_core_0_4.ld.script        |  69 ++
 .../air_project/square_kernel_0_core_0_4.ll   |  84 +++
 .../air_project/square_kernel_0_core_0_4.o    | Bin 0 -> 932 bytes
 .../square_kernel_0_core_0_4.opt.ll           |  65 ++
 .../square_kernel_0_core_0_4.peanohack.ll     |  84 +++
 .../air_project/square_kernel_0_core_0_5.elf  | Bin 0 -> 1600 bytes
 .../square_kernel_0_core_0_5.ld.script        |  66 ++
 .../air_project/square_kernel_0_core_0_5.ll   |  84 +++
 .../air_project/square_kernel_0_core_0_5.o    | Bin 0 -> 932 bytes
 .../square_kernel_0_core_0_5.opt.ll           |  65 ++
 .../square_kernel_0_core_0_5.peanohack.ll     |  84 +++
 .../air_project/square_kernel_0_design.bif    |  10 +
 ...uare_kernel_0_square_kernel_0_sequence.bin | Bin 0 -> 2288 bytes
 .../air_project/sub_kernel_0.pdi              | Bin 0 -> 7792 bytes
 .../air_project/sub_kernel_0_aie_cdo_elfs.bin | Bin 0 -> 2592 bytes
 .../sub_kernel_0_aie_cdo_enable.bin           | Bin 0 -> 104 bytes
 .../air_project/sub_kernel_0_aie_cdo_init.bin | Bin 0 -> 6032 bytes
 .../air_project/sub_kernel_0_core_0_2.elf     | Bin 0 -> 1656 bytes
 .../sub_kernel_0_core_0_2.ld.script           |  72 +++
 .../air_project/sub_kernel_0_core_0_2.ll      |  95 +++
 .../air_project/sub_kernel_0_core_0_2.o       | Bin 0 -> 984 bytes
 .../air_project/sub_kernel_0_core_0_2.opt.ll  |  64 ++
 .../sub_kernel_0_core_0_2.peanohack.ll        |  95 +++
 .../air_project/sub_kernel_0_core_0_3.elf     | Bin 0 -> 1720 bytes
 .../sub_kernel_0_core_0_3.ld.script           |  78 +++
 .../air_project/sub_kernel_0_core_0_3.ll      |  95 +++
 .../air_project/sub_kernel_0_core_0_3.o       | Bin 0 -> 984 bytes
 .../air_project/sub_kernel_0_core_0_3.opt.ll  |  64 ++
 .../sub_kernel_0_core_0_3.peanohack.ll        |  95 +++
 .../air_project/sub_kernel_0_core_0_4.elf     | Bin 0 -> 1724 bytes
 .../sub_kernel_0_core_0_4.ld.script           |  78 +++
 .../air_project/sub_kernel_0_core_0_4.ll      |  95 +++
 .../air_project/sub_kernel_0_core_0_4.o       | Bin 0 -> 984 bytes
 .../air_project/sub_kernel_0_core_0_4.opt.ll  |  64 ++
 .../sub_kernel_0_core_0_4.peanohack.ll        |  95 +++
 .../air_project/sub_kernel_0_core_0_5.elf     | Bin 0 -> 1660 bytes
 .../sub_kernel_0_core_0_5.ld.script           |  72 +++
 .../air_project/sub_kernel_0_core_0_5.ll      |  95 +++
 .../air_project/sub_kernel_0_core_0_5.o       | Bin 0 -> 984 bytes
 .../air_project/sub_kernel_0_core_0_5.opt.ll  |  64 ++
 .../sub_kernel_0_core_0_5.peanohack.ll        |  95 +++
 .../air_project/sub_kernel_0_design.bif       |  10 +
 .../sub_kernel_0_sub_kernel_0_sequence.bin    | Bin 0 -> 3248 bytes
 .../elementwise_arith/air_project/tt.mlir     |  35 +
 .../elementwise_arith/elementwise_arith.py    | 189 ++++++
 .../transform_binary_aie2p.mlir               |  40 ++
 .../transform_unary_aie2p.mlir                |  40 ++
 examples/elementwise_arith/tt.shared.mlir     |   1 +
 examples/generate_readme.py                   |   6 +
 155 files changed, 9150 insertions(+)
 create mode 100644 examples/elementwise_arith/air_project/aie.asm_air_output.mlir
 create mode 100644 examples/elementwise_arith/air_project/aie.elf
 create mode 100644 examples/elementwise_arith/air_project/aiecc_failure_1775797115_856352.mlir
 create mode 100644 examples/elementwise_arith/air_project/aiecc_failure_1775797139_858651.mlir
 create mode 100644 examples/elementwise_arith/air_project/aiecc_failure_1775797174_862028.mlir
 create mode 100755 examples/elementwise_arith/air_project/aiecc_repeater_1775797115_856352.sh
 create mode 100755 examples/elementwise_arith/air_project/aiecc_repeater_1775797139_858651.sh
 create mode 100755 examples/elementwise_arith/air_project/aiecc_repeater_1775797174_862028.sh
 create mode 100644 examples/elementwise_arith/air_project/airinput.mlir
 create mode 100644 examples/elementwise_arith/air_project/asm_air_output.mlir
 create mode 100644 examples/elementwise_arith/air_project/asm_src.mlir
 create mode 100644 examples/elementwise_arith/air_project/div_kernel_0.pdi
 create mode 100644 examples/elementwise_arith/air_project/div_kernel_0_aie_cdo_elfs.bin
 create mode 100644 examples/elementwise_arith/air_project/div_kernel_0_aie_cdo_enable.bin
 create mode 100644 examples/elementwise_arith/air_project/div_kernel_0_aie_cdo_init.bin
 create mode 100755 examples/elementwise_arith/air_project/div_kernel_0_core_0_2.elf
 create mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_2.ld.script
 create mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_2.ll
 create mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_2.o
 create mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_2.opt.ll
 create mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_2.peanohack.ll
 create mode 100755 examples/elementwise_arith/air_project/div_kernel_0_core_0_3.elf
 create mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_3.ld.script
 create mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_3.ll
 create mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_3.o
 create mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_3.opt.ll
 create mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_3.peanohack.ll
 create mode 100755 examples/elementwise_arith/air_project/div_kernel_0_core_0_4.elf
 create mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_4.ld.script
 create mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_4.ll
 create mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_4.o
 create mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_4.opt.ll
 create mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_4.peanohack.ll
 create mode 100755 examples/elementwise_arith/air_project/div_kernel_0_core_0_5.elf
 create mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_5.ld.script
 create mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_5.ll
 create mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_5.o
 create mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_5.opt.ll
 create mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_5.peanohack.ll
 create mode 100644 examples/elementwise_arith/air_project/div_kernel_0_design.bif
 create mode 100644 examples/elementwise_arith/air_project/div_kernel_0_div_kernel_0_sequence.bin
 create mode 100644 examples/elementwise_arith/air_project/empty_0.pdi
 create mode 100644 examples/elementwise_arith/air_project/empty_0_aie_cdo_elfs.bin
 create mode 100644 examples/elementwise_arith/air_project/empty_0_aie_cdo_enable.bin
 create mode 100644 examples/elementwise_arith/air_project/empty_0_aie_cdo_init.bin
 create mode 100644 examples/elementwise_arith/air_project/empty_0_design.bif
 create mode 100644 examples/elementwise_arith/air_project/full_elf_config.json
 create mode 100644 examples/elementwise_arith/air_project/input_with_addresses.mlir
 create mode 100644 examples/elementwise_arith/air_project/main.pdi
 create mode 100644 examples/elementwise_arith/air_project/main_aie_cdo_elfs.bin
 create mode 100644 examples/elementwise_arith/air_project/main_aie_cdo_enable.bin
 create mode 100644 examples/elementwise_arith/air_project/main_aie_cdo_init.bin
 create mode 100644 examples/elementwise_arith/air_project/main_design.bif
 create mode 100644 examples/elementwise_arith/air_project/main_div_kernel.bin
 create mode 100644 examples/elementwise_arith/air_project/main_mul_kernel.bin
 create mode 100644 examples/elementwise_arith/air_project/main_square_kernel.bin
 create mode 100644 examples/elementwise_arith/air_project/main_sub_kernel.bin
 create mode 100644 examples/elementwise_arith/air_project/mul_kernel_0.pdi
 create mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_aie_cdo_elfs.bin
 create mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_aie_cdo_enable.bin
 create mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_aie_cdo_init.bin
 create mode 100755 examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.elf
 create mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.ld.script
 create mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.ll
 create mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.o
 create mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.opt.ll
 create mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.peanohack.ll
 create mode 100755 examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.elf
 create mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.ld.script
 create mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.ll
 create mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.o
 create mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.opt.ll
 create mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.peanohack.ll
 create mode 100755 examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.elf
 create mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.ld.script
 create mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.ll
 create mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.o
 create mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.opt.ll
 create mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.peanohack.ll
 create mode 100755 examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.elf
 create mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.ld.script
 create mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.ll
 create mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.o
 create mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.opt.ll
 create mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.peanohack.ll
 create mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_design.bif
 create mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_mul_kernel_0_sequence.bin
 create mode 100644 examples/elementwise_arith/air_project/npu.asm_air_output.mlir
 create mode 100644 examples/elementwise_arith/air_project/placed.asm_air_output.mlir
 create mode 100644 examples/elementwise_arith/air_project/square_kernel_0.pdi
 create mode 100644 examples/elementwise_arith/air_project/square_kernel_0_aie_cdo_elfs.bin
 create mode 100644 examples/elementwise_arith/air_project/square_kernel_0_aie_cdo_enable.bin
 create mode 100644 examples/elementwise_arith/air_project/square_kernel_0_aie_cdo_init.bin
 create mode 100755 examples/elementwise_arith/air_project/square_kernel_0_core_0_2.elf
 create mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_2.ld.script
 create mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_2.ll
 create mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_2.o
 create mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_2.opt.ll
 create mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_2.peanohack.ll
 create mode 100755 examples/elementwise_arith/air_project/square_kernel_0_core_0_3.elf
 create mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_3.ld.script
 create mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_3.ll
 create mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_3.o
 create mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_3.opt.ll
 create mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_3.peanohack.ll
 create mode 100755 examples/elementwise_arith/air_project/square_kernel_0_core_0_4.elf
 create mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_4.ld.script
 create mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_4.ll
 create mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_4.o
 create mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_4.opt.ll
 create mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_4.peanohack.ll
 create mode 100755 examples/elementwise_arith/air_project/square_kernel_0_core_0_5.elf
 create mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_5.ld.script
 create mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_5.ll
 create mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_5.o
 create mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_5.opt.ll
 create mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_5.peanohack.ll
 create mode 100644 examples/elementwise_arith/air_project/square_kernel_0_design.bif
 create mode 100644 examples/elementwise_arith/air_project/square_kernel_0_square_kernel_0_sequence.bin
 create mode 100644 examples/elementwise_arith/air_project/sub_kernel_0.pdi
 create mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_aie_cdo_elfs.bin
 create mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_aie_cdo_enable.bin
 create mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_aie_cdo_init.bin
 create mode 100755 examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.elf
 create mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.ld.script
 create mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.ll
 create mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.o
 create mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.opt.ll
 create mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.peanohack.ll
 create mode 100755 examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.elf
 create mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.ld.script
 create mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.ll
 create mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.o
 create mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.opt.ll
 create mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.peanohack.ll
 create mode 100755 examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.elf
 create mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.ld.script
 create mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.ll
 create mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.o
 create mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.opt.ll
 create mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.peanohack.ll
 create mode 100755 examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.elf
 create mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.ld.script
 create mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.ll
 create mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.o
 create mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.opt.ll
 create mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.peanohack.ll
 create mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_design.bif
 create mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_sub_kernel_0_sequence.bin
 create mode 100644 examples/elementwise_arith/air_project/tt.mlir
 create mode 100644 examples/elementwise_arith/elementwise_arith.py
 create mode 100644 examples/elementwise_arith/transform_binary_aie2p.mlir
 create mode 100644 examples/elementwise_arith/transform_unary_aie2p.mlir
 create mode 100644 examples/elementwise_arith/tt.shared.mlir

diff --git a/examples/elementwise_arith/air_project/aie.asm_air_output.mlir b/examples/elementwise_arith/air_project/aie.asm_air_output.mlir
new file mode 100644
index 0000000..e55b5a1
--- /dev/null
+++ b/examples/elementwise_arith/air_project/aie.asm_air_output.mlir
@@ -0,0 +1,386 @@
+#loop_annotation = #llvm.loop_annotation<mustProgress = true>
+module {
+  aie.device(npu2) @square_kernel_0 {
+    %shim_noc_tile_0_0 = aie.tile(0, 0)
+    %shim_noc_tile_1_0 = aie.tile(1, 0)
+    %mem_tile_0_1 = aie.tile(0, 1)
+    %mem_tile_1_1 = aie.tile(1, 1)
+    %tile_0_2 = aie.tile(0, 2)
+    %tile_0_3 = aie.tile(0, 3)
+    %tile_0_4 = aie.tile(0, 4)
+    %tile_0_5 = aie.tile(0, 5)
+    %lock_0_1 = aie.lock(%mem_tile_0_1, 1) {init = 4 : i32}
+    %lock_0_1_0 = aie.lock(%mem_tile_0_1, 0) {init = 0 : i32}
+    %lock_1_1 = aie.lock(%mem_tile_1_1, 1) {init = 4 : i32}
+    %lock_1_1_1 = aie.lock(%mem_tile_1_1, 0) {init = 0 : i32}
+    %lock_0_2 = aie.lock(%tile_0_2, 3) {init = 1 : i32}
+    %lock_0_2_2 = aie.lock(%tile_0_2, 2) {init = 0 : i32}
+    %lock_0_2_3 = aie.lock(%tile_0_2, 1) {init = 1 : i32}
+    %lock_0_2_4 = aie.lock(%tile_0_2, 0) {init = 0 : i32}
+    %lock_0_3 = aie.lock(%tile_0_3, 3) {init = 1 : i32}
+    %lock_0_3_5 = aie.lock(%tile_0_3, 2) {init = 0 : i32}
+    %lock_0_3_6 = aie.lock(%tile_0_3, 1) {init = 1 : i32}
+    %lock_0_3_7 = aie.lock(%tile_0_3, 0) {init = 0 : i32}
+    %lock_0_4 = aie.lock(%tile_0_4, 3) {init = 1 : i32}
+    %lock_0_4_8 = aie.lock(%tile_0_4, 2) {init = 0 : i32}
+    %lock_0_4_9 = aie.lock(%tile_0_4, 1) {init = 1 : i32}
+    %lock_0_4_10 = aie.lock(%tile_0_4, 0) {init = 0 : i32}
+    %lock_0_5 = aie.lock(%tile_0_5, 3) {init = 1 : i32}
+    %lock_0_5_11 = aie.lock(%tile_0_5, 2) {init = 0 : i32}
+    %lock_0_5_12 = aie.lock(%tile_0_5, 1) {init = 1 : i32}
+    %lock_0_5_13 = aie.lock(%tile_0_5, 0) {init = 0 : i32}
+    %buf9 = aie.buffer(%mem_tile_0_1) {sym_name = "buf9"} : memref<1024xi16, 1 : i32> 
+    %buf8 = aie.buffer(%mem_tile_1_1) {sym_name = "buf8"} : memref<1024xi16, 1> 
+    %buf7 = aie.buffer(%tile_0_5) {sym_name = "buf7"} : memref<256xi16, 2> 
+    %buf6 = aie.buffer(%tile_0_5) {sym_name = "buf6"} : memref<256xi16, 2> 
+    %buf5 = aie.buffer(%tile_0_4) {sym_name = "buf5"} : memref<256xi16, 2> 
+    %buf4 = aie.buffer(%tile_0_4) {sym_name = "buf4"} : memref<256xi16, 2> 
+    %buf3 = aie.buffer(%tile_0_3) {sym_name = "buf3"} : memref<256xi16, 2> 
+    %buf2 = aie.buffer(%tile_0_3) {sym_name = "buf2"} : memref<256xi16, 2> 
+    %buf1 = aie.buffer(%tile_0_2) {sym_name = "buf1"} : memref<256xi16, 2> 
+    %buf0 = aie.buffer(%tile_0_2) {sym_name = "buf0"} : memref<256xi16, 2> 
+    %mem_0_5 = aie.mem(%tile_0_5) {
+      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_0_5_13, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf6 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32}
+      aie.use_lock(%lock_0_5_12, Release, 1)
+      aie.next_bd ^bb1
+    ^bb2:  // pred: ^bb3
+      aie.end
+    ^bb3:  // pred: ^bb0
+      %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb2)
+    ^bb4:  // 2 preds: ^bb3, ^bb4
+      aie.use_lock(%lock_0_5, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf7 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32}
+      aie.use_lock(%lock_0_5_11, Release, 1)
+      aie.next_bd ^bb4
+    }
+    %core_0_5 = aie.core(%tile_0_5) {
+      %0 = ub.poison : i16
+      %c256 = arith.constant 256 : index
+      %c32 = arith.constant 32 : index
+      %c0 = arith.constant 0 : index
+      cf.br ^bb1
+    ^bb1:  // 2 preds: ^bb0, ^bb2
+      aie.use_lock(%lock_0_5_12, AcquireGreaterEqual, 1)
+      cf.br ^bb2
+    ^bb2:  // pred: ^bb1
+      aie.use_lock(%lock_0_5_11, AcquireGreaterEqual, 1)
+      scf.for %arg0 = %c0 to %c256 step %c32 {
+        %subview = memref.subview %buf7[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2>
+        %subview_14 = memref.subview %buf6[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2>
+        %1 = vector.transfer_read %subview[%c0], %0 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16>
+        %2 = arith.muli %1, %1 : vector<32xi16>
+        vector.transfer_write %2, %subview_14[%c0] {in_bounds = [true]} : vector<32xi16>, memref<32xi16, strided<[1], offset: ?>, 2>
+      } {loop_annotation = #loop_annotation}
+      aie.use_lock(%lock_0_5, Release, 1)
+      aie.use_lock(%lock_0_5_13, Release, 1)
+      cf.br ^bb1
+    }
+    %mem_0_4 = aie.mem(%tile_0_4) {
+      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_0_4_10, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf4 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32}
+      aie.use_lock(%lock_0_4_9, Release, 1)
+      aie.next_bd ^bb1
+    ^bb2:  // pred: ^bb3
+      aie.end
+    ^bb3:  // pred: ^bb0
+      %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb2)
+    ^bb4:  // 2 preds: ^bb3, ^bb4
+      aie.use_lock(%lock_0_4, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf5 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32}
+      aie.use_lock(%lock_0_4_8, Release, 1)
+      aie.next_bd ^bb4
+    }
+    %core_0_4 = aie.core(%tile_0_4) {
+      %0 = ub.poison : i16
+      %c256 = arith.constant 256 : index
+      %c32 = arith.constant 32 : index
+      %c0 = arith.constant 0 : index
+      cf.br ^bb1
+    ^bb1:  // 2 preds: ^bb0, ^bb2
+      aie.use_lock(%lock_0_4_9, AcquireGreaterEqual, 1)
+      cf.br ^bb2
+    ^bb2:  // pred: ^bb1
+      aie.use_lock(%lock_0_4_8, AcquireGreaterEqual, 1)
+      scf.for %arg0 = %c0 to %c256 step %c32 {
+        %subview = memref.subview %buf5[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2>
+        %subview_14 = memref.subview %buf4[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2>
+        %1 = vector.transfer_read %subview[%c0], %0 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16>
+        %2 = arith.muli %1, %1 : vector<32xi16>
+        vector.transfer_write %2, %subview_14[%c0] {in_bounds = [true]} : vector<32xi16>, memref<32xi16, strided<[1], offset: ?>, 2>
+      } {loop_annotation = #loop_annotation}
+      aie.use_lock(%lock_0_4, Release, 1)
+      aie.use_lock(%lock_0_4_10, Release, 1)
+      cf.br ^bb1
+    }
+    %mem_0_3 = aie.mem(%tile_0_3) {
+      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_0_3_7, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf2 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32}
+      aie.use_lock(%lock_0_3_6, Release, 1)
+      aie.next_bd ^bb1
+    ^bb2:  // pred: ^bb3
+      aie.end
+    ^bb3:  // pred: ^bb0
+      %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb2)
+    ^bb4:  // 2 preds: ^bb3, ^bb4
+      aie.use_lock(%lock_0_3, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf3 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32}
+      aie.use_lock(%lock_0_3_5, Release, 1)
+      aie.next_bd ^bb4
+    }
+    %core_0_3 = aie.core(%tile_0_3) {
+      %0 = ub.poison : i16
+      %c256 = arith.constant 256 : index
+      %c32 = arith.constant 32 : index
+      %c0 = arith.constant 0 : index
+      cf.br ^bb1
+    ^bb1:  // 2 preds: ^bb0, ^bb2
+      aie.use_lock(%lock_0_3_6, AcquireGreaterEqual, 1)
+      cf.br ^bb2
+    ^bb2:  // pred: ^bb1
+      aie.use_lock(%lock_0_3_5, AcquireGreaterEqual, 1)
+      scf.for %arg0 = %c0 to %c256 step %c32 {
+        %subview = memref.subview %buf3[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2>
+        %subview_14 = memref.subview %buf2[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2>
+        %1 = vector.transfer_read %subview[%c0], %0 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16>
+        %2 = arith.muli %1, %1 : vector<32xi16>
+        vector.transfer_write %2, %subview_14[%c0] {in_bounds = [true]} : vector<32xi16>, memref<32xi16, strided<[1], offset: ?>, 2>
+      } {loop_annotation = #loop_annotation}
+      aie.use_lock(%lock_0_3, Release, 1)
+      aie.use_lock(%lock_0_3_7, Release, 1)
+      cf.br ^bb1
+    }
+    %mem_0_2 = aie.mem(%tile_0_2) {
+      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_0_2_4, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf0 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32}
+      aie.use_lock(%lock_0_2_3, Release, 1)
+      aie.next_bd ^bb1
+    ^bb2:  // pred: ^bb3
+      aie.end
+    ^bb3:  // pred: ^bb0
+      %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb2)
+    ^bb4:  // 2 preds: ^bb3, ^bb4
+      aie.use_lock(%lock_0_2, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf1 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32}
+      aie.use_lock(%lock_0_2_2, Release, 1)
+      aie.next_bd ^bb4
+    }
+    %core_0_2 = aie.core(%tile_0_2) {
+      %0 = ub.poison : i16
+      %c256 = arith.constant 256 : index
+      %c32 = arith.constant 32 : index
+      %c0 = arith.constant 0 : index
+      cf.br ^bb1
+    ^bb1:  // 2 preds: ^bb0, ^bb2
+      aie.use_lock(%lock_0_2_3, AcquireGreaterEqual, 1)
+      cf.br ^bb2
+    ^bb2:  // pred: ^bb1
+      aie.use_lock(%lock_0_2_2, AcquireGreaterEqual, 1)
+      scf.for %arg0 = %c0 to %c256 step %c32 {
+        %subview = memref.subview %buf1[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2>
+        %subview_14 = memref.subview %buf0[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2>
+        %1 = vector.transfer_read %subview[%c0], %0 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16>
+        %2 = arith.muli %1, %1 : vector<32xi16>
+        vector.transfer_write %2, %subview_14[%c0] {in_bounds = [true]} : vector<32xi16>, memref<32xi16, strided<[1], offset: ?>, 2>
+      } {loop_annotation = #loop_annotation}
+      aie.use_lock(%lock_0_2, Release, 1)
+      aie.use_lock(%lock_0_2_4, Release, 1)
+      cf.br ^bb1
+    }
+    air.channel @channel_0 []
+    air.channel @channel_2 [1, 1]
+    air.channel @channel_8 [1, 1]
+    air.channel @channel_9 [1, 1]
+    air.channel @channel_10 [1, 1]
+    air.channel @channel_4 [1, 1]
+    air.channel @channel_5 [1, 1]
+    air.channel @channel_6 [1, 1]
+    air.channel @channel_7 [1, 1]
+    air.channel @channel_3 []
+    aie.flow(%shim_noc_tile_0_0, DMA : 0, %mem_tile_0_1, DMA : 0)
+    aie.flow(%mem_tile_1_1, DMA : 0, %shim_noc_tile_1_0, DMA : 0)
+    aie.flow(%mem_tile_0_1, DMA : 0, %tile_0_2, DMA : 0)
+    aie.flow(%mem_tile_0_1, DMA : 1, %tile_0_3, DMA : 0)
+    aie.flow(%mem_tile_0_1, DMA : 2, %tile_0_4, DMA : 0)
+    aie.flow(%mem_tile_0_1, DMA : 3, %tile_0_5, DMA : 0)
+    aie.flow(%tile_0_2, DMA : 0, %mem_tile_1_1, DMA : 0)
+    aie.flow(%tile_0_3, DMA : 0, %mem_tile_1_1, DMA : 1)
+    aie.flow(%tile_0_4, DMA : 0, %mem_tile_1_1, DMA : 2)
+    aie.flow(%tile_0_5, DMA : 0, %mem_tile_1_1, DMA : 3)
+    %memtile_dma_1_1 = aie.memtile_dma(%mem_tile_1_1) {
+      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_1_1_1, AcquireGreaterEqual, 4)
+      aie.dma_bd(%buf8 : memref<1024xi16, 1>, 0, 1024) {task_id = 0 : i32}
+      aie.use_lock(%lock_1_1, Release, 4)
+      aie.next_bd ^bb1
+    ^bb2:  // pred: ^bb9
+      aie.end
+    ^bb3:  // pred: ^bb0
+      %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb5)
+    ^bb4:  // 2 preds: ^bb3, ^bb4
+      aie.use_lock(%lock_1_1, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf8 : memref<1024xi16, 1>, 0, 256) {task_id = 0 : i32}
+      aie.use_lock(%lock_1_1_1, Release, 1)
+      aie.next_bd ^bb4
+    ^bb5:  // pred: ^bb3
+      %2 = aie.dma_start(S2MM, 1, ^bb6, ^bb7)
+    ^bb6:  // 2 preds: ^bb5, ^bb6
+      aie.use_lock(%lock_1_1, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf8 : memref<1024xi16, 1>, 256, 256) {task_id = 0 : i32}
+      aie.use_lock(%lock_1_1_1, Release, 1)
+      aie.next_bd ^bb6
+    ^bb7:  // pred: ^bb5
+      %3 = aie.dma_start(S2MM, 2, ^bb8, ^bb9)
+    ^bb8:  // 2 preds: ^bb7, ^bb8
+      aie.use_lock(%lock_1_1, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf8 : memref<1024xi16, 1>, 512, 256) {task_id = 0 : i32}
+      aie.use_lock(%lock_1_1_1, Release, 1)
+      aie.next_bd ^bb8
+    ^bb9:  // pred: ^bb7
+      %4 = aie.dma_start(S2MM, 3, ^bb10, ^bb2)
+    ^bb10:  // 2 preds: ^bb9, ^bb10
+      aie.use_lock(%lock_1_1, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf8 : memref<1024xi16, 1>, 768, 256) {task_id = 0 : i32}
+      aie.use_lock(%lock_1_1_1, Release, 1)
+      aie.next_bd ^bb10
+    }
+    %memtile_dma_0_1 = aie.memtile_dma(%mem_tile_0_1) {
+      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_0_1_0, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 0, 256) {task_id = 0 : i32}
+      aie.use_lock(%lock_0_1, Release, 1)
+      aie.next_bd ^bb1
+    ^bb2:  // pred: ^bb9
+      aie.end
+    ^bb3:  // pred: ^bb0
+      %1 = aie.dma_start(MM2S, 1, ^bb4, ^bb5)
+    ^bb4:  // 2 preds: ^bb3, ^bb4
+      aie.use_lock(%lock_0_1_0, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 256, 256) {task_id = 0 : i32}
+      aie.use_lock(%lock_0_1, Release, 1)
+      aie.next_bd ^bb4
+    ^bb5:  // pred: ^bb3
+      %2 = aie.dma_start(MM2S, 2, ^bb6, ^bb7)
+    ^bb6:  // 2 preds: ^bb5, ^bb6
+      aie.use_lock(%lock_0_1_0, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 512, 256) {task_id = 0 : i32}
+      aie.use_lock(%lock_0_1, Release, 1)
+      aie.next_bd ^bb6
+    ^bb7:  // pred: ^bb5
+      %3 = aie.dma_start(MM2S, 3, ^bb8, ^bb9)
+    ^bb8:  // 2 preds: ^bb7, ^bb8
+      aie.use_lock(%lock_0_1_0, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 768, 256) {task_id = 0 : i32}
+      aie.use_lock(%lock_0_1, Release, 1)
+      aie.next_bd ^bb8
+    ^bb9:  // pred: ^bb7
+      %4 = aie.dma_start(S2MM, 0, ^bb10, ^bb2)
+    ^bb10:  // 2 preds: ^bb9, ^bb10
+      aie.use_lock(%lock_0_1, AcquireGreaterEqual, 4)
+      aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 0, 1024) {task_id = 0 : i32}
+      aie.use_lock(%lock_0_1_0, Release, 4)
+      aie.next_bd ^bb10
+    }
+    aie.shim_dma_allocation @air_channel_3(%shim_noc_tile_1_0, S2MM, 0)
+    aie.shim_dma_allocation @air_channel_0(%shim_noc_tile_0_0, MM2S, 0)
+  } {dlti.dl_spec = #dlti.dl_spec<index = 32 : i64>}
+  airrt.module_metadata{
+    airrt.segment_metadata attributes {dma_allocations = [{channel = 2 : i64, col = 0 : i64, id = 3 : i64, location = 0 : i64, row = -1 : i64}], sym_name = "square_kernel_0"}{
+      airrt.herd_metadata {dma_allocations = [], loc_x = 0 : i64, loc_y = 2 : i64, size_x = 1 : i64, size_y = 4 : i64, sym_name = "herd_0"}
+    }
+  }
+  air.channel @channel_0 []
+  air.channel @channel_1 [4, 1]
+  air.channel @channel_2 [4, 1]
+  air.channel @channel_3 []
+  func.func @square_kernel(%arg0: memref<*xi16> {tt.divisibility = 16 : i32}, %arg1: memref<*xi16> {tt.divisibility = 16 : i32}, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32) {
+    %c1 = arith.constant 1 : index
+    %0 = air.launch async (%arg8, %arg9, %arg10) in (%arg11=%c1, %arg12=%c1, %arg13=%c1) args(%arg14=%arg0, %arg15=%arg1) : memref<*xi16>, memref<*xi16> attributes {id = 1 : i32} {
+      %c1024 = arith.constant 1024 : index
+      %c1_0 = arith.constant 1 : index
+      %1 = arith.muli %arg8, %c1024 : index
+      %2 = air.channel.put async  @channel_0[] (%arg14[%1] [%c1024] [%c1_0]) {id = 1 : i32, metadataArray = [{base = "air_channel_0", index = 0 : i32}]} : (memref<*xi16>)
+      %3 = air.channel.get async  @channel_3[] (%arg15[%1] [%c1024] [%c1_0]) {id = 2 : i32, metadataArray = [{base = "air_channel_3", index = 0 : i32}]} : (memref<*xi16>)
+      %4 = air.segment @square_kernel_0 async  attributes {id = 2 : i32, x_loc = 0 : i64, x_size = 8 : i64, y_loc = 2 : i64, y_size = 6 : i64} {
+        %c4 = arith.constant 4 : index
+        %c768 = arith.constant 768 : index
+        %c3 = arith.constant 3 : index
+        %c512 = arith.constant 512 : index
+        %c2 = arith.constant 2 : index
+        %c256 = arith.constant 256 : index
+        %c0 = arith.constant 0 : index
+        %c1_1 = arith.constant 1 : index
+        %async_token, %results = air.execute -> (memref<1024xi16, 1 : i32>) {
+          %alloc = memref.alloc() : memref<1024xi16, 1 : i32>
+          air.execute_terminator %alloc : memref<1024xi16, 1 : i32>
+        }
+        %5 = air.channel.get async [%async_token]  @channel_0[] (%results[] [] []) {id = 3 : i32} : (memref<1024xi16, 1 : i32>)
+        %async_token_2, %results_3 = air.execute -> (memref<1024xi16, 1>) {
+          %alloc = memref.alloc() : memref<1024xi16, 1>
+          air.execute_terminator %alloc : memref<1024xi16, 1>
+        }
+        %6 = air.channel.put async [%5]  @channel_1[%c0, %c0] (%results[%c0] [%c256] [%c1_1]) {id = 4 : i32} : (memref<1024xi16, 1 : i32>)
+        %7 = air.channel.put async [%5]  @channel_1[%c1_1, %c0] (%results[%c256] [%c256] [%c1_1]) {id = 5 : i32} : (memref<1024xi16, 1 : i32>)
+        %8 = air.channel.put async [%5]  @channel_1[%c2, %c0] (%results[%c512] [%c256] [%c1_1]) {id = 6 : i32} : (memref<1024xi16, 1 : i32>)
+        %9 = air.channel.put async [%5]  @channel_1[%c3, %c0] (%results[%c768] [%c256] [%c1_1]) {id = 7 : i32} : (memref<1024xi16, 1 : i32>)
+        %10 = air.channel.get async [%async_token_2]  @channel_2[%c0, %c0] (%results_3[%c0] [%c256] [%c1_1]) {id = 8 : i32} : (memref<1024xi16, 1>)
+        %11 = air.channel.get async [%async_token_2]  @channel_2[%c1_1, %c0] (%results_3[%c256] [%c256] [%c1_1]) {id = 9 : i32} : (memref<1024xi16, 1>)
+        %12 = air.channel.get async [%async_token_2]  @channel_2[%c2, %c0] (%results_3[%c512] [%c256] [%c1_1]) {id = 10 : i32} : (memref<1024xi16, 1>)
+        %13 = air.channel.get async [%async_token_2]  @channel_2[%c3, %c0] (%results_3[%c768] [%c256] [%c1_1]) {id = 11 : i32} : (memref<1024xi16, 1>)
+        %14 = air.herd @herd_0 async [%5, %async_token_2]  tile (%arg16, %arg17) in (%arg18=%c1_1, %arg19=%c4) attributes {id = 3 : i32, x_loc = 0 : i64, y_loc = 2 : i64} {
+          %c32 = arith.constant 32 : index
+          %c256_5 = arith.constant 256 : index
+          %c0_6 = arith.constant 0 : index
+          %16 = ub.poison : i16
+          %async_token_7, %results_8 = air.execute -> (memref<256xi16, 2>) {
+            %alloc = memref.alloc() : memref<256xi16, 2>
+            air.execute_terminator %alloc : memref<256xi16, 2>
+          }
+          %17 = air.channel.get async [%async_token_7]  @channel_1[%arg17, %c0_6] (%results_8[] [] []) {id = 12 : i32} : (memref<256xi16, 2>)
+          %async_token_9, %results_10 = air.execute -> (memref<256xi16, 2>) {
+            %alloc = memref.alloc() : memref<256xi16, 2>
+            air.execute_terminator %alloc : memref<256xi16, 2>
+          }
+          %18 = air.wait_all async [%17, %async_token_9] 
+          %19 = scf.for %arg20 = %c0_6 to %c256_5 step %c32 iter_args(%arg21 = %18) -> (!air.async.token) {
+            %subview = memref.subview %results_8[%arg20] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2>
+            %subview_13 = memref.subview %results_10[%arg20] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2>
+            %async_token_14, %results_15 = air.execute [%arg21] -> (vector<32xi16>) {
+              %23 = vector.transfer_read %subview[%c0_6], %16 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16>
+              air.execute_terminator %23 : vector<32xi16>
+            }
+            %21 = arith.muli %results_15, %results_15 : vector<32xi16>
+            %async_token_16 = air.execute [%arg21] {
+              vector.transfer_write %21, %subview_13[%c0_6] {in_bounds = [true]} : vector<32xi16>, memref<32xi16, strided<[1], offset: ?>, 2>
+            }
+            %22 = air.wait_all async [%async_token_14, %async_token_16] 
+            scf.yield %22 : !air.async.token
+          }
+          %20 = air.channel.put async [%async_token_9]  @channel_2[%arg17, %c0_6] (%results_10[] [] []) {id = 13 : i32} : (memref<256xi16, 2>)
+          %async_token_11 = air.execute [%17] {
+            memref.dealloc %results_8 : memref<256xi16, 2>
+          }
+          %async_token_12 = air.execute [%20] {
+            memref.dealloc %results_10 : memref<256xi16, 2>
+          }
+        }
+        %15 = air.channel.put async [%14]  @channel_3[] (%results_3[] [] []) {id = 14 : i32} : (memref<1024xi16, 1>)
+        %async_token_4 = air.execute [%15] {
+          memref.dealloc %results_3 : memref<1024xi16, 1>
+        }
+        air.wait_all [%6, %7, %8, %9, %10, %11, %12, %13, %async_token_4]  {air.segment_end}
+      }
+    }
+    return
+  }
+}
diff --git a/examples/elementwise_arith/air_project/aie.elf b/examples/elementwise_arith/air_project/aie.elf
new file mode 100644
index 0000000000000000000000000000000000000000..d54eb1201d9709da9aaf31632fd7188b680af270
GIT binary patch
literal 29488
zcmeHQU2GiH6~43HBuksZx^>Z#1Tpym0*$P798^G28DpmkZ7P-oP*jkzCYTU4aTdpE
zL#2wPjVNtVWwm`URF(YF7sSJ=KZvF;TfqaZP_^PIFO3|^BSIU*1C?mq^WA&T+%rEn
z4JC4HG{>6VbH8)u-ZST(o!y=J_WGGU`@S9o!JeuzQc?l^8QDco)fVfax<+kRLu6e`
z5Ns?V?SQRp0qo!8PDpQ&j%^L;7zd_a$zY_acFk<k-q@&A{{~p|_wR0^F~|qfub55l
zkMSGZjNLbW_hw`FjURo@zki4J?c8JR+rOd|HXh%g=J6l&G%>nm@cHjO@z7g;e)&aY
z3CEAn@XUGqW&3YK)Qsi6iAE!=hN>PTPQiZvkDR{!FE3#{x&P$RsY6GPPac_a9a3iV
z<=5Nqx{*??c#~dBRvsjf>Z2sbkba*=|M}9=zfoi36Zg{yN)3?y$3JzJ25B(fK5O}Y
zi;C*6wBcFGa^!o@`^kL2K{0&zdz0@{K-mws#<hfY_55Z`i`l4MOSM(1T@mfUz*G$8
zaZNC74|R(w2Wo*{L2B`ub@AcfLKXVXHwHr0-=Qwxl(MrMs7Hd30#dJ5Ri_t_qAo0*
zJs6z3W{XlkT6gxKI`_?pdak^B`nN&1I->VgT6WaWtY7<|`EUJvaK7>P!FR{jO+G&|
z|7Bz2$J+G~75&~|@rL<lp1N@&x<FA*gzr$H)A-cmv6=N4-^4@hGc(tUc5kfq*bC38
z&DL%@g}qb4{;o~?pEWJgSlrk^(*Ctd^LnM`pQDqrs#VnWXXsSKds_cUpP6kuXK_9+
zA(HvuynbD&)&Jc<=f@w-ANhFQj#4=qu*ah!olo+jQ!`H4T55N_;U+=VUVMFBO>MW<
zHz*??E#Zal>Hs^T&2_+St^;m!9dKKPqp7+YN!3-dt<ERg712)C)6Sxv)M~6JGrwFP
zH9NoD=K1F~&p)?SIG(Dfu~a=J+v<F>T@meMJ>6T>)2gf|GrwFPH9NoD=K1F~&p)?S
zcpz0z52WfT*;eP1?TTn8>uG;cPph(?%=~hF)a?9no9Ca~JpX88{`s?%{Op9!R%k;8
z0@C`MW4$W$Q0R4`M?!B1y(#pT(Az?v6Z(SCJ3{XYy(jcVq4%wBj&C&a8o_&f^7ux?
z@i{&0JH0OSNazirH-+93dRypoLSGPiN9bLl_k_ME^uE>2@r|dB&$QkAjfvxPdf0b*
zUFea}8$xdiy(RRv(C37{AoPyVyF%{?eNpIrtDECHkUBoocE|UCI6kL`eW%xj9tpi6
z^rp~TLT?LwPUs6l?+Cps^q$Zch29r>Rc+GOFZ64TbUrS^OZd)dab2>|>r~r;#Y0i^
zRe>vk*Qr*VzpLr0z?G=2I_2K^yBetqT#31=)3G>zm-DK?l|9};oKLK*3Gi*6q(}P%
zIt8X}9~bW_ug&uhwSScGMdP1`%GV-4Lv#*=pmwTO=lIIhhOX;M*aiu7@r-@Kc;Fvl
z$b`6f$WCJ7F?v94#9F5Jf9NJ2XG_LIbM}ey1OJG6O^S<$>?9^0qq}%at(bV6Eg8?|
zPZ$sUBR!^kaPg3x#KdFtfZ8TMrq)e7&Sv8IoZe5V4U|Zz+_tcNias7YBmG18V*s@w
z>1bPHOVP8^UkzCQ;HP@8^iFC2i2E<yo_5vkw13F`54_~<AGz4Imf|2mF6vgt4mwZN
zhOYCJvaP2!eV5j@O3F4wZFH)0pR#SDHZFt0RueY9Ua}_Od{Z8E<lLs6ghWeY>3kZ0
zPd?NBG+)NQls{h?{{ido$!FT1=F46{_RQDE(HWb6eN;C8TIc-lUb_L;D|D{zBpt%G
z32O7)L@C=oYIBbFrfg+u^W3PEZFheF*L>TEYv>|f53_oO@I6X(fh&U>{euB@M{v`1
z8dn?fO40jn4>rd8)(Pi+l)xjxDSqu=7WfXrX%vlD1b!#s6p_Z)3%uGNDA6SwjYk&;
zaA;;<!bQx$yE@Jf21*?BHnlPKr`ViM__em5I|W`QJRVEn6~ez@{dWm`J>l0`e7C^M
z{jIpBtmCX*+#2U6B0SEa!1ogF=EeC>5KixQx}Q+^?;||Ut-ucw9>*i_a(`6xGaARU
zi*Of@!*>%-m*@1j+b%kMAK`9XhYu0%#&vkPKQ4}IT#P%e=)#=tf73;W?<0TLzr%+J
zcl|rO+&>`tKM?mnM7Zn6;kEwO(&z2CJ1%aG>mejOo_B!{5&lK%zgOU!2#@PW;I;m!
z=zmo7KPvinc&$G!{KvJwJ{R{X`t@`;^!|i)xgV73)T9^_-@nKU$Ekhz8l-ERYe=qb
zt^uGe<+FDG0iO<elgGIn_a83jevb;uYb!Jf5e0|)fbo3LO#sZXyLyd=mM%S_HXx;E
z=|TJHrh6#U{+E)Wx1AmDIdQ9JYU@*st7D7%vq21SFmK~cg@j(f>Lz8ssc1x%*+#E1
z#MGt@HLjzh`zvvuS2{P59wdHPw%pipS9EULaay#pY`L-HuISu!uTpO8xGOq0?$xT3
zv}|=_$6e96iQ+Ta^5({lyP|W`zDl{V<F4r3bgxow?6_AVH~e|$emWFzjw&j7Oz{}u
z*7)QK4_wh?8+aNJe2~B<ZMatt$+O|Uz0Zbw_-R|95j<>BNI}8nSY7wt)Ahphn^!M+
zHm;XE8`n#o4d1<3P(yOOD&6~s<kcs50X`cz0zMn~0%@D8-JM14>g=r6+BI{_^DAqK
z>m|>|^^#}fddahaYr>MP-7#&-uU)?ld<00|e&8mgZGlGcaBoq&Iv1<8cFo-45HR=2
z+V$JGUh?+iddaha*TRAtlH>6`yu2C&-vN?m1NXsa0}mo?bG5s_s9l|%)mpn|Zh3xX
zEpff%*|=WvY+NsSHt=N_Prp{!#&*OKOMi2$;+h!??&GsaobUC+CUNiqSR@{K@P-F(
zdhnJ9Z+q}L55C~RJ0862!FwKj(S!F>IDfX}&w9D%4)HmV=ah5ITnV&gzQLzpk$C9A
z!OLKg{*ec7c<`nNZ+Y;x2cPrc3m&}V!Mh&3=fM{}ct3@^c{87L^5@Oikk{<Ijd8kD
z^Olc?mg9n*<otji!y@s>gEu^Q(}TA>c-w=|dGG}f-tpjF58m_Oiypk6!ri=?_xSvI
zbMODzd2{D;nQ!omSR@{LaPX8^q<`eW8y>vr!CM}@?ZM|f_<{%Tc<`<V?|JY=58n6S
z;HN=Av5|Q*pVG7`frGc^@z*$PvY)y=KL;s1Or5u}AEfH>S04YmJ&$Dj&p-cGJ^m_n
zJ~GkJJv>|Q+5Yp-6IGADO1*Ae|E6xU{pZ(n)y`wuU!~?3b}hXJ%nx@w95eO<?`}JG
z2qL|kd;ph=+swa1Th23oj!(x0P97wU^XD<fIS$nuyUcjmFVg+Xhh)5Cu_+m^!Fhak
z0%s4B#`(Js<9OE68@bGQ*)Os?x~F8kp1QTyl6ZXHyX?XFIs^N=csE~WyzKYC;@!-l
zr0T)oJU%;>?e*L=&e!x9$Fq?Qnp2@qP?z8UvOn?3jK2o*Ic4DJu|N2K`aV4X4`)sK
zgZ~%%UxEY3{e%A(`+GQR=%0_4d4OokdX=0&i4%`ZpLa2y#EDaui-Rx7eM^7I9h5jY
zgz(3M#H*hE!7GG6cyH2QatwiUeiZTZ;{5pLmpF#8Kk@%!f6sh@1IYeV56=G*9K+aO
z_V1Z5*+12T>0fhH{TxH&1VZy`6nd2Tq50J~;*dDFhY(;Ampnv?gO3PF_anK9(jOc}
zNP4~`Us3vlw+KnsgXAzue{dNg;LS<g@)}`E;NUnyz!{YOmhT8t5-08>1YE*|zu`M-
z-kiiOClb3P#<l#&3~srS863Px2<17^kL6Hi{K2P$P~H;$mRp(e2hS2hN0;!ooXd<q
z_?G~yDSykw%=lBDwSP{YHBLKpW!@C=HPJ~r&z8HH>EH4=vp8`v0fLt&`?vhgj6b-Z
z05ivBc$^u3a6kcq2P*rwe9(+PxS;^STa^BmCz|mGXA}b7sPwn|(TqR1q!92orT=!r
z`^@-*V+zT>UJOpAL_gr3LWrW4z%36IrUVX7DkM|4o_N4jg=F&M@dt+$l8wjUWJ<&X
zZYw04pL~CCULoMU%6wY>D@+L-Tv$l<c=P)QM;4Mj-h6*>XCc}7GB}wM@$9nx;ML0b
zEx#6BC2(+UA=&uz`v(UXl8ry#AKY9>HhzPXDG?7iyO3-=`TpSY!XJEIdAyd_3sV9I
z#}{Dc^=|mP8Gmqp0fPT4`?oycj6XQR05j(wPygTw!#^9p!O4^uH?;oQ`1Ae2Eza-X
z;N1TJoFL#FL-2WnxX1GPH~7ae(b)PkiL~*xYV3_IX&WWP;8~lgoBaLaT8R6EpO!1l
z1?7J2^YZc5%z$zJnF~uVu!1MM3UbpnG4kWG)r}o@Mb}N2vtpi?EjM=D6`dP>MsxXD
zw%pipS9ETo_$;pzv~0Pt<F4r3w69Wb?6@mBH{Gk08$0fm$c-94`Q*vjnc2z5NaN?H
zkLz~p@uR~dY8X#D&C{qFK0G^fY<BAO>~LKTKmPPI?PH#wKCXsmrjAVl)X3!VqleY-
zk(m>xTJXJeXEcoQBxU-<?9}k-nc3mPC#Ih`dgRp1<m}NC)Mt!8bl-$hhrT^>$H}Ko
zP0mam`p(qM^whCK^~T}kpGw&eot%2=)YSChDcX70@yVmpN%zL#<PPp+X*D_9TL`;Z
zc;J`cb{RyC!v1i=PP0{4Hk~6{73Nrt$B5`Rcl2w;c#dtGzSqPuNS}=Hb3H8){HypL
zV8pgVYOS%I&O_IqsFm7Hn8wNP572>=evqCu-ZpDbYMk_03Pj^wrQU-Mf70jZbGzw3
zP!$T!^dG2Kpkv%9P}}I@g6Tg{Z$by&4pc;68cqL!dKWrys_jSV%eLu1Q1}j1u}L_%
zg>Zdcl465ylQFy1;t|CL9XRQ8+brIuK5n)+>90UXKV4d1@)nDe{w8$YKgD*d#c5vO
zg$|sK<x#q+WcsIIZ?iZZ%W3Esm-J0^+0^tOs9!+`-o?Qw{Qgv56{bT}FBk~0f(jc;
z@%u>5#HWA0oZm-&*79Afhp85d^)Ss^oOlIY;Fr-OfnI;JSPzrlROnta-p2Gszjx>L
z5YZl_6S7-hwO9|6?$*U?yP}j<M?IPOMZ9`l7(3tG=K1F~&p)?|^)QR|dXww*7VBXa
z>px$~^`DFNFmbNM`bo@(ht3>$y(W!a|B~OMi}f%~zCz2tM825Q2|W^cL+DMRw}jpn
z`kc@ggx(Q)SLi*VFABYHbu)j(dYE)~66LGw9Q<K?*T-BjXBawo!i+OF*vxIQ9wz6d
zSPv8Hg&ICBUR(SdZ<n89J<RlamBo6P#d?@Hk1|gcuN%otg{@c*Q*$%*x`@Skm}Y&*
zA;SmMIVsk|v^gOz9(Z{AH6zx~EY`y`_iy<73f_;gnRc-rCZ!^lm9ict<_Ek&+}6WJ
zw<XoV{SWFjJke$wu!oy>fA!W!fA;+~KfLYkd#nF?pUjsn`m>qvKG#s{HPMD0LVQej
zub1>W`kC3|Fr?&5)Ye}nEdC++QI{G+UmQEm=xF1(AWdqtcra??_#j(oV?WsO{{|%i
zl>O-TX6%RmH;Df`nAcxj_%+F=OT<j9H_F2O?Iyt9*43to{Vpc<#q$Jh$PLRu@@rB9
zT_SEgqtAV`;n-PTBR||<C0?}__~BkT?#XjbVAxClh!6X7h<L_!h_*r6SeoPq95EyR
z*azc1NgL-6|3|S!etCI0A^h<F6c-=L&+~VMAO5dmb2`7j9|%AEe?|PCMXYni!g>A)
r0rEI{6bK&&;^*V|1?lqs^EvVx;?C$9#|U|uHtF{s`Gqi8VZZ+Zn&r4p

literal 0
HcmV?d00001

diff --git a/examples/elementwise_arith/air_project/aiecc_failure_1775797115_856352.mlir b/examples/elementwise_arith/air_project/aiecc_failure_1775797115_856352.mlir
new file mode 100644
index 0000000..15c21c6
--- /dev/null
+++ b/examples/elementwise_arith/air_project/aiecc_failure_1775797115_856352.mlir
@@ -0,0 +1,411 @@
+#loop_annotation = #llvm.loop_annotation<mustProgress = true>
+module {
+  aie.device(npu2) @sub_kernel_0 {
+    %shim_noc_tile_0_0 = aie.tile(0, 0)
+    %shim_noc_tile_1_0 = aie.tile(1, 0)
+    %shim_noc_tile_2_0 = aie.tile(2, 0)
+    %mem_tile_0_1 = aie.tile(0, 1)
+    %mem_tile_1_1 = aie.tile(1, 1)
+    %mem_tile_2_1 = aie.tile(2, 1)
+    %tile_0_2 = aie.tile(0, 2)
+    %tile_0_3 = aie.tile(0, 3)
+    %tile_0_4 = aie.tile(0, 4)
+    %tile_0_5 = aie.tile(0, 5)
+    %lock_1_1 = aie.lock(%mem_tile_1_1, 1) {init = 4 : i32}
+    %lock_1_1_0 = aie.lock(%mem_tile_1_1, 0) {init = 0 : i32}
+    %lock_0_1 = aie.lock(%mem_tile_0_1, 1) {init = 4 : i32}
+    %lock_0_1_1 = aie.lock(%mem_tile_0_1, 0) {init = 0 : i32}
+    %lock_2_1 = aie.lock(%mem_tile_2_1, 1) {init = 4 : i32}
+    %lock_2_1_2 = aie.lock(%mem_tile_2_1, 0) {init = 0 : i32}
+    %lock_0_2 = aie.lock(%tile_0_2, 5) {init = 1 : i32}
+    %lock_0_2_3 = aie.lock(%tile_0_2, 4) {init = 0 : i32}
+    %lock_0_2_4 = aie.lock(%tile_0_2, 3) {init = 1 : i32}
+    %lock_0_2_5 = aie.lock(%tile_0_2, 2) {init = 0 : i32}
+    %lock_0_2_6 = aie.lock(%tile_0_2, 1) {init = 1 : i32}
+    %lock_0_2_7 = aie.lock(%tile_0_2, 0) {init = 0 : i32}
+    %lock_0_3 = aie.lock(%tile_0_3, 5) {init = 1 : i32}
+    %lock_0_3_8 = aie.lock(%tile_0_3, 4) {init = 0 : i32}
+    %lock_0_3_9 = aie.lock(%tile_0_3, 3) {init = 1 : i32}
+    %lock_0_3_10 = aie.lock(%tile_0_3, 2) {init = 0 : i32}
+    %lock_0_3_11 = aie.lock(%tile_0_3, 1) {init = 1 : i32}
+    %lock_0_3_12 = aie.lock(%tile_0_3, 0) {init = 0 : i32}
+    %lock_0_4 = aie.lock(%tile_0_4, 5) {init = 1 : i32}
+    %lock_0_4_13 = aie.lock(%tile_0_4, 4) {init = 0 : i32}
+    %lock_0_4_14 = aie.lock(%tile_0_4, 3) {init = 1 : i32}
+    %lock_0_4_15 = aie.lock(%tile_0_4, 2) {init = 0 : i32}
+    %lock_0_4_16 = aie.lock(%tile_0_4, 1) {init = 1 : i32}
+    %lock_0_4_17 = aie.lock(%tile_0_4, 0) {init = 0 : i32}
+    %lock_0_5 = aie.lock(%tile_0_5, 5) {init = 1 : i32}
+    %lock_0_5_18 = aie.lock(%tile_0_5, 4) {init = 0 : i32}
+    %lock_0_5_19 = aie.lock(%tile_0_5, 3) {init = 1 : i32}
+    %lock_0_5_20 = aie.lock(%tile_0_5, 2) {init = 0 : i32}
+    %lock_0_5_21 = aie.lock(%tile_0_5, 1) {init = 1 : i32}
+    %lock_0_5_22 = aie.lock(%tile_0_5, 0) {init = 0 : i32}
+    %buf14 = aie.buffer(%mem_tile_0_1) {sym_name = "buf14"} : memref<1024xi16, 1 : i32> 
+    %buf13 = aie.buffer(%mem_tile_1_1) {sym_name = "buf13"} : memref<1024xi16, 1 : i32> 
+    %buf12 = aie.buffer(%mem_tile_2_1) {sym_name = "buf12"} : memref<1024xi16, 1> 
+    %buf11 = aie.buffer(%tile_0_5) {sym_name = "buf11"} : memref<256xi16, 2> 
+    %buf10 = aie.buffer(%tile_0_5) {sym_name = "buf10"} : memref<256xi16, 2> 
+    %buf9 = aie.buffer(%tile_0_5) {sym_name = "buf9"} : memref<256xi16, 2> 
+    %buf8 = aie.buffer(%tile_0_4) {sym_name = "buf8"} : memref<256xi16, 2> 
+    %buf7 = aie.buffer(%tile_0_4) {sym_name = "buf7"} : memref<256xi16, 2> 
+    %buf6 = aie.buffer(%tile_0_4) {sym_name = "buf6"} : memref<256xi16, 2> 
+    %buf5 = aie.buffer(%tile_0_3) {sym_name = "buf5"} : memref<256xi16, 2> 
+    %buf4 = aie.buffer(%tile_0_3) {sym_name = "buf4"} : memref<256xi16, 2> 
+    %buf3 = aie.buffer(%tile_0_3) {sym_name = "buf3"} : memref<256xi16, 2> 
+    %buf2 = aie.buffer(%tile_0_2) {sym_name = "buf2"} : memref<256xi16, 2> 
+    %buf1 = aie.buffer(%tile_0_2) {sym_name = "buf1"} : memref<256xi16, 2> 
+    %buf0 = aie.buffer(%tile_0_2) {sym_name = "buf0"} : memref<256xi16, 2> 
+    %mem_0_5 = aie.mem(%tile_0_5) {
+      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_0_5_22, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf9 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32}
+      aie.use_lock(%lock_0_5_21, Release, 1)
+      aie.next_bd ^bb1
+    ^bb2:  // pred: ^bb5
+      aie.end
+    ^bb3:  // pred: ^bb0
+      %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb5)
+    ^bb4:  // 2 preds: ^bb3, ^bb4
+      aie.use_lock(%lock_0_5_19, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf11 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32}
+      aie.use_lock(%lock_0_5_20, Release, 1)
+      aie.next_bd ^bb4
+    ^bb5:  // pred: ^bb3
+      %2 = aie.dma_start(S2MM, 1, ^bb6, ^bb2)
+    ^bb6:  // 2 preds: ^bb5, ^bb6
+      aie.use_lock(%lock_0_5, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf10 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32}
+      aie.use_lock(%lock_0_5_18, Release, 1)
+      aie.next_bd ^bb6
+    }
+    %core_0_5 = aie.core(%tile_0_5) {
+      %0 = ub.poison : i16
+      %c256 = arith.constant 256 : index
+      %c32 = arith.constant 32 : index
+      %c0 = arith.constant 0 : index
+      cf.br ^bb1
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_0_5_21, AcquireGreaterEqual, 1)
+      aie.use_lock(%lock_0_5_20, AcquireGreaterEqual, 1)
+      aie.use_lock(%lock_0_5_18, AcquireGreaterEqual, 1)
+      scf.for %arg0 = %c0 to %c256 step %c32 {
+        %subview = memref.subview %buf11[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2>
+        %subview_23 = memref.subview %buf10[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2>
+        %subview_24 = memref.subview %buf9[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2>
+        %1 = vector.transfer_read %subview[%c0], %0 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16>
+        %2 = vector.transfer_read %subview_23[%c0], %0 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16>
+        %3 = arith.subi %1, %2 : vector<32xi16>
+        vector.transfer_write %3, %subview_24[%c0] {in_bounds = [true]} : vector<32xi16>, memref<32xi16, strided<[1], offset: ?>, 2>
+      } {loop_annotation = #loop_annotation}
+      aie.use_lock(%lock_0_5_19, Release, 1)
+      aie.use_lock(%lock_0_5, Release, 1)
+      aie.use_lock(%lock_0_5_22, Release, 1)
+      cf.br ^bb1
+    }
+    %mem_0_4 = aie.mem(%tile_0_4) {
+      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_0_4_17, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf6 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32}
+      aie.use_lock(%lock_0_4_16, Release, 1)
+      aie.next_bd ^bb1
+    ^bb2:  // pred: ^bb5
+      aie.end
+    ^bb3:  // pred: ^bb0
+      %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb5)
+    ^bb4:  // 2 preds: ^bb3, ^bb4
+      aie.use_lock(%lock_0_4_14, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf8 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32}
+      aie.use_lock(%lock_0_4_15, Release, 1)
+      aie.next_bd ^bb4
+    ^bb5:  // pred: ^bb3
+      %2 = aie.dma_start(S2MM, 1, ^bb6, ^bb2)
+    ^bb6:  // 2 preds: ^bb5, ^bb6
+      aie.use_lock(%lock_0_4, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf7 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32}
+      aie.use_lock(%lock_0_4_13, Release, 1)
+      aie.next_bd ^bb6
+    }
+    %core_0_4 = aie.core(%tile_0_4) {
+      %0 = ub.poison : i16
+      %c256 = arith.constant 256 : index
+      %c32 = arith.constant 32 : index
+      %c0 = arith.constant 0 : index
+      cf.br ^bb1
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_0_4_16, AcquireGreaterEqual, 1)
+      aie.use_lock(%lock_0_4_15, AcquireGreaterEqual, 1)
+      aie.use_lock(%lock_0_4_13, AcquireGreaterEqual, 1)
+      scf.for %arg0 = %c0 to %c256 step %c32 {
+        %subview = memref.subview %buf8[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2>
+        %subview_23 = memref.subview %buf7[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2>
+        %subview_24 = memref.subview %buf6[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2>
+        %1 = vector.transfer_read %subview[%c0], %0 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16>
+        %2 = vector.transfer_read %subview_23[%c0], %0 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16>
+        %3 = arith.subi %1, %2 : vector<32xi16>
+        vector.transfer_write %3, %subview_24[%c0] {in_bounds = [true]} : vector<32xi16>, memref<32xi16, strided<[1], offset: ?>, 2>
+      } {loop_annotation = #loop_annotation}
+      aie.use_lock(%lock_0_4_14, Release, 1)
+      aie.use_lock(%lock_0_4, Release, 1)
+      aie.use_lock(%lock_0_4_17, Release, 1)
+      cf.br ^bb1
+    }
+    %mem_0_3 = aie.mem(%tile_0_3) {
+      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_0_3_12, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf3 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32}
+      aie.use_lock(%lock_0_3_11, Release, 1)
+      aie.next_bd ^bb1
+    ^bb2:  // pred: ^bb5
+      aie.end
+    ^bb3:  // pred: ^bb0
+      %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb5)
+    ^bb4:  // 2 preds: ^bb3, ^bb4
+      aie.use_lock(%lock_0_3_9, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf5 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32}
+      aie.use_lock(%lock_0_3_10, Release, 1)
+      aie.next_bd ^bb4
+    ^bb5:  // pred: ^bb3
+      %2 = aie.dma_start(S2MM, 1, ^bb6, ^bb2)
+    ^bb6:  // 2 preds: ^bb5, ^bb6
+      aie.use_lock(%lock_0_3, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf4 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32}
+      aie.use_lock(%lock_0_3_8, Release, 1)
+      aie.next_bd ^bb6
+    }
+    %core_0_3 = aie.core(%tile_0_3) {
+      %0 = ub.poison : i16
+      %c256 = arith.constant 256 : index
+      %c32 = arith.constant 32 : index
+      %c0 = arith.constant 0 : index
+      cf.br ^bb1
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_0_3_11, AcquireGreaterEqual, 1)
+      aie.use_lock(%lock_0_3_10, AcquireGreaterEqual, 1)
+      aie.use_lock(%lock_0_3_8, AcquireGreaterEqual, 1)
+      scf.for %arg0 = %c0 to %c256 step %c32 {
+        %subview = memref.subview %buf5[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2>
+        %subview_23 = memref.subview %buf4[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2>
+        %subview_24 = memref.subview %buf3[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2>
+        %1 = vector.transfer_read %subview[%c0], %0 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16>
+        %2 = vector.transfer_read %subview_23[%c0], %0 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16>
+        %3 = arith.subi %1, %2 : vector<32xi16>
+        vector.transfer_write %3, %subview_24[%c0] {in_bounds = [true]} : vector<32xi16>, memref<32xi16, strided<[1], offset: ?>, 2>
+      } {loop_annotation = #loop_annotation}
+      aie.use_lock(%lock_0_3_9, Release, 1)
+      aie.use_lock(%lock_0_3, Release, 1)
+      aie.use_lock(%lock_0_3_12, Release, 1)
+      cf.br ^bb1
+    }
+    %mem_0_2 = aie.mem(%tile_0_2) {
+      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_0_2_7, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf0 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32}
+      aie.use_lock(%lock_0_2_6, Release, 1)
+      aie.next_bd ^bb1
+    ^bb2:  // pred: ^bb5
+      aie.end
+    ^bb3:  // pred: ^bb0
+      %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb5)
+    ^bb4:  // 2 preds: ^bb3, ^bb4
+      aie.use_lock(%lock_0_2_4, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf2 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32}
+      aie.use_lock(%lock_0_2_5, Release, 1)
+      aie.next_bd ^bb4
+    ^bb5:  // pred: ^bb3
+      %2 = aie.dma_start(S2MM, 1, ^bb6, ^bb2)
+    ^bb6:  // 2 preds: ^bb5, ^bb6
+      aie.use_lock(%lock_0_2, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf1 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32}
+      aie.use_lock(%lock_0_2_3, Release, 1)
+      aie.next_bd ^bb6
+    }
+    %core_0_2 = aie.core(%tile_0_2) {
+      %0 = ub.poison : i16
+      %c256 = arith.constant 256 : index
+      %c32 = arith.constant 32 : index
+      %c0 = arith.constant 0 : index
+      cf.br ^bb1
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_0_2_6, AcquireGreaterEqual, 1)
+      aie.use_lock(%lock_0_2_5, AcquireGreaterEqual, 1)
+      aie.use_lock(%lock_0_2_3, AcquireGreaterEqual, 1)
+      scf.for %arg0 = %c0 to %c256 step %c32 {
+        %subview = memref.subview %buf2[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2>
+        %subview_23 = memref.subview %buf1[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2>
+        %subview_24 = memref.subview %buf0[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2>
+        %1 = vector.transfer_read %subview[%c0], %0 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16>
+        %2 = vector.transfer_read %subview_23[%c0], %0 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16>
+        %3 = arith.subi %1, %2 : vector<32xi16>
+        vector.transfer_write %3, %subview_24[%c0] {in_bounds = [true]} : vector<32xi16>, memref<32xi16, strided<[1], offset: ?>, 2>
+      } {loop_annotation = #loop_annotation}
+      aie.use_lock(%lock_0_2_4, Release, 1)
+      aie.use_lock(%lock_0_2, Release, 1)
+      aie.use_lock(%lock_0_2_7, Release, 1)
+      cf.br ^bb1
+    }
+    aie.flow(%shim_noc_tile_0_0, DMA : 0, %mem_tile_0_1, DMA : 0)
+    aie.flow(%shim_noc_tile_1_0, DMA : 0, %mem_tile_1_1, DMA : 0)
+    aie.flow(%mem_tile_2_1, DMA : 0, %shim_noc_tile_2_0, DMA : 0)
+    aie.flow(%mem_tile_0_1, DMA : 0, %tile_0_2, DMA : 0)
+    aie.flow(%mem_tile_0_1, DMA : 1, %tile_0_3, DMA : 0)
+    aie.flow(%mem_tile_0_1, DMA : 2, %tile_0_4, DMA : 0)
+    aie.flow(%mem_tile_0_1, DMA : 3, %tile_0_5, DMA : 0)
+    aie.flow(%mem_tile_1_1, DMA : 0, %tile_0_2, DMA : 1)
+    aie.flow(%mem_tile_1_1, DMA : 1, %tile_0_3, DMA : 1)
+    aie.flow(%mem_tile_1_1, DMA : 2, %tile_0_4, DMA : 1)
+    aie.flow(%mem_tile_1_1, DMA : 3, %tile_0_5, DMA : 1)
+    aie.flow(%tile_0_2, DMA : 0, %mem_tile_2_1, DMA : 0)
+    aie.flow(%tile_0_3, DMA : 0, %mem_tile_2_1, DMA : 1)
+    aie.flow(%tile_0_4, DMA : 0, %mem_tile_2_1, DMA : 2)
+    aie.flow(%tile_0_5, DMA : 0, %mem_tile_2_1, DMA : 3)
+    %memtile_dma_2_1 = aie.memtile_dma(%mem_tile_2_1) {
+      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_2_1_2, AcquireGreaterEqual, 4)
+      aie.dma_bd(%buf12 : memref<1024xi16, 1>, 0, 1024) {task_id = 0 : i32}
+      aie.use_lock(%lock_2_1, Release, 4)
+      aie.next_bd ^bb1
+    ^bb2:  // pred: ^bb9
+      aie.end
+    ^bb3:  // pred: ^bb0
+      %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb5)
+    ^bb4:  // 2 preds: ^bb3, ^bb4
+      aie.use_lock(%lock_2_1, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf12 : memref<1024xi16, 1>, 0, 256) {task_id = 0 : i32}
+      aie.use_lock(%lock_2_1_2, Release, 1)
+      aie.next_bd ^bb4
+    ^bb5:  // pred: ^bb3
+      %2 = aie.dma_start(S2MM, 1, ^bb6, ^bb7)
+    ^bb6:  // 2 preds: ^bb5, ^bb6
+      aie.use_lock(%lock_2_1, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf12 : memref<1024xi16, 1>, 256, 256) {task_id = 0 : i32}
+      aie.use_lock(%lock_2_1_2, Release, 1)
+      aie.next_bd ^bb6
+    ^bb7:  // pred: ^bb5
+      %3 = aie.dma_start(S2MM, 2, ^bb8, ^bb9)
+    ^bb8:  // 2 preds: ^bb7, ^bb8
+      aie.use_lock(%lock_2_1, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf12 : memref<1024xi16, 1>, 512, 256) {task_id = 0 : i32}
+      aie.use_lock(%lock_2_1_2, Release, 1)
+      aie.next_bd ^bb8
+    ^bb9:  // pred: ^bb7
+      %4 = aie.dma_start(S2MM, 3, ^bb10, ^bb2)
+    ^bb10:  // 2 preds: ^bb9, ^bb10
+      aie.use_lock(%lock_2_1, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf12 : memref<1024xi16, 1>, 768, 256) {task_id = 0 : i32}
+      aie.use_lock(%lock_2_1_2, Release, 1)
+      aie.next_bd ^bb10
+    }
+    %memtile_dma_0_1 = aie.memtile_dma(%mem_tile_0_1) {
+      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_0_1_1, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf14 : memref<1024xi16, 1 : i32>, 0, 256) {task_id = 0 : i32}
+      aie.use_lock(%lock_0_1, Release, 1)
+      aie.next_bd ^bb1
+    ^bb2:  // pred: ^bb9
+      aie.end
+    ^bb3:  // pred: ^bb0
+      %1 = aie.dma_start(MM2S, 1, ^bb4, ^bb5)
+    ^bb4:  // 2 preds: ^bb3, ^bb4
+      aie.use_lock(%lock_0_1_1, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf14 : memref<1024xi16, 1 : i32>, 256, 256) {task_id = 0 : i32}
+      aie.use_lock(%lock_0_1, Release, 1)
+      aie.next_bd ^bb4
+    ^bb5:  // pred: ^bb3
+      %2 = aie.dma_start(MM2S, 2, ^bb6, ^bb7)
+    ^bb6:  // 2 preds: ^bb5, ^bb6
+      aie.use_lock(%lock_0_1_1, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf14 : memref<1024xi16, 1 : i32>, 512, 256) {task_id = 0 : i32}
+      aie.use_lock(%lock_0_1, Release, 1)
+      aie.next_bd ^bb6
+    ^bb7:  // pred: ^bb5
+      %3 = aie.dma_start(MM2S, 3, ^bb8, ^bb9)
+    ^bb8:  // 2 preds: ^bb7, ^bb8
+      aie.use_lock(%lock_0_1_1, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf14 : memref<1024xi16, 1 : i32>, 768, 256) {task_id = 0 : i32}
+      aie.use_lock(%lock_0_1, Release, 1)
+      aie.next_bd ^bb8
+    ^bb9:  // pred: ^bb7
+      %4 = aie.dma_start(S2MM, 0, ^bb10, ^bb2)
+    ^bb10:  // 2 preds: ^bb9, ^bb10
+      aie.use_lock(%lock_0_1, AcquireGreaterEqual, 4)
+      aie.dma_bd(%buf14 : memref<1024xi16, 1 : i32>, 0, 1024) {task_id = 0 : i32}
+      aie.use_lock(%lock_0_1_1, Release, 4)
+      aie.next_bd ^bb10
+    }
+    %memtile_dma_1_1 = aie.memtile_dma(%mem_tile_1_1) {
+      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_1_1_0, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf13 : memref<1024xi16, 1 : i32>, 0, 256) {task_id = 0 : i32}
+      aie.use_lock(%lock_1_1, Release, 1)
+      aie.next_bd ^bb1
+    ^bb2:  // pred: ^bb9
+      aie.end
+    ^bb3:  // pred: ^bb0
+      %1 = aie.dma_start(MM2S, 1, ^bb4, ^bb5)
+    ^bb4:  // 2 preds: ^bb3, ^bb4
+      aie.use_lock(%lock_1_1_0, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf13 : memref<1024xi16, 1 : i32>, 256, 256) {task_id = 0 : i32}
+      aie.use_lock(%lock_1_1, Release, 1)
+      aie.next_bd ^bb4
+    ^bb5:  // pred: ^bb3
+      %2 = aie.dma_start(MM2S, 2, ^bb6, ^bb7)
+    ^bb6:  // 2 preds: ^bb5, ^bb6
+      aie.use_lock(%lock_1_1_0, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf13 : memref<1024xi16, 1 : i32>, 512, 256) {task_id = 0 : i32}
+      aie.use_lock(%lock_1_1, Release, 1)
+      aie.next_bd ^bb6
+    ^bb7:  // pred: ^bb5
+      %3 = aie.dma_start(MM2S, 3, ^bb8, ^bb9)
+    ^bb8:  // 2 preds: ^bb7, ^bb8
+      aie.use_lock(%lock_1_1_0, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf13 : memref<1024xi16, 1 : i32>, 768, 256) {task_id = 0 : i32}
+      aie.use_lock(%lock_1_1, Release, 1)
+      aie.next_bd ^bb8
+    ^bb9:  // pred: ^bb7
+      %4 = aie.dma_start(S2MM, 0, ^bb10, ^bb2)
+    ^bb10:  // 2 preds: ^bb9, ^bb10
+      aie.use_lock(%lock_1_1, AcquireGreaterEqual, 4)
+      aie.dma_bd(%buf13 : memref<1024xi16, 1 : i32>, 0, 1024) {task_id = 0 : i32}
+      aie.use_lock(%lock_1_1_0, Release, 4)
+      aie.next_bd ^bb10
+    }
+    aie.shim_dma_allocation @air_channel_5(%shim_noc_tile_2_0, S2MM, 0)
+    aie.shim_dma_allocation @air_channel_0(%shim_noc_tile_0_0, MM2S, 0)
+    aie.shim_dma_allocation @air_channel_1(%shim_noc_tile_1_0, MM2S, 0)
+    aie.runtime_sequence @sub_kernel_0_sequence(%arg0: memref<*xi16>, %arg1: memref<*xi16>, %arg2: memref<*xi16>, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32) {
+      %0 = aiex.dma_configure_task_for @air_channel_0 {
+        aie.dma_bd(%arg0 : memref<*xi16>, 0, 1024, [<size = 2, stride = 512>, <size = 512, stride = 1>])
+        aie.end
+      }
+      aiex.dma_start_task(%0)
+      %1 = aiex.dma_configure_task_for @air_channel_1 {
+        aie.dma_bd(%arg1 : memref<*xi16>, 0, 1024, [<size = 2, stride = 512>, <size = 512, stride = 1>])
+        aie.end
+      }
+      aiex.dma_start_task(%1)
+      %2 = aiex.dma_configure_task_for @air_channel_5 {
+        aie.dma_bd(%arg2 : memref<*xi16>, 0, 1024, [<size = 2, stride = 512>, <size = 512, stride = 1>])
+        aie.end
+      } {issue_token = true}
+      aiex.dma_start_task(%2)
+      aiex.dma_free_task(%0)
+      aiex.dma_await_task(%2)
+      aiex.dma_free_task(%1)
+    }
+  } {dlti.dl_spec = #dlti.dl_spec<index = 32 : i64>}
+  aie.device(npu2) {
+    aie.runtime_sequence @sub_kernel(%arg0: memref<*xi16>, %arg1: memref<*xi16>, %arg2: memref<*xi16>, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32) {
+      aiex.configure @sub_kernel_0 {
+        aiex.run @sub_kernel_0_sequence(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8) : (memref<*xi16>, memref<*xi16>, memref<*xi16>, i32, i32, i32, i32, i32, i32)
+      }
+    }
+  }
+}
diff --git a/examples/elementwise_arith/air_project/aiecc_failure_1775797139_858651.mlir b/examples/elementwise_arith/air_project/aiecc_failure_1775797139_858651.mlir
new file mode 100644
index 0000000..fc9d492
--- /dev/null
+++ b/examples/elementwise_arith/air_project/aiecc_failure_1775797139_858651.mlir
@@ -0,0 +1,601 @@
+#loop_annotation = #llvm.loop_annotation<mustProgress = true>
+module {
+  aie.device(npu2) @mul_kernel_0 {
+    %shim_noc_tile_0_0 = aie.tile(0, 0) {controller_id = #aie.packet_info<pkt_type = 0, pkt_id = 15>}
+    %shim_noc_tile_1_0 = aie.tile(1, 0) {controller_id = #aie.packet_info<pkt_type = 0, pkt_id = 15>}
+    %shim_noc_tile_2_0 = aie.tile(2, 0) {controller_id = #aie.packet_info<pkt_type = 0, pkt_id = 15>}
+    %mem_tile_0_1 = aie.tile(0, 1) {controller_id = #aie.packet_info<pkt_type = 0, pkt_id = 26>}
+    %mem_tile_1_1 = aie.tile(1, 1) {controller_id = #aie.packet_info<pkt_type = 0, pkt_id = 26>}
+    %mem_tile_2_1 = aie.tile(2, 1) {controller_id = #aie.packet_info<pkt_type = 0, pkt_id = 26>}
+    %tile_0_2 = aie.tile(0, 2) {controller_id = #aie.packet_info<pkt_type = 0, pkt_id = 27>}
+    %tile_0_3 = aie.tile(0, 3) {controller_id = #aie.packet_info<pkt_type = 0, pkt_id = 29>}
+    %tile_0_4 = aie.tile(0, 4) {controller_id = #aie.packet_info<pkt_type = 0, pkt_id = 30>}
+    %tile_0_5 = aie.tile(0, 5) {controller_id = #aie.packet_info<pkt_type = 0, pkt_id = 31>}
+    %lock_1_1 = aie.lock(%mem_tile_1_1, 1) {init = 4 : i32}
+    %lock_1_1_0 = aie.lock(%mem_tile_1_1, 0) {init = 0 : i32}
+    %lock_0_1 = aie.lock(%mem_tile_0_1, 1) {init = 4 : i32}
+    %lock_0_1_1 = aie.lock(%mem_tile_0_1, 0) {init = 0 : i32}
+    %lock_2_1 = aie.lock(%mem_tile_2_1, 1) {init = 4 : i32}
+    %lock_2_1_2 = aie.lock(%mem_tile_2_1, 0) {init = 0 : i32}
+    %lock_0_2 = aie.lock(%tile_0_2, 5) {init = 1 : i32}
+    %lock_0_2_3 = aie.lock(%tile_0_2, 4) {init = 0 : i32}
+    %lock_0_2_4 = aie.lock(%tile_0_2, 3) {init = 1 : i32}
+    %lock_0_2_5 = aie.lock(%tile_0_2, 2) {init = 0 : i32}
+    %lock_0_2_6 = aie.lock(%tile_0_2, 1) {init = 1 : i32}
+    %lock_0_2_7 = aie.lock(%tile_0_2, 0) {init = 0 : i32}
+    %lock_0_3 = aie.lock(%tile_0_3, 5) {init = 1 : i32}
+    %lock_0_3_8 = aie.lock(%tile_0_3, 4) {init = 0 : i32}
+    %lock_0_3_9 = aie.lock(%tile_0_3, 3) {init = 1 : i32}
+    %lock_0_3_10 = aie.lock(%tile_0_3, 2) {init = 0 : i32}
+    %lock_0_3_11 = aie.lock(%tile_0_3, 1) {init = 1 : i32}
+    %lock_0_3_12 = aie.lock(%tile_0_3, 0) {init = 0 : i32}
+    %lock_0_4 = aie.lock(%tile_0_4, 5) {init = 1 : i32}
+    %lock_0_4_13 = aie.lock(%tile_0_4, 4) {init = 0 : i32}
+    %lock_0_4_14 = aie.lock(%tile_0_4, 3) {init = 1 : i32}
+    %lock_0_4_15 = aie.lock(%tile_0_4, 2) {init = 0 : i32}
+    %lock_0_4_16 = aie.lock(%tile_0_4, 1) {init = 1 : i32}
+    %lock_0_4_17 = aie.lock(%tile_0_4, 0) {init = 0 : i32}
+    %lock_0_5 = aie.lock(%tile_0_5, 5) {init = 1 : i32}
+    %lock_0_5_18 = aie.lock(%tile_0_5, 4) {init = 0 : i32}
+    %lock_0_5_19 = aie.lock(%tile_0_5, 3) {init = 1 : i32}
+    %lock_0_5_20 = aie.lock(%tile_0_5, 2) {init = 0 : i32}
+    %lock_0_5_21 = aie.lock(%tile_0_5, 1) {init = 1 : i32}
+    %lock_0_5_22 = aie.lock(%tile_0_5, 0) {init = 0 : i32}
+    %buf14 = aie.buffer(%mem_tile_0_1) {address = 0 : i32, mem_bank = 0 : i32, sym_name = "buf14"} : memref<1024xi16, 1 : i32> 
+    %buf13 = aie.buffer(%mem_tile_1_1) {address = 0 : i32, mem_bank = 0 : i32, sym_name = "buf13"} : memref<1024xi16, 1 : i32> 
+    %buf12 = aie.buffer(%mem_tile_2_1) {address = 0 : i32, mem_bank = 0 : i32, sym_name = "buf12"} : memref<1024xi16, 1> 
+    %buf11 = aie.buffer(%tile_0_5) {address = 1024 : i32, mem_bank = 0 : i32, sym_name = "buf11"} : memref<256xi16, 2> 
+    %buf10 = aie.buffer(%tile_0_5) {address = 16384 : i32, mem_bank = 1 : i32, sym_name = "buf10"} : memref<256xi16, 2> 
+    %buf9 = aie.buffer(%tile_0_5) {address = 32768 : i32, mem_bank = 2 : i32, sym_name = "buf9"} : memref<256xi16, 2> 
+    %buf8 = aie.buffer(%tile_0_4) {address = 1024 : i32, mem_bank = 0 : i32, sym_name = "buf8"} : memref<256xi16, 2> 
+    %buf7 = aie.buffer(%tile_0_4) {address = 16384 : i32, mem_bank = 1 : i32, sym_name = "buf7"} : memref<256xi16, 2> 
+    %buf6 = aie.buffer(%tile_0_4) {address = 32768 : i32, mem_bank = 2 : i32, sym_name = "buf6"} : memref<256xi16, 2> 
+    %buf5 = aie.buffer(%tile_0_3) {address = 1024 : i32, mem_bank = 0 : i32, sym_name = "buf5"} : memref<256xi16, 2> 
+    %buf4 = aie.buffer(%tile_0_3) {address = 16384 : i32, mem_bank = 1 : i32, sym_name = "buf4"} : memref<256xi16, 2> 
+    %buf3 = aie.buffer(%tile_0_3) {address = 32768 : i32, mem_bank = 2 : i32, sym_name = "buf3"} : memref<256xi16, 2> 
+    %buf2 = aie.buffer(%tile_0_2) {address = 1024 : i32, mem_bank = 0 : i32, sym_name = "buf2"} : memref<256xi16, 2> 
+    %buf1 = aie.buffer(%tile_0_2) {address = 16384 : i32, mem_bank = 1 : i32, sym_name = "buf1"} : memref<256xi16, 2> 
+    %buf0 = aie.buffer(%tile_0_2) {address = 32768 : i32, mem_bank = 2 : i32, sym_name = "buf0"} : memref<256xi16, 2> 
+    %mem_0_5 = aie.mem(%tile_0_5) {
+      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_0_5_22, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf9 : memref<256xi16, 2>, 0, 256) {bd_id = 0 : i32, next_bd_id = 0 : i32, task_id = 0 : i32}
+      aie.use_lock(%lock_0_5_21, Release, 1)
+      aie.next_bd ^bb1
+    ^bb2:  // pred: ^bb5
+      aie.end
+    ^bb3:  // pred: ^bb0
+      %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb5)
+    ^bb4:  // 2 preds: ^bb3, ^bb4
+      aie.use_lock(%lock_0_5_19, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf11 : memref<256xi16, 2>, 0, 256) {bd_id = 1 : i32, next_bd_id = 1 : i32, task_id = 0 : i32}
+      aie.use_lock(%lock_0_5_20, Release, 1)
+      aie.next_bd ^bb4
+    ^bb5:  // pred: ^bb3
+      %2 = aie.dma_start(S2MM, 1, ^bb6, ^bb2)
+    ^bb6:  // 2 preds: ^bb5, ^bb6
+      aie.use_lock(%lock_0_5, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf10 : memref<256xi16, 2>, 0, 256) {bd_id = 2 : i32, next_bd_id = 2 : i32, task_id = 0 : i32}
+      aie.use_lock(%lock_0_5_18, Release, 1)
+      aie.next_bd ^bb6
+    }
+    %core_0_5 = aie.core(%tile_0_5) {
+      %c0_i32 = arith.constant 0 : i32
+      %c256 = arith.constant 256 : index
+      %c32 = arith.constant 32 : index
+      %c0 = arith.constant 0 : index
+      cf.br ^bb1
+    ^bb1:  // 2 preds: ^bb0, ^bb4
+      aie.use_lock(%lock_0_5_21, AcquireGreaterEqual, 1)
+      aie.use_lock(%lock_0_5_20, AcquireGreaterEqual, 1)
+      aie.use_lock(%lock_0_5_18, AcquireGreaterEqual, 1)
+      cf.br ^bb2(%c0 : index)
+    ^bb2(%0: index):  // 2 preds: ^bb1, ^bb3
+      %1 = arith.cmpi slt, %0, %c256 : index
+      cf.cond_br %1, ^bb3, ^bb4
+    ^bb3:  // pred: ^bb2
+      %2 = vector.load %buf11[%0] : memref<256xi16, 2>, vector<32xi16>
+      %3 = vector.load %buf10[%0] : memref<256xi16, 2>, vector<32xi16>
+      %4 = aievec.mul_elem %2, %3 : vector<32xi16>, vector<32xi16>, vector<32xi32>
+      %5 = aievec.srs %4, %c0_i32 : vector<32xi32>, i32, vector<32xi16>
+      vector.store %5, %buf9[%0] : memref<256xi16, 2>, vector<32xi16>
+      %6 = arith.addi %0, %c32 : index
+      cf.br ^bb2(%6 : index) {loop_annotation = #loop_annotation}
+    ^bb4:  // pred: ^bb2
+      aie.use_lock(%lock_0_5_19, Release, 1)
+      aie.use_lock(%lock_0_5, Release, 1)
+      aie.use_lock(%lock_0_5_22, Release, 1)
+      cf.br ^bb1
+    }
+    %mem_0_4 = aie.mem(%tile_0_4) {
+      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_0_4_17, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf6 : memref<256xi16, 2>, 0, 256) {bd_id = 0 : i32, next_bd_id = 0 : i32, task_id = 0 : i32}
+      aie.use_lock(%lock_0_4_16, Release, 1)
+      aie.next_bd ^bb1
+    ^bb2:  // pred: ^bb5
+      aie.end
+    ^bb3:  // pred: ^bb0
+      %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb5)
+    ^bb4:  // 2 preds: ^bb3, ^bb4
+      aie.use_lock(%lock_0_4_14, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf8 : memref<256xi16, 2>, 0, 256) {bd_id = 1 : i32, next_bd_id = 1 : i32, task_id = 0 : i32}
+      aie.use_lock(%lock_0_4_15, Release, 1)
+      aie.next_bd ^bb4
+    ^bb5:  // pred: ^bb3
+      %2 = aie.dma_start(S2MM, 1, ^bb6, ^bb2)
+    ^bb6:  // 2 preds: ^bb5, ^bb6
+      aie.use_lock(%lock_0_4, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf7 : memref<256xi16, 2>, 0, 256) {bd_id = 2 : i32, next_bd_id = 2 : i32, task_id = 0 : i32}
+      aie.use_lock(%lock_0_4_13, Release, 1)
+      aie.next_bd ^bb6
+    }
+    %core_0_4 = aie.core(%tile_0_4) {
+      %c0_i32 = arith.constant 0 : i32
+      %c256 = arith.constant 256 : index
+      %c32 = arith.constant 32 : index
+      %c0 = arith.constant 0 : index
+      cf.br ^bb1
+    ^bb1:  // 2 preds: ^bb0, ^bb4
+      aie.use_lock(%lock_0_4_16, AcquireGreaterEqual, 1)
+      aie.use_lock(%lock_0_4_15, AcquireGreaterEqual, 1)
+      aie.use_lock(%lock_0_4_13, AcquireGreaterEqual, 1)
+      cf.br ^bb2(%c0 : index)
+    ^bb2(%0: index):  // 2 preds: ^bb1, ^bb3
+      %1 = arith.cmpi slt, %0, %c256 : index
+      cf.cond_br %1, ^bb3, ^bb4
+    ^bb3:  // pred: ^bb2
+      %2 = vector.load %buf8[%0] : memref<256xi16, 2>, vector<32xi16>
+      %3 = vector.load %buf7[%0] : memref<256xi16, 2>, vector<32xi16>
+      %4 = aievec.mul_elem %2, %3 : vector<32xi16>, vector<32xi16>, vector<32xi32>
+      %5 = aievec.srs %4, %c0_i32 : vector<32xi32>, i32, vector<32xi16>
+      vector.store %5, %buf6[%0] : memref<256xi16, 2>, vector<32xi16>
+      %6 = arith.addi %0, %c32 : index
+      cf.br ^bb2(%6 : index) {loop_annotation = #loop_annotation}
+    ^bb4:  // pred: ^bb2
+      aie.use_lock(%lock_0_4_14, Release, 1)
+      aie.use_lock(%lock_0_4, Release, 1)
+      aie.use_lock(%lock_0_4_17, Release, 1)
+      cf.br ^bb1
+    }
+    %mem_0_3 = aie.mem(%tile_0_3) {
+      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_0_3_12, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf3 : memref<256xi16, 2>, 0, 256) {bd_id = 0 : i32, next_bd_id = 0 : i32, task_id = 0 : i32}
+      aie.use_lock(%lock_0_3_11, Release, 1)
+      aie.next_bd ^bb1
+    ^bb2:  // pred: ^bb5
+      aie.end
+    ^bb3:  // pred: ^bb0
+      %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb5)
+    ^bb4:  // 2 preds: ^bb3, ^bb4
+      aie.use_lock(%lock_0_3_9, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf5 : memref<256xi16, 2>, 0, 256) {bd_id = 1 : i32, next_bd_id = 1 : i32, task_id = 0 : i32}
+      aie.use_lock(%lock_0_3_10, Release, 1)
+      aie.next_bd ^bb4
+    ^bb5:  // pred: ^bb3
+      %2 = aie.dma_start(S2MM, 1, ^bb6, ^bb2)
+    ^bb6:  // 2 preds: ^bb5, ^bb6
+      aie.use_lock(%lock_0_3, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf4 : memref<256xi16, 2>, 0, 256) {bd_id = 2 : i32, next_bd_id = 2 : i32, task_id = 0 : i32}
+      aie.use_lock(%lock_0_3_8, Release, 1)
+      aie.next_bd ^bb6
+    }
+    %core_0_3 = aie.core(%tile_0_3) {
+      %c0_i32 = arith.constant 0 : i32
+      %c256 = arith.constant 256 : index
+      %c32 = arith.constant 32 : index
+      %c0 = arith.constant 0 : index
+      cf.br ^bb1
+    ^bb1:  // 2 preds: ^bb0, ^bb4
+      aie.use_lock(%lock_0_3_11, AcquireGreaterEqual, 1)
+      aie.use_lock(%lock_0_3_10, AcquireGreaterEqual, 1)
+      aie.use_lock(%lock_0_3_8, AcquireGreaterEqual, 1)
+      cf.br ^bb2(%c0 : index)
+    ^bb2(%0: index):  // 2 preds: ^bb1, ^bb3
+      %1 = arith.cmpi slt, %0, %c256 : index
+      cf.cond_br %1, ^bb3, ^bb4
+    ^bb3:  // pred: ^bb2
+      %2 = vector.load %buf5[%0] : memref<256xi16, 2>, vector<32xi16>
+      %3 = vector.load %buf4[%0] : memref<256xi16, 2>, vector<32xi16>
+      %4 = aievec.mul_elem %2, %3 : vector<32xi16>, vector<32xi16>, vector<32xi32>
+      %5 = aievec.srs %4, %c0_i32 : vector<32xi32>, i32, vector<32xi16>
+      vector.store %5, %buf3[%0] : memref<256xi16, 2>, vector<32xi16>
+      %6 = arith.addi %0, %c32 : index
+      cf.br ^bb2(%6 : index) {loop_annotation = #loop_annotation}
+    ^bb4:  // pred: ^bb2
+      aie.use_lock(%lock_0_3_9, Release, 1)
+      aie.use_lock(%lock_0_3, Release, 1)
+      aie.use_lock(%lock_0_3_12, Release, 1)
+      cf.br ^bb1
+    }
+    %mem_0_2 = aie.mem(%tile_0_2) {
+      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_0_2_7, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf0 : memref<256xi16, 2>, 0, 256) {bd_id = 0 : i32, next_bd_id = 0 : i32, task_id = 0 : i32}
+      aie.use_lock(%lock_0_2_6, Release, 1)
+      aie.next_bd ^bb1
+    ^bb2:  // pred: ^bb5
+      aie.end
+    ^bb3:  // pred: ^bb0
+      %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb5)
+    ^bb4:  // 2 preds: ^bb3, ^bb4
+      aie.use_lock(%lock_0_2_4, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf2 : memref<256xi16, 2>, 0, 256) {bd_id = 1 : i32, next_bd_id = 1 : i32, task_id = 0 : i32}
+      aie.use_lock(%lock_0_2_5, Release, 1)
+      aie.next_bd ^bb4
+    ^bb5:  // pred: ^bb3
+      %2 = aie.dma_start(S2MM, 1, ^bb6, ^bb2)
+    ^bb6:  // 2 preds: ^bb5, ^bb6
+      aie.use_lock(%lock_0_2, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf1 : memref<256xi16, 2>, 0, 256) {bd_id = 2 : i32, next_bd_id = 2 : i32, task_id = 0 : i32}
+      aie.use_lock(%lock_0_2_3, Release, 1)
+      aie.next_bd ^bb6
+    }
+    %core_0_2 = aie.core(%tile_0_2) {
+      %c0_i32 = arith.constant 0 : i32
+      %c256 = arith.constant 256 : index
+      %c32 = arith.constant 32 : index
+      %c0 = arith.constant 0 : index
+      cf.br ^bb1
+    ^bb1:  // 2 preds: ^bb0, ^bb4
+      aie.use_lock(%lock_0_2_6, AcquireGreaterEqual, 1)
+      aie.use_lock(%lock_0_2_5, AcquireGreaterEqual, 1)
+      aie.use_lock(%lock_0_2_3, AcquireGreaterEqual, 1)
+      cf.br ^bb2(%c0 : index)
+    ^bb2(%0: index):  // 2 preds: ^bb1, ^bb3
+      %1 = arith.cmpi slt, %0, %c256 : index
+      cf.cond_br %1, ^bb3, ^bb4
+    ^bb3:  // pred: ^bb2
+      %2 = vector.load %buf2[%0] : memref<256xi16, 2>, vector<32xi16>
+      %3 = vector.load %buf1[%0] : memref<256xi16, 2>, vector<32xi16>
+      %4 = aievec.mul_elem %2, %3 : vector<32xi16>, vector<32xi16>, vector<32xi32>
+      %5 = aievec.srs %4, %c0_i32 : vector<32xi32>, i32, vector<32xi16>
+      vector.store %5, %buf0[%0] : memref<256xi16, 2>, vector<32xi16>
+      %6 = arith.addi %0, %c32 : index
+      cf.br ^bb2(%6 : index) {loop_annotation = #loop_annotation}
+    ^bb4:  // pred: ^bb2
+      aie.use_lock(%lock_0_2_4, Release, 1)
+      aie.use_lock(%lock_0_2, Release, 1)
+      aie.use_lock(%lock_0_2_7, Release, 1)
+      cf.br ^bb1
+    }
+    %memtile_dma_2_1 = aie.memtile_dma(%mem_tile_2_1) {
+      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_2_1_2, AcquireGreaterEqual, 4)
+      aie.dma_bd(%buf12 : memref<1024xi16, 1>, 0, 1024) {bd_id = 0 : i32, next_bd_id = 0 : i32, task_id = 0 : i32}
+      aie.use_lock(%lock_2_1, Release, 4)
+      aie.next_bd ^bb1
+    ^bb2:  // pred: ^bb9
+      aie.end
+    ^bb3:  // pred: ^bb0
+      %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb5)
+    ^bb4:  // 2 preds: ^bb3, ^bb4
+      aie.use_lock(%lock_2_1, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf12 : memref<1024xi16, 1>, 0, 256) {bd_id = 1 : i32, next_bd_id = 1 : i32, task_id = 0 : i32}
+      aie.use_lock(%lock_2_1_2, Release, 1)
+      aie.next_bd ^bb4
+    ^bb5:  // pred: ^bb3
+      %2 = aie.dma_start(S2MM, 1, ^bb6, ^bb7)
+    ^bb6:  // 2 preds: ^bb5, ^bb6
+      aie.use_lock(%lock_2_1, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf12 : memref<1024xi16, 1>, 256, 256) {bd_id = 24 : i32, next_bd_id = 24 : i32, task_id = 0 : i32}
+      aie.use_lock(%lock_2_1_2, Release, 1)
+      aie.next_bd ^bb6
+    ^bb7:  // pred: ^bb5
+      %3 = aie.dma_start(S2MM, 2, ^bb8, ^bb9)
+    ^bb8:  // 2 preds: ^bb7, ^bb8
+      aie.use_lock(%lock_2_1, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf12 : memref<1024xi16, 1>, 512, 256) {bd_id = 2 : i32, next_bd_id = 2 : i32, task_id = 0 : i32}
+      aie.use_lock(%lock_2_1_2, Release, 1)
+      aie.next_bd ^bb8
+    ^bb9:  // pred: ^bb7
+      %4 = aie.dma_start(S2MM, 3, ^bb10, ^bb2)
+    ^bb10:  // 2 preds: ^bb9, ^bb10
+      aie.use_lock(%lock_2_1, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf12 : memref<1024xi16, 1>, 768, 256) {bd_id = 25 : i32, next_bd_id = 25 : i32, task_id = 0 : i32}
+      aie.use_lock(%lock_2_1_2, Release, 1)
+      aie.next_bd ^bb10
+    }
+    %memtile_dma_0_1 = aie.memtile_dma(%mem_tile_0_1) {
+      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_0_1_1, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf14 : memref<1024xi16, 1 : i32>, 0, 256) {bd_id = 0 : i32, next_bd_id = 0 : i32, task_id = 0 : i32}
+      aie.use_lock(%lock_0_1, Release, 1)
+      aie.next_bd ^bb1
+    ^bb2:  // pred: ^bb9
+      aie.end
+    ^bb3:  // pred: ^bb0
+      %1 = aie.dma_start(MM2S, 1, ^bb4, ^bb5)
+    ^bb4:  // 2 preds: ^bb3, ^bb4
+      aie.use_lock(%lock_0_1_1, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf14 : memref<1024xi16, 1 : i32>, 256, 256) {bd_id = 24 : i32, next_bd_id = 24 : i32, task_id = 0 : i32}
+      aie.use_lock(%lock_0_1, Release, 1)
+      aie.next_bd ^bb4
+    ^bb5:  // pred: ^bb3
+      %2 = aie.dma_start(MM2S, 2, ^bb6, ^bb7)
+    ^bb6:  // 2 preds: ^bb5, ^bb6
+      aie.use_lock(%lock_0_1_1, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf14 : memref<1024xi16, 1 : i32>, 512, 256) {bd_id = 1 : i32, next_bd_id = 1 : i32, task_id = 0 : i32}
+      aie.use_lock(%lock_0_1, Release, 1)
+      aie.next_bd ^bb6
+    ^bb7:  // pred: ^bb5
+      %3 = aie.dma_start(MM2S, 3, ^bb8, ^bb9)
+    ^bb8:  // 2 preds: ^bb7, ^bb8
+      aie.use_lock(%lock_0_1_1, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf14 : memref<1024xi16, 1 : i32>, 768, 256) {bd_id = 25 : i32, next_bd_id = 25 : i32, task_id = 0 : i32}
+      aie.use_lock(%lock_0_1, Release, 1)
+      aie.next_bd ^bb8
+    ^bb9:  // pred: ^bb7
+      %4 = aie.dma_start(S2MM, 0, ^bb10, ^bb2)
+    ^bb10:  // 2 preds: ^bb9, ^bb10
+      aie.use_lock(%lock_0_1, AcquireGreaterEqual, 4)
+      aie.dma_bd(%buf14 : memref<1024xi16, 1 : i32>, 0, 1024) {bd_id = 2 : i32, next_bd_id = 2 : i32, task_id = 0 : i32}
+      aie.use_lock(%lock_0_1_1, Release, 4)
+      aie.next_bd ^bb10
+    }
+    %memtile_dma_1_1 = aie.memtile_dma(%mem_tile_1_1) {
+      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_1_1_0, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf13 : memref<1024xi16, 1 : i32>, 0, 256) {bd_id = 0 : i32, next_bd_id = 0 : i32, task_id = 0 : i32}
+      aie.use_lock(%lock_1_1, Release, 1)
+      aie.next_bd ^bb1
+    ^bb2:  // pred: ^bb9
+      aie.end
+    ^bb3:  // pred: ^bb0
+      %1 = aie.dma_start(MM2S, 1, ^bb4, ^bb5)
+    ^bb4:  // 2 preds: ^bb3, ^bb4
+      aie.use_lock(%lock_1_1_0, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf13 : memref<1024xi16, 1 : i32>, 256, 256) {bd_id = 24 : i32, next_bd_id = 24 : i32, task_id = 0 : i32}
+      aie.use_lock(%lock_1_1, Release, 1)
+      aie.next_bd ^bb4
+    ^bb5:  // pred: ^bb3
+      %2 = aie.dma_start(MM2S, 2, ^bb6, ^bb7)
+    ^bb6:  // 2 preds: ^bb5, ^bb6
+      aie.use_lock(%lock_1_1_0, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf13 : memref<1024xi16, 1 : i32>, 512, 256) {bd_id = 1 : i32, next_bd_id = 1 : i32, task_id = 0 : i32}
+      aie.use_lock(%lock_1_1, Release, 1)
+      aie.next_bd ^bb6
+    ^bb7:  // pred: ^bb5
+      %3 = aie.dma_start(MM2S, 3, ^bb8, ^bb9)
+    ^bb8:  // 2 preds: ^bb7, ^bb8
+      aie.use_lock(%lock_1_1_0, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf13 : memref<1024xi16, 1 : i32>, 768, 256) {bd_id = 25 : i32, next_bd_id = 25 : i32, task_id = 0 : i32}
+      aie.use_lock(%lock_1_1, Release, 1)
+      aie.next_bd ^bb8
+    ^bb9:  // pred: ^bb7
+      %4 = aie.dma_start(S2MM, 0, ^bb10, ^bb2)
+    ^bb10:  // 2 preds: ^bb9, ^bb10
+      aie.use_lock(%lock_1_1, AcquireGreaterEqual, 4)
+      aie.dma_bd(%buf13 : memref<1024xi16, 1 : i32>, 0, 1024) {bd_id = 2 : i32, next_bd_id = 2 : i32, task_id = 0 : i32}
+      aie.use_lock(%lock_1_1_0, Release, 4)
+      aie.next_bd ^bb10
+    }
+    aie.shim_dma_allocation @air_channel_5(%shim_noc_tile_2_0, S2MM, 0)
+    aie.shim_dma_allocation @air_channel_0(%shim_noc_tile_0_0, MM2S, 0)
+    aie.shim_dma_allocation @air_channel_1(%shim_noc_tile_1_0, MM2S, 0)
+    aie.runtime_sequence @mul_kernel_0_sequence(%arg0: memref<*xi16>, %arg1: memref<*xi16>, %arg2: memref<*xi16>, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32) {
+      %0 = aiex.dma_configure_task_for @air_channel_0 {
+        aie.dma_bd(%arg0 : memref<*xi16>, 0, 1024, [<size = 2, stride = 512>, <size = 512, stride = 1>])
+        aie.end
+      }
+      aiex.dma_start_task(%0)
+      %1 = aiex.dma_configure_task_for @air_channel_1 {
+        aie.dma_bd(%arg1 : memref<*xi16>, 0, 1024, [<size = 2, stride = 512>, <size = 512, stride = 1>])
+        aie.end
+      }
+      aiex.dma_start_task(%1)
+      %2 = aiex.dma_configure_task_for @air_channel_5 {
+        aie.dma_bd(%arg2 : memref<*xi16>, 0, 1024, [<size = 2, stride = 512>, <size = 512, stride = 1>])
+        aie.end
+      } {issue_token = true}
+      aiex.dma_start_task(%2)
+      aiex.dma_free_task(%0)
+      aiex.dma_await_task(%2)
+      aiex.dma_free_task(%1)
+    }
+    aie.packet_flow(15) {
+      aie.packet_source<%shim_noc_tile_0_0, TileControl : 0>
+      aie.packet_dest<%shim_noc_tile_0_0, South : 0>
+    } {keep_pkt_header = true, priority_route = true}
+    aie.packet_flow(15) {
+      aie.packet_source<%shim_noc_tile_1_0, TileControl : 0>
+      aie.packet_dest<%shim_noc_tile_1_0, South : 0>
+    } {keep_pkt_header = true, priority_route = true}
+    aie.packet_flow(15) {
+      aie.packet_source<%shim_noc_tile_2_0, TileControl : 0>
+      aie.packet_dest<%shim_noc_tile_2_0, South : 0>
+    } {keep_pkt_header = true, priority_route = true}
+    %switchbox_0_0 = aie.switchbox(%shim_noc_tile_0_0) {
+      aie.connect<South : 3, North : 3>
+      %0 = aie.amsel<5> (3)
+      %1 = aie.masterset(South : 0, %0) {keep_pkt_header = true}
+      aie.packet_rules(TileControl : 0) {
+        aie.rule(31, 15, %0)
+      }
+    }
+    %shim_mux_0_0 = aie.shim_mux(%shim_noc_tile_0_0) {
+      aie.connect<DMA : 0, North : 3>
+    }
+    %switchbox_0_1 = aie.switchbox(%mem_tile_0_1) {
+      aie.connect<South : 3, DMA : 0>
+      aie.connect<DMA : 0, North : 1>
+      aie.connect<DMA : 1, North : 5>
+      aie.connect<DMA : 2, North : 0>
+      aie.connect<DMA : 3, North : 3>
+    }
+    %switchbox_1_0 = aie.switchbox(%shim_noc_tile_1_0) {
+      aie.connect<South : 3, North : 1>
+      %0 = aie.amsel<5> (3)
+      %1 = aie.masterset(South : 0, %0) {keep_pkt_header = true}
+      aie.packet_rules(TileControl : 0) {
+        aie.rule(31, 15, %0)
+      }
+    }
+    %shim_mux_1_0 = aie.shim_mux(%shim_noc_tile_1_0) {
+      aie.connect<DMA : 0, North : 3>
+    }
+    %switchbox_1_1 = aie.switchbox(%mem_tile_1_1) {
+      aie.connect<South : 1, DMA : 0>
+      aie.connect<DMA : 0, North : 1>
+      aie.connect<DMA : 1, North : 5>
+      aie.connect<DMA : 2, North : 0>
+      aie.connect<DMA : 3, North : 3>
+    }
+    %switchbox_2_0 = aie.switchbox(%shim_noc_tile_2_0) {
+      aie.connect<North : 2, South : 2>
+      %0 = aie.amsel<5> (3)
+      %1 = aie.masterset(South : 0, %0) {keep_pkt_header = true}
+      aie.packet_rules(TileControl : 0) {
+        aie.rule(31, 15, %0)
+      }
+    }
+    %shim_mux_2_0 = aie.shim_mux(%shim_noc_tile_2_0) {
+      aie.connect<North : 2, DMA : 0>
+    }
+    %switchbox_2_1 = aie.switchbox(%mem_tile_2_1) {
+      aie.connect<DMA : 0, South : 2>
+      aie.connect<North : 2, DMA : 0>
+      aie.connect<North : 1, DMA : 1>
+      aie.connect<North : 0, DMA : 2>
+      aie.connect<North : 3, DMA : 3>
+    }
+    %switchbox_0_2 = aie.switchbox(%tile_0_2) {
+      aie.connect<South : 1, DMA : 0>
+      aie.connect<South : 5, North : 3>
+      aie.connect<South : 0, North : 5>
+      aie.connect<South : 3, North : 4>
+      aie.connect<East : 3, DMA : 1>
+      aie.connect<DMA : 0, East : 0>
+    }
+    %switchbox_0_3 = aie.switchbox(%tile_0_3) {
+      aie.connect<South : 3, DMA : 0>
+      aie.connect<South : 5, North : 0>
+      aie.connect<South : 4, North : 2>
+      aie.connect<East : 2, DMA : 1>
+      aie.connect<East : 0, North : 5>
+      aie.connect<DMA : 0, East : 0>
+    }
+    %switchbox_0_4 = aie.switchbox(%tile_0_4) {
+      aie.connect<South : 0, DMA : 0>
+      aie.connect<South : 2, North : 0>
+      aie.connect<South : 5, DMA : 1>
+      aie.connect<East : 2, North : 5>
+      aie.connect<DMA : 0, East : 0>
+      aie.connect<North : 0, East : 3>
+    }
+    %switchbox_0_5 = aie.switchbox(%tile_0_5) {
+      aie.connect<South : 0, DMA : 0>
+      aie.connect<South : 5, DMA : 1>
+      aie.connect<DMA : 0, South : 0>
+    }
+    %tile_1_2 = aie.tile(1, 2)
+    %switchbox_1_2 = aie.switchbox(%tile_1_2) {
+      aie.connect<South : 1, West : 3>
+      aie.connect<South : 5, North : 1>
+      aie.connect<South : 0, North : 2>
+      aie.connect<South : 3, North : 0>
+      aie.connect<West : 0, East : 1>
+      aie.connect<North : 3, East : 3>
+    }
+    %tile_1_3 = aie.tile(1, 3)
+    %switchbox_1_3 = aie.switchbox(%tile_1_3) {
+      aie.connect<South : 1, West : 2>
+      aie.connect<South : 2, West : 0>
+      aie.connect<South : 0, North : 0>
+      aie.connect<West : 0, East : 1>
+      aie.connect<North : 1, South : 3>
+    }
+    %tile_1_4 = aie.tile(1, 4)
+    %switchbox_1_4 = aie.switchbox(%tile_1_4) {
+      aie.connect<South : 0, West : 2>
+      aie.connect<West : 0, South : 1>
+      aie.connect<West : 3, East : 3>
+    }
+    %tile_2_2 = aie.tile(2, 2)
+    %switchbox_2_2 = aie.switchbox(%tile_2_2) {
+      aie.connect<West : 1, South : 2>
+      aie.connect<North : 3, South : 1>
+      aie.connect<West : 3, South : 0>
+      aie.connect<North : 0, South : 3>
+    }
+    %tile_2_3 = aie.tile(2, 3)
+    %switchbox_2_3 = aie.switchbox(%tile_2_3) {
+      aie.connect<West : 1, South : 3>
+      aie.connect<North : 1, South : 0>
+    }
+    %tile_2_4 = aie.tile(2, 4)
+    %switchbox_2_4 = aie.switchbox(%tile_2_4) {
+      aie.connect<West : 3, South : 1>
+    }
+    aie.wire(%shim_mux_0_0 : North, %switchbox_0_0 : South)
+    aie.wire(%shim_noc_tile_0_0 : DMA, %shim_mux_0_0 : DMA)
+    aie.wire(%mem_tile_0_1 : Core, %switchbox_0_1 : Core)
+    aie.wire(%mem_tile_0_1 : DMA, %switchbox_0_1 : DMA)
+    aie.wire(%switchbox_0_0 : North, %switchbox_0_1 : South)
+    aie.wire(%tile_0_2 : Core, %switchbox_0_2 : Core)
+    aie.wire(%tile_0_2 : DMA, %switchbox_0_2 : DMA)
+    aie.wire(%switchbox_0_1 : North, %switchbox_0_2 : South)
+    aie.wire(%tile_0_3 : Core, %switchbox_0_3 : Core)
+    aie.wire(%tile_0_3 : DMA, %switchbox_0_3 : DMA)
+    aie.wire(%switchbox_0_2 : North, %switchbox_0_3 : South)
+    aie.wire(%tile_0_4 : Core, %switchbox_0_4 : Core)
+    aie.wire(%tile_0_4 : DMA, %switchbox_0_4 : DMA)
+    aie.wire(%switchbox_0_3 : North, %switchbox_0_4 : South)
+    aie.wire(%tile_0_5 : Core, %switchbox_0_5 : Core)
+    aie.wire(%tile_0_5 : DMA, %switchbox_0_5 : DMA)
+    aie.wire(%switchbox_0_4 : North, %switchbox_0_5 : South)
+    aie.wire(%switchbox_0_0 : East, %switchbox_1_0 : West)
+    aie.wire(%shim_mux_1_0 : North, %switchbox_1_0 : South)
+    aie.wire(%shim_noc_tile_1_0 : DMA, %shim_mux_1_0 : DMA)
+    aie.wire(%switchbox_0_1 : East, %switchbox_1_1 : West)
+    aie.wire(%mem_tile_1_1 : Core, %switchbox_1_1 : Core)
+    aie.wire(%mem_tile_1_1 : DMA, %switchbox_1_1 : DMA)
+    aie.wire(%switchbox_1_0 : North, %switchbox_1_1 : South)
+    aie.wire(%switchbox_0_2 : East, %switchbox_1_2 : West)
+    aie.wire(%tile_1_2 : Core, %switchbox_1_2 : Core)
+    aie.wire(%tile_1_2 : DMA, %switchbox_1_2 : DMA)
+    aie.wire(%switchbox_1_1 : North, %switchbox_1_2 : South)
+    aie.wire(%switchbox_0_3 : East, %switchbox_1_3 : West)
+    aie.wire(%tile_1_3 : Core, %switchbox_1_3 : Core)
+    aie.wire(%tile_1_3 : DMA, %switchbox_1_3 : DMA)
+    aie.wire(%switchbox_1_2 : North, %switchbox_1_3 : South)
+    aie.wire(%switchbox_0_4 : East, %switchbox_1_4 : West)
+    aie.wire(%tile_1_4 : Core, %switchbox_1_4 : Core)
+    aie.wire(%tile_1_4 : DMA, %switchbox_1_4 : DMA)
+    aie.wire(%switchbox_1_3 : North, %switchbox_1_4 : South)
+    aie.wire(%switchbox_1_0 : East, %switchbox_2_0 : West)
+    aie.wire(%shim_mux_2_0 : North, %switchbox_2_0 : South)
+    aie.wire(%shim_noc_tile_2_0 : DMA, %shim_mux_2_0 : DMA)
+    aie.wire(%switchbox_1_1 : East, %switchbox_2_1 : West)
+    aie.wire(%mem_tile_2_1 : Core, %switchbox_2_1 : Core)
+    aie.wire(%mem_tile_2_1 : DMA, %switchbox_2_1 : DMA)
+    aie.wire(%switchbox_2_0 : North, %switchbox_2_1 : South)
+    aie.wire(%switchbox_1_2 : East, %switchbox_2_2 : West)
+    aie.wire(%tile_2_2 : Core, %switchbox_2_2 : Core)
+    aie.wire(%tile_2_2 : DMA, %switchbox_2_2 : DMA)
+    aie.wire(%switchbox_2_1 : North, %switchbox_2_2 : South)
+    aie.wire(%switchbox_1_3 : East, %switchbox_2_3 : West)
+    aie.wire(%tile_2_3 : Core, %switchbox_2_3 : Core)
+    aie.wire(%tile_2_3 : DMA, %switchbox_2_3 : DMA)
+    aie.wire(%switchbox_2_2 : North, %switchbox_2_3 : South)
+    aie.wire(%switchbox_1_4 : East, %switchbox_2_4 : West)
+    aie.wire(%tile_2_4 : Core, %switchbox_2_4 : Core)
+    aie.wire(%tile_2_4 : DMA, %switchbox_2_4 : DMA)
+    aie.wire(%switchbox_2_3 : North, %switchbox_2_4 : South)
+  } {dlti.dl_spec = #dlti.dl_spec<index = 32 : i64>}
+  aie.device(npu2) {
+    aie.runtime_sequence @mul_kernel(%arg0: memref<*xi16>, %arg1: memref<*xi16>, %arg2: memref<*xi16>, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32) {
+      aiex.configure @mul_kernel_0 {
+        aiex.run @mul_kernel_0_sequence(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8) : (memref<*xi16>, memref<*xi16>, memref<*xi16>, i32, i32, i32, i32, i32, i32)
+      }
+    }
+  }
+}
diff --git a/examples/elementwise_arith/air_project/aiecc_failure_1775797174_862028.mlir b/examples/elementwise_arith/air_project/aiecc_failure_1775797174_862028.mlir
new file mode 100644
index 0000000..918aa51
--- /dev/null
+++ b/examples/elementwise_arith/air_project/aiecc_failure_1775797174_862028.mlir
@@ -0,0 +1,431 @@
+#loop_annotation = #llvm.loop_annotation<mustProgress = true>
+module {
+  aie.device(npu2) @square_kernel_0 {
+    %shim_noc_tile_0_0 = aie.tile(0, 0) {controller_id = #aie.packet_info<pkt_type = 0, pkt_id = 15>}
+    %shim_noc_tile_1_0 = aie.tile(1, 0) {controller_id = #aie.packet_info<pkt_type = 0, pkt_id = 15>}
+    %mem_tile_0_1 = aie.tile(0, 1) {controller_id = #aie.packet_info<pkt_type = 0, pkt_id = 26>}
+    %mem_tile_1_1 = aie.tile(1, 1) {controller_id = #aie.packet_info<pkt_type = 0, pkt_id = 26>}
+    %tile_0_2 = aie.tile(0, 2) {controller_id = #aie.packet_info<pkt_type = 0, pkt_id = 27>}
+    %tile_0_3 = aie.tile(0, 3) {controller_id = #aie.packet_info<pkt_type = 0, pkt_id = 29>}
+    %tile_0_4 = aie.tile(0, 4) {controller_id = #aie.packet_info<pkt_type = 0, pkt_id = 30>}
+    %tile_0_5 = aie.tile(0, 5) {controller_id = #aie.packet_info<pkt_type = 0, pkt_id = 31>}
+    %lock_0_1 = aie.lock(%mem_tile_0_1, 1) {init = 4 : i32}
+    %lock_0_1_0 = aie.lock(%mem_tile_0_1, 0) {init = 0 : i32}
+    %lock_1_1 = aie.lock(%mem_tile_1_1, 1) {init = 4 : i32}
+    %lock_1_1_1 = aie.lock(%mem_tile_1_1, 0) {init = 0 : i32}
+    %lock_0_2 = aie.lock(%tile_0_2, 3) {init = 1 : i32}
+    %lock_0_2_2 = aie.lock(%tile_0_2, 2) {init = 0 : i32}
+    %lock_0_2_3 = aie.lock(%tile_0_2, 1) {init = 1 : i32}
+    %lock_0_2_4 = aie.lock(%tile_0_2, 0) {init = 0 : i32}
+    %lock_0_3 = aie.lock(%tile_0_3, 3) {init = 1 : i32}
+    %lock_0_3_5 = aie.lock(%tile_0_3, 2) {init = 0 : i32}
+    %lock_0_3_6 = aie.lock(%tile_0_3, 1) {init = 1 : i32}
+    %lock_0_3_7 = aie.lock(%tile_0_3, 0) {init = 0 : i32}
+    %lock_0_4 = aie.lock(%tile_0_4, 3) {init = 1 : i32}
+    %lock_0_4_8 = aie.lock(%tile_0_4, 2) {init = 0 : i32}
+    %lock_0_4_9 = aie.lock(%tile_0_4, 1) {init = 1 : i32}
+    %lock_0_4_10 = aie.lock(%tile_0_4, 0) {init = 0 : i32}
+    %lock_0_5 = aie.lock(%tile_0_5, 3) {init = 1 : i32}
+    %lock_0_5_11 = aie.lock(%tile_0_5, 2) {init = 0 : i32}
+    %lock_0_5_12 = aie.lock(%tile_0_5, 1) {init = 1 : i32}
+    %lock_0_5_13 = aie.lock(%tile_0_5, 0) {init = 0 : i32}
+    %buf9 = aie.buffer(%mem_tile_0_1) {address = 0 : i32, mem_bank = 0 : i32, sym_name = "buf9"} : memref<1024xi16, 1 : i32> 
+    %buf8 = aie.buffer(%mem_tile_1_1) {address = 0 : i32, mem_bank = 0 : i32, sym_name = "buf8"} : memref<1024xi16, 1> 
+    %buf7 = aie.buffer(%tile_0_5) {address = 1024 : i32, mem_bank = 0 : i32, sym_name = "buf7"} : memref<256xi16, 2> 
+    %buf6 = aie.buffer(%tile_0_5) {address = 16384 : i32, mem_bank = 1 : i32, sym_name = "buf6"} : memref<256xi16, 2> 
+    %buf5 = aie.buffer(%tile_0_4) {address = 1024 : i32, mem_bank = 0 : i32, sym_name = "buf5"} : memref<256xi16, 2> 
+    %buf4 = aie.buffer(%tile_0_4) {address = 16384 : i32, mem_bank = 1 : i32, sym_name = "buf4"} : memref<256xi16, 2> 
+    %buf3 = aie.buffer(%tile_0_3) {address = 1024 : i32, mem_bank = 0 : i32, sym_name = "buf3"} : memref<256xi16, 2> 
+    %buf2 = aie.buffer(%tile_0_3) {address = 16384 : i32, mem_bank = 1 : i32, sym_name = "buf2"} : memref<256xi16, 2> 
+    %buf1 = aie.buffer(%tile_0_2) {address = 1024 : i32, mem_bank = 0 : i32, sym_name = "buf1"} : memref<256xi16, 2> 
+    %buf0 = aie.buffer(%tile_0_2) {address = 16384 : i32, mem_bank = 1 : i32, sym_name = "buf0"} : memref<256xi16, 2> 
+    %mem_0_5 = aie.mem(%tile_0_5) {
+      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_0_5_13, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf6 : memref<256xi16, 2>, 0, 256) {bd_id = 0 : i32, next_bd_id = 0 : i32, task_id = 0 : i32}
+      aie.use_lock(%lock_0_5_12, Release, 1)
+      aie.next_bd ^bb1
+    ^bb2:  // pred: ^bb3
+      aie.end
+    ^bb3:  // pred: ^bb0
+      %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb2)
+    ^bb4:  // 2 preds: ^bb3, ^bb4
+      aie.use_lock(%lock_0_5, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf7 : memref<256xi16, 2>, 0, 256) {bd_id = 1 : i32, next_bd_id = 1 : i32, task_id = 0 : i32}
+      aie.use_lock(%lock_0_5_11, Release, 1)
+      aie.next_bd ^bb4
+    }
+    %core_0_5 = aie.core(%tile_0_5) {
+      %c0_i32 = arith.constant 0 : i32
+      %c256 = arith.constant 256 : index
+      %c32 = arith.constant 32 : index
+      %c0 = arith.constant 0 : index
+      cf.br ^bb1
+    ^bb1:  // 2 preds: ^bb0, ^bb4
+      aie.use_lock(%lock_0_5_12, AcquireGreaterEqual, 1)
+      aie.use_lock(%lock_0_5_11, AcquireGreaterEqual, 1)
+      cf.br ^bb2(%c0 : index)
+    ^bb2(%0: index):  // 2 preds: ^bb1, ^bb3
+      %1 = arith.cmpi slt, %0, %c256 : index
+      cf.cond_br %1, ^bb3, ^bb4
+    ^bb3:  // pred: ^bb2
+      %2 = vector.load %buf7[%0] : memref<256xi16, 2>, vector<32xi16>
+      %3 = aievec.mul_elem %2, %2 : vector<32xi16>, vector<32xi16>, vector<32xi32>
+      %4 = aievec.srs %3, %c0_i32 : vector<32xi32>, i32, vector<32xi16>
+      vector.store %4, %buf6[%0] : memref<256xi16, 2>, vector<32xi16>
+      %5 = arith.addi %0, %c32 : index
+      cf.br ^bb2(%5 : index) {loop_annotation = #loop_annotation}
+    ^bb4:  // pred: ^bb2
+      aie.use_lock(%lock_0_5, Release, 1)
+      aie.use_lock(%lock_0_5_13, Release, 1)
+      cf.br ^bb1
+    }
+    %mem_0_4 = aie.mem(%tile_0_4) {
+      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_0_4_10, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf4 : memref<256xi16, 2>, 0, 256) {bd_id = 0 : i32, next_bd_id = 0 : i32, task_id = 0 : i32}
+      aie.use_lock(%lock_0_4_9, Release, 1)
+      aie.next_bd ^bb1
+    ^bb2:  // pred: ^bb3
+      aie.end
+    ^bb3:  // pred: ^bb0
+      %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb2)
+    ^bb4:  // 2 preds: ^bb3, ^bb4
+      aie.use_lock(%lock_0_4, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf5 : memref<256xi16, 2>, 0, 256) {bd_id = 1 : i32, next_bd_id = 1 : i32, task_id = 0 : i32}
+      aie.use_lock(%lock_0_4_8, Release, 1)
+      aie.next_bd ^bb4
+    }
+    %core_0_4 = aie.core(%tile_0_4) {
+      %c0_i32 = arith.constant 0 : i32
+      %c256 = arith.constant 256 : index
+      %c32 = arith.constant 32 : index
+      %c0 = arith.constant 0 : index
+      cf.br ^bb1
+    ^bb1:  // 2 preds: ^bb0, ^bb4
+      aie.use_lock(%lock_0_4_9, AcquireGreaterEqual, 1)
+      aie.use_lock(%lock_0_4_8, AcquireGreaterEqual, 1)
+      cf.br ^bb2(%c0 : index)
+    ^bb2(%0: index):  // 2 preds: ^bb1, ^bb3
+      %1 = arith.cmpi slt, %0, %c256 : index
+      cf.cond_br %1, ^bb3, ^bb4
+    ^bb3:  // pred: ^bb2
+      %2 = vector.load %buf5[%0] : memref<256xi16, 2>, vector<32xi16>
+      %3 = aievec.mul_elem %2, %2 : vector<32xi16>, vector<32xi16>, vector<32xi32>
+      %4 = aievec.srs %3, %c0_i32 : vector<32xi32>, i32, vector<32xi16>
+      vector.store %4, %buf4[%0] : memref<256xi16, 2>, vector<32xi16>
+      %5 = arith.addi %0, %c32 : index
+      cf.br ^bb2(%5 : index) {loop_annotation = #loop_annotation}
+    ^bb4:  // pred: ^bb2
+      aie.use_lock(%lock_0_4, Release, 1)
+      aie.use_lock(%lock_0_4_10, Release, 1)
+      cf.br ^bb1
+    }
+    %mem_0_3 = aie.mem(%tile_0_3) {
+      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_0_3_7, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf2 : memref<256xi16, 2>, 0, 256) {bd_id = 0 : i32, next_bd_id = 0 : i32, task_id = 0 : i32}
+      aie.use_lock(%lock_0_3_6, Release, 1)
+      aie.next_bd ^bb1
+    ^bb2:  // pred: ^bb3
+      aie.end
+    ^bb3:  // pred: ^bb0
+      %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb2)
+    ^bb4:  // 2 preds: ^bb3, ^bb4
+      aie.use_lock(%lock_0_3, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf3 : memref<256xi16, 2>, 0, 256) {bd_id = 1 : i32, next_bd_id = 1 : i32, task_id = 0 : i32}
+      aie.use_lock(%lock_0_3_5, Release, 1)
+      aie.next_bd ^bb4
+    }
+    %core_0_3 = aie.core(%tile_0_3) {
+      %c0_i32 = arith.constant 0 : i32
+      %c256 = arith.constant 256 : index
+      %c32 = arith.constant 32 : index
+      %c0 = arith.constant 0 : index
+      cf.br ^bb1
+    ^bb1:  // 2 preds: ^bb0, ^bb4
+      aie.use_lock(%lock_0_3_6, AcquireGreaterEqual, 1)
+      aie.use_lock(%lock_0_3_5, AcquireGreaterEqual, 1)
+      cf.br ^bb2(%c0 : index)
+    ^bb2(%0: index):  // 2 preds: ^bb1, ^bb3
+      %1 = arith.cmpi slt, %0, %c256 : index
+      cf.cond_br %1, ^bb3, ^bb4
+    ^bb3:  // pred: ^bb2
+      %2 = vector.load %buf3[%0] : memref<256xi16, 2>, vector<32xi16>
+      %3 = aievec.mul_elem %2, %2 : vector<32xi16>, vector<32xi16>, vector<32xi32>
+      %4 = aievec.srs %3, %c0_i32 : vector<32xi32>, i32, vector<32xi16>
+      vector.store %4, %buf2[%0] : memref<256xi16, 2>, vector<32xi16>
+      %5 = arith.addi %0, %c32 : index
+      cf.br ^bb2(%5 : index) {loop_annotation = #loop_annotation}
+    ^bb4:  // pred: ^bb2
+      aie.use_lock(%lock_0_3, Release, 1)
+      aie.use_lock(%lock_0_3_7, Release, 1)
+      cf.br ^bb1
+    }
+    %mem_0_2 = aie.mem(%tile_0_2) {
+      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_0_2_4, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf0 : memref<256xi16, 2>, 0, 256) {bd_id = 0 : i32, next_bd_id = 0 : i32, task_id = 0 : i32}
+      aie.use_lock(%lock_0_2_3, Release, 1)
+      aie.next_bd ^bb1
+    ^bb2:  // pred: ^bb3
+      aie.end
+    ^bb3:  // pred: ^bb0
+      %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb2)
+    ^bb4:  // 2 preds: ^bb3, ^bb4
+      aie.use_lock(%lock_0_2, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf1 : memref<256xi16, 2>, 0, 256) {bd_id = 1 : i32, next_bd_id = 1 : i32, task_id = 0 : i32}
+      aie.use_lock(%lock_0_2_2, Release, 1)
+      aie.next_bd ^bb4
+    }
+    %core_0_2 = aie.core(%tile_0_2) {
+      %c0_i32 = arith.constant 0 : i32
+      %c256 = arith.constant 256 : index
+      %c32 = arith.constant 32 : index
+      %c0 = arith.constant 0 : index
+      cf.br ^bb1
+    ^bb1:  // 2 preds: ^bb0, ^bb4
+      aie.use_lock(%lock_0_2_3, AcquireGreaterEqual, 1)
+      aie.use_lock(%lock_0_2_2, AcquireGreaterEqual, 1)
+      cf.br ^bb2(%c0 : index)
+    ^bb2(%0: index):  // 2 preds: ^bb1, ^bb3
+      %1 = arith.cmpi slt, %0, %c256 : index
+      cf.cond_br %1, ^bb3, ^bb4
+    ^bb3:  // pred: ^bb2
+      %2 = vector.load %buf1[%0] : memref<256xi16, 2>, vector<32xi16>
+      %3 = aievec.mul_elem %2, %2 : vector<32xi16>, vector<32xi16>, vector<32xi32>
+      %4 = aievec.srs %3, %c0_i32 : vector<32xi32>, i32, vector<32xi16>
+      vector.store %4, %buf0[%0] : memref<256xi16, 2>, vector<32xi16>
+      %5 = arith.addi %0, %c32 : index
+      cf.br ^bb2(%5 : index) {loop_annotation = #loop_annotation}
+    ^bb4:  // pred: ^bb2
+      aie.use_lock(%lock_0_2, Release, 1)
+      aie.use_lock(%lock_0_2_4, Release, 1)
+      cf.br ^bb1
+    }
+    %memtile_dma_1_1 = aie.memtile_dma(%mem_tile_1_1) {
+      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_1_1_1, AcquireGreaterEqual, 4)
+      aie.dma_bd(%buf8 : memref<1024xi16, 1>, 0, 1024) {bd_id = 0 : i32, next_bd_id = 0 : i32, task_id = 0 : i32}
+      aie.use_lock(%lock_1_1, Release, 4)
+      aie.next_bd ^bb1
+    ^bb2:  // pred: ^bb9
+      aie.end
+    ^bb3:  // pred: ^bb0
+      %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb5)
+    ^bb4:  // 2 preds: ^bb3, ^bb4
+      aie.use_lock(%lock_1_1, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf8 : memref<1024xi16, 1>, 0, 256) {bd_id = 1 : i32, next_bd_id = 1 : i32, task_id = 0 : i32}
+      aie.use_lock(%lock_1_1_1, Release, 1)
+      aie.next_bd ^bb4
+    ^bb5:  // pred: ^bb3
+      %2 = aie.dma_start(S2MM, 1, ^bb6, ^bb7)
+    ^bb6:  // 2 preds: ^bb5, ^bb6
+      aie.use_lock(%lock_1_1, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf8 : memref<1024xi16, 1>, 256, 256) {bd_id = 24 : i32, next_bd_id = 24 : i32, task_id = 0 : i32}
+      aie.use_lock(%lock_1_1_1, Release, 1)
+      aie.next_bd ^bb6
+    ^bb7:  // pred: ^bb5
+      %3 = aie.dma_start(S2MM, 2, ^bb8, ^bb9)
+    ^bb8:  // 2 preds: ^bb7, ^bb8
+      aie.use_lock(%lock_1_1, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf8 : memref<1024xi16, 1>, 512, 256) {bd_id = 2 : i32, next_bd_id = 2 : i32, task_id = 0 : i32}
+      aie.use_lock(%lock_1_1_1, Release, 1)
+      aie.next_bd ^bb8
+    ^bb9:  // pred: ^bb7
+      %4 = aie.dma_start(S2MM, 3, ^bb10, ^bb2)
+    ^bb10:  // 2 preds: ^bb9, ^bb10
+      aie.use_lock(%lock_1_1, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf8 : memref<1024xi16, 1>, 768, 256) {bd_id = 25 : i32, next_bd_id = 25 : i32, task_id = 0 : i32}
+      aie.use_lock(%lock_1_1_1, Release, 1)
+      aie.next_bd ^bb10
+    }
+    %memtile_dma_0_1 = aie.memtile_dma(%mem_tile_0_1) {
+      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_0_1_0, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 0, 256) {bd_id = 0 : i32, next_bd_id = 0 : i32, task_id = 0 : i32}
+      aie.use_lock(%lock_0_1, Release, 1)
+      aie.next_bd ^bb1
+    ^bb2:  // pred: ^bb9
+      aie.end
+    ^bb3:  // pred: ^bb0
+      %1 = aie.dma_start(MM2S, 1, ^bb4, ^bb5)
+    ^bb4:  // 2 preds: ^bb3, ^bb4
+      aie.use_lock(%lock_0_1_0, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 256, 256) {bd_id = 24 : i32, next_bd_id = 24 : i32, task_id = 0 : i32}
+      aie.use_lock(%lock_0_1, Release, 1)
+      aie.next_bd ^bb4
+    ^bb5:  // pred: ^bb3
+      %2 = aie.dma_start(MM2S, 2, ^bb6, ^bb7)
+    ^bb6:  // 2 preds: ^bb5, ^bb6
+      aie.use_lock(%lock_0_1_0, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 512, 256) {bd_id = 1 : i32, next_bd_id = 1 : i32, task_id = 0 : i32}
+      aie.use_lock(%lock_0_1, Release, 1)
+      aie.next_bd ^bb6
+    ^bb7:  // pred: ^bb5
+      %3 = aie.dma_start(MM2S, 3, ^bb8, ^bb9)
+    ^bb8:  // 2 preds: ^bb7, ^bb8
+      aie.use_lock(%lock_0_1_0, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 768, 256) {bd_id = 25 : i32, next_bd_id = 25 : i32, task_id = 0 : i32}
+      aie.use_lock(%lock_0_1, Release, 1)
+      aie.next_bd ^bb8
+    ^bb9:  // pred: ^bb7
+      %4 = aie.dma_start(S2MM, 0, ^bb10, ^bb2)
+    ^bb10:  // 2 preds: ^bb9, ^bb10
+      aie.use_lock(%lock_0_1, AcquireGreaterEqual, 4)
+      aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 0, 1024) {bd_id = 2 : i32, next_bd_id = 2 : i32, task_id = 0 : i32}
+      aie.use_lock(%lock_0_1_0, Release, 4)
+      aie.next_bd ^bb10
+    }
+    aie.shim_dma_allocation @air_channel_3(%shim_noc_tile_1_0, S2MM, 0)
+    aie.shim_dma_allocation @air_channel_0(%shim_noc_tile_0_0, MM2S, 0)
+    aie.runtime_sequence @square_kernel_0_sequence(%arg0: memref<*xi16>, %arg1: memref<*xi16>, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32) {
+      %0 = aiex.dma_configure_task_for @air_channel_0 {
+        aie.dma_bd(%arg0 : memref<*xi16>, 0, 1024, [<size = 2, stride = 512>, <size = 512, stride = 1>])
+        aie.end
+      }
+      aiex.dma_start_task(%0)
+      %1 = aiex.dma_configure_task_for @air_channel_3 {
+        aie.dma_bd(%arg1 : memref<*xi16>, 0, 1024, [<size = 2, stride = 512>, <size = 512, stride = 1>])
+        aie.end
+      } {issue_token = true}
+      aiex.dma_start_task(%1)
+      aiex.dma_free_task(%0)
+      aiex.dma_await_task(%1)
+    }
+    aie.packet_flow(15) {
+      aie.packet_source<%shim_noc_tile_0_0, TileControl : 0>
+      aie.packet_dest<%shim_noc_tile_0_0, South : 0>
+    } {keep_pkt_header = true, priority_route = true}
+    aie.packet_flow(15) {
+      aie.packet_source<%shim_noc_tile_1_0, TileControl : 0>
+      aie.packet_dest<%shim_noc_tile_1_0, South : 0>
+    } {keep_pkt_header = true, priority_route = true}
+    %switchbox_0_0 = aie.switchbox(%shim_noc_tile_0_0) {
+      aie.connect<South : 3, North : 3>
+      %0 = aie.amsel<5> (3)
+      %1 = aie.masterset(South : 0, %0) {keep_pkt_header = true}
+      aie.packet_rules(TileControl : 0) {
+        aie.rule(31, 15, %0)
+      }
+    }
+    %shim_mux_0_0 = aie.shim_mux(%shim_noc_tile_0_0) {
+      aie.connect<DMA : 0, North : 3>
+    }
+    %switchbox_0_1 = aie.switchbox(%mem_tile_0_1) {
+      aie.connect<South : 3, DMA : 0>
+      aie.connect<DMA : 0, North : 1>
+      aie.connect<DMA : 1, North : 5>
+      aie.connect<DMA : 2, North : 0>
+      aie.connect<DMA : 3, North : 3>
+    }
+    %switchbox_1_0 = aie.switchbox(%shim_noc_tile_1_0) {
+      aie.connect<North : 2, South : 2>
+      %0 = aie.amsel<5> (3)
+      %1 = aie.masterset(South : 0, %0) {keep_pkt_header = true}
+      aie.packet_rules(TileControl : 0) {
+        aie.rule(31, 15, %0)
+      }
+    }
+    %shim_mux_1_0 = aie.shim_mux(%shim_noc_tile_1_0) {
+      aie.connect<North : 2, DMA : 0>
+    }
+    %switchbox_1_1 = aie.switchbox(%mem_tile_1_1) {
+      aie.connect<DMA : 0, South : 2>
+      aie.connect<North : 1, DMA : 0>
+      aie.connect<North : 3, DMA : 1>
+      aie.connect<North : 0, DMA : 2>
+      aie.connect<North : 2, DMA : 3>
+    }
+    %switchbox_0_2 = aie.switchbox(%tile_0_2) {
+      aie.connect<South : 1, DMA : 0>
+      aie.connect<South : 5, North : 3>
+      aie.connect<South : 0, North : 5>
+      aie.connect<South : 3, North : 4>
+      aie.connect<DMA : 0, East : 0>
+      aie.connect<North : 0, East : 3>
+    }
+    %switchbox_0_3 = aie.switchbox(%tile_0_3) {
+      aie.connect<South : 3, DMA : 0>
+      aie.connect<South : 5, North : 0>
+      aie.connect<South : 4, North : 2>
+      aie.connect<DMA : 0, East : 0>
+      aie.connect<North : 0, South : 0>
+    }
+    %switchbox_0_4 = aie.switchbox(%tile_0_4) {
+      aie.connect<South : 0, DMA : 0>
+      aie.connect<South : 2, North : 0>
+      aie.connect<DMA : 0, East : 0>
+      aie.connect<North : 0, South : 0>
+    }
+    %switchbox_0_5 = aie.switchbox(%tile_0_5) {
+      aie.connect<South : 0, DMA : 0>
+      aie.connect<DMA : 0, South : 0>
+    }
+    %tile_1_2 = aie.tile(1, 2)
+    %switchbox_1_2 = aie.switchbox(%tile_1_2) {
+      aie.connect<West : 0, South : 1>
+      aie.connect<North : 1, South : 3>
+      aie.connect<North : 3, South : 0>
+      aie.connect<West : 3, South : 2>
+    }
+    %tile_1_3 = aie.tile(1, 3)
+    %switchbox_1_3 = aie.switchbox(%tile_1_3) {
+      aie.connect<West : 0, South : 1>
+      aie.connect<North : 1, South : 3>
+    }
+    %tile_1_4 = aie.tile(1, 4)
+    %switchbox_1_4 = aie.switchbox(%tile_1_4) {
+      aie.connect<West : 0, South : 1>
+    }
+    aie.wire(%shim_mux_0_0 : North, %switchbox_0_0 : South)
+    aie.wire(%shim_noc_tile_0_0 : DMA, %shim_mux_0_0 : DMA)
+    aie.wire(%mem_tile_0_1 : Core, %switchbox_0_1 : Core)
+    aie.wire(%mem_tile_0_1 : DMA, %switchbox_0_1 : DMA)
+    aie.wire(%switchbox_0_0 : North, %switchbox_0_1 : South)
+    aie.wire(%tile_0_2 : Core, %switchbox_0_2 : Core)
+    aie.wire(%tile_0_2 : DMA, %switchbox_0_2 : DMA)
+    aie.wire(%switchbox_0_1 : North, %switchbox_0_2 : South)
+    aie.wire(%tile_0_3 : Core, %switchbox_0_3 : Core)
+    aie.wire(%tile_0_3 : DMA, %switchbox_0_3 : DMA)
+    aie.wire(%switchbox_0_2 : North, %switchbox_0_3 : South)
+    aie.wire(%tile_0_4 : Core, %switchbox_0_4 : Core)
+    aie.wire(%tile_0_4 : DMA, %switchbox_0_4 : DMA)
+    aie.wire(%switchbox_0_3 : North, %switchbox_0_4 : South)
+    aie.wire(%tile_0_5 : Core, %switchbox_0_5 : Core)
+    aie.wire(%tile_0_5 : DMA, %switchbox_0_5 : DMA)
+    aie.wire(%switchbox_0_4 : North, %switchbox_0_5 : South)
+    aie.wire(%switchbox_0_0 : East, %switchbox_1_0 : West)
+    aie.wire(%shim_mux_1_0 : North, %switchbox_1_0 : South)
+    aie.wire(%shim_noc_tile_1_0 : DMA, %shim_mux_1_0 : DMA)
+    aie.wire(%switchbox_0_1 : East, %switchbox_1_1 : West)
+    aie.wire(%mem_tile_1_1 : Core, %switchbox_1_1 : Core)
+    aie.wire(%mem_tile_1_1 : DMA, %switchbox_1_1 : DMA)
+    aie.wire(%switchbox_1_0 : North, %switchbox_1_1 : South)
+    aie.wire(%switchbox_0_2 : East, %switchbox_1_2 : West)
+    aie.wire(%tile_1_2 : Core, %switchbox_1_2 : Core)
+    aie.wire(%tile_1_2 : DMA, %switchbox_1_2 : DMA)
+    aie.wire(%switchbox_1_1 : North, %switchbox_1_2 : South)
+    aie.wire(%switchbox_0_3 : East, %switchbox_1_3 : West)
+    aie.wire(%tile_1_3 : Core, %switchbox_1_3 : Core)
+    aie.wire(%tile_1_3 : DMA, %switchbox_1_3 : DMA)
+    aie.wire(%switchbox_1_2 : North, %switchbox_1_3 : South)
+    aie.wire(%switchbox_0_4 : East, %switchbox_1_4 : West)
+    aie.wire(%tile_1_4 : Core, %switchbox_1_4 : Core)
+    aie.wire(%tile_1_4 : DMA, %switchbox_1_4 : DMA)
+    aie.wire(%switchbox_1_3 : North, %switchbox_1_4 : South)
+  } {dlti.dl_spec = #dlti.dl_spec<index = 32 : i64>}
+  aie.device(npu2) {
+    aie.runtime_sequence @square_kernel(%arg0: memref<*xi16>, %arg1: memref<*xi16>, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32) {
+      aiex.configure @square_kernel_0 {
+        aiex.run @square_kernel_0_sequence(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7) : (memref<*xi16>, memref<*xi16>, i32, i32, i32, i32, i32, i32)
+      }
+    }
+  }
+}
diff --git a/examples/elementwise_arith/air_project/aiecc_repeater_1775797115_856352.sh b/examples/elementwise_arith/air_project/aiecc_repeater_1775797115_856352.sh
new file mode 100755
index 0000000..b7aa36c
--- /dev/null
+++ b/examples/elementwise_arith/air_project/aiecc_repeater_1775797115_856352.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+set -e
+# Repeater script for: resource allocation
+echo "Original MLIR Diagnostics:"
+cat << 'DIAGNOSTICS_EOF'
+failed to legalize operation 'arith.subi' that was explicitly marked illegal: %120 = "arith.subi"(%118, %119) <{overflowFlags = #arith.overflow<none>}> : (vector<32xi16>, vector<32xi16>) -> vector<32xi16>
+DIAGNOSTICS_EOF
+echo ""
+
+MLIR_FILE='air_project/aiecc_failure_1775797115_856352.mlir'
+PASS_PIPELINE='any(canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},unknown<RedundantLoadStoreOptimizationPass>,unknown<ReorderOperationsPass>,unknown<{anonymous}::CopyRemovalPass>,unknown<VectorBroadcastLoweringPass>,test-canonicalize-vector-for-aievec{aie-target=aie2p target-backend=llvmir},test-lower-vector-to-aievec{aie-target=aie2p target-backend=llvmir},canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},unknown<ExtendUPDOpsPass>,cse,unknown<SimplifyUPDOpsPass>,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},test-aievec-optimize{aie-target=aie2p target-backend=llvmir},cse,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},aievec-convolution-analysis{print=false},test-aievec-convolution-optimize{aie-target=aie2p shift=0 target-backend=llvmir},cse,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},loop-invariant-code-motion,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},lower-affine,aie-canonicalize-device,aie.device(aie-assign-lock-ids,aie-register-objectFifos,aie-objectFifo-stateful-transform{dynamic-objFifos=false packet-sw-objFifos=false},aie-assign-bd-ids,aie-lower-cascade-flows,aie-lower-broadcast-packet,aie-lower-multicast,aie-assign-tile-controller-ids{column-wise-unique-ids=true},aie-generate-column-control-overlay{route-shim-to-tct=shim-only route-shim-to-tile-ctrl=false},aie-assign-buffer-addresses{alloc-scheme=},aie-assign-core-link-files,aie-vector-transfer-lowering{max-transfer-rank=4294967295}),convert-scf-to-cf{allow-pattern-rollback=true})'
+aie-opt --mlir-print-ir-after-all --mlir-disable-threading --pass-pipeline="$PASS_PIPELINE" "$MLIR_FILE"
diff --git a/examples/elementwise_arith/air_project/aiecc_repeater_1775797139_858651.sh b/examples/elementwise_arith/air_project/aiecc_repeater_1775797139_858651.sh
new file mode 100755
index 0000000..2f765b5
--- /dev/null
+++ b/examples/elementwise_arith/air_project/aiecc_repeater_1775797139_858651.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+set -e
+# Repeater script for: LLVM lowering
+echo "Original MLIR Diagnostics:"
+cat << 'DIAGNOSTICS_EOF'
+aievec.mul_elem conversion is not supported for AIE2p.
+
+failed to legalize operation 'aievec.mul_elem' that was explicitly marked illegal: %28 = "aievec.mul_elem"(%24, %27) : (vector<32xi16>, vector<32xi16>) -> vector<32xi32>
+DIAGNOSTICS_EOF
+echo ""
+
+MLIR_FILE='air_project/aiecc_failure_1775797139_858651.mlir'
+PASS_PIPELINE='any(aie.device(aie-localize-locks,aie-normalize-address-spaces,aie-transform-bfp-types),aie-standard-lowering{device=mul_kernel_0 tilecol=0 tilerow=5},aiex-standard-lowering,convert-aievec-to-llvm{aie-target=aie2p aie2-fp32-emulation-strategy=accuracy-safe},canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},cse,expand-strided-metadata,lower-affine,arith-expand{include-bf16=false include-f4e2m1=false include-f8e8m0=false},finalize-memref-to-llvm{index-bitwidth=0 use-aligned-alloc=false use-generic-functions=false},convert-func-to-llvm{index-bitwidth=0 use-bare-ptr-memref-call-conv=true},convert-to-llvm{allow-pattern-rollback=true dynamic=true },convert-vector-to-llvm{enable-arm-bf16=false enable-arm-i8mm=false enable-arm-neon=false enable-arm-sve=false enable-x86=false force-32bit-vector-indices=true reassociate-fp-reductions=false use-vector-alignment=false vector-contract-lowering=dot vector-transpose-lowering=eltwise},convert-ub-to-llvm{index-bitwidth=0},canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},cse)'
+aie-opt --mlir-print-ir-after-all --mlir-disable-threading --pass-pipeline="$PASS_PIPELINE" "$MLIR_FILE"
diff --git a/examples/elementwise_arith/air_project/aiecc_repeater_1775797174_862028.sh b/examples/elementwise_arith/air_project/aiecc_repeater_1775797174_862028.sh
new file mode 100755
index 0000000..e9fc1e4
--- /dev/null
+++ b/examples/elementwise_arith/air_project/aiecc_repeater_1775797174_862028.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+set -e
+# Repeater script for: LLVM lowering
+echo "Original MLIR Diagnostics:"
+cat << 'DIAGNOSTICS_EOF'
+aievec.mul_elem conversion is not supported for AIE2p.
+
+failed to legalize operation 'aievec.mul_elem' that was explicitly marked illegal: %21 = "aievec.mul_elem"(%20, %20) : (vector<32xi16>, vector<32xi16>) -> vector<32xi32>
+DIAGNOSTICS_EOF
+echo ""
+
+MLIR_FILE='air_project/aiecc_failure_1775797174_862028.mlir'
+PASS_PIPELINE='any(aie.device(aie-localize-locks,aie-normalize-address-spaces,aie-transform-bfp-types),aie-standard-lowering{device=square_kernel_0 tilecol=0 tilerow=5},aiex-standard-lowering,convert-aievec-to-llvm{aie-target=aie2p aie2-fp32-emulation-strategy=accuracy-safe},canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},cse,expand-strided-metadata,lower-affine,arith-expand{include-bf16=false include-f4e2m1=false include-f8e8m0=false},finalize-memref-to-llvm{index-bitwidth=0 use-aligned-alloc=false use-generic-functions=false},convert-func-to-llvm{index-bitwidth=0 use-bare-ptr-memref-call-conv=true},convert-to-llvm{allow-pattern-rollback=true dynamic=true },convert-vector-to-llvm{enable-arm-bf16=false enable-arm-i8mm=false enable-arm-neon=false enable-arm-sve=false enable-x86=false force-32bit-vector-indices=true reassociate-fp-reductions=false use-vector-alignment=false vector-contract-lowering=dot vector-transpose-lowering=eltwise},convert-ub-to-llvm{index-bitwidth=0},canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},cse)'
+aie-opt --mlir-print-ir-after-all --mlir-disable-threading --pass-pipeline="$PASS_PIPELINE" "$MLIR_FILE"
diff --git a/examples/elementwise_arith/air_project/airinput.mlir b/examples/elementwise_arith/air_project/airinput.mlir
new file mode 100644
index 0000000..d0b7377
--- /dev/null
+++ b/examples/elementwise_arith/air_project/airinput.mlir
@@ -0,0 +1,41 @@
+#map = affine_map<()[s0] -> (s0 * 256)>
+module {
+  func.func @square_kernel(%arg0: memref<*xi16> {tt.divisibility = 16 : i32}, %arg1: memref<*xi16> {tt.divisibility = 16 : i32}, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32) {
+    %c1 = arith.constant 1 : index
+    air.launch (%arg8, %arg9, %arg10) in (%arg11=%c1, %arg12=%c1, %arg13=%c1) args(%arg14=%arg0, %arg15=%arg1) : memref<*xi16>, memref<*xi16> {
+      air.segment @square_kernel_0  args(%arg16=%arg8, %arg17=%arg14, %arg18=%arg15) : index, memref<*xi16>, memref<*xi16> {
+        %c1024 = arith.constant 1024 : index
+        %c4 = arith.constant 4 : index
+        %c1_0 = arith.constant 1 : index
+        %0 = arith.muli %arg16, %c1024 : index
+        %alloc = memref.alloc() : memref<1024xi16, 1 : i32>
+        air.dma_memcpy_nd (%alloc[] [] [], %arg17[%0] [%c1024] [%c1_0]) {id = 1 : i32} : (memref<1024xi16, 1 : i32>, memref<*xi16>)
+        %alloc_1 = memref.alloc() : memref<1024xi16, 1>
+        air.herd @herd_0  tile (%arg19, %arg20) in (%arg21=%c4, %arg22=%c1_0) args(%arg23=%alloc, %arg24=%alloc_1) : memref<1024xi16, 1 : i32>, memref<1024xi16, 1> {
+          %1 = ub.poison : i16
+          %c1_2 = arith.constant 1 : index
+          %c0 = arith.constant 0 : index
+          %c256 = arith.constant 256 : index
+          %c32 = arith.constant 32 : index
+          %2 = affine.apply #map()[%arg19]
+          %alloc_3 = memref.alloc() : memref<256xi16, 2>
+          air.dma_memcpy_nd (%alloc_3[] [] [], %arg23[%2] [%c256] [%c1_2]) {id = 1 : i32} : (memref<256xi16, 2>, memref<1024xi16, 1 : i32>)
+          %alloc_4 = memref.alloc() : memref<256xi16, 2>
+          scf.for %arg25 = %c0 to %c256 step %c32 {
+            %subview = memref.subview %alloc_3[%arg25] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2>
+            %subview_5 = memref.subview %alloc_4[%arg25] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2>
+            %3 = vector.transfer_read %subview[%c0], %1 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16>
+            %4 = arith.muli %3, %3 : vector<32xi16>
+            vector.transfer_write %4, %subview_5[%c0] {in_bounds = [true]} : vector<32xi16>, memref<32xi16, strided<[1], offset: ?>, 2>
+          }
+          air.dma_memcpy_nd (%arg24[%2] [%c256] [%c1_2], %alloc_4[] [] []) {id = 2 : i32} : (memref<1024xi16, 1>, memref<256xi16, 2>)
+          memref.dealloc %alloc_3 : memref<256xi16, 2>
+          memref.dealloc %alloc_4 : memref<256xi16, 2>
+        }
+        air.dma_memcpy_nd (%arg18[%0] [%c1024] [%c1_0], %alloc_1[] [] []) {id = 2 : i32} : (memref<*xi16>, memref<1024xi16, 1>)
+        memref.dealloc %alloc_1 : memref<1024xi16, 1>
+      }
+    }
+    return
+  }
+}
diff --git a/examples/elementwise_arith/air_project/asm_air_output.mlir b/examples/elementwise_arith/air_project/asm_air_output.mlir
new file mode 100644
index 0000000..d0b7377
--- /dev/null
+++ b/examples/elementwise_arith/air_project/asm_air_output.mlir
@@ -0,0 +1,41 @@
+#map = affine_map<()[s0] -> (s0 * 256)>
+module {
+  func.func @square_kernel(%arg0: memref<*xi16> {tt.divisibility = 16 : i32}, %arg1: memref<*xi16> {tt.divisibility = 16 : i32}, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32) {
+    %c1 = arith.constant 1 : index
+    air.launch (%arg8, %arg9, %arg10) in (%arg11=%c1, %arg12=%c1, %arg13=%c1) args(%arg14=%arg0, %arg15=%arg1) : memref<*xi16>, memref<*xi16> {
+      air.segment @square_kernel_0  args(%arg16=%arg8, %arg17=%arg14, %arg18=%arg15) : index, memref<*xi16>, memref<*xi16> {
+        %c1024 = arith.constant 1024 : index
+        %c4 = arith.constant 4 : index
+        %c1_0 = arith.constant 1 : index
+        %0 = arith.muli %arg16, %c1024 : index
+        %alloc = memref.alloc() : memref<1024xi16, 1 : i32>
+        air.dma_memcpy_nd (%alloc[] [] [], %arg17[%0] [%c1024] [%c1_0]) {id = 1 : i32} : (memref<1024xi16, 1 : i32>, memref<*xi16>)
+        %alloc_1 = memref.alloc() : memref<1024xi16, 1>
+        air.herd @herd_0  tile (%arg19, %arg20) in (%arg21=%c4, %arg22=%c1_0) args(%arg23=%alloc, %arg24=%alloc_1) : memref<1024xi16, 1 : i32>, memref<1024xi16, 1> {
+          %1 = ub.poison : i16
+          %c1_2 = arith.constant 1 : index
+          %c0 = arith.constant 0 : index
+          %c256 = arith.constant 256 : index
+          %c32 = arith.constant 32 : index
+          %2 = affine.apply #map()[%arg19]
+          %alloc_3 = memref.alloc() : memref<256xi16, 2>
+          air.dma_memcpy_nd (%alloc_3[] [] [], %arg23[%2] [%c256] [%c1_2]) {id = 1 : i32} : (memref<256xi16, 2>, memref<1024xi16, 1 : i32>)
+          %alloc_4 = memref.alloc() : memref<256xi16, 2>
+          scf.for %arg25 = %c0 to %c256 step %c32 {
+            %subview = memref.subview %alloc_3[%arg25] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2>
+            %subview_5 = memref.subview %alloc_4[%arg25] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2>
+            %3 = vector.transfer_read %subview[%c0], %1 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16>
+            %4 = arith.muli %3, %3 : vector<32xi16>
+            vector.transfer_write %4, %subview_5[%c0] {in_bounds = [true]} : vector<32xi16>, memref<32xi16, strided<[1], offset: ?>, 2>
+          }
+          air.dma_memcpy_nd (%arg24[%2] [%c256] [%c1_2], %alloc_4[] [] []) {id = 2 : i32} : (memref<1024xi16, 1>, memref<256xi16, 2>)
+          memref.dealloc %alloc_3 : memref<256xi16, 2>
+          memref.dealloc %alloc_4 : memref<256xi16, 2>
+        }
+        air.dma_memcpy_nd (%arg18[%0] [%c1024] [%c1_0], %alloc_1[] [] []) {id = 2 : i32} : (memref<*xi16>, memref<1024xi16, 1>)
+        memref.dealloc %alloc_1 : memref<1024xi16, 1>
+      }
+    }
+    return
+  }
+}
diff --git a/examples/elementwise_arith/air_project/asm_src.mlir b/examples/elementwise_arith/air_project/asm_src.mlir
new file mode 100644
index 0000000..aa0162c
--- /dev/null
+++ b/examples/elementwise_arith/air_project/asm_src.mlir
@@ -0,0 +1,34 @@
+#loc = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":86:1)
+#loc5 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":89:9)
+#map = affine_map<(d0) -> (d0)>
+#loc8 = loc("X"(#loc))
+#loc9 = loc("OUT"(#loc))
+#loc12 = loc("x"(#loc5))
+module {
+  func.func @square_kernel(%arg0: memref<*xi16> {tt.divisibility = 16 : i32} loc("X"(#loc)), %arg1: memref<*xi16> {tt.divisibility = 16 : i32} loc("OUT"(#loc)), %arg2: i32 loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":86:1), %arg3: i32 loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":86:1), %arg4: i32 loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":86:1), %arg5: i32 loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":86:1), %arg6: i32 loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":86:1), %arg7: i32 loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":86:1)) {
+    %c1024_i32 = arith.constant 1024 : i32 loc(#loc1)
+    %0 = arith.muli %arg5, %c1024_i32 : i32 loc(#loc10)
+    %1 = arith.index_cast %0 : i32 to index loc(#loc3)
+    %reinterpret_cast = memref.reinterpret_cast %arg0 to offset: [%1], sizes: [1024], strides: [1] : memref<*xi16> to memref<1024xi16, strided<[1], offset: ?>> loc(#loc11)
+    %alloc = memref.alloc() : memref<1024xi16> loc(#loc12)
+    memref.copy %reinterpret_cast, %alloc : memref<1024xi16, strided<[1], offset: ?>> to memref<1024xi16> loc(#loc12)
+    %2 = bufferization.to_tensor %alloc restrict writable : memref<1024xi16> to tensor<1024xi16> loc(#loc12)
+    %reinterpret_cast_0 = memref.reinterpret_cast %arg1 to offset: [%1], sizes: [1024], strides: [1] : memref<*xi16> to memref<1024xi16, strided<[1], offset: ?>> loc(#loc3)
+    %3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%2, %2 : tensor<1024xi16>, tensor<1024xi16>) outs(%2 : tensor<1024xi16>) {
+    ^bb0(%in: i16 loc("x"(#loc5)), %in_1: i16 loc("x"(#loc5)), %out: i16 loc("x"(#loc5))):
+      %4 = arith.muli %in, %in_1 : i16 loc(#loc6)
+      linalg.yield %4 : i16 loc(#loc6)
+    } -> tensor<1024xi16> loc(#loc6)
+    bufferization.materialize_in_destination %3 in writable %reinterpret_cast_0 : (tensor<1024xi16>, memref<1024xi16, strided<[1], offset: ?>>) -> () loc(#loc7)
+    return loc(#loc)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc(unknown)
+#loc2 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":88:15)
+#loc3 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":90:14)
+#loc4 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":89:17)
+#loc6 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":90:32)
+#loc7 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":90:5)
+#loc10 = loc("offsets"(#loc2))
+#loc11 = loc("x"(#loc4))
+
diff --git a/examples/elementwise_arith/air_project/div_kernel_0.pdi b/examples/elementwise_arith/air_project/div_kernel_0.pdi
new file mode 100644
index 0000000000000000000000000000000000000000..3681781c43b9bf80414d1863d6cae3111aa21810
GIT binary patch
literal 15904
zcmeHOZ)_aJ6@UB3z9V(Gy~Iuz+aVj<5JN6-i49F4fpzRwF16ANA(h)k#Yjzowy3UD
zT7OA7h$;UPVbUTEP=t$ws-k{4S8WuzUyc$GAZSeqQmR69g;Z^&qMi|zOmJ|2v%9nN
z?))!lrHT(X(%QfGn>TOXdow$J-rIYdi2AzP()a)18+%{<?X!;&*<^t0z$Jfo<C5*i
z`Bxf46#o2tUoPi49%VThVL9%<X(N|E_qd#okH6*i{Wsr!$1S(+*9KcPtVB<AZd==(
zBHDsW{^o5iYBzzbCo=gQT<?Fu4S$?FYr{Zq-xo0h(I)ut(6fcPG$u0*i^yw_W0SIT
zYQ0noFUW3{OjUA{$<n60>Wn%1rZf_?HnI&8HuX{If`lUJQe^C%8!V5?8JhHoenb+v
z_TA2LSV)rCQG}<bd&Ud*B>$pDebw;DOU$@Lj-rTM>Wm~ga%PU`heI3bLMT8RHpP=x
zN~}$!#E#CCXu2dNu3DcGd)KDKofoCVqfM!6(zK;$P~qx#zVA@s4I}^N%0v$VXWupQ
z|Gpq$CNr>AnsbwFYXH#P<jx3X9DAkpapv0mH0JThnep<j`)k_jcGLBtE@Fpw;?82q
zoGR8Cg?nc3yT=$>Gv{`FkUHIS`9#h~e7@Y{B;$j}VG~U+p?~AVjr~VM!lU8DoVfOb
zD2btEqGJcG=3@sBHjAcLm$5f>Zuahly?d<|(R2d(nI-%?iT+K?`*$Dg-FFc7-h%!W
zOV}%+|H|d<Jpg+T$aqgff2sWa9sTE!zlRPsAA87xfB%GjX5yyY!s{{PjWx@7zcjea
zdK|V|jvYq4b!!|M@8Hm4_M8?O?^kYH#@@pRTaG<!HH*4t*z26wH@v7l_;=aF*G49z
z{j%yq_mNi<?juvUPT)F;>n&WR#ED&|cVren+g6=Dv8%1)$k~w%>sv-PY;GOd(6e@A
z!|U#eU8~wJoEh2hSo6q+U$l(;e6n@qZ?B-|&udTr{^-U*ck<Zk_nNoVd-u<r-c@+Q
zaz_n+ING@6I=;Mr<X!cKS>Hpeh@yS%l7YKtD7t&x<T%hM(U2VJCMn_(pF>HLyy$*7
zDlJ2VnX_W$=Nz#z@9gt_Gja8Y`@C(1$r-UG@0_ulD8Kj3gjdQKKdJ94iGhLiHMfxS
zOKK=wH*?J+bN`xMM}@>J?)FZzw65^<M?n3iM}<p2+V_t`g)7dS`QC$+fAqCPijIyv
zW7yOho*rKyXB4MHe4N3Rz`y&A(mLgz5AO{XqqyW{XVRfg1OAdRIY_sCn^sU~d2;aM
z+ctILbfnB>@8K@bJa@Rh?X1{ze2pl-=m-y2{uM`j`K49J<w<xJH_QiVCO#i<Lg-WC
zh5Rb?kInsIc24<o<m>&B@4Fgd%eA&DQ++VxOaB+QSLZvfXHe#eF}ieC+|l5O{mFEu
z!!g&BxM(1aTnt&+(4JT`>WEFYGdVW*9b^|i_0hORdC^^;78_rJu0Mto9w+ptNv!nK
zq7E@WQ*@S^-%XGs^WU-B4r6E;kTGmkbIO^Q8ZYYd!T95y$zVLj7A%~=VUzI}??MbE
zE2+3;>^0rJ>d0>2!(V7yd}{^vf`8J&DuykL*X<lnWPWzYs3&3M(_+Vij@W?=tQ>XR
z@(pQm$#Lvh$a|5kAS;xVSqWQ|4@7*t-H!GX<|t{O=liGr{)xulIl>{CPdUER)Z@vH
zW5yjO<BeyWZ?9rMAzo>3Yz6#_?8(2ULG#GFZpMB6g@n7UlyE0r##!=O!u{^+33vCK
z33v0K5^nnK#QeX<nBT{0#tm}LuFOCD&l~m8d?NFibmqktYBP{e$2~KecF$al-1W%p
z51WylKJx1GcZp3eI!9sqr*cnY2fA55?Y?U{!YHm5Yi~%q8!ldQzs-0}F5-oSh3~JZ
zU;0<4*2gWF>C97yjKs&{z|)R6(2*9Y0Y_Zfmlo@K9g)g6iggb(iUZ?mw{R#evRHS=
zY!p6z9~g4n!cIrfPDkb`W2`V%n_@=~dcf`{Xx=0J^+{GRQ9dTa^AFL9C!bH5!{uLD
zh5Kg2t+R=YY5SJEAK)LxMEYuf$V9o@j_aa}+{d0@V10PqC&x%x)qP4<b)S+|-KS*o
zy2y|MT4--G$(0JOz+R<Hrz+(nlhcMA6}I{H<d#}JS*s^&_2j(&7iPR#Jz1+KYxN{5
zYtr>`t)9%F{;SoKwR-aZT~CJfovh-*da_bh^`Gij^`DZ->#eFMd#dWmN|{bo%1I_K
zT~F?;)swY)vQ|&l>PZ|uwda$y=aaSPleOoQwda$cyyuf)eW&U}H>&THtm;1{tNKsL
z<n5`dCwEuXla(@^s+5yVUb>#Vu~tvk>d9I?S*s^;^wjFfT0L2-Cu{X&t)Bek)sta;
zr|LsDs_&Gn>OUo``cKOI-AVo#w|tjUeR6Hd&#=qur8-IMFzhj$W!Psp$M6ut!wlyc
z9%Fc%;R3@&hNl=VF<g$Yd}b>@)31y#lvPYr-pu0D*kgQ_VV~g~!$S-YGn{95jNx&H
z3k(+-o?^Jfa5=(Ze7#lig|d#XhsCF{SJpVou+MOg;UR{H8O}32#_%}91%`_ZPcd9#
zxE$dyzCBg(g|d!sH;YeWudH#FVV~g~!$S-YGn{95jNx&H3k(+-o?^JfaG7C;T21+G
zb8xQ#tGK|+>QnJLMrySgFJYjQ!=5)H?6Z3z%Xp9R4&#aOIqWTMN8b-w#(QipI*cdA
z=dgy_PUT)xyvNquVfw`Q9CD@feU`T@<2}YZjHigN{9jjElkay&FK<m@pBQ*qyQ-D0
zx9j;hrSAp$ew8-SyEoexE~M{=_JeiegZ}!Iy+*EiBhGBu<X(3otm3sHOB~z>RecSR
zgL4hz<hd@J)J+@J@%5Z%d{W<+<|^9a@dfsmh!6c}VSFJrAj{ZoJpYh5jIXk<L4EcW
z#Hah_$A>+7Uh60Ieau?f7LPBmzeIfKM+@T%v5qgim%{ie`x?}{VnKYmZ+?91&oe%$
z?*~5dAB)Eq*k2+(^rMCGh1h@`u3vcOgz;7OHK^}Y1o7!U9iMzTQ?_P1IOy<x@j}g?
z?HNsD!a5`D_oTjbfSMOlr=B_gZmv&Oq;EVJjH(_~t?w)Q=H=C@fn2TcDf`yri`tjD
z>aK?$XfjvX2+ILM9wAE{<gu!+0W#i&4EmB)eT|US8Q@g)wL-oq8t1aU1@bTV7uq8E
zpX;g4XQ;nOKK&|vtXHUSF1%jz^sPvLk$hI`W4(f}oe(XuUbFsalz$nd%71`W{<qgR
z<0(hjw-s16)z=4Et&LyRHvn1XxWB5;hOE|xSl<HqH_tQwq5iq&Kh!_>{D=DIo_`$V
zGGoi;Kd5b@{U2bJW8~jt`LF0h{)6w7$#KYkMIZ8Sviw){A^)Z%qI;^MZ+qFoHy9`{
z_eJA0{zr&#68vY0ijt?05ApjJU1h9G1w6ILJ0%#~X|%*+v*48-V!RI?aR&M}<1^qf
zOTZ@?zZE<@3it-bcb6@b`7GlJ$AV`*OTHNR`MzZ`pG_1vfxc(7#AD%ejNx;P;j_$n
zpbwv89zM&QhVwEWKAX(vSl=mI@)iC--=DVRE(yniXFf~582M}~pF5}})`xEzgU6TZ
z0z2CoZ-d9%K>@#&@k#J2BfgvQ4d7$n7;l%`3EyK3?6}kIa4p0>_;}qJ-w$4|i`MT0
zALpFuC&A+@LV^83#t(pxbIy3X+{Ns4#r|}I*ZydJJNOGC`yJDoPl9iX_?sBt4}NvT
z4={cJy!J=ivCF+I-d^TUFX1KqAg<PF%_pI+{nUIvc<qno2f*ujH9rVGUUz2SF7ILf
z?1}wJg4gqEz7f2(ula1b-DK+?@8PZBwVn2u-wvLwd(3x(XX_sGIq;u~@{^g?b_T)Y
zyJbP%`Wc@ocQJpuVmlq+b)Gek{lNUud=9*xSMwhDc>hmpK2z>xc6#G^UGUnD=CLkj
zeBVuLzO~$LH$-t|r`zNE*9Fhwjrm6K+K$%u!0UDAd>6Cd#q4zH`+8dQ@SoZ54fKP*
u4Q1>GI7(n5+2pcaHta0Ak*kVlW*8;+Q8*_0-iVgN_x`jTer_ve)&2uCkqQU^

literal 0
HcmV?d00001

diff --git a/examples/elementwise_arith/air_project/div_kernel_0_aie_cdo_elfs.bin b/examples/elementwise_arith/air_project/div_kernel_0_aie_cdo_elfs.bin
new file mode 100644
index 0000000000000000000000000000000000000000..3542f70d63d78dc642ddc7dc29acd12d643307f5
GIT binary patch
literal 10704
zcmeHLZEO@p7=Cy6?)GRXH{15=aM#*xDfF;gC?6spDejfVAT||HOx4EJL?ZDc)I`kv
zsT{P(R|$nkG6ad0gcuVOw?>=hepM+12q=V-)QHUqP1HnUO)xo1%lXdTZo7EbmVfeN
zCz;;vGc(V;?>jT^J{=bTtgCw-Km^#H1MuuuMp6*~WWj|kSk!rduc=~TWT=#PPgM(}
zSun_gd2JY<$1Nn`El!pyW#AHkxu6c>vt$c|1w{ZiH%KasXb{r@J_7zkUAss>GI)X2
z^!WgW!(GO1@2}RVR|OSRIY9x41p!b9nQwe#A_?$eb0y5d4%izx*-6RAie(?G3Hg|R
zzK=aw>SJ4qeQd{EA3NsvtyW-Zp8&>+n;#G`t_jiWxpElrx%!q6{b!aedNt&VC#kor
z5FfBmZ>{;H5m&BsUt1IX8|P7hB-ALS-@r<`9G*`1Mc>gnwxiD{Ug>iS#_k0EhK1(B
zBwcpLHynPVd!vs2=>WJk+&{E9U6}towpVs<Uf+`jRA`ZtY|R}rh-qek-Zm-F+twCf
z{!26PU3oaZJ;=953Nrry_SdGdZxGw7X0~rH^6hOyz8lzo(KLK<Y+pPx-+tuV&;1?7
z{?o<pcWi%%_#JEu^d6MZzCW?Q*8Sqf?CX(GI|JO`ml|hSk3&+h_YnH)E+pLF#^$N`
zNRa#c%C;H!4z~q+4@&{&4j^BsyQyVLKD2LP_iK@uxnJ_`(j%8-dgKb$0jz^qZ(xnf
z0}Y~jbPWHN<&6(Cl$9SHk5rTfBNa=EA{F7{NX1n;(2zH8P9joqJP@h)A{hBRRuuW|
zBDVZoJp4mXWh0IC=HCu1^{D$2!wtp>iM9)Ri@9-u5}ltPeyXmCo-pJAnERT7fTyR~
zJU!Igg-`7O&3q*Ff{CMl1iT`s=6UGh%=n5l<1F_X!g8aeN&U8a#oZ=#l@Uv@!YCPa
z`60UHx~#@E;bTu-oYmJWt6vB5CCoO~CRQIw{xwzthCGI+JtW3Uj8pgUX}zd|vEW|Q
zA05V`(b4x0K=jyU*#|w5)0ioX(%IuVAnlkPCej(2lCp1~5HGRp)6sgt@{!HIKw<<!
z0@@2gtP!@o133^Hi8bEeRu#hR2-?Eic*?c!4tdJPSyg`_8#zaqiZyzXu$Ry0VJrvH
zvWbH8;ag%NKVXKK%o1m#dDz~W{BbO4*>m)b^``9&1;|CEl@?VU66xIjsrlSGp{fAd
ziB4EB#@?Mx*jBHil@qZP*xY&rW6>;Cr}<c6J7HBW66;KE$LOL>b<ib2lr8rttnxhe
z)jKi6V}^bVSgx)xH~M(Gk4!f|m4Wd1m*>0EJ_PHz4=b%X`OMReXYOcfe1FKB8joWQ
z2{UlIaR2*uq7QM&Yhg3+`FG_V-K47k9v6E%2ls+LDB&umOWI!=>XNn3Is^|eA*!&N
z1BBIJ1ajL69oe9;`Te+K@qG(MD-{*+@~EVX84vWhf1b<SPdG<>(&kM2tnVKy_?|EV
zJf3`fNSquGt_h#<gmHf-X3XS!U_YV1oUapggY`Y)-_ul4L0y}mSI^3HRa~as7ciGx
zmg##}WxDCQOqcv5Q{@*q^Lvc*>$sW|1>ZNi9*jTQ9~C^N{Y-mG(ar^-Ou%>&I@+$#
z(Z?}%VIKV!(d4NE)f3&xs?L!f<o<;3Y21OdT36^Ti7=rrpA~OVXvO2x?ze>Mr|2&-
zWItbKJGakmtq(OsMf<cvkngkBQ-rmaE6i6<*y1{cm8>Jo7cF2V`wLiWmqLvWg;nFa
z6S08l_}kh{s8LH8)Dj*iO~?`QMOG8W7DTrK_T0_w)qyJ*Gaj*Y{zLEB$umB&W#ntg
z*qdN)j>($n(j|T#&^{bv_P6#24-{S1KPkF^9zT<1eJs6aX+S>C!YmDF$1DwK$1DwK
z$3R^M0tc{Id}L?|@D=3M!e|x@vS3~trU;xNT^`h%OC7!G=uJm&X8b>qP#wML=uJm&
zqVk4G{p{#X4fUa;HyyqCf9TD$7UU}Kzgp0aSz6GJSz6GJfx0e>-VA5ao0%94Wnx|%
zHnrZYb@ZmAHyyp{=uJ#T&i$ryzv<j>I`^B-{pO>2znRu}cFjmljc3O!EojFqEojF;
zeIbk9+>}LcW@0duiFs|<)OvHXqc<JB>F7;IZ(=HP^roXX9lh!3O-FA&ntC&>@$8zB
Rni|iJSz6GJSz3@|{{p#ZAFu!b

literal 0
HcmV?d00001

diff --git a/examples/elementwise_arith/air_project/div_kernel_0_aie_cdo_enable.bin b/examples/elementwise_arith/air_project/div_kernel_0_aie_cdo_enable.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7cc1818bce4d6ce1226fc5fda519967a4842b99e
GIT binary patch
literal 104
zcmZQ!U|?`|@n>LQVqg#jvKQ~%@L!OTfq|Wo1;_&e1!V>x7mN{N##qFhv4{o3#Q>))
B1+xGE

literal 0
HcmV?d00001

diff --git a/examples/elementwise_arith/air_project/div_kernel_0_aie_cdo_init.bin b/examples/elementwise_arith/air_project/div_kernel_0_aie_cdo_init.bin
new file mode 100644
index 0000000000000000000000000000000000000000..d4549ba181167c6e6d7475f23335e34fea2c3376
GIT binary patch
literal 6032
zcmb7{&rak<5XQ@%hINw3u1Uy>EJ*QcI7AT#M?)kINHH2BaX`vM<iIUF0*^qXJ?)8e
zF5wYI$_vaRaLt7vMY;Fvc_s37+iiD!;}I|tf4=&CUGA#tw!nzU<oIWiROFvEk-z_%
zJ!~W*FA@WSPcd@He{sy=EE%{TBKEES5p+(vAYGEKNKZ*$kiH~+Mf#fb4e1%_Iq6%{
z_oQv3yPk}D`c%hUV?tlVeouYUx*%SXu1HTwUy!~eeMS13^bP45={f0J()Xloqq{y$
zdiqqybf1o>Pg)mN>ymUudP@3&^d;#l($}PKNY6;mN#BybCv6+u_35mqPjyW9=@;sg
z)`iu&Bwdl7lD;5)N&1TPHR&7DGtzU?x1{e$Thh7gr0z}C-Y@95Kj1=K_e*vRasT2A
z`KGw%NtC~$_drR!Af6K!;#1rYonOBXO5z3Gubj9LpW<BU{LA+y<^`S0oYo8RDdrKc
zuV_9c@q&0xTpGTS_%ZrfttFZNDYrPwi8P`Px$pNsn)N<^;r&%-Y<~OqH}kU3U)A~j
z{NqLc{b^o@^RRhW-=}a-^G3%#Fo-!FJksMCBIYU#o-Fco^&UO5Ugxjkt^40s*X#2?
zQZGFCgnj0BwN4NVJu!$m9X!(G86u|Z?ecW>;_N?bzRq9a$@;UpUZ4Mwdf~w*)RN!T
zTGw0O+2D~L&k!+PZ<nX5_w6(5b^gllFaEZ=UZ4Mwdf~w*)RN!TIzg<PZ~YDo9_jH6
z5!3Z{dDfCo{WC<~82>Vspqp^>qxyLCZUYYv{=)g)M&yk)HtOi1`aSl`pz$Z~{VHC>
z{wm~8zhkS{>@Pz8!5=I0ho0@PV=wrGo|TPu3xmfX=5+8#k7tP3R^-WgJe!Ecz0Z3*
zJBWQ9<=Kinq2IpGzOohbc{t*JyYq{${(e{F$2@C)+CR@F|Df@&=r{W@PpQw>ig|vl
z-i6*{KG=BId(6k{cs_@S#l1i1@f;%-&#>z8oFW$Y{-no)v!Lz|&O*DNXg+CuAD%p)
z+W++V)c&W>r}jU6KIt>&<BoKDH!vTsX+9~<XUT*4q%@x;59X87e3m?zPwIlvU0w1V
z+JWF-=&NwXHW&VX<5p+jKZ(o(mzsC_!$5vWzIoMh*>-QrcKx@>wrfeNOYm3=@d_NZ
zw|+x>4;)3bJR^PpjtW{nB))G4Db?$G()v+vO7%KEZ|nOoNU7cwf84d}i)35nL%m9)
zUZqj5n{~S$^(u~f-3(ifdKE{#Db=fZZU-ih)>n&xd&hVAz^Pux=TW^Tu6I|q)q2#U
zHTbJ2{~>V${xaeR#53^EBfd|32(IcOZtO_bqxD;h5z6W6t-#g!Bz^*}&xc-r46f!)
z>of4z(euuUpMtA-6E}8D`NwMi``~*2njeCH5<P!+p?L=WRm6WKegghw#7~Kzg6sY3
z{KihGe-qmOL|#PscNUswSg-5T`~+O@U-MINy+6&*!PWVs=No%Q`#)3r&%pKmG~Wc*
z&)2-Pqm<64x~~V|I{!%VLvT8uitmHd`BZ!g{#n%jy@k$y4*o^NPl)f?G3|e>^6!G{
z`D%{yOZ(S+3a<C3c>%8O|3dRUJE8m&wZAQJonLdDk5s*%7Mkzakr_t)DHkL4{@Vhl
z{wcl*uJh~l1-L$+%*XWnG36iY_v1oy)KAZ!`1S4I2o|Ro1qz~3EajN7i78>jEaZQL
C^4EL-

literal 0
HcmV?d00001

diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_2.elf b/examples/elementwise_arith/air_project/div_kernel_0_core_0_2.elf
new file mode 100755
index 0000000000000000000000000000000000000000..33f5143237b2608b0fcdae4c633076f2bc60cb03
GIT binary patch
literal 4132
zcma)9eN0=|6+h3<_&Hgquld?7gz`d2a2f+PI0>esHH=kJ=AdL-BDO^}#x}tU7={gV
zHf4DYv}u||B~c^Ys*TDjb<?CtGt<Tpe|am3(yHAOl}M@9NnXmdO<Of0wUSFoymS5D
zGY&~1N4mb}ch0%zo^$TUyFU3X@3Vp+fKCQ5=a379n=Ak#_AZdY05*`o2;5r1rqkpG
z0Nx5vK(HPBM=Yr8@_SQ0(0HQ&%m^Kj+92h$+cyK;-Y3eiXop!3z>B~+YJ8r|;nZee
ztx1e(p=~m8%KUeI)DHz2WGPM65Hu+T8xIJHk4$F(e%8Gc9z!f-y|c@rg;h!x)@rk`
z(rp&DYm0>)s<g0?$1UtqsinmVjY$C#yKj#YNPH+LHw&dUK;H3*pnSeTGMMe?E0v+<
z>M~@IsJUI^?8I0o-m^cY+_3|bfuv=H(%(Tdb-^>LE#{8Vv5}<3a6MTdBu=HVZxg!9
zGPLP4%R<|eQwKblp9iq!!1Kc@sKiRIp?>GoclJ)J0vU#+414M`t%#~;fSEC|d}eH{
zoRz-6j=$@V=I;gc_kvi#O6SquzD9hvP+zxxe5cXh=`r;8G1}L!;V*^yhV}iOL4Rj>
zz6)r-R{j2k`bVhW*|GAOvm(a#U$nPReeb}-*CQ=Yl=FNKcdfG?=f#Sd^O$cz8R7YM
zb$^vVQo-~6LEk$5UKy*Hc||N|1?A|^Hg#m^EB#@7JEoqC&uaUn=nHz`eTiPUj%^;>
zEo>iSOG)#c2KnML{#F;=o$su!xp+5TyQLyt>)af#ZL5se-k|fHMOz<B$7^3JkJtXH
zBL2(S&GCP}hnl}uF8q0VXBVBFDgLy)(Imf=Ug%7`F4A$qGo)=?pv1$^4?k5O8cc0a
z1fcC}3Id*<Zte6?^CWWP0Ns2fHG?L{{0Nv0AZzDgntMiwZtk+eX9+7*$PxMXQ@g)7
zBD)i_X;!9?C1WWlhi*!8$}YTW>PRuK*V^I-@@v?Xcskv3A@jfGDo9Anc-n1-R8``w
zFOl18kRh@C%On3jm#AM_ioFcVr4J+vOvm3AjIdd~J-!WO9B&5$4^g9Rd@l*9Dm^|A
zMla|&vTg5@SpqgeMss1d3;OzDBiI&ayYBVX+4v1=-|;D)a{C|8o2u`!y16p8c$F|2
zoAMrE-@jIbwY-Iq<p}3Pl+N7`ctfn*5?7QW)K6sovYgT5xftB5#ok$hzNpx&yXrtk
z9$3>~frn680Q>6`uzi^gY$7aVw%TimVGFRwy;iJ6x9Cv)v9fW(>WpM|A~T59#h5x^
zQUry$Ojfq@8rpg$@P-EpUk6s`v9bcp@$Dp8Ykn#L;q|X6HmW&PczF)Z`kZ{`wZ^jo
zB|Co3X3mbsu?Z1xV3m0O$&;8vN;K=Tb^Mh+Uv%-HM+Pt%*|Ckd7d*E_Tt$`Ce5q|x
zvj6;?U;-v6R@VA5VXat!!f`?u_gUGtIoz>`4`H>kRRJ@vib`5NFypzcMr}Xg9I2I<
zAB<1`{;`rj5k`R5laIF<R>p%dmouI)o^Ni(2mRf*pD<tUZ(<|Hr}^XG(=5p#Kb@vG
zu1M6KlIYaCc$a)2(I4HA=)s#3b$%pK>))gYzsESg2Ujykhz}U=uRq4G2qtYlX5VAA
zU#);@0qaTV(zul_J%P1r<JBKBXns0CHYq1r-BmJ;zTe<y1a}~Hc&zjjkuV`y%qsU;
zY3&nh?zgmDN-<yb@bL4c>3Mtw`ufm>!D@f=oFLs}C*C6LM2(eMyo5D$SXtFK39~3A
ztm;e&J27dciE~!wz;!2v66V4FL^q*{cEX^Y@H*LrjY6@3wYH%Kr6EAeQ#@V|@E43$
zk6HEp!|d4QcPxgX#ovmF(`j~eS+X099+961j1R|X_WJ(dg`)1cl}8uQ*WP)^`gp^U
zW1-;|+3W3)o$h)^J)ZACzdv$R9t{mg!-Ek`D%<*Ev7u;7L&MQ<tZ$^JJ}@}Y&=u|v
zN8%0r{i6eQ{%}a{X$k}#?qE|>$RBKSxOcmngQ38lpexwq>}l$81)T0s(CzAV@8Hqh
zPeL9v!ktVp;$%)HKn3!tJiIl>^Vidz41YOw^9O2{1bok;JwB^c65t!iB=Ynfh?BYM
z2(TZqBlmz6;4s(cVIN{S&%O^a$<y;?8p^}R5qIa=pF*6-!)FoWQ&%GaUPVsH!<P|H
z=HXxHrr7g*f3phz0kH$$=qd?t1sNIJ0c=Gez(?3SupSfGIKGXT$D9;Z{2%1Os0K*D
z1dAMdR`BN>D=T;jVG`rxKDa+oTR>UEU(4bZ`x?YNKHa_nG3Pn)@h-%27V!sw0Bx)A
zcUR%(R^iAh{L@wVMZ{oK7l`L~mSfKfp5Rzn!B8Iy#bdy?`9LUwO!W9b%-@53Uo<wX
za=G3gIOdCne-erUc)f@BcZB`@p+Ic^U~r^A1ohtbcBjv&^^Gtv(jV<@st;rZ4&N@V
zZ_@fbTHmM(oDNOm)J>hbja%0=>WnUF)Fq8B-CwiLxOoV{Fb^Rx9COwOcqqLik$|r!
zbTk}sg3pJCxpBxB4fy;0!@k~Ncob!7+z=QX4*49uM(7#o&Hgw!=THsMD>BiTe>eud
zD0<+WZ`9vE67qHI=P+;#d;|V)#21c)W57A^1!9B4QGFDb`~1Vh{^PntRfi%$$PUt!
zSsctCsRAy?fOhd87yoI>?mG`I@1qjm3cRVi0k;M6{Qs{`L}U2w=5`JZ%lO^=vBvXG
z_rvWCd2)t!lPKWV5r6jLk<9+D#=9HvIOeFi5MWOZss1>u#+Gk4h<2CwWE$!0lZZL5
PXmTU|tLAg+dcFTY7d$5=

literal 0
HcmV?d00001

diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_2.ld.script b/examples/elementwise_arith/air_project/div_kernel_0_core_0_2.ld.script
new file mode 100644
index 0000000..fc4f0cf
--- /dev/null
+++ b/examples/elementwise_arith/air_project/div_kernel_0_core_0_2.ld.script
@@ -0,0 +1,72 @@
+
+MEMORY
+{
+   program (RX) : ORIGIN = 0, LENGTH = 0x0020000
+   data (!RX) : ORIGIN = 0x78400, LENGTH = 0x7C00
+}
+ENTRY(__start)
+SECTIONS
+{
+  . = 0x0;
+  .text : {
+     /* the __start symbol has to come at address zero. */
+     *crt0.o(.text*)
+     _ctors_start = .;
+     _init_array_start = .;
+     KEEP(SORT(*.init_array))
+     _ctors_end = .;
+     _init_array_end = .;
+     _dtors_start = .;
+     _dtors_end = .;
+     *(.text*)
+  } > program
+  .data : {
+     *(.data*)
+     *(.rodata*)
+  } > data
+  .comment : {
+     *(.comment*)
+  }
+  .symtab : {
+     *(.symtab)
+  }
+  .shstrtab : {
+     *(.shstrtab)
+  }
+  .strtab : {
+     *(.strtab)
+  }
+  .stack_sizes : {
+     *(.stack_sizes)
+  }
+
+. = 0x70000;
+_sp_start_value_DM_stack = .;
+. += 0x400; /* stack */
+/* No tile with memory exists to the south. */
+. = 0x40000;
+. += 0x10000;
+/* No tile with memory exists to the west. */
+. = 0x50000;
+. += 0x10000;
+. = 0x60400;
+buf5 = .;
+. += 0x400;
+. = 0x64000;
+buf4 = .;
+. += 0x400;
+. = 0x68000;
+buf3 = .;
+. += 0x400;
+. = 0x70400;
+buf2 = .;
+. += 0x400;
+. = 0x74000;
+buf1 = .;
+. += 0x400;
+. = 0x78000;
+buf0 = .;
+. += 0x400;
+  .bss : { *(.bss*) } > data
+}
+PROVIDE(main = core_0_2);
diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_2.ll b/examples/elementwise_arith/air_project/div_kernel_0_core_0_2.ll
new file mode 100644
index 0000000..bf98238
--- /dev/null
+++ b/examples/elementwise_arith/air_project/div_kernel_0_core_0_2.ll
@@ -0,0 +1,158 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target triple = "aie2p"
+
+@buf0 = external global [256 x float]
+@buf1 = external global [256 x float]
+@buf2 = external global [256 x float]
+@buf3 = external global [256 x float]
+@buf4 = external global [256 x float]
+@buf5 = external global [256 x float]
+@buf6 = external global [256 x float]
+@buf7 = external global [256 x float]
+@buf8 = external global [256 x float]
+@buf9 = external global [256 x float]
+@buf10 = external global [256 x float]
+@buf11 = external global [256 x float]
+@buf12 = external global [1024 x float]
+@buf13 = external global [1024 x float]
+@buf14 = external global [1024 x float]
+
+; Function Attrs: noinline
+define float @__aie2p_scalar_fdiv(float %0, float %1) #0 {
+  %3 = call float @llvm.aie2p.inv(float %1)
+  %4 = fmul float %0, %3
+  ret float %4
+}
+
+declare void @debug_i32(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.event(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.put.ms(i32, i32)
+
+; Unknown intrinsic
+declare { i32, i32 } @llvm.aie2p.get.ss()
+
+; Unknown intrinsic
+declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32)
+
+; Unknown intrinsic
+declare <16 x i32> @llvm.aie2p.scd.read.vec(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.acquire(i32, i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.release(i32, i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.set.ctrl.reg(i32, i32)
+
+define void @core_0_2() {
+  br label %1
+
+1:                                                ; preds = %76, %0
+  call void @llvm.aie2p.acquire(i32 49, i32 -1)
+  call void @llvm.aie2p.acquire(i32 50, i32 -1)
+  call void @llvm.aie2p.acquire(i32 52, i32 -1)
+  br label %2
+
+2:                                                ; preds = %5, %1
+  %3 = phi i32 [ %75, %5 ], [ 0, %1 ]
+  %4 = icmp slt i32 %3, 256
+  br i1 %4, label %5, label %76
+
+5:                                                ; preds = %2
+  %6 = getelementptr float, ptr @buf2, i32 %3
+  %7 = load <16 x float>, ptr %6, align 4
+  %8 = getelementptr float, ptr @buf1, i32 %3
+  %9 = load <16 x float>, ptr %8, align 4
+  %10 = extractelement <16 x float> %7, i64 0
+  %11 = extractelement <16 x float> %9, i64 0
+  %12 = call float @__aie2p_scalar_fdiv(float %10, float %11)
+  %13 = insertelement <16 x float> poison, float %12, i64 0
+  %14 = extractelement <16 x float> %7, i64 1
+  %15 = extractelement <16 x float> %9, i64 1
+  %16 = call float @__aie2p_scalar_fdiv(float %14, float %15)
+  %17 = insertelement <16 x float> %13, float %16, i64 1
+  %18 = extractelement <16 x float> %7, i64 2
+  %19 = extractelement <16 x float> %9, i64 2
+  %20 = call float @__aie2p_scalar_fdiv(float %18, float %19)
+  %21 = insertelement <16 x float> %17, float %20, i64 2
+  %22 = extractelement <16 x float> %7, i64 3
+  %23 = extractelement <16 x float> %9, i64 3
+  %24 = call float @__aie2p_scalar_fdiv(float %22, float %23)
+  %25 = insertelement <16 x float> %21, float %24, i64 3
+  %26 = extractelement <16 x float> %7, i64 4
+  %27 = extractelement <16 x float> %9, i64 4
+  %28 = call float @__aie2p_scalar_fdiv(float %26, float %27)
+  %29 = insertelement <16 x float> %25, float %28, i64 4
+  %30 = extractelement <16 x float> %7, i64 5
+  %31 = extractelement <16 x float> %9, i64 5
+  %32 = call float @__aie2p_scalar_fdiv(float %30, float %31)
+  %33 = insertelement <16 x float> %29, float %32, i64 5
+  %34 = extractelement <16 x float> %7, i64 6
+  %35 = extractelement <16 x float> %9, i64 6
+  %36 = call float @__aie2p_scalar_fdiv(float %34, float %35)
+  %37 = insertelement <16 x float> %33, float %36, i64 6
+  %38 = extractelement <16 x float> %7, i64 7
+  %39 = extractelement <16 x float> %9, i64 7
+  %40 = call float @__aie2p_scalar_fdiv(float %38, float %39)
+  %41 = insertelement <16 x float> %37, float %40, i64 7
+  %42 = extractelement <16 x float> %7, i64 8
+  %43 = extractelement <16 x float> %9, i64 8
+  %44 = call float @__aie2p_scalar_fdiv(float %42, float %43)
+  %45 = insertelement <16 x float> %41, float %44, i64 8
+  %46 = extractelement <16 x float> %7, i64 9
+  %47 = extractelement <16 x float> %9, i64 9
+  %48 = call float @__aie2p_scalar_fdiv(float %46, float %47)
+  %49 = insertelement <16 x float> %45, float %48, i64 9
+  %50 = extractelement <16 x float> %7, i64 10
+  %51 = extractelement <16 x float> %9, i64 10
+  %52 = call float @__aie2p_scalar_fdiv(float %50, float %51)
+  %53 = insertelement <16 x float> %49, float %52, i64 10
+  %54 = extractelement <16 x float> %7, i64 11
+  %55 = extractelement <16 x float> %9, i64 11
+  %56 = call float @__aie2p_scalar_fdiv(float %54, float %55)
+  %57 = insertelement <16 x float> %53, float %56, i64 11
+  %58 = extractelement <16 x float> %7, i64 12
+  %59 = extractelement <16 x float> %9, i64 12
+  %60 = call float @__aie2p_scalar_fdiv(float %58, float %59)
+  %61 = insertelement <16 x float> %57, float %60, i64 12
+  %62 = extractelement <16 x float> %7, i64 13
+  %63 = extractelement <16 x float> %9, i64 13
+  %64 = call float @__aie2p_scalar_fdiv(float %62, float %63)
+  %65 = insertelement <16 x float> %61, float %64, i64 13
+  %66 = extractelement <16 x float> %7, i64 14
+  %67 = extractelement <16 x float> %9, i64 14
+  %68 = call float @__aie2p_scalar_fdiv(float %66, float %67)
+  %69 = insertelement <16 x float> %65, float %68, i64 14
+  %70 = extractelement <16 x float> %7, i64 15
+  %71 = extractelement <16 x float> %9, i64 15
+  %72 = call float @__aie2p_scalar_fdiv(float %70, float %71)
+  %73 = insertelement <16 x float> %69, float %72, i64 15
+  %74 = getelementptr float, ptr @buf0, i32 %3
+  store <16 x float> %73, ptr %74, align 4
+  %75 = add i32 %3, 16
+  br label %2, !llvm.loop !1
+
+76:                                               ; preds = %2
+  call void @llvm.aie2p.release(i32 51, i32 1)
+  call void @llvm.aie2p.release(i32 53, i32 1)
+  call void @llvm.aie2p.release(i32 48, i32 1)
+  br label %1
+}
+
+; Unknown intrinsic
+declare float @llvm.aie2p.inv(float)
+
+attributes #0 = { noinline }
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !{!1, !2}
+!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_2.o b/examples/elementwise_arith/air_project/div_kernel_0_core_0_2.o
new file mode 100644
index 0000000000000000000000000000000000000000..728d41f24858dcddcc40661dd9b21afbd5fbd4eb
GIT binary patch
literal 2048
zcma)7VQ3R)7=AC8G*_738A<t;SU9`QRGZ$~>WV00Qk?Tg4MC=qDW<WetSs8nB<1|G
zbxpO3lUCW_M1=ku`!f;{3H{X{V~inESvCayQ6;E96e=i0&}Hv?_g#CG+4P0)-uHc;
z=brn%yYKG4`@Hv{AP8WG0Ft8vFd6_vBzKvDR%nD3EkSb;lUND^bAPY=hNJ|>zIhp-
z5}hqfNRLb!561-+RBx35C^iJZpp|!YX0-<JZGRhVL;<SN)3_<hMzdEo_O;4}{}b8x
zY^!V>ZI+GPhqCdLU*59`I#68bemtf?L4@(YH+Z9fbqCh38@(<mjPzoSNi9uSU@)nN
zS7F7yLHswocl;$@M+K#-jx%-(cC%fu-_p9xr5d?;*|jk55egGk%u%7gsm6A`k}J_2
z#drkQ7cteeKAf=?hW`)b+logH-L?P~M!Yp+?<=p>TFwBaycj6u^8v$ur;gvk`uxtL
z-+3`;_{*pduQ9*-$hX&@-xT^y<<ajU>a{ie7Lot7zTYMEyF}})pnk3U_YC=U+`r5D
zK<ToG^Lvi^aPiCd|38naIuoGv9_y>~Jg$hr(iL2<r%9pp_VvHdPYKd`zxuk4-*i4$
znid0wCxCvf#S<g%^~3pXFP_xr_<r%dVOQ^X+0_LcWgPc$JjAi+EyrBywYPX|@x3d@
zT0Xw^P7iGj>Y>gpdMMhghwia*%=gj8svf!?&_llj^`Gaq=uiJd=0S7i?(McdHdosG
zBGA>KUaVGPg&QK96e8AD@yL{j`Tk(}m#ZNPJ^;S28NT^U>gR6{lV-7+<mWL7IKFJe
zi*5(thL^P|?BmOr2L`boar7VXnlwKA^7Jip)`#O9@8|KO<O2^RSMd|8`sr0YwJwhZ
zYM9#i8@U_ncFDpIP~B+NKVTBvR$ZXP>n9G}{3LR`TUgP@v9|RW9QdpQ|HpwZIq+?+
z6`u&!w$GRYpK;*7JMcvZp2hnl&<7l25_3CW@2`ZDjzSwvrIT7VrDwH7A~}@m8cAdZ
zlIi4VV(|3P7?=!bXNR*X?Tgsa_DnW8@D23#9y@$sD49+TWDgIY&ZSeJWwN8$<SESO
z&QMMy&g9aW!JTj_H`oPQZ%<EW0`tnn28Ksdi4Me2(8&dz)O1+$TF3DaYWVU@wHyCe
zKRV8%TS}qgpc-j2B0Sqk;RW%lb+i}luyv%NM;_tC5;nAVX$#SOh@(D(!|6AJJ?&jP
uY`?PzIQ`yYBjJhUN88o*+lDpy*%2qg5jID2wjVFiq?^S|Je@*=uKgdN+&{(u

literal 0
HcmV?d00001

diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_2.opt.ll b/examples/elementwise_arith/air_project/div_kernel_0_core_0_2.opt.ll
new file mode 100644
index 0000000..9cfe6d1
--- /dev/null
+++ b/examples/elementwise_arith/air_project/div_kernel_0_core_0_2.opt.ll
@@ -0,0 +1,129 @@
+; ModuleID = 'air_project/div_kernel_0_core_0_2.peanohack.ll'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-m:e-p:20:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-f32:32:32-i64:32-f64:32-a:0:32-n32"
+target triple = "aie2p"
+
+@buf0 = external local_unnamed_addr global [256 x float]
+@buf1 = external local_unnamed_addr global [256 x float]
+@buf2 = external local_unnamed_addr global [256 x float]
+
+; Function Attrs: nofree noinline nosync nounwind memory(none)
+define float @__aie2p_scalar_fdiv(float %0, float %1) local_unnamed_addr #0 {
+  %3 = tail call float @llvm.aie2p.inv(float %1)
+  %4 = fmul float %3, %0
+  ret float %4
+}
+
+; Function Attrs: nounwind
+declare void @llvm.aie2p.acquire(i32, i32) #1
+
+; Function Attrs: nounwind
+declare void @llvm.aie2p.release(i32, i32) #1
+
+; Function Attrs: noreturn nounwind
+define void @core_0_2() local_unnamed_addr #2 {
+  br label %1
+
+1:                                                ; preds = %76, %0
+  tail call void @llvm.aie2p.acquire(i32 49, i32 -1)
+  tail call void @llvm.aie2p.acquire(i32 50, i32 -1)
+  tail call void @llvm.aie2p.acquire(i32 52, i32 -1)
+  br label %2
+
+2:                                                ; preds = %1, %2
+  %3 = phi i32 [ 0, %1 ], [ %74, %2 ]
+  %4 = trunc nuw i32 %3 to i20
+  %5 = getelementptr float, ptr @buf2, i20 %4
+  %6 = load <16 x float>, ptr %5, align 64
+  %7 = getelementptr float, ptr @buf1, i20 %4
+  %8 = load <16 x float>, ptr %7, align 64
+  %9 = extractelement <16 x float> %6, i64 0
+  %10 = extractelement <16 x float> %8, i64 0
+  %11 = tail call float @__aie2p_scalar_fdiv(float %9, float %10)
+  %12 = insertelement <16 x float> poison, float %11, i64 0
+  %13 = extractelement <16 x float> %6, i64 1
+  %14 = extractelement <16 x float> %8, i64 1
+  %15 = tail call float @__aie2p_scalar_fdiv(float %13, float %14)
+  %16 = insertelement <16 x float> %12, float %15, i64 1
+  %17 = extractelement <16 x float> %6, i64 2
+  %18 = extractelement <16 x float> %8, i64 2
+  %19 = tail call float @__aie2p_scalar_fdiv(float %17, float %18)
+  %20 = insertelement <16 x float> %16, float %19, i64 2
+  %21 = extractelement <16 x float> %6, i64 3
+  %22 = extractelement <16 x float> %8, i64 3
+  %23 = tail call float @__aie2p_scalar_fdiv(float %21, float %22)
+  %24 = insertelement <16 x float> %20, float %23, i64 3
+  %25 = extractelement <16 x float> %6, i64 4
+  %26 = extractelement <16 x float> %8, i64 4
+  %27 = tail call float @__aie2p_scalar_fdiv(float %25, float %26)
+  %28 = insertelement <16 x float> %24, float %27, i64 4
+  %29 = extractelement <16 x float> %6, i64 5
+  %30 = extractelement <16 x float> %8, i64 5
+  %31 = tail call float @__aie2p_scalar_fdiv(float %29, float %30)
+  %32 = insertelement <16 x float> %28, float %31, i64 5
+  %33 = extractelement <16 x float> %6, i64 6
+  %34 = extractelement <16 x float> %8, i64 6
+  %35 = tail call float @__aie2p_scalar_fdiv(float %33, float %34)
+  %36 = insertelement <16 x float> %32, float %35, i64 6
+  %37 = extractelement <16 x float> %6, i64 7
+  %38 = extractelement <16 x float> %8, i64 7
+  %39 = tail call float @__aie2p_scalar_fdiv(float %37, float %38)
+  %40 = insertelement <16 x float> %36, float %39, i64 7
+  %41 = extractelement <16 x float> %6, i64 8
+  %42 = extractelement <16 x float> %8, i64 8
+  %43 = tail call float @__aie2p_scalar_fdiv(float %41, float %42)
+  %44 = insertelement <16 x float> %40, float %43, i64 8
+  %45 = extractelement <16 x float> %6, i64 9
+  %46 = extractelement <16 x float> %8, i64 9
+  %47 = tail call float @__aie2p_scalar_fdiv(float %45, float %46)
+  %48 = insertelement <16 x float> %44, float %47, i64 9
+  %49 = extractelement <16 x float> %6, i64 10
+  %50 = extractelement <16 x float> %8, i64 10
+  %51 = tail call float @__aie2p_scalar_fdiv(float %49, float %50)
+  %52 = insertelement <16 x float> %48, float %51, i64 10
+  %53 = extractelement <16 x float> %6, i64 11
+  %54 = extractelement <16 x float> %8, i64 11
+  %55 = tail call float @__aie2p_scalar_fdiv(float %53, float %54)
+  %56 = insertelement <16 x float> %52, float %55, i64 11
+  %57 = extractelement <16 x float> %6, i64 12
+  %58 = extractelement <16 x float> %8, i64 12
+  %59 = tail call float @__aie2p_scalar_fdiv(float %57, float %58)
+  %60 = insertelement <16 x float> %56, float %59, i64 12
+  %61 = extractelement <16 x float> %6, i64 13
+  %62 = extractelement <16 x float> %8, i64 13
+  %63 = tail call float @__aie2p_scalar_fdiv(float %61, float %62)
+  %64 = insertelement <16 x float> %60, float %63, i64 13
+  %65 = extractelement <16 x float> %6, i64 14
+  %66 = extractelement <16 x float> %8, i64 14
+  %67 = tail call float @__aie2p_scalar_fdiv(float %65, float %66)
+  %68 = insertelement <16 x float> %64, float %67, i64 14
+  %69 = extractelement <16 x float> %6, i64 15
+  %70 = extractelement <16 x float> %8, i64 15
+  %71 = tail call float @__aie2p_scalar_fdiv(float %69, float %70)
+  %72 = insertelement <16 x float> %68, float %71, i64 15
+  %73 = getelementptr float, ptr @buf0, i20 %4
+  store <16 x float> %72, ptr %73, align 64
+  %74 = add nuw nsw i32 %3, 16
+  %75 = icmp ult i32 %3, 240
+  br i1 %75, label %2, label %76, !llvm.loop !1
+
+76:                                               ; preds = %2
+  tail call void @llvm.aie2p.release(i32 51, i32 1)
+  tail call void @llvm.aie2p.release(i32 53, i32 1)
+  tail call void @llvm.aie2p.release(i32 48, i32 1)
+  br label %1
+}
+
+; Function Attrs: nofree nosync nounwind memory(none)
+declare float @llvm.aie2p.inv(float) #3
+
+attributes #0 = { nofree noinline nosync nounwind memory(none) }
+attributes #1 = { nounwind }
+attributes #2 = { noreturn nounwind }
+attributes #3 = { nofree nosync nounwind memory(none) }
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !{!1, !2}
+!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_2.peanohack.ll b/examples/elementwise_arith/air_project/div_kernel_0_core_0_2.peanohack.ll
new file mode 100644
index 0000000..61bace1
--- /dev/null
+++ b/examples/elementwise_arith/air_project/div_kernel_0_core_0_2.peanohack.ll
@@ -0,0 +1,158 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target triple = "aie2p"
+
+@buf0 = external global [256 x float]
+@buf1 = external global [256 x float]
+@buf2 = external global [256 x float]
+@buf3 = external global [256 x float]
+@buf4 = external global [256 x float]
+@buf5 = external global [256 x float]
+@buf6 = external global [256 x float]
+@buf7 = external global [256 x float]
+@buf8 = external global [256 x float]
+@buf9 = external global [256 x float]
+@buf10 = external global [256 x float]
+@buf11 = external global [256 x float]
+@buf12 = external global [1024 x float]
+@buf13 = external global [1024 x float]
+@buf14 = external global [1024 x float]
+
+; Function Attrs: noinline
+define float @__aie2p_scalar_fdiv(float %0, float %1) #0 {
+  %3 = call float @llvm.aie2p.inv(float %1)
+  %4 = fmul float %0, %3
+  ret float %4
+}
+
+declare void @debug_i32(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.event(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.put.ms(i32, i32)
+
+; Unknown intrinsic
+declare { i32, i32 } @llvm.aie2p.get.ss()
+
+; Unknown intrinsic
+declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32)
+
+; Unknown intrinsic
+declare <16 x i32> @llvm.aie2p.scd.read.vec(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.acquire(i32, i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.release(i32, i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.set.ctrl.reg(i32, i32)
+
+define void @core_0_2() {
+  br label %1
+
+1:                                                ; preds = %76, %0
+  call void @llvm.aie2p.acquire(i32 49, i32 -1)
+  call void @llvm.aie2p.acquire(i32 50, i32 -1)
+  call void @llvm.aie2p.acquire(i32 52, i32 -1)
+  br label %2
+
+2:                                                ; preds = %5, %1
+  %3 = phi i32 [ %75, %5 ], [ 0, %1 ]
+  %4 = icmp slt i32 %3, 256
+  br i1 %4, label %5, label %76
+
+5:                                                ; preds = %2
+  %6 = getelementptr float, ptr @buf2, i32 %3
+  %7 = load <16 x float>, ptr %6
+  %8 = getelementptr float, ptr @buf1, i32 %3
+  %9 = load <16 x float>, ptr %8
+  %10 = extractelement <16 x float> %7, i64 0
+  %11 = extractelement <16 x float> %9, i64 0
+  %12 = call float @__aie2p_scalar_fdiv(float %10, float %11)
+  %13 = insertelement <16 x float> poison, float %12, i64 0
+  %14 = extractelement <16 x float> %7, i64 1
+  %15 = extractelement <16 x float> %9, i64 1
+  %16 = call float @__aie2p_scalar_fdiv(float %14, float %15)
+  %17 = insertelement <16 x float> %13, float %16, i64 1
+  %18 = extractelement <16 x float> %7, i64 2
+  %19 = extractelement <16 x float> %9, i64 2
+  %20 = call float @__aie2p_scalar_fdiv(float %18, float %19)
+  %21 = insertelement <16 x float> %17, float %20, i64 2
+  %22 = extractelement <16 x float> %7, i64 3
+  %23 = extractelement <16 x float> %9, i64 3
+  %24 = call float @__aie2p_scalar_fdiv(float %22, float %23)
+  %25 = insertelement <16 x float> %21, float %24, i64 3
+  %26 = extractelement <16 x float> %7, i64 4
+  %27 = extractelement <16 x float> %9, i64 4
+  %28 = call float @__aie2p_scalar_fdiv(float %26, float %27)
+  %29 = insertelement <16 x float> %25, float %28, i64 4
+  %30 = extractelement <16 x float> %7, i64 5
+  %31 = extractelement <16 x float> %9, i64 5
+  %32 = call float @__aie2p_scalar_fdiv(float %30, float %31)
+  %33 = insertelement <16 x float> %29, float %32, i64 5
+  %34 = extractelement <16 x float> %7, i64 6
+  %35 = extractelement <16 x float> %9, i64 6
+  %36 = call float @__aie2p_scalar_fdiv(float %34, float %35)
+  %37 = insertelement <16 x float> %33, float %36, i64 6
+  %38 = extractelement <16 x float> %7, i64 7
+  %39 = extractelement <16 x float> %9, i64 7
+  %40 = call float @__aie2p_scalar_fdiv(float %38, float %39)
+  %41 = insertelement <16 x float> %37, float %40, i64 7
+  %42 = extractelement <16 x float> %7, i64 8
+  %43 = extractelement <16 x float> %9, i64 8
+  %44 = call float @__aie2p_scalar_fdiv(float %42, float %43)
+  %45 = insertelement <16 x float> %41, float %44, i64 8
+  %46 = extractelement <16 x float> %7, i64 9
+  %47 = extractelement <16 x float> %9, i64 9
+  %48 = call float @__aie2p_scalar_fdiv(float %46, float %47)
+  %49 = insertelement <16 x float> %45, float %48, i64 9
+  %50 = extractelement <16 x float> %7, i64 10
+  %51 = extractelement <16 x float> %9, i64 10
+  %52 = call float @__aie2p_scalar_fdiv(float %50, float %51)
+  %53 = insertelement <16 x float> %49, float %52, i64 10
+  %54 = extractelement <16 x float> %7, i64 11
+  %55 = extractelement <16 x float> %9, i64 11
+  %56 = call float @__aie2p_scalar_fdiv(float %54, float %55)
+  %57 = insertelement <16 x float> %53, float %56, i64 11
+  %58 = extractelement <16 x float> %7, i64 12
+  %59 = extractelement <16 x float> %9, i64 12
+  %60 = call float @__aie2p_scalar_fdiv(float %58, float %59)
+  %61 = insertelement <16 x float> %57, float %60, i64 12
+  %62 = extractelement <16 x float> %7, i64 13
+  %63 = extractelement <16 x float> %9, i64 13
+  %64 = call float @__aie2p_scalar_fdiv(float %62, float %63)
+  %65 = insertelement <16 x float> %61, float %64, i64 13
+  %66 = extractelement <16 x float> %7, i64 14
+  %67 = extractelement <16 x float> %9, i64 14
+  %68 = call float @__aie2p_scalar_fdiv(float %66, float %67)
+  %69 = insertelement <16 x float> %65, float %68, i64 14
+  %70 = extractelement <16 x float> %7, i64 15
+  %71 = extractelement <16 x float> %9, i64 15
+  %72 = call float @__aie2p_scalar_fdiv(float %70, float %71)
+  %73 = insertelement <16 x float> %69, float %72, i64 15
+  %74 = getelementptr float, ptr @buf0, i32 %3
+  store <16 x float> %73, ptr %74
+  %75 = add i32 %3, 16
+  br label %2, !llvm.loop !1
+
+76:                                               ; preds = %2
+  call void @llvm.aie2p.release(i32 51, i32 1)
+  call void @llvm.aie2p.release(i32 53, i32 1)
+  call void @llvm.aie2p.release(i32 48, i32 1)
+  br label %1
+}
+
+; Unknown intrinsic
+declare float @llvm.aie2p.inv(float)
+
+attributes #0 = { noinline }
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !{!1, !2}
+!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_3.elf b/examples/elementwise_arith/air_project/div_kernel_0_core_0_3.elf
new file mode 100755
index 0000000000000000000000000000000000000000..a92e4ae85366d8708c15091ef67871349c6409af
GIT binary patch
literal 4192
zcma)9eQZ<L6+h2*{0wXA>p-&G1j-|P#1tp7V*@cAtwXa4F@s<mDBDmS$2P<wakAK9
zW)tfxNlPi922`Zmb%?B5H%*!}XWCelzq~adTD2RfAW^N8Jj%39TeTpylA8vybNt@3
zO`r)!y6>LfIp>~x&bc4He*WvhXC+AjgG^v6A|DF3*#XSh`#=E`I6(#rkhO}<pwUGD
z)GAQGu<iawDr)G;duut+d5Z+X0_~7nBp0=7mjT?|ZdPE*1rq^)mw-5Kd4Wyh)HV=X
zvKZB^w$aQ{+uw~*-;)$j<UH3x(4`!l6c7@P%+CV+xN|)`idZUm7w61&v0S!`El#_r
zUSk&<R@%j$<#sXnm|dK$wr_SoOICu+#+!#2WImA8>y>gFAn*QIQa@WHn`|!hm7C?Z
z`Wj@AxNWP>xtOKWeAo4)dfNp;0hU)(&VL83+y_r<wwOCcCkC^2)0J$6lsTHmzD??^
zndRP3?YG*V7~2)V`~rYA2g(ntppmG)jQaIs-`sXV6DZIx&x$8M)r+Wk1{fbQ*NzVj
z)r#u(7VvlF;rzXb{$4cKiRx=;@0usR8>nwwIKE@(@7NIf`v~ou=JA(9ee=Tpj-$Wh
zl<zIHpRa!ZLj6P3@5E5;_z5${_g}PkjeUF9gV!UkjMP%Tdpj0bk5lHl@l%*@MGd2T
zJ37D2AFHE$zuUclzgLIq#$Pqpii%qF=N#MD|E2ygzI9{Ir6=_LQuR4M{hrKEU%_?_
z+YM|VVav(acAAtkGx)8qx^r!3{pvG!(hV!?(hZ(v>4vuD>4vNP+Rmy~kLJ@2uh*s<
zepQ$L<;1e|Ki@^oUzgwd^M&;td}4gbC$%kB<>maXotZbxd{_$f>l+t13()zYQ}uz#
z+6Gks`o88M;pyqrPY<_^A~y`sNh7%pbUEh7z-9tPKMxnkGeUH8M^rw`M5W61DZd}v
z`1wA?pP9&u8kJ33szKdzT~=}~={0M6P6UIF&0%1_hDS0_<u{+6{ol+A$jCEz+MTA{
zip-l|Ah*q=KxXY1`~H10(=<JudIi+8@5^?$kbX<Dz%uRj_y(|Hyd6vwqE3bQUY2qz
zjQ9c=y=3Gl*1W?e7&s*b&83MB=<b2V;GCN1xZB<6q#M+=?h`!au0Ni#*5470lQm-M
z5)%qG^<5^ubGZs@c>^OW63&MxU%VgihA7<<7u71%kIepMX4Z)3Omv$b`_854i<?^w
zSMBI10Q35*2r#Zl;Cf>O*3O9DN0^A&9In;Ov=YQ)K?l~N)9lv#iJD<18ZB&MWVR2h
zi!rsss2Nn@vpU53%V--I!5bbpd=*4xz#%Fy$G5U<zWKQfjOxF7iABqyE=W1F8gtUj
z^Nkl3YGM4O(^eRdV>4#Ffi<H1v(IA=IkU}>E#R;Eg{m{V0}6oEA`ULby%4xz##Piv
z%a=PxW!KM6N>&h(>JVFAVPXqbpmLb;sqGH2W)gQS;yqZcLRG*<RnbVV2WC9E%A)Tl
zoFi9?`TqEf-#@YRPfReNdeV5OsWcue#hmekQNG0)@Ar4le!_go-^gN&PxnW^rv*|#
zc`DDZUX-~%C-bp)@Gf~@=HI_6^WE2F?)gyWj=#zG{~qJ~0bI=@A>L)VxBeKvDp~dU
zglm(-b*T>OC9Ego)58uv{W#XHjjG>o(*3l9VpX3PjhEO3^!-yhBe(;(JK*3So0*WZ
zOT_Z+4&Lziy!$P$RCCN1Jv{h)>3WK<!dM@kF*#i4PfGG#apX-Vj;wYFdyt9dc86H;
z3=?*BsaSD*sW>v~;F*&S;l_1mrllf){gF<_Gh3N}t&HmAk`_x#OkztLYET*jv^+}j
z27q2LdOaqz`wz1d=iauP`lo(t&K%2&12eMAWC@t*JYak{Mz=Ti2NjC@Crghm;IF^^
zfb|K+69;1ho0VX&UGeyv+)a4CBR%270p(C^AQ|sV=u*Ymol5m5H#avQh^M*-yP6_>
zz0Do*o_HeN+|zTYw=o=#DP7)3)a{RYy|HlA>-KN-wMJu+O;KOe>*?}#`63>FEb8~|
z_phVq?j@-VTHyBV62#e}OoBS(b7go{k*C+w?OA#`b<zX1KoY)Z(H@^w8cFaqWHM#?
zcEs6YbtKq<*j;?UO0bvoWjKUbDYNfJ%*yn%O#Nl}FyhWK`=f|6W%vYQeCp~X!E4B=
zW%wN8(K7rC!xVeU_cwF!9}v6ojjoXd7m-o0?ZQ?C5`2ig8|yKGjqpvx6m!<B;r}2H
z7A-&qR+u6jDB;ftt0g>*FpKe#5AtW$7f{jhR|>dfzZx;cXV^C*CZ34K`w%MyL=ON7
z+UDSI&B4#j!HGHeM|1E?h{2+52+HpS;XnzG5LQe0Y{6bzP_q9?!M=pg7wk(Ino_ZJ
z3g}q$#uCUR5BH|RUD$UgQv(`T?8A|Rp=A7ru_Qn+xOYc;JlqqDq;~X02YX`B6x_Pi
z6Y}VN3-k{5B=>upA_ajvv_bE^dcR5UTMU86txG(HsmHMK8=4k_F(fU9q{V0WYc&`@
zg%FKX2$6x5rzt|A>>o@-LS3-~@q`CLAw2vo{h?$e+!G!M?T^L}p-hV#B7Fm~kUQjs
zuEG5q=;Nc0S2IAb$RtzYffR(2=z+M<p>WS&EY!Y(VB{c#dc*NVD4vL?fH(+6QhftS
zV-(3l;emnhVMC&+V~Hpf2I<NIj+T$q0Ld|+7W#PT<EBq5eKyf&V`19{aCbZu(YBzb
z?F`&f$kTskgP6|H$sjwojurgP{ejL?XZRs|Q<+?#-4ztjRYp%{+OcGV_deiJE>d?T
nz@{S7{1L3jR&IA5?S4*^>141^ASPbb<<H<ha+=f78~y(Q$lEV}

literal 0
HcmV?d00001

diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_3.ld.script b/examples/elementwise_arith/air_project/div_kernel_0_core_0_3.ld.script
new file mode 100644
index 0000000..6120a88
--- /dev/null
+++ b/examples/elementwise_arith/air_project/div_kernel_0_core_0_3.ld.script
@@ -0,0 +1,78 @@
+
+MEMORY
+{
+   program (RX) : ORIGIN = 0, LENGTH = 0x0020000
+   data (!RX) : ORIGIN = 0x78400, LENGTH = 0x7C00
+}
+ENTRY(__start)
+SECTIONS
+{
+  . = 0x0;
+  .text : {
+     /* the __start symbol has to come at address zero. */
+     *crt0.o(.text*)
+     _ctors_start = .;
+     _init_array_start = .;
+     KEEP(SORT(*.init_array))
+     _ctors_end = .;
+     _init_array_end = .;
+     _dtors_start = .;
+     _dtors_end = .;
+     *(.text*)
+  } > program
+  .data : {
+     *(.data*)
+     *(.rodata*)
+  } > data
+  .comment : {
+     *(.comment*)
+  }
+  .symtab : {
+     *(.symtab)
+  }
+  .shstrtab : {
+     *(.shstrtab)
+  }
+  .strtab : {
+     *(.strtab)
+  }
+  .stack_sizes : {
+     *(.stack_sizes)
+  }
+
+. = 0x70000;
+_sp_start_value_DM_stack = .;
+. += 0x400; /* stack */
+. = 0x40400;
+buf2 = .;
+. += 0x400;
+. = 0x44000;
+buf1 = .;
+. += 0x400;
+. = 0x48000;
+buf0 = .;
+. += 0x400;
+/* No tile with memory exists to the west. */
+. = 0x50000;
+. += 0x10000;
+. = 0x60400;
+buf8 = .;
+. += 0x400;
+. = 0x64000;
+buf7 = .;
+. += 0x400;
+. = 0x68000;
+buf6 = .;
+. += 0x400;
+. = 0x70400;
+buf5 = .;
+. += 0x400;
+. = 0x74000;
+buf4 = .;
+. += 0x400;
+. = 0x78000;
+buf3 = .;
+. += 0x400;
+  .bss : { *(.bss*) } > data
+}
+PROVIDE(main = core_0_3);
diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_3.ll b/examples/elementwise_arith/air_project/div_kernel_0_core_0_3.ll
new file mode 100644
index 0000000..666390f
--- /dev/null
+++ b/examples/elementwise_arith/air_project/div_kernel_0_core_0_3.ll
@@ -0,0 +1,158 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target triple = "aie2p"
+
+@buf0 = external global [256 x float]
+@buf1 = external global [256 x float]
+@buf2 = external global [256 x float]
+@buf3 = external global [256 x float]
+@buf4 = external global [256 x float]
+@buf5 = external global [256 x float]
+@buf6 = external global [256 x float]
+@buf7 = external global [256 x float]
+@buf8 = external global [256 x float]
+@buf9 = external global [256 x float]
+@buf10 = external global [256 x float]
+@buf11 = external global [256 x float]
+@buf12 = external global [1024 x float]
+@buf13 = external global [1024 x float]
+@buf14 = external global [1024 x float]
+
+; Function Attrs: noinline
+define float @__aie2p_scalar_fdiv(float %0, float %1) #0 {
+  %3 = call float @llvm.aie2p.inv(float %1)
+  %4 = fmul float %0, %3
+  ret float %4
+}
+
+declare void @debug_i32(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.event(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.put.ms(i32, i32)
+
+; Unknown intrinsic
+declare { i32, i32 } @llvm.aie2p.get.ss()
+
+; Unknown intrinsic
+declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32)
+
+; Unknown intrinsic
+declare <16 x i32> @llvm.aie2p.scd.read.vec(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.acquire(i32, i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.release(i32, i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.set.ctrl.reg(i32, i32)
+
+define void @core_0_3() {
+  br label %1
+
+1:                                                ; preds = %76, %0
+  call void @llvm.aie2p.acquire(i32 49, i32 -1)
+  call void @llvm.aie2p.acquire(i32 50, i32 -1)
+  call void @llvm.aie2p.acquire(i32 52, i32 -1)
+  br label %2
+
+2:                                                ; preds = %5, %1
+  %3 = phi i32 [ %75, %5 ], [ 0, %1 ]
+  %4 = icmp slt i32 %3, 256
+  br i1 %4, label %5, label %76
+
+5:                                                ; preds = %2
+  %6 = getelementptr float, ptr @buf5, i32 %3
+  %7 = load <16 x float>, ptr %6, align 4
+  %8 = getelementptr float, ptr @buf4, i32 %3
+  %9 = load <16 x float>, ptr %8, align 4
+  %10 = extractelement <16 x float> %7, i64 0
+  %11 = extractelement <16 x float> %9, i64 0
+  %12 = call float @__aie2p_scalar_fdiv(float %10, float %11)
+  %13 = insertelement <16 x float> poison, float %12, i64 0
+  %14 = extractelement <16 x float> %7, i64 1
+  %15 = extractelement <16 x float> %9, i64 1
+  %16 = call float @__aie2p_scalar_fdiv(float %14, float %15)
+  %17 = insertelement <16 x float> %13, float %16, i64 1
+  %18 = extractelement <16 x float> %7, i64 2
+  %19 = extractelement <16 x float> %9, i64 2
+  %20 = call float @__aie2p_scalar_fdiv(float %18, float %19)
+  %21 = insertelement <16 x float> %17, float %20, i64 2
+  %22 = extractelement <16 x float> %7, i64 3
+  %23 = extractelement <16 x float> %9, i64 3
+  %24 = call float @__aie2p_scalar_fdiv(float %22, float %23)
+  %25 = insertelement <16 x float> %21, float %24, i64 3
+  %26 = extractelement <16 x float> %7, i64 4
+  %27 = extractelement <16 x float> %9, i64 4
+  %28 = call float @__aie2p_scalar_fdiv(float %26, float %27)
+  %29 = insertelement <16 x float> %25, float %28, i64 4
+  %30 = extractelement <16 x float> %7, i64 5
+  %31 = extractelement <16 x float> %9, i64 5
+  %32 = call float @__aie2p_scalar_fdiv(float %30, float %31)
+  %33 = insertelement <16 x float> %29, float %32, i64 5
+  %34 = extractelement <16 x float> %7, i64 6
+  %35 = extractelement <16 x float> %9, i64 6
+  %36 = call float @__aie2p_scalar_fdiv(float %34, float %35)
+  %37 = insertelement <16 x float> %33, float %36, i64 6
+  %38 = extractelement <16 x float> %7, i64 7
+  %39 = extractelement <16 x float> %9, i64 7
+  %40 = call float @__aie2p_scalar_fdiv(float %38, float %39)
+  %41 = insertelement <16 x float> %37, float %40, i64 7
+  %42 = extractelement <16 x float> %7, i64 8
+  %43 = extractelement <16 x float> %9, i64 8
+  %44 = call float @__aie2p_scalar_fdiv(float %42, float %43)
+  %45 = insertelement <16 x float> %41, float %44, i64 8
+  %46 = extractelement <16 x float> %7, i64 9
+  %47 = extractelement <16 x float> %9, i64 9
+  %48 = call float @__aie2p_scalar_fdiv(float %46, float %47)
+  %49 = insertelement <16 x float> %45, float %48, i64 9
+  %50 = extractelement <16 x float> %7, i64 10
+  %51 = extractelement <16 x float> %9, i64 10
+  %52 = call float @__aie2p_scalar_fdiv(float %50, float %51)
+  %53 = insertelement <16 x float> %49, float %52, i64 10
+  %54 = extractelement <16 x float> %7, i64 11
+  %55 = extractelement <16 x float> %9, i64 11
+  %56 = call float @__aie2p_scalar_fdiv(float %54, float %55)
+  %57 = insertelement <16 x float> %53, float %56, i64 11
+  %58 = extractelement <16 x float> %7, i64 12
+  %59 = extractelement <16 x float> %9, i64 12
+  %60 = call float @__aie2p_scalar_fdiv(float %58, float %59)
+  %61 = insertelement <16 x float> %57, float %60, i64 12
+  %62 = extractelement <16 x float> %7, i64 13
+  %63 = extractelement <16 x float> %9, i64 13
+  %64 = call float @__aie2p_scalar_fdiv(float %62, float %63)
+  %65 = insertelement <16 x float> %61, float %64, i64 13
+  %66 = extractelement <16 x float> %7, i64 14
+  %67 = extractelement <16 x float> %9, i64 14
+  %68 = call float @__aie2p_scalar_fdiv(float %66, float %67)
+  %69 = insertelement <16 x float> %65, float %68, i64 14
+  %70 = extractelement <16 x float> %7, i64 15
+  %71 = extractelement <16 x float> %9, i64 15
+  %72 = call float @__aie2p_scalar_fdiv(float %70, float %71)
+  %73 = insertelement <16 x float> %69, float %72, i64 15
+  %74 = getelementptr float, ptr @buf3, i32 %3
+  store <16 x float> %73, ptr %74, align 4
+  %75 = add i32 %3, 16
+  br label %2, !llvm.loop !1
+
+76:                                               ; preds = %2
+  call void @llvm.aie2p.release(i32 51, i32 1)
+  call void @llvm.aie2p.release(i32 53, i32 1)
+  call void @llvm.aie2p.release(i32 48, i32 1)
+  br label %1
+}
+
+; Unknown intrinsic
+declare float @llvm.aie2p.inv(float)
+
+attributes #0 = { noinline }
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !{!1, !2}
+!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_3.o b/examples/elementwise_arith/air_project/div_kernel_0_core_0_3.o
new file mode 100644
index 0000000000000000000000000000000000000000..0cbf9dffbcb51eefa1ba59e76899a9a17c429aac
GIT binary patch
literal 2048
zcma)7VQ3R)7=AC8G*_73EhFVyV&Uwj>DDxBYb`R7CB<$2s38oMGB>Yjjayk<Lz|Qv
zgRL`lPMla}gBxSef3iO#0g=#O{UKuv869Rrm_Mon^@l<Q2N~$H_r3eB{V22P3*Yy>
z@AEwO-1psm_uc1j4!tEw61X9OB2)lY3xJGdzb)v5R#?*#v=Fz6l_)Up7uq!>6|nX#
zssOdbeCd>O+orKQD(Rs6>kL41Api!QyrL`X4S>%^x?n2`P>r6}53*{t`&DaSr)mXv
zsMhZ7s&%+swF)n)*0(|R_2;1n#ifBe(;AdynE7LiKLMEcVE%Wj-=jp4UT!d@qYV=b
zrVR2ds(H7_|3>%DJi`0vpw;ym#;(8~)(>wuTCcrStFWkgmKJ?d=~NxZgf!CDV7nfx
zwZyCCR2<hAx7D;hoUs#D@E7E}%I_Sw>Hs?A{0(dGW4}{6&H$C79I6zHAuD*jiQm%Z
z{7$3aX*p~KtEi7|Fu$9~cQ>Ek8T30-M88|8k8R+$jQnfO{m!D_Sz2!m^&36E`^azN
z`JF3<D(7UJ-viV~%ZF0`|32#aT!_|tWVp%uI4_4Q=W)HhHjUOhJn}3*Ellfu_roTB
z7mDG^1vzB-Lg?37elPb-Kb+qy<zwao-!FlG*v0F9c5w+?728d0x3Deyt4WW3=?V57
zfv44E$IF+Vnvv~cGt#@wj3nC4$nUJ047{|pZbrTfnUNpE=H-QL=DlB$`K!J5+tsdN
zworNDVW_V~|EgX~mcEzStQ2?diapcf_V<J74Npq~0swqpGyL+IGQz(erp#kD%dcY!
zaD3H>Cj$cDg-4CAu}-aGKHDKD)X(vpGyyz$K3L_PgX2?PFXEvT03Qsm<KJIbkFKk!
z^!OYwhC>(sBKKh4tvL98%3GbB<|}!fyhH=f?+UnmByxN}TGOX6clBQi_`HDsDc~yt
zzSFbj6UW^3nHKOl0smRRmjyhJ&q<;K9Ag&8Zob~12qzsyY%-Hg$MP97A2W>fc&0CB
zOpT?p=}F_n@$qS}8Hjy6k<Y~5P9E-_%BROZf}x=!2ltPsvzf8{!HMIAYzAUe`N@2G
z6vt0a(l}Z;v75Sn>UM!)oGfIgFcEGyHZhqodJM#d1_yhMK0fw3^V&pz05yDhw%UvT
zt8aws=#f(B*eFNZjyNCPLE{PW>vgmj-EeiJp;sQ~#0nO)cWDdJe2AmEfKBw9!<zQ4
v8?Ijt0nzUX77AWSezaX(znz$qpBpJ6gmAeOV(#lDnsoCx5>J;<qkI1cP#r(5

literal 0
HcmV?d00001

diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_3.opt.ll b/examples/elementwise_arith/air_project/div_kernel_0_core_0_3.opt.ll
new file mode 100644
index 0000000..7485372
--- /dev/null
+++ b/examples/elementwise_arith/air_project/div_kernel_0_core_0_3.opt.ll
@@ -0,0 +1,129 @@
+; ModuleID = 'air_project/div_kernel_0_core_0_3.peanohack.ll'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-m:e-p:20:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-f32:32:32-i64:32-f64:32-a:0:32-n32"
+target triple = "aie2p"
+
+@buf3 = external local_unnamed_addr global [256 x float]
+@buf4 = external local_unnamed_addr global [256 x float]
+@buf5 = external local_unnamed_addr global [256 x float]
+
+; Function Attrs: nofree noinline nosync nounwind memory(none)
+define float @__aie2p_scalar_fdiv(float %0, float %1) local_unnamed_addr #0 {
+  %3 = tail call float @llvm.aie2p.inv(float %1)
+  %4 = fmul float %3, %0
+  ret float %4
+}
+
+; Function Attrs: nounwind
+declare void @llvm.aie2p.acquire(i32, i32) #1
+
+; Function Attrs: nounwind
+declare void @llvm.aie2p.release(i32, i32) #1
+
+; Function Attrs: noreturn nounwind
+define void @core_0_3() local_unnamed_addr #2 {
+  br label %1
+
+1:                                                ; preds = %76, %0
+  tail call void @llvm.aie2p.acquire(i32 49, i32 -1)
+  tail call void @llvm.aie2p.acquire(i32 50, i32 -1)
+  tail call void @llvm.aie2p.acquire(i32 52, i32 -1)
+  br label %2
+
+2:                                                ; preds = %1, %2
+  %3 = phi i32 [ 0, %1 ], [ %74, %2 ]
+  %4 = trunc nuw i32 %3 to i20
+  %5 = getelementptr float, ptr @buf5, i20 %4
+  %6 = load <16 x float>, ptr %5, align 64
+  %7 = getelementptr float, ptr @buf4, i20 %4
+  %8 = load <16 x float>, ptr %7, align 64
+  %9 = extractelement <16 x float> %6, i64 0
+  %10 = extractelement <16 x float> %8, i64 0
+  %11 = tail call float @__aie2p_scalar_fdiv(float %9, float %10)
+  %12 = insertelement <16 x float> poison, float %11, i64 0
+  %13 = extractelement <16 x float> %6, i64 1
+  %14 = extractelement <16 x float> %8, i64 1
+  %15 = tail call float @__aie2p_scalar_fdiv(float %13, float %14)
+  %16 = insertelement <16 x float> %12, float %15, i64 1
+  %17 = extractelement <16 x float> %6, i64 2
+  %18 = extractelement <16 x float> %8, i64 2
+  %19 = tail call float @__aie2p_scalar_fdiv(float %17, float %18)
+  %20 = insertelement <16 x float> %16, float %19, i64 2
+  %21 = extractelement <16 x float> %6, i64 3
+  %22 = extractelement <16 x float> %8, i64 3
+  %23 = tail call float @__aie2p_scalar_fdiv(float %21, float %22)
+  %24 = insertelement <16 x float> %20, float %23, i64 3
+  %25 = extractelement <16 x float> %6, i64 4
+  %26 = extractelement <16 x float> %8, i64 4
+  %27 = tail call float @__aie2p_scalar_fdiv(float %25, float %26)
+  %28 = insertelement <16 x float> %24, float %27, i64 4
+  %29 = extractelement <16 x float> %6, i64 5
+  %30 = extractelement <16 x float> %8, i64 5
+  %31 = tail call float @__aie2p_scalar_fdiv(float %29, float %30)
+  %32 = insertelement <16 x float> %28, float %31, i64 5
+  %33 = extractelement <16 x float> %6, i64 6
+  %34 = extractelement <16 x float> %8, i64 6
+  %35 = tail call float @__aie2p_scalar_fdiv(float %33, float %34)
+  %36 = insertelement <16 x float> %32, float %35, i64 6
+  %37 = extractelement <16 x float> %6, i64 7
+  %38 = extractelement <16 x float> %8, i64 7
+  %39 = tail call float @__aie2p_scalar_fdiv(float %37, float %38)
+  %40 = insertelement <16 x float> %36, float %39, i64 7
+  %41 = extractelement <16 x float> %6, i64 8
+  %42 = extractelement <16 x float> %8, i64 8
+  %43 = tail call float @__aie2p_scalar_fdiv(float %41, float %42)
+  %44 = insertelement <16 x float> %40, float %43, i64 8
+  %45 = extractelement <16 x float> %6, i64 9
+  %46 = extractelement <16 x float> %8, i64 9
+  %47 = tail call float @__aie2p_scalar_fdiv(float %45, float %46)
+  %48 = insertelement <16 x float> %44, float %47, i64 9
+  %49 = extractelement <16 x float> %6, i64 10
+  %50 = extractelement <16 x float> %8, i64 10
+  %51 = tail call float @__aie2p_scalar_fdiv(float %49, float %50)
+  %52 = insertelement <16 x float> %48, float %51, i64 10
+  %53 = extractelement <16 x float> %6, i64 11
+  %54 = extractelement <16 x float> %8, i64 11
+  %55 = tail call float @__aie2p_scalar_fdiv(float %53, float %54)
+  %56 = insertelement <16 x float> %52, float %55, i64 11
+  %57 = extractelement <16 x float> %6, i64 12
+  %58 = extractelement <16 x float> %8, i64 12
+  %59 = tail call float @__aie2p_scalar_fdiv(float %57, float %58)
+  %60 = insertelement <16 x float> %56, float %59, i64 12
+  %61 = extractelement <16 x float> %6, i64 13
+  %62 = extractelement <16 x float> %8, i64 13
+  %63 = tail call float @__aie2p_scalar_fdiv(float %61, float %62)
+  %64 = insertelement <16 x float> %60, float %63, i64 13
+  %65 = extractelement <16 x float> %6, i64 14
+  %66 = extractelement <16 x float> %8, i64 14
+  %67 = tail call float @__aie2p_scalar_fdiv(float %65, float %66)
+  %68 = insertelement <16 x float> %64, float %67, i64 14
+  %69 = extractelement <16 x float> %6, i64 15
+  %70 = extractelement <16 x float> %8, i64 15
+  %71 = tail call float @__aie2p_scalar_fdiv(float %69, float %70)
+  %72 = insertelement <16 x float> %68, float %71, i64 15
+  %73 = getelementptr float, ptr @buf3, i20 %4
+  store <16 x float> %72, ptr %73, align 64
+  %74 = add nuw nsw i32 %3, 16
+  %75 = icmp ult i32 %3, 240
+  br i1 %75, label %2, label %76, !llvm.loop !1
+
+76:                                               ; preds = %2
+  tail call void @llvm.aie2p.release(i32 51, i32 1)
+  tail call void @llvm.aie2p.release(i32 53, i32 1)
+  tail call void @llvm.aie2p.release(i32 48, i32 1)
+  br label %1
+}
+
+; Function Attrs: nofree nosync nounwind memory(none)
+declare float @llvm.aie2p.inv(float) #3
+
+attributes #0 = { nofree noinline nosync nounwind memory(none) }
+attributes #1 = { nounwind }
+attributes #2 = { noreturn nounwind }
+attributes #3 = { nofree nosync nounwind memory(none) }
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !{!1, !2}
+!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_3.peanohack.ll b/examples/elementwise_arith/air_project/div_kernel_0_core_0_3.peanohack.ll
new file mode 100644
index 0000000..0c167b0
--- /dev/null
+++ b/examples/elementwise_arith/air_project/div_kernel_0_core_0_3.peanohack.ll
@@ -0,0 +1,158 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target triple = "aie2p"
+
+@buf0 = external global [256 x float]
+@buf1 = external global [256 x float]
+@buf2 = external global [256 x float]
+@buf3 = external global [256 x float]
+@buf4 = external global [256 x float]
+@buf5 = external global [256 x float]
+@buf6 = external global [256 x float]
+@buf7 = external global [256 x float]
+@buf8 = external global [256 x float]
+@buf9 = external global [256 x float]
+@buf10 = external global [256 x float]
+@buf11 = external global [256 x float]
+@buf12 = external global [1024 x float]
+@buf13 = external global [1024 x float]
+@buf14 = external global [1024 x float]
+
+; Function Attrs: noinline
+define float @__aie2p_scalar_fdiv(float %0, float %1) #0 {
+  %3 = call float @llvm.aie2p.inv(float %1)
+  %4 = fmul float %0, %3
+  ret float %4
+}
+
+declare void @debug_i32(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.event(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.put.ms(i32, i32)
+
+; Unknown intrinsic
+declare { i32, i32 } @llvm.aie2p.get.ss()
+
+; Unknown intrinsic
+declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32)
+
+; Unknown intrinsic
+declare <16 x i32> @llvm.aie2p.scd.read.vec(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.acquire(i32, i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.release(i32, i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.set.ctrl.reg(i32, i32)
+
+define void @core_0_3() {
+  br label %1
+
+1:                                                ; preds = %76, %0
+  call void @llvm.aie2p.acquire(i32 49, i32 -1)
+  call void @llvm.aie2p.acquire(i32 50, i32 -1)
+  call void @llvm.aie2p.acquire(i32 52, i32 -1)
+  br label %2
+
+2:                                                ; preds = %5, %1
+  %3 = phi i32 [ %75, %5 ], [ 0, %1 ]
+  %4 = icmp slt i32 %3, 256
+  br i1 %4, label %5, label %76
+
+5:                                                ; preds = %2
+  %6 = getelementptr float, ptr @buf5, i32 %3
+  %7 = load <16 x float>, ptr %6
+  %8 = getelementptr float, ptr @buf4, i32 %3
+  %9 = load <16 x float>, ptr %8
+  %10 = extractelement <16 x float> %7, i64 0
+  %11 = extractelement <16 x float> %9, i64 0
+  %12 = call float @__aie2p_scalar_fdiv(float %10, float %11)
+  %13 = insertelement <16 x float> poison, float %12, i64 0
+  %14 = extractelement <16 x float> %7, i64 1
+  %15 = extractelement <16 x float> %9, i64 1
+  %16 = call float @__aie2p_scalar_fdiv(float %14, float %15)
+  %17 = insertelement <16 x float> %13, float %16, i64 1
+  %18 = extractelement <16 x float> %7, i64 2
+  %19 = extractelement <16 x float> %9, i64 2
+  %20 = call float @__aie2p_scalar_fdiv(float %18, float %19)
+  %21 = insertelement <16 x float> %17, float %20, i64 2
+  %22 = extractelement <16 x float> %7, i64 3
+  %23 = extractelement <16 x float> %9, i64 3
+  %24 = call float @__aie2p_scalar_fdiv(float %22, float %23)
+  %25 = insertelement <16 x float> %21, float %24, i64 3
+  %26 = extractelement <16 x float> %7, i64 4
+  %27 = extractelement <16 x float> %9, i64 4
+  %28 = call float @__aie2p_scalar_fdiv(float %26, float %27)
+  %29 = insertelement <16 x float> %25, float %28, i64 4
+  %30 = extractelement <16 x float> %7, i64 5
+  %31 = extractelement <16 x float> %9, i64 5
+  %32 = call float @__aie2p_scalar_fdiv(float %30, float %31)
+  %33 = insertelement <16 x float> %29, float %32, i64 5
+  %34 = extractelement <16 x float> %7, i64 6
+  %35 = extractelement <16 x float> %9, i64 6
+  %36 = call float @__aie2p_scalar_fdiv(float %34, float %35)
+  %37 = insertelement <16 x float> %33, float %36, i64 6
+  %38 = extractelement <16 x float> %7, i64 7
+  %39 = extractelement <16 x float> %9, i64 7
+  %40 = call float @__aie2p_scalar_fdiv(float %38, float %39)
+  %41 = insertelement <16 x float> %37, float %40, i64 7
+  %42 = extractelement <16 x float> %7, i64 8
+  %43 = extractelement <16 x float> %9, i64 8
+  %44 = call float @__aie2p_scalar_fdiv(float %42, float %43)
+  %45 = insertelement <16 x float> %41, float %44, i64 8
+  %46 = extractelement <16 x float> %7, i64 9
+  %47 = extractelement <16 x float> %9, i64 9
+  %48 = call float @__aie2p_scalar_fdiv(float %46, float %47)
+  %49 = insertelement <16 x float> %45, float %48, i64 9
+  %50 = extractelement <16 x float> %7, i64 10
+  %51 = extractelement <16 x float> %9, i64 10
+  %52 = call float @__aie2p_scalar_fdiv(float %50, float %51)
+  %53 = insertelement <16 x float> %49, float %52, i64 10
+  %54 = extractelement <16 x float> %7, i64 11
+  %55 = extractelement <16 x float> %9, i64 11
+  %56 = call float @__aie2p_scalar_fdiv(float %54, float %55)
+  %57 = insertelement <16 x float> %53, float %56, i64 11
+  %58 = extractelement <16 x float> %7, i64 12
+  %59 = extractelement <16 x float> %9, i64 12
+  %60 = call float @__aie2p_scalar_fdiv(float %58, float %59)
+  %61 = insertelement <16 x float> %57, float %60, i64 12
+  %62 = extractelement <16 x float> %7, i64 13
+  %63 = extractelement <16 x float> %9, i64 13
+  %64 = call float @__aie2p_scalar_fdiv(float %62, float %63)
+  %65 = insertelement <16 x float> %61, float %64, i64 13
+  %66 = extractelement <16 x float> %7, i64 14
+  %67 = extractelement <16 x float> %9, i64 14
+  %68 = call float @__aie2p_scalar_fdiv(float %66, float %67)
+  %69 = insertelement <16 x float> %65, float %68, i64 14
+  %70 = extractelement <16 x float> %7, i64 15
+  %71 = extractelement <16 x float> %9, i64 15
+  %72 = call float @__aie2p_scalar_fdiv(float %70, float %71)
+  %73 = insertelement <16 x float> %69, float %72, i64 15
+  %74 = getelementptr float, ptr @buf3, i32 %3
+  store <16 x float> %73, ptr %74
+  %75 = add i32 %3, 16
+  br label %2, !llvm.loop !1
+
+76:                                               ; preds = %2
+  call void @llvm.aie2p.release(i32 51, i32 1)
+  call void @llvm.aie2p.release(i32 53, i32 1)
+  call void @llvm.aie2p.release(i32 48, i32 1)
+  br label %1
+}
+
+; Unknown intrinsic
+declare float @llvm.aie2p.inv(float)
+
+attributes #0 = { noinline }
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !{!1, !2}
+!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_4.elf b/examples/elementwise_arith/air_project/div_kernel_0_core_0_4.elf
new file mode 100755
index 0000000000000000000000000000000000000000..b9ef327d81371a030c01aa887037ee689da34b56
GIT binary patch
literal 4196
zcma)9eQZ<L6+h2*{0wXA>p-&G1e!<qh$&8DCkA3VT8E_-q6Wb>P`05uj%|oV;$*SI
z%qGSwNlPi922`Y5G1RnDTTPlYXWCelzq~adTD2RfAW^N8Jj%39TeTpylA8vybNt@3
zO`r)!y6>LfIp>~x&bc4HKJrZ9SxJ(>AQRXM$cw^&9l(se7Zfmo6J)ReS*zF#8eIfH
ztqcVW+pd2kBZjWDx0V8(w@4r?&<@!}azVRh8NkhLW(B5PFyRMy35esC7uY0DZ3D46
zgHhdT8%-az{oNS#V@Ux;&T%aSUCO~p0U^=I+$_K^I@iIYh^4%Dao%hf%VoRR?6ix@
z)poIdg<b4kZWn`(*~QsP`z8l8XCz2(xOtF4`eRAGUM{x*^6t+h^~*)F$>u^|*;#I@
zt3n2e+qUSOi&@IecU@1aw_P9<U^zwQ{CCjGz3{YVi@9TTVlZPjUCES5>7zO9+oaB_
zS>E!6{Z`u(V>|trpC7R1K>1-6G!m7UQNM2Nds{DP0tNczS@Gl-dJ#3x0OLdE>hYnW
zYEk*&0{*T%oWB>*-;3rNQF#sRUGv0u1N99H$9D|<9UDS_pQ3%^JpQt%Z(7*jarAeb
z^1X%j^VRQPsDFt1ofxVfKVioB{)_gmu^;Sw@OtEwk!s3!Psal5amrjXehTv~t74RI
zN9Q;BV>OiTkGdD|_v%p1_^ak>QC5xqoMU_YztJDYw|4CL)P%lYD!$^UKa~0DE7-1K
zyMgUfY+3o*4wG_b2ETO`cdqTITXp76s(wXHs@}6KRo}KeRezOV+flLd(Ojzj_3Bjp
zuWC~7PAp6P^8?iUb@{D7Us%_{C&rh2Ufpa}Ue4Xxkv?bU!;-&W-?+e;pUw}Rs*g?9
zHmCs5_caFzPfw?Qdbn*AxnY1#8p&;-%P~I&HWMiNdALBH5u%$rqWoDV%2l>k`Tf|2
zul6dw^h8cnschO(3F_|avXXU4uUXr(A`ozF3IY2yJd%DYx9Rlk|7L0-EzjU-cbc-b
z>9@W{ZmUUw^qQ~t{`+LQae6xW3aDp4lI?II^|oYzW!mlW9I#=$9ZVFWPWkv=ma?@*
ze142xGIA8F-)9pHoRWg((nJSz_rPLsPEB;&?QU?=4eDC^Ii7OYA5U59?udrTDlv75
z2?d+_0TVyGT!FQ`fsqvm=R=e$+z)s|6mN-(Y6a>?X8$rXYs7OVvQ>|L$5Qmg&8>#3
zc68*2dHt378CN85og0BQGotqqCZaZnYZWuC0P$GBfwkx~yET8JYM6-z3!503?ZfI~
zOzkjg232^i4zcbs+WJTEh6fJc22t*Jh%(Ia?F^f5el7!}`mb7I(Q>E>P!6reoHX-%
z<3*X8A3y1|<;UaLv>9(;jVS-j5zHZLwi&Vo{8heCab}la0kB%cfyKBN{5Q<FiW+J8
za_6Y*dh?`Y1tF;pvH2AyHe&_KhZ&#R<`AnVamOOwjn&Fm1#DCmjr4k8#*-^8`hLPW
zvc;J1kI(r16HEWZ1Ouukjdz-g<H1tM8BZAHTbS{FfA{Ps%$NL)EXMeBfAo8rCk2$J
za{TH=nftObAA29~l8<EmldCe{bzSD3Ph{@+n|%N8G0yMD)hrO=otAs+kMXOLRi96|
zHac9FYM@TSdNMve?BLUnW9{0g`u!%|Pdg}9^@wP=#4e!kU(y-D9mw5&2mj2>gp^q#
zmTz<L`p4(pZ#kusW4`F&!RJfYQ+#E{`tY>L;d<kwB;OT>-(uqMDu=KKm}qKuh}!Qm
zVON)m+T%;b;ZX-qpL7T}t~)a=6@KgwcQT&d!USw#R413TSXyEdo7+%>(kP(iQHs|O
z^n%grF`?amn4LKPp54?x^;>iLSWfJpkzFQ>-%RHL<HIq!y|F*2P~10Jd~^YS{k;dQ
zPaqaQ5FOa01On}f$Jgj?#Pc2Q3B~s-2crXtSYKS1D$ed?vOlq@scC;K**)0R81CzB
z>WKBk;;E*do`by&p;%PuY6(Z&zDP?;G!$uZ`!;x6Bhm22h&R&W>1yfnhCRM$#OK}T
zTT9X1OHv86!0p*3h%*J51U1NKOYq79Pp_xjv-EQ6qz7uABz(`JJwB^6lHfbYq)YVe
zh%<%iNU$BTyYPUOU=Qg_a1gOlV&9FJmFQ`i`b+R3#GNJfM-it>@Cn5D)YVCX*N{_7
z@Oi|eCHNh~6no0|H*@eG5WDe>u8{;6kx{Vi#8v?ke1g3j>oJ0j@J+-NbH=RU{~!+*
zEkFiVm?G>i;x7rSMLdl#gYl6M@@Li;P|@+X^0;Wf3NgiJ*f$|2o`}YK5i5B_4*&_;
z=HTzo!OzdZ@j3YCbMQ-u!J=&l%I^eWe-V!mR*U#--d<Z!vOkx%FXA`z_C*Yh$!IDG
zbS!$KabyyQdXu3p?7I`m0gWs4q40rVBKEUr0{?#n_H1vDg?ggl<o3SEU{4ep16#Iu
zf*!qZhTg%R#J-lsa9-dJuGjk(z2B(!&4$3^)+HXp)MMEA3{A7a7?Nf~((E<-wHl0%
zLWsmDgz!Mp(-@{u_6^3v!LI23Slk1_ARhka{$L^;>In@5_eEj{QKrQW;l6=r&>i$b
z*WkWZ`fQ}n2F(DyB9lml29gj=pa<fD2SYuB(O~;_g5d)Y><z`@!B{+&1mYkVPWBBX
zj8P;Hh6V;ghYX3Pj>aR9AEYbuI8r)N10=_Q*3%~+WD9W~PyXwsuP?uG{kT6Kl4xsC
z({=}LE9B|_vq4N}=yZ^sTgM9i?*3HgsWbeLy{SYl(C!Kf=sKfkGwoWk!Mh*uI2Wk9
n9AIMsY5oY-VJo$J1MS|S$#gQ<ClC{_>hfpsUpdWb=#BpWGCVKz

literal 0
HcmV?d00001

diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_4.ld.script b/examples/elementwise_arith/air_project/div_kernel_0_core_0_4.ld.script
new file mode 100644
index 0000000..ddda3c2
--- /dev/null
+++ b/examples/elementwise_arith/air_project/div_kernel_0_core_0_4.ld.script
@@ -0,0 +1,78 @@
+
+MEMORY
+{
+   program (RX) : ORIGIN = 0, LENGTH = 0x0020000
+   data (!RX) : ORIGIN = 0x78400, LENGTH = 0x7C00
+}
+ENTRY(__start)
+SECTIONS
+{
+  . = 0x0;
+  .text : {
+     /* the __start symbol has to come at address zero. */
+     *crt0.o(.text*)
+     _ctors_start = .;
+     _init_array_start = .;
+     KEEP(SORT(*.init_array))
+     _ctors_end = .;
+     _init_array_end = .;
+     _dtors_start = .;
+     _dtors_end = .;
+     *(.text*)
+  } > program
+  .data : {
+     *(.data*)
+     *(.rodata*)
+  } > data
+  .comment : {
+     *(.comment*)
+  }
+  .symtab : {
+     *(.symtab)
+  }
+  .shstrtab : {
+     *(.shstrtab)
+  }
+  .strtab : {
+     *(.strtab)
+  }
+  .stack_sizes : {
+     *(.stack_sizes)
+  }
+
+. = 0x70000;
+_sp_start_value_DM_stack = .;
+. += 0x400; /* stack */
+. = 0x40400;
+buf5 = .;
+. += 0x400;
+. = 0x44000;
+buf4 = .;
+. += 0x400;
+. = 0x48000;
+buf3 = .;
+. += 0x400;
+/* No tile with memory exists to the west. */
+. = 0x50000;
+. += 0x10000;
+. = 0x60400;
+buf11 = .;
+. += 0x400;
+. = 0x64000;
+buf10 = .;
+. += 0x400;
+. = 0x68000;
+buf9 = .;
+. += 0x400;
+. = 0x70400;
+buf8 = .;
+. += 0x400;
+. = 0x74000;
+buf7 = .;
+. += 0x400;
+. = 0x78000;
+buf6 = .;
+. += 0x400;
+  .bss : { *(.bss*) } > data
+}
+PROVIDE(main = core_0_4);
diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_4.ll b/examples/elementwise_arith/air_project/div_kernel_0_core_0_4.ll
new file mode 100644
index 0000000..678847a
--- /dev/null
+++ b/examples/elementwise_arith/air_project/div_kernel_0_core_0_4.ll
@@ -0,0 +1,158 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target triple = "aie2p"
+
+@buf0 = external global [256 x float]
+@buf1 = external global [256 x float]
+@buf2 = external global [256 x float]
+@buf3 = external global [256 x float]
+@buf4 = external global [256 x float]
+@buf5 = external global [256 x float]
+@buf6 = external global [256 x float]
+@buf7 = external global [256 x float]
+@buf8 = external global [256 x float]
+@buf9 = external global [256 x float]
+@buf10 = external global [256 x float]
+@buf11 = external global [256 x float]
+@buf12 = external global [1024 x float]
+@buf13 = external global [1024 x float]
+@buf14 = external global [1024 x float]
+
+; Function Attrs: noinline
+define float @__aie2p_scalar_fdiv(float %0, float %1) #0 {
+  %3 = call float @llvm.aie2p.inv(float %1)
+  %4 = fmul float %0, %3
+  ret float %4
+}
+
+declare void @debug_i32(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.event(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.put.ms(i32, i32)
+
+; Unknown intrinsic
+declare { i32, i32 } @llvm.aie2p.get.ss()
+
+; Unknown intrinsic
+declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32)
+
+; Unknown intrinsic
+declare <16 x i32> @llvm.aie2p.scd.read.vec(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.acquire(i32, i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.release(i32, i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.set.ctrl.reg(i32, i32)
+
+define void @core_0_4() {
+  br label %1
+
+1:                                                ; preds = %76, %0
+  call void @llvm.aie2p.acquire(i32 49, i32 -1)
+  call void @llvm.aie2p.acquire(i32 50, i32 -1)
+  call void @llvm.aie2p.acquire(i32 52, i32 -1)
+  br label %2
+
+2:                                                ; preds = %5, %1
+  %3 = phi i32 [ %75, %5 ], [ 0, %1 ]
+  %4 = icmp slt i32 %3, 256
+  br i1 %4, label %5, label %76
+
+5:                                                ; preds = %2
+  %6 = getelementptr float, ptr @buf8, i32 %3
+  %7 = load <16 x float>, ptr %6, align 4
+  %8 = getelementptr float, ptr @buf7, i32 %3
+  %9 = load <16 x float>, ptr %8, align 4
+  %10 = extractelement <16 x float> %7, i64 0
+  %11 = extractelement <16 x float> %9, i64 0
+  %12 = call float @__aie2p_scalar_fdiv(float %10, float %11)
+  %13 = insertelement <16 x float> poison, float %12, i64 0
+  %14 = extractelement <16 x float> %7, i64 1
+  %15 = extractelement <16 x float> %9, i64 1
+  %16 = call float @__aie2p_scalar_fdiv(float %14, float %15)
+  %17 = insertelement <16 x float> %13, float %16, i64 1
+  %18 = extractelement <16 x float> %7, i64 2
+  %19 = extractelement <16 x float> %9, i64 2
+  %20 = call float @__aie2p_scalar_fdiv(float %18, float %19)
+  %21 = insertelement <16 x float> %17, float %20, i64 2
+  %22 = extractelement <16 x float> %7, i64 3
+  %23 = extractelement <16 x float> %9, i64 3
+  %24 = call float @__aie2p_scalar_fdiv(float %22, float %23)
+  %25 = insertelement <16 x float> %21, float %24, i64 3
+  %26 = extractelement <16 x float> %7, i64 4
+  %27 = extractelement <16 x float> %9, i64 4
+  %28 = call float @__aie2p_scalar_fdiv(float %26, float %27)
+  %29 = insertelement <16 x float> %25, float %28, i64 4
+  %30 = extractelement <16 x float> %7, i64 5
+  %31 = extractelement <16 x float> %9, i64 5
+  %32 = call float @__aie2p_scalar_fdiv(float %30, float %31)
+  %33 = insertelement <16 x float> %29, float %32, i64 5
+  %34 = extractelement <16 x float> %7, i64 6
+  %35 = extractelement <16 x float> %9, i64 6
+  %36 = call float @__aie2p_scalar_fdiv(float %34, float %35)
+  %37 = insertelement <16 x float> %33, float %36, i64 6
+  %38 = extractelement <16 x float> %7, i64 7
+  %39 = extractelement <16 x float> %9, i64 7
+  %40 = call float @__aie2p_scalar_fdiv(float %38, float %39)
+  %41 = insertelement <16 x float> %37, float %40, i64 7
+  %42 = extractelement <16 x float> %7, i64 8
+  %43 = extractelement <16 x float> %9, i64 8
+  %44 = call float @__aie2p_scalar_fdiv(float %42, float %43)
+  %45 = insertelement <16 x float> %41, float %44, i64 8
+  %46 = extractelement <16 x float> %7, i64 9
+  %47 = extractelement <16 x float> %9, i64 9
+  %48 = call float @__aie2p_scalar_fdiv(float %46, float %47)
+  %49 = insertelement <16 x float> %45, float %48, i64 9
+  %50 = extractelement <16 x float> %7, i64 10
+  %51 = extractelement <16 x float> %9, i64 10
+  %52 = call float @__aie2p_scalar_fdiv(float %50, float %51)
+  %53 = insertelement <16 x float> %49, float %52, i64 10
+  %54 = extractelement <16 x float> %7, i64 11
+  %55 = extractelement <16 x float> %9, i64 11
+  %56 = call float @__aie2p_scalar_fdiv(float %54, float %55)
+  %57 = insertelement <16 x float> %53, float %56, i64 11
+  %58 = extractelement <16 x float> %7, i64 12
+  %59 = extractelement <16 x float> %9, i64 12
+  %60 = call float @__aie2p_scalar_fdiv(float %58, float %59)
+  %61 = insertelement <16 x float> %57, float %60, i64 12
+  %62 = extractelement <16 x float> %7, i64 13
+  %63 = extractelement <16 x float> %9, i64 13
+  %64 = call float @__aie2p_scalar_fdiv(float %62, float %63)
+  %65 = insertelement <16 x float> %61, float %64, i64 13
+  %66 = extractelement <16 x float> %7, i64 14
+  %67 = extractelement <16 x float> %9, i64 14
+  %68 = call float @__aie2p_scalar_fdiv(float %66, float %67)
+  %69 = insertelement <16 x float> %65, float %68, i64 14
+  %70 = extractelement <16 x float> %7, i64 15
+  %71 = extractelement <16 x float> %9, i64 15
+  %72 = call float @__aie2p_scalar_fdiv(float %70, float %71)
+  %73 = insertelement <16 x float> %69, float %72, i64 15
+  %74 = getelementptr float, ptr @buf6, i32 %3
+  store <16 x float> %73, ptr %74, align 4
+  %75 = add i32 %3, 16
+  br label %2, !llvm.loop !1
+
+76:                                               ; preds = %2
+  call void @llvm.aie2p.release(i32 51, i32 1)
+  call void @llvm.aie2p.release(i32 53, i32 1)
+  call void @llvm.aie2p.release(i32 48, i32 1)
+  br label %1
+}
+
+; Unknown intrinsic
+declare float @llvm.aie2p.inv(float)
+
+attributes #0 = { noinline }
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !{!1, !2}
+!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_4.o b/examples/elementwise_arith/air_project/div_kernel_0_core_0_4.o
new file mode 100644
index 0000000000000000000000000000000000000000..75208a26da516d5325930c6a9cff6520d51ab84d
GIT binary patch
literal 2048
zcma)7Z)h837=JF8G*@uDGm`R}tZ=KkRGXe{S8H($lQMN5x)5X=#edhdOSPEUk|yOu
z(7JTaF~_>0a3Vs#4ZlbNGD1J~!w``n6{8H?hblq+pwPjD2%GVF?%r!(ifj77`#yhu
zzx&<$+`aeSd(VwNFG&)3CV?Vk0CpRIjNqU{=!SOK;!<fP;Q$+PVE*^D>j)}fA6``f
z>dEEu8RZuT#_45A2R+ze0Gd|<V9?Doy0Xy(cz3)9_96k<sA+#Lt9EBlwGVfzcK9*X
zermsJpXgNW;=`)_Nmza6LFhwrdFa=94azbs{J1BW1dRJI{<}TsQ{o7(H<{AafdLv*
zMtBt0{Cnho<A)Y*<9&3{8u|ibU%@aNglAo@-)Yq@uByJZ)qqq!(?Fk;#ygtq;2pJ|
ze4>(0V1EgRP5t4DU9-dAAl_4X@#r<Tpu=piX&<^1bW>Lupjwh6)lw;9hi|mdTic!9
zS=2i#N9}M8`SBgrcN6j6*6TZmdgn^0cMJK(4tnc|KiOLEJnEgNe(T8J>G}PE_%5E`
zg;Jz?LB{p{iTrrwc>4d}M?+tVP`@X~TD*^oa<qC8`wetx)bH5%{q(db_50ErE%Yvx
zqSZ@s#12GIue<Wf?0xlceUDdOvsU<i3H`$^-w3kHYdC5+ZsNFwV?9_)`SdGyaqbG;
ztEIXgy>icr?T=cq{(V*~*=fbTXSG!5k-ZHo_G!e5eG#>;uI#gZ{}z#-JL}(F>ltG!
z)rbCy47BMVHtMPJXEIxq67F4bW?I7eelYyO*Or730N>XPzkH^Q^RI^~%NQ;4>zD$(
zzA40mA)(-h+s%(KPjA)&j>4>vKfq(+gz)5fXA^S|jy%s-@lXmu0LHfAuWrjHw&j%i
z0&dYj*TcWaVT^kfw|<1;b~mQ=N`5ykljHGkLfttMd3{LQ;-@k8_#X)MWug9)P~Q;h
z2Yg#P35-3Rd7-`})V~($>q5PN&q<;K9BmPOFYotDsuPdQ$Y*mIqmZ=<hG}M|vjel{
z++-$~$(vKBrsu&yz<6t>kTqUNo#>q_WG3H)(b1E~j!b8A*~!AOnN!7F7L2(<zL1$f
z|MqF}6UC`vnx3X<NW9DBOg?M&nS)@Or;E9{se@pQj*Rr11Kjt!>)OS96gj*-hwaDz
z)yKkh^hn8c926t$Sb}@cX*?i&JCF9FXC99@^vV+)*uaGLE^Q%N4`Gz&aEN+KnA6_%
v%+q@v3!>g#OccD3^k}<!dIvBjJ<rkv2<Bn5X7BY9OuS|EgwrK7=-&SUdbvNa

literal 0
HcmV?d00001

diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_4.opt.ll b/examples/elementwise_arith/air_project/div_kernel_0_core_0_4.opt.ll
new file mode 100644
index 0000000..849c352
--- /dev/null
+++ b/examples/elementwise_arith/air_project/div_kernel_0_core_0_4.opt.ll
@@ -0,0 +1,129 @@
+; ModuleID = 'air_project/div_kernel_0_core_0_4.peanohack.ll'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-m:e-p:20:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-f32:32:32-i64:32-f64:32-a:0:32-n32"
+target triple = "aie2p"
+
+@buf6 = external local_unnamed_addr global [256 x float]
+@buf7 = external local_unnamed_addr global [256 x float]
+@buf8 = external local_unnamed_addr global [256 x float]
+
+; Function Attrs: nofree noinline nosync nounwind memory(none)
+define float @__aie2p_scalar_fdiv(float %0, float %1) local_unnamed_addr #0 {
+  %3 = tail call float @llvm.aie2p.inv(float %1)
+  %4 = fmul float %3, %0
+  ret float %4
+}
+
+; Function Attrs: nounwind
+declare void @llvm.aie2p.acquire(i32, i32) #1
+
+; Function Attrs: nounwind
+declare void @llvm.aie2p.release(i32, i32) #1
+
+; Function Attrs: noreturn nounwind
+define void @core_0_4() local_unnamed_addr #2 {
+  br label %1
+
+1:                                                ; preds = %76, %0
+  tail call void @llvm.aie2p.acquire(i32 49, i32 -1)
+  tail call void @llvm.aie2p.acquire(i32 50, i32 -1)
+  tail call void @llvm.aie2p.acquire(i32 52, i32 -1)
+  br label %2
+
+2:                                                ; preds = %1, %2
+  %3 = phi i32 [ 0, %1 ], [ %74, %2 ]
+  %4 = trunc nuw i32 %3 to i20
+  %5 = getelementptr float, ptr @buf8, i20 %4
+  %6 = load <16 x float>, ptr %5, align 64
+  %7 = getelementptr float, ptr @buf7, i20 %4
+  %8 = load <16 x float>, ptr %7, align 64
+  %9 = extractelement <16 x float> %6, i64 0
+  %10 = extractelement <16 x float> %8, i64 0
+  %11 = tail call float @__aie2p_scalar_fdiv(float %9, float %10)
+  %12 = insertelement <16 x float> poison, float %11, i64 0
+  %13 = extractelement <16 x float> %6, i64 1
+  %14 = extractelement <16 x float> %8, i64 1
+  %15 = tail call float @__aie2p_scalar_fdiv(float %13, float %14)
+  %16 = insertelement <16 x float> %12, float %15, i64 1
+  %17 = extractelement <16 x float> %6, i64 2
+  %18 = extractelement <16 x float> %8, i64 2
+  %19 = tail call float @__aie2p_scalar_fdiv(float %17, float %18)
+  %20 = insertelement <16 x float> %16, float %19, i64 2
+  %21 = extractelement <16 x float> %6, i64 3
+  %22 = extractelement <16 x float> %8, i64 3
+  %23 = tail call float @__aie2p_scalar_fdiv(float %21, float %22)
+  %24 = insertelement <16 x float> %20, float %23, i64 3
+  %25 = extractelement <16 x float> %6, i64 4
+  %26 = extractelement <16 x float> %8, i64 4
+  %27 = tail call float @__aie2p_scalar_fdiv(float %25, float %26)
+  %28 = insertelement <16 x float> %24, float %27, i64 4
+  %29 = extractelement <16 x float> %6, i64 5
+  %30 = extractelement <16 x float> %8, i64 5
+  %31 = tail call float @__aie2p_scalar_fdiv(float %29, float %30)
+  %32 = insertelement <16 x float> %28, float %31, i64 5
+  %33 = extractelement <16 x float> %6, i64 6
+  %34 = extractelement <16 x float> %8, i64 6
+  %35 = tail call float @__aie2p_scalar_fdiv(float %33, float %34)
+  %36 = insertelement <16 x float> %32, float %35, i64 6
+  %37 = extractelement <16 x float> %6, i64 7
+  %38 = extractelement <16 x float> %8, i64 7
+  %39 = tail call float @__aie2p_scalar_fdiv(float %37, float %38)
+  %40 = insertelement <16 x float> %36, float %39, i64 7
+  %41 = extractelement <16 x float> %6, i64 8
+  %42 = extractelement <16 x float> %8, i64 8
+  %43 = tail call float @__aie2p_scalar_fdiv(float %41, float %42)
+  %44 = insertelement <16 x float> %40, float %43, i64 8
+  %45 = extractelement <16 x float> %6, i64 9
+  %46 = extractelement <16 x float> %8, i64 9
+  %47 = tail call float @__aie2p_scalar_fdiv(float %45, float %46)
+  %48 = insertelement <16 x float> %44, float %47, i64 9
+  %49 = extractelement <16 x float> %6, i64 10
+  %50 = extractelement <16 x float> %8, i64 10
+  %51 = tail call float @__aie2p_scalar_fdiv(float %49, float %50)
+  %52 = insertelement <16 x float> %48, float %51, i64 10
+  %53 = extractelement <16 x float> %6, i64 11
+  %54 = extractelement <16 x float> %8, i64 11
+  %55 = tail call float @__aie2p_scalar_fdiv(float %53, float %54)
+  %56 = insertelement <16 x float> %52, float %55, i64 11
+  %57 = extractelement <16 x float> %6, i64 12
+  %58 = extractelement <16 x float> %8, i64 12
+  %59 = tail call float @__aie2p_scalar_fdiv(float %57, float %58)
+  %60 = insertelement <16 x float> %56, float %59, i64 12
+  %61 = extractelement <16 x float> %6, i64 13
+  %62 = extractelement <16 x float> %8, i64 13
+  %63 = tail call float @__aie2p_scalar_fdiv(float %61, float %62)
+  %64 = insertelement <16 x float> %60, float %63, i64 13
+  %65 = extractelement <16 x float> %6, i64 14
+  %66 = extractelement <16 x float> %8, i64 14
+  %67 = tail call float @__aie2p_scalar_fdiv(float %65, float %66)
+  %68 = insertelement <16 x float> %64, float %67, i64 14
+  %69 = extractelement <16 x float> %6, i64 15
+  %70 = extractelement <16 x float> %8, i64 15
+  %71 = tail call float @__aie2p_scalar_fdiv(float %69, float %70)
+  %72 = insertelement <16 x float> %68, float %71, i64 15
+  %73 = getelementptr float, ptr @buf6, i20 %4
+  store <16 x float> %72, ptr %73, align 64
+  %74 = add nuw nsw i32 %3, 16
+  %75 = icmp ult i32 %3, 240
+  br i1 %75, label %2, label %76, !llvm.loop !1
+
+76:                                               ; preds = %2
+  tail call void @llvm.aie2p.release(i32 51, i32 1)
+  tail call void @llvm.aie2p.release(i32 53, i32 1)
+  tail call void @llvm.aie2p.release(i32 48, i32 1)
+  br label %1
+}
+
+; Function Attrs: nofree nosync nounwind memory(none)
+declare float @llvm.aie2p.inv(float) #3
+
+attributes #0 = { nofree noinline nosync nounwind memory(none) }
+attributes #1 = { nounwind }
+attributes #2 = { noreturn nounwind }
+attributes #3 = { nofree nosync nounwind memory(none) }
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !{!1, !2}
+!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_4.peanohack.ll b/examples/elementwise_arith/air_project/div_kernel_0_core_0_4.peanohack.ll
new file mode 100644
index 0000000..9a0f789
--- /dev/null
+++ b/examples/elementwise_arith/air_project/div_kernel_0_core_0_4.peanohack.ll
@@ -0,0 +1,158 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target triple = "aie2p"
+
+@buf0 = external global [256 x float]
+@buf1 = external global [256 x float]
+@buf2 = external global [256 x float]
+@buf3 = external global [256 x float]
+@buf4 = external global [256 x float]
+@buf5 = external global [256 x float]
+@buf6 = external global [256 x float]
+@buf7 = external global [256 x float]
+@buf8 = external global [256 x float]
+@buf9 = external global [256 x float]
+@buf10 = external global [256 x float]
+@buf11 = external global [256 x float]
+@buf12 = external global [1024 x float]
+@buf13 = external global [1024 x float]
+@buf14 = external global [1024 x float]
+
+; Function Attrs: noinline
+define float @__aie2p_scalar_fdiv(float %0, float %1) #0 {
+  %3 = call float @llvm.aie2p.inv(float %1)
+  %4 = fmul float %0, %3
+  ret float %4
+}
+
+declare void @debug_i32(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.event(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.put.ms(i32, i32)
+
+; Unknown intrinsic
+declare { i32, i32 } @llvm.aie2p.get.ss()
+
+; Unknown intrinsic
+declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32)
+
+; Unknown intrinsic
+declare <16 x i32> @llvm.aie2p.scd.read.vec(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.acquire(i32, i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.release(i32, i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.set.ctrl.reg(i32, i32)
+
+define void @core_0_4() {
+  br label %1
+
+1:                                                ; preds = %76, %0
+  call void @llvm.aie2p.acquire(i32 49, i32 -1)
+  call void @llvm.aie2p.acquire(i32 50, i32 -1)
+  call void @llvm.aie2p.acquire(i32 52, i32 -1)
+  br label %2
+
+2:                                                ; preds = %5, %1
+  %3 = phi i32 [ %75, %5 ], [ 0, %1 ]
+  %4 = icmp slt i32 %3, 256
+  br i1 %4, label %5, label %76
+
+5:                                                ; preds = %2
+  %6 = getelementptr float, ptr @buf8, i32 %3
+  %7 = load <16 x float>, ptr %6
+  %8 = getelementptr float, ptr @buf7, i32 %3
+  %9 = load <16 x float>, ptr %8
+  %10 = extractelement <16 x float> %7, i64 0
+  %11 = extractelement <16 x float> %9, i64 0
+  %12 = call float @__aie2p_scalar_fdiv(float %10, float %11)
+  %13 = insertelement <16 x float> poison, float %12, i64 0
+  %14 = extractelement <16 x float> %7, i64 1
+  %15 = extractelement <16 x float> %9, i64 1
+  %16 = call float @__aie2p_scalar_fdiv(float %14, float %15)
+  %17 = insertelement <16 x float> %13, float %16, i64 1
+  %18 = extractelement <16 x float> %7, i64 2
+  %19 = extractelement <16 x float> %9, i64 2
+  %20 = call float @__aie2p_scalar_fdiv(float %18, float %19)
+  %21 = insertelement <16 x float> %17, float %20, i64 2
+  %22 = extractelement <16 x float> %7, i64 3
+  %23 = extractelement <16 x float> %9, i64 3
+  %24 = call float @__aie2p_scalar_fdiv(float %22, float %23)
+  %25 = insertelement <16 x float> %21, float %24, i64 3
+  %26 = extractelement <16 x float> %7, i64 4
+  %27 = extractelement <16 x float> %9, i64 4
+  %28 = call float @__aie2p_scalar_fdiv(float %26, float %27)
+  %29 = insertelement <16 x float> %25, float %28, i64 4
+  %30 = extractelement <16 x float> %7, i64 5
+  %31 = extractelement <16 x float> %9, i64 5
+  %32 = call float @__aie2p_scalar_fdiv(float %30, float %31)
+  %33 = insertelement <16 x float> %29, float %32, i64 5
+  %34 = extractelement <16 x float> %7, i64 6
+  %35 = extractelement <16 x float> %9, i64 6
+  %36 = call float @__aie2p_scalar_fdiv(float %34, float %35)
+  %37 = insertelement <16 x float> %33, float %36, i64 6
+  %38 = extractelement <16 x float> %7, i64 7
+  %39 = extractelement <16 x float> %9, i64 7
+  %40 = call float @__aie2p_scalar_fdiv(float %38, float %39)
+  %41 = insertelement <16 x float> %37, float %40, i64 7
+  %42 = extractelement <16 x float> %7, i64 8
+  %43 = extractelement <16 x float> %9, i64 8
+  %44 = call float @__aie2p_scalar_fdiv(float %42, float %43)
+  %45 = insertelement <16 x float> %41, float %44, i64 8
+  %46 = extractelement <16 x float> %7, i64 9
+  %47 = extractelement <16 x float> %9, i64 9
+  %48 = call float @__aie2p_scalar_fdiv(float %46, float %47)
+  %49 = insertelement <16 x float> %45, float %48, i64 9
+  %50 = extractelement <16 x float> %7, i64 10
+  %51 = extractelement <16 x float> %9, i64 10
+  %52 = call float @__aie2p_scalar_fdiv(float %50, float %51)
+  %53 = insertelement <16 x float> %49, float %52, i64 10
+  %54 = extractelement <16 x float> %7, i64 11
+  %55 = extractelement <16 x float> %9, i64 11
+  %56 = call float @__aie2p_scalar_fdiv(float %54, float %55)
+  %57 = insertelement <16 x float> %53, float %56, i64 11
+  %58 = extractelement <16 x float> %7, i64 12
+  %59 = extractelement <16 x float> %9, i64 12
+  %60 = call float @__aie2p_scalar_fdiv(float %58, float %59)
+  %61 = insertelement <16 x float> %57, float %60, i64 12
+  %62 = extractelement <16 x float> %7, i64 13
+  %63 = extractelement <16 x float> %9, i64 13
+  %64 = call float @__aie2p_scalar_fdiv(float %62, float %63)
+  %65 = insertelement <16 x float> %61, float %64, i64 13
+  %66 = extractelement <16 x float> %7, i64 14
+  %67 = extractelement <16 x float> %9, i64 14
+  %68 = call float @__aie2p_scalar_fdiv(float %66, float %67)
+  %69 = insertelement <16 x float> %65, float %68, i64 14
+  %70 = extractelement <16 x float> %7, i64 15
+  %71 = extractelement <16 x float> %9, i64 15
+  %72 = call float @__aie2p_scalar_fdiv(float %70, float %71)
+  %73 = insertelement <16 x float> %69, float %72, i64 15
+  %74 = getelementptr float, ptr @buf6, i32 %3
+  store <16 x float> %73, ptr %74
+  %75 = add i32 %3, 16
+  br label %2, !llvm.loop !1
+
+76:                                               ; preds = %2
+  call void @llvm.aie2p.release(i32 51, i32 1)
+  call void @llvm.aie2p.release(i32 53, i32 1)
+  call void @llvm.aie2p.release(i32 48, i32 1)
+  br label %1
+}
+
+; Unknown intrinsic
+declare float @llvm.aie2p.inv(float)
+
+attributes #0 = { noinline }
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !{!1, !2}
+!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_5.elf b/examples/elementwise_arith/air_project/div_kernel_0_core_0_5.elf
new file mode 100755
index 0000000000000000000000000000000000000000..8162c282e6181e1320623e660451ca081a79c550
GIT binary patch
literal 4132
zcma)9eQZ<L6+h2*{0t)Xb-s3+KzW3ZnBpXMk|r@7okOz<F%!WwP`05tj_nYOd@Xh|
zGb!Vhqyq}50kzt#I<%}(H%*!}XWFtTe|c*_v}!j{gG9AX@+i|bZPkL*X>J<G&hdNC
zF@Yu=>Arh@=bU@)Ip==(`s7#rFG!LE2ARN?Lk%e0U<WW`-vA1jzzH%~fUH$)22HF3
zpjLnahV9Vb;$cIV-&^y6&RZl97HEa^IytA^u@T_*ezOAeE|~EF{1AxamKWJ9PHh9R
zCxubnX_-i#vi;Q<^=(N3Mb2<71YOF(NdY0z$jlPJ4?1?i6Nsg(cX8Ql7n@|e*yFT|
z((QKf%oe*iyvZ(xp0tbCO6|KHP?wS**?4=HLGlAhy;&%?0P^mSCH24SWRuN>zS2wF
zR#}D&61VNuITy1On(w=wRqwh$D8Mp`%K2}gnK!^!G+WFaqZ30ZyXktWKuVs<VBaEj
zlr8c4PwjVFo}N7D!~A@JH3!NMtDuo6y@vW-lV9IAr3n-mke9@>pXx=_JOfORn#-q0
zN6SU&du#Z+{&@agLVqurD@5rW+PhYX?-uH-*N*Qr`a3;}{yswcnpOOzQD3{Zzq9D?
zEaiI#?N_VcKT!V|^*c9OK7G!N@%<a^U6bE9_~`Y>DC6an?~(R3*5iV?V)_E+TTsR*
z-}a6#^2aJD-*0uV;qR5vis@I(<)WY*{W&L(4t$|MjBn@UcM>!DekuBlUwlvI7q4TR
z!*&bXN7&NxT$@R`w20rzqI+{~mD?`eOH^&CNK|<?CaPLCC8}=lxwfLMPh=8Rua_sP
zeo>M5`OL<|-`_>epEuq4<J7KpJ~LhXNqL=BIg`25mVCp^$0Xl?zHxywADtgMRUeqF
zEl>oY?`sYco}Lc<^l;k*a$^7;G?Lpumt%emY$j0j^Dsr85u%%WqVNSK3RQMg`R!!m
zXGaxpawa3nRJLF#1@-VvSxLL3SFNpS;rBas2Z8+(HYA_R?7q14pT*6Plo#=|J5A}$
z$u~boZl6hk<c`me{_}jYW?>=zGN`Y;FWX@%@s?zPjoR(;HDF_SJD4a$owD(rk<y!u
z_<R_>WaKEezr$u2I3)$mrI~i<?t%5-oS$jG-(Bsb8`QP)6FlXv-(Rp+-V@ccWn%s+
z6ACu<T_(PHtq5y*3nR-B&W9+IyC3j|SiU8$s70tBU;5MHk`d3P@IF2Ewi5Kk&CQ0Z
zR&?ZpRs9wC7*`~4y)h0u7Dev{CL%V6Ya26d0r8~Yfwkx`yET8JY>bI&3!52V>c{G0
zOsz0s230gz9b(rtwDpbS4G$c?1ftOA5CxdyTPe2M{9Fb`_1{)((Q>HpQx46>oHX-l
z<3)j*9Y5=|Wyj;#q#18ujVS-rNz5T_wi&WD{FS~~bm@>!0kB%c@%6YDe7DTFiW+J8
za_5BX`ssPe3PMsHV$aJ=?7<2Yjxj#J-yyco;*Ldp7^{`73fQPB8tL`GjAyr6^!<c$
zq?coUI6mX|Pn7(D2?kV88t*hMj|WRGXFOq)Z*Ini{XMXsFkkXFz8>S#{n77fmK0E)
z%kUdlWbRGNeDWQ<OWv3HcW=o2&`p_pK9srRFY?2`$2h+aS2IV54_Y3qKgO?0R((F<
zYI3-)RzRhM^<;cu%)u9)!rHY^^#@G4pH@(;>Pb<3l}(}VAJZAZ9mw522mjd2gp?{4
zoAx_+)l;kPw~SKCF<<oX=<}uPDZT<@eR$I3aQ)=GB;OY!Z!$5m%^~c5CTd$9V)OG%
z*wqrT`D}?8nQ-vrd53W0x-(OW@L@mF!FX~n6R?+2om|p-sn{g;w4es15kSjR6t55H
z1*6wvM!WwoJ8}7KyJ=wl*XHEuj5xL^yG$0Jna%^ohhubmV}DSgxOaB>(FOeVw;!=S
z{%GIv$lz|p?{8H+-Wqoep6^gku<w{M92ty7`}=gM;_Qyc2V%QxYmY_a-9w!<q5j_5
z_GnMEFHzgmGu&Gpj7F5s`cT;I4cFI4g5i3%x3Qr)90@gr8^ZOT&ic-VkjEPddmFmE
zJ1M#cNy>v3xVu!0IF*x0P=S0p4{y!!^m@9xL@%cfdZ1=W!uKrN<FiU53BHU>GEd)%
zIF+l81P2hia}QVvj*vbN2M{ZH_T7kCo}QLzAP=8F+>vL03UM+IpF@mKU7aL&6*)Bz
zUq(EUhks_6Vo&-0Y6bosVmH3gHIm>8G77eX*or`c53zS+J;t#SzKxh-PMJ0QFXX|Z
z1<1e(cM1EJ@&5>`%XkrC3gaUm<gY+qKt;!eS-fn&12M&C*n1EYPekJz5i40l4*&_C
zUx5#=z#S{_zzRIF0-r$)7HvaNeisP)mhmOR>N19!cq9=A+UC8HK4fAidgH-P?7L&}
zL5<7x!O-zQEc*RO48ZR{a-cOD?1_Zp2l~T9JrSt!@7?POc=WywdWU*qUG+7gtiT<3
zM(^wOzDe)v41ve3OFV|D$FT7lnmU6qBz1<QuEFrvY%pF5AsnR;LW6NnO^8D28tMxL
zIwQxTeI5t|@G#d61Y)6JPjE2M6^;(0Op6;r{ezJJh21&S<?&>{Zu)vP6ZDHrEFK(;
zLm-Aehzkq{dxj!`)&m4X$05)gjP?bhebG1&2Z2z$e=uf@B6%P<I2b%(NHle%FAUj1
zx-yHy`6D$zatx@MK27v#%<eoNF7V?KZ3Sw^aaNJ1|Gx$?ouS=Lc5WRj_`CT%ou|(5
zL-wXTxj?%K6wq}<&tBSfWP^7%;Bm}Rb0I)e4r%@fR$|My>qomE(PTOq?9+&eS9Q4+
M|5ekRhTiD^55T1-w*UYD

literal 0
HcmV?d00001

diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_5.ld.script b/examples/elementwise_arith/air_project/div_kernel_0_core_0_5.ld.script
new file mode 100644
index 0000000..51c13db
--- /dev/null
+++ b/examples/elementwise_arith/air_project/div_kernel_0_core_0_5.ld.script
@@ -0,0 +1,72 @@
+
+MEMORY
+{
+   program (RX) : ORIGIN = 0, LENGTH = 0x0020000
+   data (!RX) : ORIGIN = 0x78400, LENGTH = 0x7C00
+}
+ENTRY(__start)
+SECTIONS
+{
+  . = 0x0;
+  .text : {
+     /* the __start symbol has to come at address zero. */
+     *crt0.o(.text*)
+     _ctors_start = .;
+     _init_array_start = .;
+     KEEP(SORT(*.init_array))
+     _ctors_end = .;
+     _init_array_end = .;
+     _dtors_start = .;
+     _dtors_end = .;
+     *(.text*)
+  } > program
+  .data : {
+     *(.data*)
+     *(.rodata*)
+  } > data
+  .comment : {
+     *(.comment*)
+  }
+  .symtab : {
+     *(.symtab)
+  }
+  .shstrtab : {
+     *(.shstrtab)
+  }
+  .strtab : {
+     *(.strtab)
+  }
+  .stack_sizes : {
+     *(.stack_sizes)
+  }
+
+. = 0x70000;
+_sp_start_value_DM_stack = .;
+. += 0x400; /* stack */
+. = 0x40400;
+buf8 = .;
+. += 0x400;
+. = 0x44000;
+buf7 = .;
+. += 0x400;
+. = 0x48000;
+buf6 = .;
+. += 0x400;
+/* No tile with memory exists to the west. */
+. = 0x50000;
+. += 0x10000;
+/* No tile with memory exists to the north. */
+. = 0x60000;
+. += 0x10000;
+. = 0x70400;
+buf11 = .;
+. += 0x400;
+. = 0x74000;
+buf10 = .;
+. += 0x400;
+. = 0x78000;
+buf9 = .;
+. += 0x400;
+  .bss : { *(.bss*) } > data
+}
+PROVIDE(main = core_0_5);
diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_5.ll b/examples/elementwise_arith/air_project/div_kernel_0_core_0_5.ll
new file mode 100644
index 0000000..e652b65
--- /dev/null
+++ b/examples/elementwise_arith/air_project/div_kernel_0_core_0_5.ll
@@ -0,0 +1,158 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target triple = "aie2p"
+
+@buf0 = external global [256 x float]
+@buf1 = external global [256 x float]
+@buf2 = external global [256 x float]
+@buf3 = external global [256 x float]
+@buf4 = external global [256 x float]
+@buf5 = external global [256 x float]
+@buf6 = external global [256 x float]
+@buf7 = external global [256 x float]
+@buf8 = external global [256 x float]
+@buf9 = external global [256 x float]
+@buf10 = external global [256 x float]
+@buf11 = external global [256 x float]
+@buf12 = external global [1024 x float]
+@buf13 = external global [1024 x float]
+@buf14 = external global [1024 x float]
+
+; Function Attrs: noinline
+define float @__aie2p_scalar_fdiv(float %0, float %1) #0 {
+  %3 = call float @llvm.aie2p.inv(float %1)
+  %4 = fmul float %0, %3
+  ret float %4
+}
+
+declare void @debug_i32(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.event(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.put.ms(i32, i32)
+
+; Unknown intrinsic
+declare { i32, i32 } @llvm.aie2p.get.ss()
+
+; Unknown intrinsic
+declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32)
+
+; Unknown intrinsic
+declare <16 x i32> @llvm.aie2p.scd.read.vec(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.acquire(i32, i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.release(i32, i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.set.ctrl.reg(i32, i32)
+
+define void @core_0_5() {
+  br label %1
+
+1:                                                ; preds = %76, %0
+  call void @llvm.aie2p.acquire(i32 49, i32 -1)
+  call void @llvm.aie2p.acquire(i32 50, i32 -1)
+  call void @llvm.aie2p.acquire(i32 52, i32 -1)
+  br label %2
+
+2:                                                ; preds = %5, %1
+  %3 = phi i32 [ %75, %5 ], [ 0, %1 ]
+  %4 = icmp slt i32 %3, 256
+  br i1 %4, label %5, label %76
+
+5:                                                ; preds = %2
+  %6 = getelementptr float, ptr @buf11, i32 %3
+  %7 = load <16 x float>, ptr %6, align 4
+  %8 = getelementptr float, ptr @buf10, i32 %3
+  %9 = load <16 x float>, ptr %8, align 4
+  %10 = extractelement <16 x float> %7, i64 0
+  %11 = extractelement <16 x float> %9, i64 0
+  %12 = call float @__aie2p_scalar_fdiv(float %10, float %11)
+  %13 = insertelement <16 x float> poison, float %12, i64 0
+  %14 = extractelement <16 x float> %7, i64 1
+  %15 = extractelement <16 x float> %9, i64 1
+  %16 = call float @__aie2p_scalar_fdiv(float %14, float %15)
+  %17 = insertelement <16 x float> %13, float %16, i64 1
+  %18 = extractelement <16 x float> %7, i64 2
+  %19 = extractelement <16 x float> %9, i64 2
+  %20 = call float @__aie2p_scalar_fdiv(float %18, float %19)
+  %21 = insertelement <16 x float> %17, float %20, i64 2
+  %22 = extractelement <16 x float> %7, i64 3
+  %23 = extractelement <16 x float> %9, i64 3
+  %24 = call float @__aie2p_scalar_fdiv(float %22, float %23)
+  %25 = insertelement <16 x float> %21, float %24, i64 3
+  %26 = extractelement <16 x float> %7, i64 4
+  %27 = extractelement <16 x float> %9, i64 4
+  %28 = call float @__aie2p_scalar_fdiv(float %26, float %27)
+  %29 = insertelement <16 x float> %25, float %28, i64 4
+  %30 = extractelement <16 x float> %7, i64 5
+  %31 = extractelement <16 x float> %9, i64 5
+  %32 = call float @__aie2p_scalar_fdiv(float %30, float %31)
+  %33 = insertelement <16 x float> %29, float %32, i64 5
+  %34 = extractelement <16 x float> %7, i64 6
+  %35 = extractelement <16 x float> %9, i64 6
+  %36 = call float @__aie2p_scalar_fdiv(float %34, float %35)
+  %37 = insertelement <16 x float> %33, float %36, i64 6
+  %38 = extractelement <16 x float> %7, i64 7
+  %39 = extractelement <16 x float> %9, i64 7
+  %40 = call float @__aie2p_scalar_fdiv(float %38, float %39)
+  %41 = insertelement <16 x float> %37, float %40, i64 7
+  %42 = extractelement <16 x float> %7, i64 8
+  %43 = extractelement <16 x float> %9, i64 8
+  %44 = call float @__aie2p_scalar_fdiv(float %42, float %43)
+  %45 = insertelement <16 x float> %41, float %44, i64 8
+  %46 = extractelement <16 x float> %7, i64 9
+  %47 = extractelement <16 x float> %9, i64 9
+  %48 = call float @__aie2p_scalar_fdiv(float %46, float %47)
+  %49 = insertelement <16 x float> %45, float %48, i64 9
+  %50 = extractelement <16 x float> %7, i64 10
+  %51 = extractelement <16 x float> %9, i64 10
+  %52 = call float @__aie2p_scalar_fdiv(float %50, float %51)
+  %53 = insertelement <16 x float> %49, float %52, i64 10
+  %54 = extractelement <16 x float> %7, i64 11
+  %55 = extractelement <16 x float> %9, i64 11
+  %56 = call float @__aie2p_scalar_fdiv(float %54, float %55)
+  %57 = insertelement <16 x float> %53, float %56, i64 11
+  %58 = extractelement <16 x float> %7, i64 12
+  %59 = extractelement <16 x float> %9, i64 12
+  %60 = call float @__aie2p_scalar_fdiv(float %58, float %59)
+  %61 = insertelement <16 x float> %57, float %60, i64 12
+  %62 = extractelement <16 x float> %7, i64 13
+  %63 = extractelement <16 x float> %9, i64 13
+  %64 = call float @__aie2p_scalar_fdiv(float %62, float %63)
+  %65 = insertelement <16 x float> %61, float %64, i64 13
+  %66 = extractelement <16 x float> %7, i64 14
+  %67 = extractelement <16 x float> %9, i64 14
+  %68 = call float @__aie2p_scalar_fdiv(float %66, float %67)
+  %69 = insertelement <16 x float> %65, float %68, i64 14
+  %70 = extractelement <16 x float> %7, i64 15
+  %71 = extractelement <16 x float> %9, i64 15
+  %72 = call float @__aie2p_scalar_fdiv(float %70, float %71)
+  %73 = insertelement <16 x float> %69, float %72, i64 15
+  %74 = getelementptr float, ptr @buf9, i32 %3
+  store <16 x float> %73, ptr %74, align 4
+  %75 = add i32 %3, 16
+  br label %2, !llvm.loop !1
+
+76:                                               ; preds = %2
+  call void @llvm.aie2p.release(i32 51, i32 1)
+  call void @llvm.aie2p.release(i32 53, i32 1)
+  call void @llvm.aie2p.release(i32 48, i32 1)
+  br label %1
+}
+
+; Unknown intrinsic
+declare float @llvm.aie2p.inv(float)
+
+attributes #0 = { noinline }
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !{!1, !2}
+!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_5.o b/examples/elementwise_arith/air_project/div_kernel_0_core_0_5.o
new file mode 100644
index 0000000000000000000000000000000000000000..78cc76657161c0618a24de8080e832261028cff6
GIT binary patch
literal 2052
zcma)7VQ5=b6h1F6X<otYJtHaCWQDitmTJ>y+ihh!hDn*aA6*FIMsezE+NG?R+1e)M
z=?|?<=bUq_8z@hl(0{T&l7NiRU;QB>GNfWA1NWm!P=6?NFd@QbJm<ap+Dmaw58Qjs
z`ObI0-1FXjH}5_-_`D=Z;F$!9kO5q60g$n{-yw8DD{OEHT1q&DwKy>Ud)n7nRKPyG
ztOC@Mi={c`7pIKVNl6DiSZ4s52LUkX<Qc83Hvrxl?t-02KsIXHpUbM<9#rkaovIz)
zquK{{tM>7B)h;}$+8>A2XC8(gB$xVsozb8q!|cr+!6abZgYn<3L7x)G@@j)A9c>t(
zF=c>9am~L&{x^PT_747z4q9EGW$Y_B$o9jtF4yn0Y8RGO-^y}8D$Ui=C#B)G2HSU6
zttFo<rxVy;!eLW?xMSDs@V8j+D!+L2x(nzq8En{x?grh|RR*XO<w&JijM(8DP4rf_
zr*{_h&dO0cTt$9-i~ZfgdUx~vokP8IMbx{Ed}9l}RjfbNT<<*Uou__l$lvPu{eksu
zJiiOYNaccz`}-65@$yUQ|9>BKeIY{qo)~KKJ}%19%0=uq(56wpL&Fc!)1uVx%cq;@
zT`EQ^m*j{Yh@f6)`IX5B>f!#LD8FVc@$(Y;hh4rAWS3WPRB_zGaT~{Ku$uDeSMK55
z5xQSZbv%CMz7^XYwPL-ytXQ($iv7T<snBCP>sIX3h!wjQwZ2%|W&QpgR(@`;eSf`c
zh%Hqf`76@bqJL1YrAnX4Y+g#Zzlt-{63+L7;g7zSB!mF?xn}s~Gi8{6Jxp1|Xr8ZQ
z3UGW~hzI=w;D<YnIn2}RSODI}cv8q8;xTbTc=NouUUM&w(>!0sODO~a_;6EwVN*V|
zDW}vIa6toI7ylv;V%)8`_z{X*-S{JPlHZL><aqp>fIC+r$NQxXei~zs|Gt1P3iwX~
zz9!&%eH%Inj6Izh0bdaCZv=c*!1MT=B)Y)S=FxZae!nD~cx1*@HkUE-Su1atW@bFw
zH)&3fW^$P+bL`ak3^)}q-k8W|jTchKyQlM+(br*c@Winr<C$D`G=FU3R3VoIV>&;T
z&y1jd>kRpk!r0TI_UOb^*6cA4fN7p7<fg~=fiXBR&};T_-%Bfc-MPoz*|z4R$l@(J
zd_VqYeIl%+$4aK-pcu=JCAjyT#sjKv=A}IndXGmOdhH1uWNVnv`K4n-`=J`;BRE98
z1<dLEdgkf9ih!th4-*9!k{%sdPj4^Aq~}?h3W9kx+OzkHsZ6{@^i-!MH0Zbg1Bs+S
AEdT%j

literal 0
HcmV?d00001

diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_5.opt.ll b/examples/elementwise_arith/air_project/div_kernel_0_core_0_5.opt.ll
new file mode 100644
index 0000000..6e22dce
--- /dev/null
+++ b/examples/elementwise_arith/air_project/div_kernel_0_core_0_5.opt.ll
@@ -0,0 +1,129 @@
+; ModuleID = 'air_project/div_kernel_0_core_0_5.peanohack.ll'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-m:e-p:20:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-f32:32:32-i64:32-f64:32-a:0:32-n32"
+target triple = "aie2p"
+
+@buf9 = external local_unnamed_addr global [256 x float]
+@buf10 = external local_unnamed_addr global [256 x float]
+@buf11 = external local_unnamed_addr global [256 x float]
+
+; Function Attrs: nofree noinline nosync nounwind memory(none)
+define float @__aie2p_scalar_fdiv(float %0, float %1) local_unnamed_addr #0 {
+  %3 = tail call float @llvm.aie2p.inv(float %1)
+  %4 = fmul float %3, %0
+  ret float %4
+}
+
+; Function Attrs: nounwind
+declare void @llvm.aie2p.acquire(i32, i32) #1
+
+; Function Attrs: nounwind
+declare void @llvm.aie2p.release(i32, i32) #1
+
+; Function Attrs: noreturn nounwind
+define void @core_0_5() local_unnamed_addr #2 {
+  br label %1
+
+1:                                                ; preds = %76, %0
+  tail call void @llvm.aie2p.acquire(i32 49, i32 -1)
+  tail call void @llvm.aie2p.acquire(i32 50, i32 -1)
+  tail call void @llvm.aie2p.acquire(i32 52, i32 -1)
+  br label %2
+
+2:                                                ; preds = %1, %2
+  %3 = phi i32 [ 0, %1 ], [ %74, %2 ]
+  %4 = trunc nuw i32 %3 to i20
+  %5 = getelementptr float, ptr @buf11, i20 %4
+  %6 = load <16 x float>, ptr %5, align 64
+  %7 = getelementptr float, ptr @buf10, i20 %4
+  %8 = load <16 x float>, ptr %7, align 64
+  %9 = extractelement <16 x float> %6, i64 0
+  %10 = extractelement <16 x float> %8, i64 0
+  %11 = tail call float @__aie2p_scalar_fdiv(float %9, float %10)
+  %12 = insertelement <16 x float> poison, float %11, i64 0
+  %13 = extractelement <16 x float> %6, i64 1
+  %14 = extractelement <16 x float> %8, i64 1
+  %15 = tail call float @__aie2p_scalar_fdiv(float %13, float %14)
+  %16 = insertelement <16 x float> %12, float %15, i64 1
+  %17 = extractelement <16 x float> %6, i64 2
+  %18 = extractelement <16 x float> %8, i64 2
+  %19 = tail call float @__aie2p_scalar_fdiv(float %17, float %18)
+  %20 = insertelement <16 x float> %16, float %19, i64 2
+  %21 = extractelement <16 x float> %6, i64 3
+  %22 = extractelement <16 x float> %8, i64 3
+  %23 = tail call float @__aie2p_scalar_fdiv(float %21, float %22)
+  %24 = insertelement <16 x float> %20, float %23, i64 3
+  %25 = extractelement <16 x float> %6, i64 4
+  %26 = extractelement <16 x float> %8, i64 4
+  %27 = tail call float @__aie2p_scalar_fdiv(float %25, float %26)
+  %28 = insertelement <16 x float> %24, float %27, i64 4
+  %29 = extractelement <16 x float> %6, i64 5
+  %30 = extractelement <16 x float> %8, i64 5
+  %31 = tail call float @__aie2p_scalar_fdiv(float %29, float %30)
+  %32 = insertelement <16 x float> %28, float %31, i64 5
+  %33 = extractelement <16 x float> %6, i64 6
+  %34 = extractelement <16 x float> %8, i64 6
+  %35 = tail call float @__aie2p_scalar_fdiv(float %33, float %34)
+  %36 = insertelement <16 x float> %32, float %35, i64 6
+  %37 = extractelement <16 x float> %6, i64 7
+  %38 = extractelement <16 x float> %8, i64 7
+  %39 = tail call float @__aie2p_scalar_fdiv(float %37, float %38)
+  %40 = insertelement <16 x float> %36, float %39, i64 7
+  %41 = extractelement <16 x float> %6, i64 8
+  %42 = extractelement <16 x float> %8, i64 8
+  %43 = tail call float @__aie2p_scalar_fdiv(float %41, float %42)
+  %44 = insertelement <16 x float> %40, float %43, i64 8
+  %45 = extractelement <16 x float> %6, i64 9
+  %46 = extractelement <16 x float> %8, i64 9
+  %47 = tail call float @__aie2p_scalar_fdiv(float %45, float %46)
+  %48 = insertelement <16 x float> %44, float %47, i64 9
+  %49 = extractelement <16 x float> %6, i64 10
+  %50 = extractelement <16 x float> %8, i64 10
+  %51 = tail call float @__aie2p_scalar_fdiv(float %49, float %50)
+  %52 = insertelement <16 x float> %48, float %51, i64 10
+  %53 = extractelement <16 x float> %6, i64 11
+  %54 = extractelement <16 x float> %8, i64 11
+  %55 = tail call float @__aie2p_scalar_fdiv(float %53, float %54)
+  %56 = insertelement <16 x float> %52, float %55, i64 11
+  %57 = extractelement <16 x float> %6, i64 12
+  %58 = extractelement <16 x float> %8, i64 12
+  %59 = tail call float @__aie2p_scalar_fdiv(float %57, float %58)
+  %60 = insertelement <16 x float> %56, float %59, i64 12
+  %61 = extractelement <16 x float> %6, i64 13
+  %62 = extractelement <16 x float> %8, i64 13
+  %63 = tail call float @__aie2p_scalar_fdiv(float %61, float %62)
+  %64 = insertelement <16 x float> %60, float %63, i64 13
+  %65 = extractelement <16 x float> %6, i64 14
+  %66 = extractelement <16 x float> %8, i64 14
+  %67 = tail call float @__aie2p_scalar_fdiv(float %65, float %66)
+  %68 = insertelement <16 x float> %64, float %67, i64 14
+  %69 = extractelement <16 x float> %6, i64 15
+  %70 = extractelement <16 x float> %8, i64 15
+  %71 = tail call float @__aie2p_scalar_fdiv(float %69, float %70)
+  %72 = insertelement <16 x float> %68, float %71, i64 15
+  %73 = getelementptr float, ptr @buf9, i20 %4
+  store <16 x float> %72, ptr %73, align 64
+  %74 = add nuw nsw i32 %3, 16
+  %75 = icmp ult i32 %3, 240
+  br i1 %75, label %2, label %76, !llvm.loop !1
+
+76:                                               ; preds = %2
+  tail call void @llvm.aie2p.release(i32 51, i32 1)
+  tail call void @llvm.aie2p.release(i32 53, i32 1)
+  tail call void @llvm.aie2p.release(i32 48, i32 1)
+  br label %1
+}
+
+; Function Attrs: nofree nosync nounwind memory(none)
+declare float @llvm.aie2p.inv(float) #3
+
+attributes #0 = { nofree noinline nosync nounwind memory(none) }
+attributes #1 = { nounwind }
+attributes #2 = { noreturn nounwind }
+attributes #3 = { nofree nosync nounwind memory(none) }
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !{!1, !2}
+!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_5.peanohack.ll b/examples/elementwise_arith/air_project/div_kernel_0_core_0_5.peanohack.ll
new file mode 100644
index 0000000..5ef9373
--- /dev/null
+++ b/examples/elementwise_arith/air_project/div_kernel_0_core_0_5.peanohack.ll
@@ -0,0 +1,158 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target triple = "aie2p"
+
+@buf0 = external global [256 x float]
+@buf1 = external global [256 x float]
+@buf2 = external global [256 x float]
+@buf3 = external global [256 x float]
+@buf4 = external global [256 x float]
+@buf5 = external global [256 x float]
+@buf6 = external global [256 x float]
+@buf7 = external global [256 x float]
+@buf8 = external global [256 x float]
+@buf9 = external global [256 x float]
+@buf10 = external global [256 x float]
+@buf11 = external global [256 x float]
+@buf12 = external global [1024 x float]
+@buf13 = external global [1024 x float]
+@buf14 = external global [1024 x float]
+
+; Function Attrs: noinline
+define float @__aie2p_scalar_fdiv(float %0, float %1) #0 {
+  %3 = call float @llvm.aie2p.inv(float %1)
+  %4 = fmul float %0, %3
+  ret float %4
+}
+
+declare void @debug_i32(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.event(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.put.ms(i32, i32)
+
+; Unknown intrinsic
+declare { i32, i32 } @llvm.aie2p.get.ss()
+
+; Unknown intrinsic
+declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32)
+
+; Unknown intrinsic
+declare <16 x i32> @llvm.aie2p.scd.read.vec(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.acquire(i32, i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.release(i32, i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.set.ctrl.reg(i32, i32)
+
+define void @core_0_5() {
+  br label %1
+
+1:                                                ; preds = %76, %0
+  call void @llvm.aie2p.acquire(i32 49, i32 -1)
+  call void @llvm.aie2p.acquire(i32 50, i32 -1)
+  call void @llvm.aie2p.acquire(i32 52, i32 -1)
+  br label %2
+
+2:                                                ; preds = %5, %1
+  %3 = phi i32 [ %75, %5 ], [ 0, %1 ]
+  %4 = icmp slt i32 %3, 256
+  br i1 %4, label %5, label %76
+
+5:                                                ; preds = %2
+  %6 = getelementptr float, ptr @buf11, i32 %3
+  %7 = load <16 x float>, ptr %6
+  %8 = getelementptr float, ptr @buf10, i32 %3
+  %9 = load <16 x float>, ptr %8
+  %10 = extractelement <16 x float> %7, i64 0
+  %11 = extractelement <16 x float> %9, i64 0
+  %12 = call float @__aie2p_scalar_fdiv(float %10, float %11)
+  %13 = insertelement <16 x float> poison, float %12, i64 0
+  %14 = extractelement <16 x float> %7, i64 1
+  %15 = extractelement <16 x float> %9, i64 1
+  %16 = call float @__aie2p_scalar_fdiv(float %14, float %15)
+  %17 = insertelement <16 x float> %13, float %16, i64 1
+  %18 = extractelement <16 x float> %7, i64 2
+  %19 = extractelement <16 x float> %9, i64 2
+  %20 = call float @__aie2p_scalar_fdiv(float %18, float %19)
+  %21 = insertelement <16 x float> %17, float %20, i64 2
+  %22 = extractelement <16 x float> %7, i64 3
+  %23 = extractelement <16 x float> %9, i64 3
+  %24 = call float @__aie2p_scalar_fdiv(float %22, float %23)
+  %25 = insertelement <16 x float> %21, float %24, i64 3
+  %26 = extractelement <16 x float> %7, i64 4
+  %27 = extractelement <16 x float> %9, i64 4
+  %28 = call float @__aie2p_scalar_fdiv(float %26, float %27)
+  %29 = insertelement <16 x float> %25, float %28, i64 4
+  %30 = extractelement <16 x float> %7, i64 5
+  %31 = extractelement <16 x float> %9, i64 5
+  %32 = call float @__aie2p_scalar_fdiv(float %30, float %31)
+  %33 = insertelement <16 x float> %29, float %32, i64 5
+  %34 = extractelement <16 x float> %7, i64 6
+  %35 = extractelement <16 x float> %9, i64 6
+  %36 = call float @__aie2p_scalar_fdiv(float %34, float %35)
+  %37 = insertelement <16 x float> %33, float %36, i64 6
+  %38 = extractelement <16 x float> %7, i64 7
+  %39 = extractelement <16 x float> %9, i64 7
+  %40 = call float @__aie2p_scalar_fdiv(float %38, float %39)
+  %41 = insertelement <16 x float> %37, float %40, i64 7
+  %42 = extractelement <16 x float> %7, i64 8
+  %43 = extractelement <16 x float> %9, i64 8
+  %44 = call float @__aie2p_scalar_fdiv(float %42, float %43)
+  %45 = insertelement <16 x float> %41, float %44, i64 8
+  %46 = extractelement <16 x float> %7, i64 9
+  %47 = extractelement <16 x float> %9, i64 9
+  %48 = call float @__aie2p_scalar_fdiv(float %46, float %47)
+  %49 = insertelement <16 x float> %45, float %48, i64 9
+  %50 = extractelement <16 x float> %7, i64 10
+  %51 = extractelement <16 x float> %9, i64 10
+  %52 = call float @__aie2p_scalar_fdiv(float %50, float %51)
+  %53 = insertelement <16 x float> %49, float %52, i64 10
+  %54 = extractelement <16 x float> %7, i64 11
+  %55 = extractelement <16 x float> %9, i64 11
+  %56 = call float @__aie2p_scalar_fdiv(float %54, float %55)
+  %57 = insertelement <16 x float> %53, float %56, i64 11
+  %58 = extractelement <16 x float> %7, i64 12
+  %59 = extractelement <16 x float> %9, i64 12
+  %60 = call float @__aie2p_scalar_fdiv(float %58, float %59)
+  %61 = insertelement <16 x float> %57, float %60, i64 12
+  %62 = extractelement <16 x float> %7, i64 13
+  %63 = extractelement <16 x float> %9, i64 13
+  %64 = call float @__aie2p_scalar_fdiv(float %62, float %63)
+  %65 = insertelement <16 x float> %61, float %64, i64 13
+  %66 = extractelement <16 x float> %7, i64 14
+  %67 = extractelement <16 x float> %9, i64 14
+  %68 = call float @__aie2p_scalar_fdiv(float %66, float %67)
+  %69 = insertelement <16 x float> %65, float %68, i64 14
+  %70 = extractelement <16 x float> %7, i64 15
+  %71 = extractelement <16 x float> %9, i64 15
+  %72 = call float @__aie2p_scalar_fdiv(float %70, float %71)
+  %73 = insertelement <16 x float> %69, float %72, i64 15
+  %74 = getelementptr float, ptr @buf9, i32 %3
+  store <16 x float> %73, ptr %74
+  %75 = add i32 %3, 16
+  br label %2, !llvm.loop !1
+
+76:                                               ; preds = %2
+  call void @llvm.aie2p.release(i32 51, i32 1)
+  call void @llvm.aie2p.release(i32 53, i32 1)
+  call void @llvm.aie2p.release(i32 48, i32 1)
+  br label %1
+}
+
+; Unknown intrinsic
+declare float @llvm.aie2p.inv(float)
+
+attributes #0 = { noinline }
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !{!1, !2}
+!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/div_kernel_0_design.bif b/examples/elementwise_arith/air_project/div_kernel_0_design.bif
new file mode 100644
index 0000000..11c5e21
--- /dev/null
+++ b/examples/elementwise_arith/air_project/div_kernel_0_design.bif
@@ -0,0 +1,10 @@
+all:
+{
+  id_code = 0x14ca8093
+  extended_id_code = 0x01
+  image
+  {
+    name=aie_image, id=0x1c000000
+    { type=cdo file=air_project/div_kernel_0_aie_cdo_elfs.bin file=air_project/div_kernel_0_aie_cdo_init.bin file=air_project/div_kernel_0_aie_cdo_enable.bin }
+  }
+}
diff --git a/examples/elementwise_arith/air_project/div_kernel_0_div_kernel_0_sequence.bin b/examples/elementwise_arith/air_project/div_kernel_0_div_kernel_0_sequence.bin
new file mode 100644
index 0000000000000000000000000000000000000000..f2eb383b7eb903e8ad8809b7d44fb0a4d660bfb6
GIT binary patch
literal 3248
zcmcJQ-Aw~A5QNu<g?K=M2OfApq6JF0GAPG%L<ue-N=1yF_0OCH8j;via_gU++3a_@
zvk>neg4w5;eLtEZyo^(@j}krmd|WcZ&8OQJf_lyu+9e|Yukmso`>#0IXQ!}T>04{-
zdWRz5+iJIiZ@khs7OO7qzIV<49O}M3Vd35Et*zr4+n!k}H6?s5ZHHP2b8iNd9f!uq
z2%ELeWFG=0l9+H#Mw;SV&vYiNQGA=m?@TyH^<^fmv5ZNwd1Or5@6ws1SwdvOIT;Fy
zFEe2cjS1(dzRbimmN6++jgb*5y%`xL$~=|+C!AAeiZ3%^4UGxssJ_g^HI^|MPc*|&
zYb<A{(Vr%DO=9vtUuMD@8WYaRC|J1{n2BpFW3p2})f#K>`xk4J-t?`*yLP<eIY;$n
zCaj?`;T+YMnYhL>CTa8!jZ|yI(P^-T#)NY+)Rg;`nXrb&gmYA1X5t#lnB*HUVGWH5
W=cv75Caj?`;T+YMnYhL+nEU|x-i@&U

literal 0
HcmV?d00001

diff --git a/examples/elementwise_arith/air_project/empty_0.pdi b/examples/elementwise_arith/air_project/empty_0.pdi
new file mode 100644
index 0000000000000000000000000000000000000000..a2347424a644d017f5e8ac814673b9061a6becd0
GIT binary patch
literal 368
zcmcc1z`)>QtR&b`o)&s$_o|r;3@i+cKrs~{2GJ%!3<8rIPKiKR5Yp2nAV5Jtfx!S-
z5XuJWBL`Gy9h?QS4_SX=W@>z9Zen^W!ax}W6O}bv!%~(5C<MdEvj3qV1SrVHz`%rT
k2eLX!*nR*1|78Ih;OycLw1I(v8OYwcbHjf@pk*)&03)a&DF6Tf

literal 0
HcmV?d00001

diff --git a/examples/elementwise_arith/air_project/empty_0_aie_cdo_elfs.bin b/examples/elementwise_arith/air_project/empty_0_aie_cdo_elfs.bin
new file mode 100644
index 0000000000000000000000000000000000000000..cba6b8778c42200ab6ec35c68cb3586f8fb4e055
GIT binary patch
literal 24
dcmZQ!U|?`|@n>LQVqjndvbXQt@Lv$f001PR1P=fJ

literal 0
HcmV?d00001

diff --git a/examples/elementwise_arith/air_project/empty_0_aie_cdo_enable.bin b/examples/elementwise_arith/air_project/empty_0_aie_cdo_enable.bin
new file mode 100644
index 0000000000000000000000000000000000000000..cba6b8778c42200ab6ec35c68cb3586f8fb4e055
GIT binary patch
literal 24
dcmZQ!U|?`|@n>LQVqjndvbXQt@Lv$f001PR1P=fJ

literal 0
HcmV?d00001

diff --git a/examples/elementwise_arith/air_project/empty_0_aie_cdo_init.bin b/examples/elementwise_arith/air_project/empty_0_aie_cdo_init.bin
new file mode 100644
index 0000000000000000000000000000000000000000..cba6b8778c42200ab6ec35c68cb3586f8fb4e055
GIT binary patch
literal 24
dcmZQ!U|?`|@n>LQVqjndvbXQt@Lv$f001PR1P=fJ

literal 0
HcmV?d00001

diff --git a/examples/elementwise_arith/air_project/empty_0_design.bif b/examples/elementwise_arith/air_project/empty_0_design.bif
new file mode 100644
index 0000000..b22ae3c
--- /dev/null
+++ b/examples/elementwise_arith/air_project/empty_0_design.bif
@@ -0,0 +1,10 @@
+all:
+{
+  id_code = 0x14ca8093
+  extended_id_code = 0x01
+  image
+  {
+    name=aie_image, id=0x1c000000
+    { type=cdo file=air_project/empty_0_aie_cdo_elfs.bin file=air_project/empty_0_aie_cdo_init.bin file=air_project/empty_0_aie_cdo_enable.bin }
+  }
+}
diff --git a/examples/elementwise_arith/air_project/full_elf_config.json b/examples/elementwise_arith/air_project/full_elf_config.json
new file mode 100644
index 0000000..eab4fdb
--- /dev/null
+++ b/examples/elementwise_arith/air_project/full_elf_config.json
@@ -0,0 +1,134 @@
+{
+  "xrt-kernels": [
+    {
+      "PDIs": [
+        {
+          "PDI_file": "/home/strixminipc/Triton-XDNA/examples/elementwise_arith/air_project/empty_0.pdi",
+          "id": 1
+        },
+        {
+          "PDI_file": "/home/strixminipc/Triton-XDNA/examples/elementwise_arith/air_project/square_kernel_0.pdi",
+          "id": 2
+        },
+        {
+          "PDI_file": "/home/strixminipc/Triton-XDNA/examples/elementwise_arith/air_project/main.pdi",
+          "id": 3
+        }
+      ],
+      "arguments": [
+        {
+          "name": "arg_0",
+          "offset": "0x0",
+          "type": "char *"
+        },
+        {
+          "name": "arg_1",
+          "offset": "0x8",
+          "type": "char *"
+        },
+        {
+          "name": "arg_2",
+          "offset": "0x10",
+          "type": "char *"
+        },
+        {
+          "name": "arg_3",
+          "offset": "0x18",
+          "type": "char *"
+        },
+        {
+          "name": "arg_4",
+          "offset": "0x20",
+          "type": "char *"
+        },
+        {
+          "name": "arg_5",
+          "offset": "0x28",
+          "type": "char *"
+        },
+        {
+          "name": "arg_6",
+          "offset": "0x30",
+          "type": "char *"
+        },
+        {
+          "name": "arg_7",
+          "offset": "0x38",
+          "type": "char *"
+        }
+      ],
+      "instance": [
+        {
+          "TXN_ctrl_code_file": "/home/strixminipc/Triton-XDNA/examples/elementwise_arith/air_project/square_kernel_0_square_kernel_0_sequence.bin",
+          "id": "square_kernel_0_sequence"
+        }
+      ],
+      "name": "square_kernel_0"
+    },
+    {
+      "PDIs": [
+        {
+          "PDI_file": "/home/strixminipc/Triton-XDNA/examples/elementwise_arith/air_project/empty_0.pdi",
+          "id": 1
+        },
+        {
+          "PDI_file": "/home/strixminipc/Triton-XDNA/examples/elementwise_arith/air_project/square_kernel_0.pdi",
+          "id": 2
+        },
+        {
+          "PDI_file": "/home/strixminipc/Triton-XDNA/examples/elementwise_arith/air_project/main.pdi",
+          "id": 3
+        }
+      ],
+      "arguments": [
+        {
+          "name": "arg_0",
+          "offset": "0x0",
+          "type": "char *"
+        },
+        {
+          "name": "arg_1",
+          "offset": "0x8",
+          "type": "char *"
+        },
+        {
+          "name": "arg_2",
+          "offset": "0x10",
+          "type": "char *"
+        },
+        {
+          "name": "arg_3",
+          "offset": "0x18",
+          "type": "char *"
+        },
+        {
+          "name": "arg_4",
+          "offset": "0x20",
+          "type": "char *"
+        },
+        {
+          "name": "arg_5",
+          "offset": "0x28",
+          "type": "char *"
+        },
+        {
+          "name": "arg_6",
+          "offset": "0x30",
+          "type": "char *"
+        },
+        {
+          "name": "arg_7",
+          "offset": "0x38",
+          "type": "char *"
+        }
+      ],
+      "instance": [
+        {
+          "TXN_ctrl_code_file": "/home/strixminipc/Triton-XDNA/examples/elementwise_arith/air_project/main_square_kernel.bin",
+          "id": "square_kernel"
+        }
+      ],
+      "name": "main"
+    }
+  ]
+}
diff --git a/examples/elementwise_arith/air_project/input_with_addresses.mlir b/examples/elementwise_arith/air_project/input_with_addresses.mlir
new file mode 100644
index 0000000..f2c48f0
--- /dev/null
+++ b/examples/elementwise_arith/air_project/input_with_addresses.mlir
@@ -0,0 +1,328 @@
+#loop_annotation = #llvm.loop_annotation<mustProgress = true>
+module {
+  aie.device(npu2) @square_kernel_0 {
+    %shim_noc_tile_0_0 = aie.tile(0, 0) {controller_id = #aie.packet_info<pkt_type = 0, pkt_id = 15>}
+    %shim_noc_tile_1_0 = aie.tile(1, 0) {controller_id = #aie.packet_info<pkt_type = 0, pkt_id = 15>}
+    %mem_tile_0_1 = aie.tile(0, 1) {controller_id = #aie.packet_info<pkt_type = 0, pkt_id = 26>}
+    %mem_tile_1_1 = aie.tile(1, 1) {controller_id = #aie.packet_info<pkt_type = 0, pkt_id = 26>}
+    %tile_0_2 = aie.tile(0, 2) {controller_id = #aie.packet_info<pkt_type = 0, pkt_id = 27>}
+    %tile_0_3 = aie.tile(0, 3) {controller_id = #aie.packet_info<pkt_type = 0, pkt_id = 29>}
+    %tile_0_4 = aie.tile(0, 4) {controller_id = #aie.packet_info<pkt_type = 0, pkt_id = 30>}
+    %tile_0_5 = aie.tile(0, 5) {controller_id = #aie.packet_info<pkt_type = 0, pkt_id = 31>}
+    %lock_0_1 = aie.lock(%mem_tile_0_1, 1) {init = 4 : i32}
+    %lock_0_1_0 = aie.lock(%mem_tile_0_1, 0) {init = 0 : i32}
+    %lock_1_1 = aie.lock(%mem_tile_1_1, 1) {init = 4 : i32}
+    %lock_1_1_1 = aie.lock(%mem_tile_1_1, 0) {init = 0 : i32}
+    %lock_0_2 = aie.lock(%tile_0_2, 3) {init = 1 : i32}
+    %lock_0_2_2 = aie.lock(%tile_0_2, 2) {init = 0 : i32}
+    %lock_0_2_3 = aie.lock(%tile_0_2, 1) {init = 1 : i32}
+    %lock_0_2_4 = aie.lock(%tile_0_2, 0) {init = 0 : i32}
+    %lock_0_3 = aie.lock(%tile_0_3, 3) {init = 1 : i32}
+    %lock_0_3_5 = aie.lock(%tile_0_3, 2) {init = 0 : i32}
+    %lock_0_3_6 = aie.lock(%tile_0_3, 1) {init = 1 : i32}
+    %lock_0_3_7 = aie.lock(%tile_0_3, 0) {init = 0 : i32}
+    %lock_0_4 = aie.lock(%tile_0_4, 3) {init = 1 : i32}
+    %lock_0_4_8 = aie.lock(%tile_0_4, 2) {init = 0 : i32}
+    %lock_0_4_9 = aie.lock(%tile_0_4, 1) {init = 1 : i32}
+    %lock_0_4_10 = aie.lock(%tile_0_4, 0) {init = 0 : i32}
+    %lock_0_5 = aie.lock(%tile_0_5, 3) {init = 1 : i32}
+    %lock_0_5_11 = aie.lock(%tile_0_5, 2) {init = 0 : i32}
+    %lock_0_5_12 = aie.lock(%tile_0_5, 1) {init = 1 : i32}
+    %lock_0_5_13 = aie.lock(%tile_0_5, 0) {init = 0 : i32}
+    %buf9 = aie.buffer(%mem_tile_0_1) {address = 0 : i32, mem_bank = 0 : i32, sym_name = "buf9"} : memref<1024xi16, 1 : i32> 
+    %buf8 = aie.buffer(%mem_tile_1_1) {address = 0 : i32, mem_bank = 0 : i32, sym_name = "buf8"} : memref<1024xi16, 1> 
+    %buf7 = aie.buffer(%tile_0_5) {address = 1024 : i32, mem_bank = 0 : i32, sym_name = "buf7"} : memref<256xi16, 2> 
+    %buf6 = aie.buffer(%tile_0_5) {address = 16384 : i32, mem_bank = 1 : i32, sym_name = "buf6"} : memref<256xi16, 2> 
+    %buf5 = aie.buffer(%tile_0_4) {address = 1024 : i32, mem_bank = 0 : i32, sym_name = "buf5"} : memref<256xi16, 2> 
+    %buf4 = aie.buffer(%tile_0_4) {address = 16384 : i32, mem_bank = 1 : i32, sym_name = "buf4"} : memref<256xi16, 2> 
+    %buf3 = aie.buffer(%tile_0_3) {address = 1024 : i32, mem_bank = 0 : i32, sym_name = "buf3"} : memref<256xi16, 2> 
+    %buf2 = aie.buffer(%tile_0_3) {address = 16384 : i32, mem_bank = 1 : i32, sym_name = "buf2"} : memref<256xi16, 2> 
+    %buf1 = aie.buffer(%tile_0_2) {address = 1024 : i32, mem_bank = 0 : i32, sym_name = "buf1"} : memref<256xi16, 2> 
+    %buf0 = aie.buffer(%tile_0_2) {address = 16384 : i32, mem_bank = 1 : i32, sym_name = "buf0"} : memref<256xi16, 2> 
+    %mem_0_5 = aie.mem(%tile_0_5) {
+      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_0_5_13, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf6 : memref<256xi16, 2>, 0, 256) {bd_id = 0 : i32, next_bd_id = 0 : i32, task_id = 0 : i32}
+      aie.use_lock(%lock_0_5_12, Release, 1)
+      aie.next_bd ^bb1
+    ^bb2:  // pred: ^bb3
+      aie.end
+    ^bb3:  // pred: ^bb0
+      %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb2)
+    ^bb4:  // 2 preds: ^bb3, ^bb4
+      aie.use_lock(%lock_0_5, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf7 : memref<256xi16, 2>, 0, 256) {bd_id = 1 : i32, next_bd_id = 1 : i32, task_id = 0 : i32}
+      aie.use_lock(%lock_0_5_11, Release, 1)
+      aie.next_bd ^bb4
+    }
+    %core_0_5 = aie.core(%tile_0_5) {
+      %c0_i32 = arith.constant 0 : i32
+      %c256 = arith.constant 256 : index
+      %c32 = arith.constant 32 : index
+      %c0 = arith.constant 0 : index
+      cf.br ^bb1
+    ^bb1:  // 2 preds: ^bb0, ^bb4
+      aie.use_lock(%lock_0_5_12, AcquireGreaterEqual, 1)
+      aie.use_lock(%lock_0_5_11, AcquireGreaterEqual, 1)
+      cf.br ^bb2(%c0 : index)
+    ^bb2(%0: index):  // 2 preds: ^bb1, ^bb3
+      %1 = arith.cmpi slt, %0, %c256 : index
+      cf.cond_br %1, ^bb3, ^bb4
+    ^bb3:  // pred: ^bb2
+      %2 = vector.load %buf7[%0] : memref<256xi16, 2>, vector<32xi16>
+      %3 = aievec.mul_elem %2, %2 : vector<32xi16>, vector<32xi16>, vector<32xi32>
+      %4 = aievec.srs %3, %c0_i32 : vector<32xi32>, i32, vector<32xi16>
+      vector.store %4, %buf6[%0] : memref<256xi16, 2>, vector<32xi16>
+      %5 = arith.addi %0, %c32 : index
+      cf.br ^bb2(%5 : index) {loop_annotation = #loop_annotation}
+    ^bb4:  // pred: ^bb2
+      aie.use_lock(%lock_0_5, Release, 1)
+      aie.use_lock(%lock_0_5_13, Release, 1)
+      cf.br ^bb1
+    }
+    %mem_0_4 = aie.mem(%tile_0_4) {
+      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_0_4_10, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf4 : memref<256xi16, 2>, 0, 256) {bd_id = 0 : i32, next_bd_id = 0 : i32, task_id = 0 : i32}
+      aie.use_lock(%lock_0_4_9, Release, 1)
+      aie.next_bd ^bb1
+    ^bb2:  // pred: ^bb3
+      aie.end
+    ^bb3:  // pred: ^bb0
+      %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb2)
+    ^bb4:  // 2 preds: ^bb3, ^bb4
+      aie.use_lock(%lock_0_4, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf5 : memref<256xi16, 2>, 0, 256) {bd_id = 1 : i32, next_bd_id = 1 : i32, task_id = 0 : i32}
+      aie.use_lock(%lock_0_4_8, Release, 1)
+      aie.next_bd ^bb4
+    }
+    %core_0_4 = aie.core(%tile_0_4) {
+      %c0_i32 = arith.constant 0 : i32
+      %c256 = arith.constant 256 : index
+      %c32 = arith.constant 32 : index
+      %c0 = arith.constant 0 : index
+      cf.br ^bb1
+    ^bb1:  // 2 preds: ^bb0, ^bb4
+      aie.use_lock(%lock_0_4_9, AcquireGreaterEqual, 1)
+      aie.use_lock(%lock_0_4_8, AcquireGreaterEqual, 1)
+      cf.br ^bb2(%c0 : index)
+    ^bb2(%0: index):  // 2 preds: ^bb1, ^bb3
+      %1 = arith.cmpi slt, %0, %c256 : index
+      cf.cond_br %1, ^bb3, ^bb4
+    ^bb3:  // pred: ^bb2
+      %2 = vector.load %buf5[%0] : memref<256xi16, 2>, vector<32xi16>
+      %3 = aievec.mul_elem %2, %2 : vector<32xi16>, vector<32xi16>, vector<32xi32>
+      %4 = aievec.srs %3, %c0_i32 : vector<32xi32>, i32, vector<32xi16>
+      vector.store %4, %buf4[%0] : memref<256xi16, 2>, vector<32xi16>
+      %5 = arith.addi %0, %c32 : index
+      cf.br ^bb2(%5 : index) {loop_annotation = #loop_annotation}
+    ^bb4:  // pred: ^bb2
+      aie.use_lock(%lock_0_4, Release, 1)
+      aie.use_lock(%lock_0_4_10, Release, 1)
+      cf.br ^bb1
+    }
+    %mem_0_3 = aie.mem(%tile_0_3) {
+      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_0_3_7, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf2 : memref<256xi16, 2>, 0, 256) {bd_id = 0 : i32, next_bd_id = 0 : i32, task_id = 0 : i32}
+      aie.use_lock(%lock_0_3_6, Release, 1)
+      aie.next_bd ^bb1
+    ^bb2:  // pred: ^bb3
+      aie.end
+    ^bb3:  // pred: ^bb0
+      %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb2)
+    ^bb4:  // 2 preds: ^bb3, ^bb4
+      aie.use_lock(%lock_0_3, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf3 : memref<256xi16, 2>, 0, 256) {bd_id = 1 : i32, next_bd_id = 1 : i32, task_id = 0 : i32}
+      aie.use_lock(%lock_0_3_5, Release, 1)
+      aie.next_bd ^bb4
+    }
+    %core_0_3 = aie.core(%tile_0_3) {
+      %c0_i32 = arith.constant 0 : i32
+      %c256 = arith.constant 256 : index
+      %c32 = arith.constant 32 : index
+      %c0 = arith.constant 0 : index
+      cf.br ^bb1
+    ^bb1:  // 2 preds: ^bb0, ^bb4
+      aie.use_lock(%lock_0_3_6, AcquireGreaterEqual, 1)
+      aie.use_lock(%lock_0_3_5, AcquireGreaterEqual, 1)
+      cf.br ^bb2(%c0 : index)
+    ^bb2(%0: index):  // 2 preds: ^bb1, ^bb3
+      %1 = arith.cmpi slt, %0, %c256 : index
+      cf.cond_br %1, ^bb3, ^bb4
+    ^bb3:  // pred: ^bb2
+      %2 = vector.load %buf3[%0] : memref<256xi16, 2>, vector<32xi16>
+      %3 = aievec.mul_elem %2, %2 : vector<32xi16>, vector<32xi16>, vector<32xi32>
+      %4 = aievec.srs %3, %c0_i32 : vector<32xi32>, i32, vector<32xi16>
+      vector.store %4, %buf2[%0] : memref<256xi16, 2>, vector<32xi16>
+      %5 = arith.addi %0, %c32 : index
+      cf.br ^bb2(%5 : index) {loop_annotation = #loop_annotation}
+    ^bb4:  // pred: ^bb2
+      aie.use_lock(%lock_0_3, Release, 1)
+      aie.use_lock(%lock_0_3_7, Release, 1)
+      cf.br ^bb1
+    }
+    %mem_0_2 = aie.mem(%tile_0_2) {
+      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_0_2_4, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf0 : memref<256xi16, 2>, 0, 256) {bd_id = 0 : i32, next_bd_id = 0 : i32, task_id = 0 : i32}
+      aie.use_lock(%lock_0_2_3, Release, 1)
+      aie.next_bd ^bb1
+    ^bb2:  // pred: ^bb3
+      aie.end
+    ^bb3:  // pred: ^bb0
+      %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb2)
+    ^bb4:  // 2 preds: ^bb3, ^bb4
+      aie.use_lock(%lock_0_2, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf1 : memref<256xi16, 2>, 0, 256) {bd_id = 1 : i32, next_bd_id = 1 : i32, task_id = 0 : i32}
+      aie.use_lock(%lock_0_2_2, Release, 1)
+      aie.next_bd ^bb4
+    }
+    %core_0_2 = aie.core(%tile_0_2) {
+      %c0_i32 = arith.constant 0 : i32
+      %c256 = arith.constant 256 : index
+      %c32 = arith.constant 32 : index
+      %c0 = arith.constant 0 : index
+      cf.br ^bb1
+    ^bb1:  // 2 preds: ^bb0, ^bb4
+      aie.use_lock(%lock_0_2_3, AcquireGreaterEqual, 1)
+      aie.use_lock(%lock_0_2_2, AcquireGreaterEqual, 1)
+      cf.br ^bb2(%c0 : index)
+    ^bb2(%0: index):  // 2 preds: ^bb1, ^bb3
+      %1 = arith.cmpi slt, %0, %c256 : index
+      cf.cond_br %1, ^bb3, ^bb4
+    ^bb3:  // pred: ^bb2
+      %2 = vector.load %buf1[%0] : memref<256xi16, 2>, vector<32xi16>
+      %3 = aievec.mul_elem %2, %2 : vector<32xi16>, vector<32xi16>, vector<32xi32>
+      %4 = aievec.srs %3, %c0_i32 : vector<32xi32>, i32, vector<32xi16>
+      vector.store %4, %buf0[%0] : memref<256xi16, 2>, vector<32xi16>
+      %5 = arith.addi %0, %c32 : index
+      cf.br ^bb2(%5 : index) {loop_annotation = #loop_annotation}
+    ^bb4:  // pred: ^bb2
+      aie.use_lock(%lock_0_2, Release, 1)
+      aie.use_lock(%lock_0_2_4, Release, 1)
+      cf.br ^bb1
+    }
+    aie.flow(%shim_noc_tile_0_0, DMA : 0, %mem_tile_0_1, DMA : 0)
+    aie.flow(%mem_tile_1_1, DMA : 0, %shim_noc_tile_1_0, DMA : 0)
+    aie.flow(%mem_tile_0_1, DMA : 0, %tile_0_2, DMA : 0)
+    aie.flow(%mem_tile_0_1, DMA : 1, %tile_0_3, DMA : 0)
+    aie.flow(%mem_tile_0_1, DMA : 2, %tile_0_4, DMA : 0)
+    aie.flow(%mem_tile_0_1, DMA : 3, %tile_0_5, DMA : 0)
+    aie.flow(%tile_0_2, DMA : 0, %mem_tile_1_1, DMA : 0)
+    aie.flow(%tile_0_3, DMA : 0, %mem_tile_1_1, DMA : 1)
+    aie.flow(%tile_0_4, DMA : 0, %mem_tile_1_1, DMA : 2)
+    aie.flow(%tile_0_5, DMA : 0, %mem_tile_1_1, DMA : 3)
+    %memtile_dma_1_1 = aie.memtile_dma(%mem_tile_1_1) {
+      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_1_1_1, AcquireGreaterEqual, 4)
+      aie.dma_bd(%buf8 : memref<1024xi16, 1>, 0, 1024) {bd_id = 0 : i32, next_bd_id = 0 : i32, task_id = 0 : i32}
+      aie.use_lock(%lock_1_1, Release, 4)
+      aie.next_bd ^bb1
+    ^bb2:  // pred: ^bb9
+      aie.end
+    ^bb3:  // pred: ^bb0
+      %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb5)
+    ^bb4:  // 2 preds: ^bb3, ^bb4
+      aie.use_lock(%lock_1_1, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf8 : memref<1024xi16, 1>, 0, 256) {bd_id = 1 : i32, next_bd_id = 1 : i32, task_id = 0 : i32}
+      aie.use_lock(%lock_1_1_1, Release, 1)
+      aie.next_bd ^bb4
+    ^bb5:  // pred: ^bb3
+      %2 = aie.dma_start(S2MM, 1, ^bb6, ^bb7)
+    ^bb6:  // 2 preds: ^bb5, ^bb6
+      aie.use_lock(%lock_1_1, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf8 : memref<1024xi16, 1>, 256, 256) {bd_id = 24 : i32, next_bd_id = 24 : i32, task_id = 0 : i32}
+      aie.use_lock(%lock_1_1_1, Release, 1)
+      aie.next_bd ^bb6
+    ^bb7:  // pred: ^bb5
+      %3 = aie.dma_start(S2MM, 2, ^bb8, ^bb9)
+    ^bb8:  // 2 preds: ^bb7, ^bb8
+      aie.use_lock(%lock_1_1, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf8 : memref<1024xi16, 1>, 512, 256) {bd_id = 2 : i32, next_bd_id = 2 : i32, task_id = 0 : i32}
+      aie.use_lock(%lock_1_1_1, Release, 1)
+      aie.next_bd ^bb8
+    ^bb9:  // pred: ^bb7
+      %4 = aie.dma_start(S2MM, 3, ^bb10, ^bb2)
+    ^bb10:  // 2 preds: ^bb9, ^bb10
+      aie.use_lock(%lock_1_1, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf8 : memref<1024xi16, 1>, 768, 256) {bd_id = 25 : i32, next_bd_id = 25 : i32, task_id = 0 : i32}
+      aie.use_lock(%lock_1_1_1, Release, 1)
+      aie.next_bd ^bb10
+    }
+    %memtile_dma_0_1 = aie.memtile_dma(%mem_tile_0_1) {
+      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_0_1_0, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 0, 256) {bd_id = 0 : i32, next_bd_id = 0 : i32, task_id = 0 : i32}
+      aie.use_lock(%lock_0_1, Release, 1)
+      aie.next_bd ^bb1
+    ^bb2:  // pred: ^bb9
+      aie.end
+    ^bb3:  // pred: ^bb0
+      %1 = aie.dma_start(MM2S, 1, ^bb4, ^bb5)
+    ^bb4:  // 2 preds: ^bb3, ^bb4
+      aie.use_lock(%lock_0_1_0, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 256, 256) {bd_id = 24 : i32, next_bd_id = 24 : i32, task_id = 0 : i32}
+      aie.use_lock(%lock_0_1, Release, 1)
+      aie.next_bd ^bb4
+    ^bb5:  // pred: ^bb3
+      %2 = aie.dma_start(MM2S, 2, ^bb6, ^bb7)
+    ^bb6:  // 2 preds: ^bb5, ^bb6
+      aie.use_lock(%lock_0_1_0, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 512, 256) {bd_id = 1 : i32, next_bd_id = 1 : i32, task_id = 0 : i32}
+      aie.use_lock(%lock_0_1, Release, 1)
+      aie.next_bd ^bb6
+    ^bb7:  // pred: ^bb5
+      %3 = aie.dma_start(MM2S, 3, ^bb8, ^bb9)
+    ^bb8:  // 2 preds: ^bb7, ^bb8
+      aie.use_lock(%lock_0_1_0, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 768, 256) {bd_id = 25 : i32, next_bd_id = 25 : i32, task_id = 0 : i32}
+      aie.use_lock(%lock_0_1, Release, 1)
+      aie.next_bd ^bb8
+    ^bb9:  // pred: ^bb7
+      %4 = aie.dma_start(S2MM, 0, ^bb10, ^bb2)
+    ^bb10:  // 2 preds: ^bb9, ^bb10
+      aie.use_lock(%lock_0_1, AcquireGreaterEqual, 4)
+      aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 0, 1024) {bd_id = 2 : i32, next_bd_id = 2 : i32, task_id = 0 : i32}
+      aie.use_lock(%lock_0_1_0, Release, 4)
+      aie.next_bd ^bb10
+    }
+    aie.shim_dma_allocation @air_channel_3(%shim_noc_tile_1_0, S2MM, 0)
+    aie.shim_dma_allocation @air_channel_0(%shim_noc_tile_0_0, MM2S, 0)
+    aie.runtime_sequence @square_kernel_0_sequence(%arg0: memref<*xi16>, %arg1: memref<*xi16>, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32) {
+      %0 = aiex.dma_configure_task_for @air_channel_0 {
+        aie.dma_bd(%arg0 : memref<*xi16>, 0, 1024, [<size = 2, stride = 512>, <size = 512, stride = 1>])
+        aie.end
+      }
+      aiex.dma_start_task(%0)
+      %1 = aiex.dma_configure_task_for @air_channel_3 {
+        aie.dma_bd(%arg1 : memref<*xi16>, 0, 1024, [<size = 2, stride = 512>, <size = 512, stride = 1>])
+        aie.end
+      } {issue_token = true}
+      aiex.dma_start_task(%1)
+      aiex.dma_free_task(%0)
+      aiex.dma_await_task(%1)
+    }
+    aie.packet_flow(15) {
+      aie.packet_source<%shim_noc_tile_0_0, TileControl : 0>
+      aie.packet_dest<%shim_noc_tile_0_0, South : 0>
+    } {keep_pkt_header = true, priority_route = true}
+    aie.packet_flow(15) {
+      aie.packet_source<%shim_noc_tile_1_0, TileControl : 0>
+      aie.packet_dest<%shim_noc_tile_1_0, South : 0>
+    } {keep_pkt_header = true, priority_route = true}
+  } {dlti.dl_spec = #dlti.dl_spec<index = 32 : i64>}
+  aie.device(npu2) {
+    aie.runtime_sequence @square_kernel(%arg0: memref<*xi16>, %arg1: memref<*xi16>, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32) {
+      aiex.configure @square_kernel_0 {
+        aiex.run @square_kernel_0_sequence(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7) : (memref<*xi16>, memref<*xi16>, i32, i32, i32, i32, i32, i32)
+      }
+    }
+  }
+}
diff --git a/examples/elementwise_arith/air_project/main.pdi b/examples/elementwise_arith/air_project/main.pdi
new file mode 100644
index 0000000000000000000000000000000000000000..a2347424a644d017f5e8ac814673b9061a6becd0
GIT binary patch
literal 368
zcmcc1z`)>QtR&b`o)&s$_o|r;3@i+cKrs~{2GJ%!3<8rIPKiKR5Yp2nAV5Jtfx!S-
z5XuJWBL`Gy9h?QS4_SX=W@>z9Zen^W!ax}W6O}bv!%~(5C<MdEvj3qV1SrVHz`%rT
k2eLX!*nR*1|78Ih;OycLw1I(v8OYwcbHjf@pk*)&03)a&DF6Tf

literal 0
HcmV?d00001

diff --git a/examples/elementwise_arith/air_project/main_aie_cdo_elfs.bin b/examples/elementwise_arith/air_project/main_aie_cdo_elfs.bin
new file mode 100644
index 0000000000000000000000000000000000000000..cba6b8778c42200ab6ec35c68cb3586f8fb4e055
GIT binary patch
literal 24
dcmZQ!U|?`|@n>LQVqjndvbXQt@Lv$f001PR1P=fJ

literal 0
HcmV?d00001

diff --git a/examples/elementwise_arith/air_project/main_aie_cdo_enable.bin b/examples/elementwise_arith/air_project/main_aie_cdo_enable.bin
new file mode 100644
index 0000000000000000000000000000000000000000..cba6b8778c42200ab6ec35c68cb3586f8fb4e055
GIT binary patch
literal 24
dcmZQ!U|?`|@n>LQVqjndvbXQt@Lv$f001PR1P=fJ

literal 0
HcmV?d00001

diff --git a/examples/elementwise_arith/air_project/main_aie_cdo_init.bin b/examples/elementwise_arith/air_project/main_aie_cdo_init.bin
new file mode 100644
index 0000000000000000000000000000000000000000..cba6b8778c42200ab6ec35c68cb3586f8fb4e055
GIT binary patch
literal 24
dcmZQ!U|?`|@n>LQVqjndvbXQt@Lv$f001PR1P=fJ

literal 0
HcmV?d00001

diff --git a/examples/elementwise_arith/air_project/main_design.bif b/examples/elementwise_arith/air_project/main_design.bif
new file mode 100644
index 0000000..27149ca
--- /dev/null
+++ b/examples/elementwise_arith/air_project/main_design.bif
@@ -0,0 +1,10 @@
+all:
+{
+  id_code = 0x14ca8093
+  extended_id_code = 0x01
+  image
+  {
+    name=aie_image, id=0x1c000000
+    { type=cdo file=air_project/main_aie_cdo_elfs.bin file=air_project/main_aie_cdo_init.bin file=air_project/main_aie_cdo_enable.bin }
+  }
+}
diff --git a/examples/elementwise_arith/air_project/main_div_kernel.bin b/examples/elementwise_arith/air_project/main_div_kernel.bin
new file mode 100644
index 0000000000000000000000000000000000000000..e44b65c166f6fc2f297dc1df988f5b8b540f6252
GIT binary patch
literal 22460
zcmeHPZ)_aJ6@PoT_8qCs?KyV3*bZJN1{-pL3o+C{0(*&DS=5Rg{<LlzwXRfDq_(On
zmC9dIF2t07L6``np%mdFp{i0poU1m9tbVwbfB->jAV{f7qXSa4m5RDXR62r#`(}3L
zc4v3S-XbFU;YPZ<-QWD?&71d|nSHn3GZ<!T)&QvD>vvxU$O0aON?IH{Tg4G$^x~PQ
zvrhN3x)i?CSGdkzqYl=n%PQ&u&$dA20szwhV!|lYF37O{>D#5)`P(<89q<aU=mPu%
z=#gz6vQA+0ERfyR9Ke~<eC6)!TcXw13<sP{-3G7(1YpAuKcg+{s{lWq+6EUQ0oiDi
zJekgs&6yn8J(MF|m*mLR{W)^m<{Y{6q8xdwD|cNUuEMBRu6g@M7F1q0%5Su1N`S}3
zcZ~9XF36;^1+-OLwX<6`;Q<yqyN4eYEVDiRQQ`XXGWO$uRd>p^{Tmpuhv7z_i(^Oc
z$eq<(YN^^{RPL_hXUUk_v}*5sKX;~d`ND*Y<8v`e0Carlv0q5n3s}Ex;pVYpKH$J~
zW|dt3J`bXA8Q}QgbocSYhr3DF%j?)%N@njKw0BRshjg7peqjy&PGSA__5HgS?cIAA
z?Y)WoE7q`A!}=@Nw|76<yPuBt4D#2C-#@TELHr&(+<pAPH2U{1<QEoholxshcjmh3
zc)v8cjvf!CdyYSZ<89ew(eX}B>FilObi7}^V;y@BAMQE+aJrkcbfdkYg#*)Cd+6U~
z3tyjE<nyKD1N)JeGxj4(_&SNNQ}}ulU$xB1eJSVY3chdYICpa2mVu+^W(NCvW(Ifk
z&J30|&kVk5pWN56^}_nh;N#shgTL&V`Nd-I%->(an!jv5^T%V`Chf)J8{h4|s@1u#
zer8|giL^ayxYK;%f^E5U{m@nQdaAVq9RPe@+hE}8nc}O*&d%e}EWi|PX=j1sI6e!q
zDRB6DI7ThwDVjM)+P`3t_Of-r`R&3r9~^M5tt{5brm}U`?1J)bZ)BWW!FaTFv_{6q
z^VfY7tY1M}<%arokF5TCr4K5Z6<qB@sajv<=@0Q}EagDu(hm>(^GM~2vuA(!0F)nl
zC6j|=GtU?%^!mHU7r~mv-GT0xqA>sNGirUpKNr0>#2CpXFItNh3>gmcjm1g0<GZi{
zhP=hekM7t$1keJ7%ihIRUike(ty|8K?I$)7?|F+j_$t3-k*~bafw4S=o;3>ghiCQ1
z{(w6~uuD8w?!fxF)jzGQ3V)7%YmEE8uM=(A=@DUS6b-qsro9%|vK<2oPt3ukE984^
z7P&o}FAP|zejpc(=P?#j>7s9sY?`&mcGFs%Tm3#p7kwIq`81TtaBH4ydjYxb9PaSA
zLw^=XyPGF1IL2qH)>{4B8L(*l2R54i7<$I(7)C^&wCA<jla?}Te_|-h+GE>F8h2p7
z(D7IA!ZFm+S%Ix%uj^39(fzIi&}x$JY{0qTo=T&Ne(}d^56x!^KR;r$0x`;Yvikvx
z?8XSR&sw&3Q=VLM0%t6q--gj*QGqOtieGp<aKtCJntVQCAGN^exPRjMM>_vt5esNM
zY5Sp6&>qajm~n;C@iuyl+mq%Kj+fe-+kpOYd-QtBiUZCKb^F!lGWNB#jJ@z8?j^5e
z>>s?EvG>1`v3LAAW9R>xiC>Sge;3tk6g;0W<@lrjWuuk%M+%?I7oP8dEe6KZvd_-u
z?XwqS>`FBH(<yFe6r9%bU1a<7)-kmGGrC4_2HHh8Z@-hah*8~0Hs6%D2QOZ8zSW&B
z8^?<l)a#4aQ~z3oK6WLQFFbX`$b3W&K5dbM19_4gx5$;FdD6GnBDr!W>ASy^9GuVF
zl_PmlMBS}aCvoxf;FM)o_E-e=STs%rV}r3VMRu342GQFAnRiovU7#5ZkH@0F|KaG!
zlh5W-)84Psm3!;t+bfwu%5>9oJ)nQshV#Yzpn<ZlJrN{bU_bt>LZ4`UNZZjUtMWrt
zoyZSWbs|4h)q%5<!mJqh9`4UNy@TP@kGL=TI#{D!R#8X$^~+VwT(z02Hgi?B_e8&o
z)SYIo+RRm(xhiJu5aiy?T(y9?WiwZ8=Bl4YuIlH2G<VbGhpIY}AFAp^eyFMgXD@|W
ziRY>%hO2W`eI1<E*JTxTv|qpMY38cUT(z02Hgi>W&x77!Yu;CF-dBBVjr&;5T(z02
zHt(x~`{~?Q_47QMyTx-=eVxb;RqaH6sHy|!77DXMb4K~T>VAf+b5(sEoYvQ66?L><
zzuer+RhzkLGgocqs?A)r`3ykw8Gz<904yzNJ_GRo`V4@d=TVX3xvIWS<cF$uB0p5s
z0e@aqm5++La65X;iq|iW@a{)YEDi2x@S+BHHTZ-EpVHvd8oaE*=QQ}d2Cr!Fss>-u
z;57~IMR4(~h<J8Gty}P1hnPbRg}fb&Ls>U9?r8W$4eo022@O7_!KXENS%c4M@Oceh
z(co1LzNEox8r+NEfo}faG1R&Rf8&Vj7JN^YbyMSxr^bsK+|}R{8hlEFPiye92A|X5
z^BTOO!K)g4NrTrkxEH|#-Td$LYTbhG`*GcZJzLgIjXRziFKTdCgHLGiDGffY!OI$a
zPJ_>D@QMbnYVaiuUen;72DhM>N<yFRm#$NISBBxVPSS<ltx@tRm6ZsenpW9oIIRow
zC-{DDL~u8<&(n6a6g7ND!?!ei(C{btJ{9*zeLW~@_)cWM^S#Fh$u)e?@F)2Ci~F<o
zeir3Bk#!vS<J*byLBpTm^D4@BHS?{g;X4|>rQt(_Py1(OX+N;XpAGzxPNdXsG+PDF
zY4F?#3*en^m=_A3+u*S&EOGz#l*xY)0MN<rF;YT!K3g1*#>dp)B2SI-+d}s+!}dk)
zD)_>0SSlrqgW$Q+*yxEI2f<q}4yA^ju%8xxUO@OMNTvoC_m-plw$O72Vf*5Kv*6QD
zR+e@ojDz60ykYPm$3gJci-UT95YG(@KSeJlfA%+ui|0Y2cw6XMVL1+dzk*LcSy}2&
z7ze>~dBfmCj)UN>7l;0afv}(Aw|>G;K{EO8^`f|VhC7P4g?`@`wlALR7JT~Ymu{2&
zJcM0+=#L2E#a$`k*MGk_!1)a5j*}57t-6N<AJQSd@jwF(Hw^RLu>I8KVOrxj%y+`}
z(?8YP7mbHoyOr13BFCwLaNJA{4jR0T<J3mjPFBP3<Ty<(!hB1^@8$SK5xyPa2Z87Y
zrY{3D`mUD3dIfxqUT%nw^C94;)bl~&r$c;=USU4Y2j4zZO*0=>+*u3<E2ytHl;Vl{
zGCZp9E{;d_8qx4aHMpz6$2lI=YfQt(`NsT=&Ig=tlK&xnQ~#sB0Y90(0Y90(0Y90(
zK0mp>$~+wPWw=6LhAZ?{@=;$Uj`}Kb)R*B3eU*IFm*Mm+yxvh?C4ZOaf1_*xD2Mr-
z%Z+cE5}xJ!&*M+e!+Z$%GX9=#=W3%bC-I?&%{SaJB>O{#k7K2T7dbzQgNE;NKI$de
zF*W=G=VOp0epbUD;d~sO#BbB^hrKk5zaq{IkB$rD%y31V1-~lA^F5!hh%@73axVGl
z7(K#{gkzkQIL29tW1JbTh_jN9ab`HaN&RA+m3)jd!xeE>@+~jTR=O3j?=JheBF>Dj
zh%>{HAB{7^qj4U99>JH!g>h5koR9z1K;rMx@J-IgC#xm?h=!l#{PqZcSi^7Q{8-#H
zeAC;Cp}}9lM(o?mK8{tApDyRe*1Lv3#`*E}qP9QE`LTJfv7hC9{6Z=DIjP}~bAD`|
zYxt(;-}8?9)3_Hd^*hY@@%X6uyEq^JorYv*U|G%2a(-8Yf2)Q+#`zl~{BaF`ob%)H
zQTt<hd)fSn>+9pu`7_M<_~TV+yuHh6ewOp&@l^B2I6odAHGiD*<KtEHCpka1-ZlQ4
z-YuH=_;@rvS<a7-SIzI_{J8&We$m^?_FKhzSMI|joFDhcw-fb$7w5<4gPK3g`SJCo
z=1*|`XCm{bu&m}!az6gA3TeKLY4`<irzSo=9v$xh=f~%>nveSfvmcL-nm@t$@$stp
z4(G@A|7A75;O*7;<Kxls+MFNvN6p9e!tBSccgt#iueX(5XWJrr6_<TH?vIZv^i^K3
zJGuS1KWh69=f~H(j=xjmzmG@#@AQ0Jp_kHsj6cKU{`<JFpT+}LJlq**p{+2e;b%s{
zGnh82I@7~{m4XHRV&AK!cKmPq)G~d^bvqI9@pzrTr>fie`&GD(9<#D!_}}fRGjgfo
zsXqq%JcK7jxt^Vw*nX|j`C3p<bMeO6lHq^n{P_Jc-x~HJPGG;ncZl@s-)sB${W9$u
z#(BSuIq{mWE#RbvK3_lKnknk|`YU6F?*>{kjBsoMO<d8MeOB>Zbe&uim$hxYCDKH0
zQ^m$f`emGkCbA7kq>0=nX}_{2+{Ss+qzvIw#ginl3Ae!+AfG1EL~c_>3rYHwHIZ#3
z(?o8Qv|m{hZsWXZLcIz~6%UidCTKyAs`6<fP2@K8EJ?qzCbErWn#gUE_A6_`ZJakv
zf*lR8B=wPUL#45sFf5TKAFp3o6WK;GP2@IJbU4XekTu~p&YLE#6wjnSve-A?WE(U#
zjjG7CHn_&iZIbpYYa-i7rit7pX}_{2+{Ss+q#VQ#1DVuEq=t%YBbg?0n<`pMGGAp)
yWE;sek=rEgSJs5vIB%L%KLJf-8_6`0+a!&Rtch$RnI>|Zr2Wd8a2ub5CjSM>GL3!!

literal 0
HcmV?d00001

diff --git a/examples/elementwise_arith/air_project/main_mul_kernel.bin b/examples/elementwise_arith/air_project/main_mul_kernel.bin
new file mode 100644
index 0000000000000000000000000000000000000000..48ac55f27234b0be60abf7239927bde8dd3fe95a
GIT binary patch
literal 14460
zcmeHNO>87r5w7-h>{)PjCy9v$uTgq`gjSIcV{9ZNS)!dak_ASZ5Du0&V7PGMU~<X<
z2}fB8gb>;b$RHtn;XdM!!=6S;4xG>^r<_(!fO}6)IjmUfy;raM)q7Rm$11+8_Q{^^
z{_3lG)%8{X&OlKv_6vY1{{M0tpbxkbjGkYwZuvFKSi>!E8$7&WZ43Bm{zKdF0vlXl
z>kGCnvzu^^8)X3#fIT?BvSjBsA797GKmBg6f@U2p|9~ZUe*lwN0Ttlig8_^_LC;Sf
zKR7HtzH$@by`=|-@bUL*fEW7jzw@_ZI@qwjCKkYO_{OVlZGBX}xcJ}G@BjX#)A2uF
z`uFbAPkys;`W1lFKZFGvzFPN(f2^+n?C+khp8o0=U)wt}^X=82F5;{JIK$81Uj9*G
z=WyHi-o6@a&vu7D{oP`5)8vnKH;jF<5N*F)t%G6t!RIz8^LO!Z{u(xEKFofx_!91A
z{|xu?9>5*rHS7cJGy2z9WdPzoI7ah=ne+57A6#2FZvNhz{QC3LpFLVyUnuJy?H~FO
ze+V!z5zQEW>%&eY82;&lr6JrQ>xTuv=a18VGJXsAFZGjT^Zg{*d_PGxs5T932d2X2
z8$8^wxXB~UMYq8Petp4a=5^y6&wG3FEcdqS8Kbq<Pm;~|lVtP#B-x<4YhXLl-rlvi
z+}pYh9_qHfU^DZ&@y7Gsex`fd^@Y(o?QPxW`$_Wi{Uq6-ddt9em>vtS6N86uT3qgJ
z-3Ax<^#xnk+dp{T+s|}wyPh#kmG-u7^Zg|G`F@gYK=<2H|9V^KXV&d#Zum98`>%Ek
z6ueUKk%HF>K34D}1wU5s69vDo;AaXxQShmP&lLPz!J7#8&qDrQQr4}z&-6Xi3e;@0
zt6Vo3uN3}B!D|H{EBKLuA1nBYg5Ov0GX<Y0_*B7X3VyEOO@w#4x%(tpx9%P))vdd(
zaouFRYGiz*;I)E}75qrSj}`nx!S5^hnSxIge5&9x1wU8tCc-=2+&zG-TX%nu>eijJ
zxo$FEH8MU@@LIvg3Vx*E#|nO;;P(~$Ou;7#K2`9Uf}bmRqu>KrGm@CM3+y_DcX1Xs
zHl~ZcyA%0_YDdI3p{4j1H@1*Jrt^6b;dOMLH-2V}6n>@f2MQk){+P~FX+HAxV5IP?
z=zQn0#}_jS9~AzWj=waYi_d2<zlx6IE*}>s=7Yi?Q(wjWTIsiu!mkwmK;c8gH}l&u
zd(*u>EASO+G|s<WE_z<o^6H2M@UI`2S3R$8dA5Za_dkCv=?xvg3O$n;LfFs#`6sSV
zN&Fc1_o*?y96a-c@%_E4=bNP+v)#-(cwSA8-GbM_^PW_P-8Mj&=YZZv_&hzSRB-<+
z8uOQf_Zne*|9tBCW@*Q4BeM>kSCeD6;C1l4C)GhdKlt~UK2JYON$(<K+`n&%@#Wy%
zC9i{<*YnNNj@gaOI(S}9j@^RS!SkL}hZ}8xFi-zoh0oKIO8UMb#{E0x7+(&)p9tgo
z_spJemTs(;BeP<!KKRmI81<vwd!Anb@hz|_umN@-*YA2h^a6hI+ZGNTLw+5`UwA#F
zwd*0j3gh?wtj71wTZ^mWb#_F!2@uw;RB%x6Wx`F2Fiv0LuMlo-E<*l5;ja<?)rdcg
z_+24Z!Rl)PMc>D}p<W$d(W?&l*bg0lLGA~}?*)8CuaJ-Z;Nn}=lzw<r-@|g)f%^JY
zR!`K|;<3IvgvWYqEBrkLuNC}&@K~>Xg^&Ga^Njle`;Fxv=)3TL>f7=2^zHb0`gZ(0
zeH}kvU-3R1^|iR5uf+v@MLy~);;65PqrMgw^cDH2uf@$rc)X*&B7diGPn0hJoP_+9
zdHbZv@ILXsgI}G8eCYTb|IEetX7SoJ#)s>+-|&pV;@>G8jz)%$h#%vi@N43uUMx<j
z@YjitMPmHE!rvx7?j7SVEBvjdXX`K2+2ZkjVVx~5)Y<c=te&4azEEe&$D4DOXH{JH
zaTtzu7ICbzh+~~CF4S4%W1TIIA8cN%v&hFfTU@BK$R9L4d!*x6>@0pgceqez%NOcw
zapX^<I$J!h^KH29`D|XSn~W3xT9oIG!Y_$`HR5k8{66ttjQCp$f0_76-4uS=+``fT
z;LkhQ{to93M-#(q;wQ(u!rv!;dc4T-_lTeLxe~ume0-o}c^)eK1L7xruJFsoz4K1<
zY2SsjdAEq4)<@>=5FdYX!{XeY%ltm^@uxeC|3ihpPyAOR{(-_jAbwgOIiIq*Yx^hF
z*Wq#hY!M%SOvCbDo6Gz@@zZ+B{C(o5^^y4p#83BE<{uJ2Io_50%jPYmJ`RuT(<gqq
zzcPP?_-Xz!f7INv=Ud@;7th1n#82~aapL@Uh@bX@%-<q@dVI<JG4a0~_0RfT<{uLO
z<%qwp@YkD7r9KXi_xCpO(|(rucz&?))B4E#G4a#=mH8F%lk@*v=C3z*m3$l?@9&WK
zX+AO^j~5$1x!%oX{#tX(UT2phy+(6~r};Qs&{w=(uTcCnA31(S{PcL&_?t@p4v+I+
zX&f%-CFYOyw|JVr!+rc7uGrDwnZXR>g+&d2V**WKwUKOPQ`mVhFZ*20{OG%V6WRQ8
z+a@9oPi^j;D*L(fRcJHocHqxnM`pz&!^adi-9F&*UEHud&VdsBFB&T!b#`-)*lx`V
z_sse9^V;0ncu^8Kug@K#dEL9Vr=Qnq*9NC~+n)HyX$#oY@S4-lr&f9!9e*ciaA{z)
zq6ll#X;OP{yPV>upE_TY+WNL1S(<R4DLUuOYdNhZJO*TG!hLe*<(g28OVi{8!ZF2-
zoZ5sw*a3VqOB3!h#Rxg`a!q)QJWaSy?z~(Rig9V0n5?=n#nqhJbRK4H$~Ut#;XY<7
zXI`!ekCCSd_sN}?YeF$DO_T171~_taG~8gcXA_HMY4UXQa!q)QJWaUI6cf(r1+EFj
zxHL^_R?plVZSC6+9>dh8HAUCj?i$a1a_8lm@ECcTaG%_Hxh53j(lj~g>W78Q%@L`=
z;4$(v;XYH0mea3X6CNW^6Yi5cFV}=(T$(1+FF+F>BTp0VlUp0E36GJd3HQmJmuo^X
Iz6eeJ2joH%+5i9m

literal 0
HcmV?d00001

diff --git a/examples/elementwise_arith/air_project/main_square_kernel.bin b/examples/elementwise_arith/air_project/main_square_kernel.bin
new file mode 100644
index 0000000000000000000000000000000000000000..8ba56366c72a88bc1322e4d03447a2fe49afc7c7
GIT binary patch
literal 11048
zcmeHNzi%8x6n=ZRu{U6n4LYKYW0Z9gp$J@@6C(vF5_`l*78v0qO(Y5o4Go1!MaM;0
z0U?B38YCn)^e2iGsa8sAbV`-Vx&;1!b*dB*=FQBT+xhj(#)^wb8OgV^-}~mxoA<t(
zo7*!)@k9~<e2RbP-T_DeJ4S2kett&xBx45ev@PA9m9`eXia*nq?O}sGY)QaYM|K)Y
zyon>&0$7C6;Ym5aczFvafAmp1hiVQU0;F&rnV&^DKn}N47z`uqz{A};*P{oAPXl~E
zdFMJj__zQ#l-&OIkEl%N<yeqyuDd=p@$cp*zZ~0K`Rmv}y~!_roZoyy+4O66yhs3+
zqS1-X@4k6+v3Q7)E#{9Va25dc`1Oq`F~5oj@7He}G46-G?8~2T!7SO8Q(+&p#Q6Fi
z$KO$R*wd|^TOck!OID8qY~F&y09gXZ*RjNUB=HN(BFOv*|7(c0L?oNP-#yw|tNv^c
zet5F^)zitjR$R1cJy2+VCh|)yd+%Od1jv59JDI^bvfjgtJlXa2B8GB$3E8w>LN=|J
zkPY$+0+tO!vGkd4&r4jy;m5+;V2^Q0z$WHZ?fgsa%0#@V+EtnmOW^bpvT419Y+5fN
z8|0S-ESs<0p2Wl2<!!Ld+me7y%&Xc5FSRQZ@uF&1)m<#X*Di0<dI^owdI{Mezb0VW
zgoey>LArfK;$iLbHrQia60p_Uz4}tSGUG3*c9kZ?5`69QHm#S?IIWkE4d}cZ(x<kW
z<3wM!YF76E&qw2yGI-A5T?Q{0e1*YR8GMbw`wYIp;9CqnVDOT`hYUVq@XEpUcM*No
z3)iha2WkzC3slXIyLR2ec+T*<3|=t!3WKjQ_!@)v8GM7mw-|iD;3b0(8GOXxm4nxH
zQ|Fv;-Rd)tU$^>m%dT4(&#N%rW$=Q*R~USi!PgkP&)^#jzQy1J1}_<W$lxOeuN=It
zoBE6o*RB5i_v=>g*>>H+cwU9^E`t{gzQW+E48F$TeFooP@GS-(FnGz}Lk1r)c*WqU
zJdMe3|JLV}I;E+t(5ef}h!TpY1ukFW!e;Wjv_D4<o;&-t7$;iF@PXlXX&>?ZhxhB0
z;e)dusb~PYq<7zcc>hQlJ~;22+P~7X??0^bl<Mg6!KtsX%Qn2C9`;1CWEz#%(fjd)
z=H+8vVR6LG-=D`vG}xiDwYWFytk3aoTru%oT%X4@uH%4VuVK8JSLo2jS3BO`7`Nk9
zxSn5=V*1|e;`+Of#>G{(;e5k*HLsw7^~H{t(Jeho((~)lF@x*p44PlZJKHc`&HKN2
zXLTr!9u%(U*QA)9=eoFlrl)anRc*S}6&<nRz$@a#YsJ&vy5{FVd<jf@TmkfUi<_Dc
zZG#{EG=|NACciNKx862s<G#tyP5<rRSpRzP!o+lVA0=EkFm)U=I57AW;lhXMCt>&<
z!bMOfKV|qcgr9KunZvIWF%7c55@71P+cWD`^O<@T1|RiM^IKs(Sp2rZXX<70Q4h+$
z)F-2dr^Py^LpH3h?ppcm67JUb65(#WE;9T@1}_+VnQ*sWOAH_NCgXSYfO?a-7k{Nr
zh=&0BrhYm*<CCL>C&d2{pfveV^KJaG^7C$V>ZrwslQJGul;!_iq;ND^c$fGt4h+8_
zKCajD6Epld;$z+{e!}oC5+7G*@uwJmTD9fAfj_la{9;Gp?z%9~k{`&k#F1ZG`FyPS
zZl2G=NzJ$B#k_@a;^TqB;$LL=G4Z{1G5m!1uQ~n~82%LTk2?HI3_q^UU}^yHXFV(a
zY)9c}vhafVo(>uQ67l`_OW6M+@$sieYu=pkzf62jw+#O}@xA;o{J2ujR{VIz&s?lH
z7l`lYC(OS@eEccX@^f}4%wHzHzuqvvLwtX|VSZd)mU{5(rEpgd3&h7CNv-&2cEbE+
z;`{N3`5of>@rU_wb&ZK%;comL;`{N0`B`;F?gOtn>z&(Cxa&VBzW07-_#NWE?)YC~
z_%p=!_8*3yRTr4}74FBca6kSqKdUY?{uQqMx3MEzg_##koHwTI-%{c#lD=?mc_uM`
zXVK59#c1(o+myD$Hl>@eP1W4AiN0+34;Ed~5y9B4#9JAsxZcDU$-}GJVjRC$>)fyH
z;&W<zH;mNxh5gSfV;lRUAjbK>uW}6d27^=PQxYtRwviV5NKs*+w<t1WtaDQ!sct29
zpFKB4?LhPz<;EUU;;<(3+PNVg2bP;YMpd^GyG>%OM~GhEzHO8ndrXNQn#^nGhI|}Y
zZbVddE3w-o#(IS4mG*6;+}LBpU6Xn3+>nn0%T4`G!|>!Po5U!%MzeQ-M!DJFymoHL
z$ARUhu+~-Lf}cG%wvUGU&A#Tfb3^kUSZ@0JiqX&vc5NTca$}EaS`#}r<m13{Q$B~>
T*gl%&#vZf3+>npwk(>Vj59vP!

literal 0
HcmV?d00001

diff --git a/examples/elementwise_arith/air_project/main_sub_kernel.bin b/examples/elementwise_arith/air_project/main_sub_kernel.bin
new file mode 100644
index 0000000000000000000000000000000000000000..32bcee0ccb77c74fa42e783e1359dfb1a7806953
GIT binary patch
literal 14396
zcmeHN&ub*t5w4zTOXFm`J9d;X*;R0_cL|HpVPq+wfWR%U0wO3dj&oSZ!JAy{Lk_|x
zALX!+5JK2XKnaBC639{Hki(uD>~k3W2Q0Fa+*>~Rutcf%UcK&D?^S7^7wfX@gPHFB
z>Z^Ly^;Q3zfudX*6#xx>wzmLAfGfcm`t|CjU$cx=-14@`qZ`&Xz;E*jZPN>EaDi<k
z*!s*~g9UDs1^gXg7fxSXw)2~>@8jg3|9DtIy9S>DOyCXNv!53g!0GuvCvbv(xLyn%
zPGI&K#`yK}!@I@%do{odqlX{<t*9sK=;_9E^wxLW+S;UiY3YAQKl#JUNArKZ{GXlW
zU;K9c=-U8CfAr<rd$WAy==|V&yS0hBTYbI+W`2zLs}IIzewPn!-~ZrBu)Wxs{_^)r
z#cO1D(;558!1xE-uT*Rs<6)jQS2MRE#&$P<Q*6+DnD0{Y4S?y^G49uWfZN7vx&^eC
z$gi=gVC#jEA0D82VCFdb%g0v-2kk$GC%^gf=)K3wYlE^L(*B_j#h;pb)0qD7lU@{<
z{^{f8DcmOOCk4Ql=V|{KSFL{}o9`dV=KDvoLA7CEJ1`a2uaigX7B_jMx#%{yz;7hj
z%)D;A@w}Ixsa|$HVzkxzN3!|;k!-$yBpXzB3~Wc*%R3gAds(-^Bi%L<Y-V0J-g@54
z&r~nFJ}}y*y{y}O|44qme<T}J?-<w)(^KJn#N^T27MFWjx4{K|Bf-}9@=u@l@-x-T
zu18E0rM;}%eE&#(zJDYe(EYO1uih5=nRPqb8-5M&eyST21+Nr*rr@=L&lUWhf*&aO
zp@Kh9@M8r(QSe5=7Ycr=;BAEaXBmHQDC^eWH~Jpx1Zvm2Rj!+iR|<co;I)F!75tup
zA1L^tf<I93V+B7^@J7KG3Vy2KZG`u_x%(hlxBea})vdpdaouFRYGr(;;I)F!75tup
zA1L^tf<I93V+B7^@J7KG3Vy2KZG`u_xof_xTYo>0>eipLxo$FEwK6_a@LIv=3Vu()
z4;1`R!5=92v4Wo{c%$G81wU2rR>3E*Y9ujl3)poE@7gSGY)lt>Hz)E9)sBd7LQC;2
zZfqfcPUrI?!t3ZfZ~V-dDf~*|PZT~V{5hSc(tPCW!A#*-(fQ70k1tvjJ}CS-9e-&)
z7oX2!eia?ZeLgNu%m;-(r@o5$wbE}hg<mQBiNc47Z|1jS@wR&%R^ThpXq>)TE_q(n
z@#=^L@b7oaE1p+(Jln#I``^Eo^u`Tfg`PnSA?#=Wyc5@_Bz}zh`_vd82hTWRe1Gri
z`DSUyVkff>o>!A&zu<N7yeHLRrwb6~IidFrK2J|772H32#{6;c-Xe_epGQ64EbUmV
zXV$^<YI5usybhlCq&mpw2mfBu=jmrD>D^+C`}avPJ`UbR@;bPAJ>M+tSlr00gXh)c
z*e`e;Jnu<$xX}d&^Yp(p_&hzSq~8l-+`l7^@p16`LKxq_H}-t9bmLk%Gb{G$gD=sA
zQ9s_f@A(xF-vZYJHo(q#{gLOxFyI%z>)_Ba<kw;R!RsNdTMzkF7=QR@HNJP=T)HM+
zXJ>?)0Abxq1qTHm6K-OJaYhP%g>Z9o5%MPrf0gj>M*L~S?+bAatiBdd^gZ7R_3HVG
zUUk67e(3oFxgQvR81NOnLO%9`i*Hp^`r&bXAIo6}>g!ioJyBna$NFv)9_zKG@OKrw
zR`7koW4-niKK7f<GwuiMH<o{(@8B!y+w=4E?fH57_WV439Y0@R@je{&wYZ?K#RYvu
zKI$vtsIQ2lz7`ks75S*I#m!fEyraG%f4g;0lrI1rhWwSY?n#s3BjW!EfBFvj(DOO|
zg^Tlp;?=8+57%wK;TeO)zg;*SjSQa=KgL1f*ThG?Se#PfuMr=M#P}nHzeRl9JH{U?
z{LOY~>o3&V;_-fAoh>fZ+4CD#&o3NbsI%qc%{j}nDz5uD497Z)IM!LjvCbA3>MZiH
z&KAcnHZRs$<YS#JF4S4%Puih9((xyA7Qa4oxKL-y7wT+r<Tp{BEgskT7F_pyHZRsq
z#)*G5%5z)cm&Csk@wXKIi1;r>{7r>FCVo;kg<rNeu`~eqcMZ0`(=&&oiQzTzljB|C
z?-4&eUgY?@#83KMi9aGfK2Wke?<)L#;wOEs@XOY{^G@^W-i5PyH;JFtN9J!6|HUZ(
zTW2zVM11`74V(983V)CI--`JA3V)yYX?^5;%Jz=!pHyFm$NjTOeEf3^%YXGu=8uS<
z)>G#15kIYu%-<(|y1z32F7cD&UCF;}-%;x0@VGuB;-~v7^H+$U<}dSS?M-{W6^?iD
zJiJBxG#?iy&VQTuX+OyPP2#7=m&~6N|C>?&tewgHyTr#oh_Qa#Q}}D`hEgAg$NPJW
z_-Q}Od^|tc_-TD){+#&f{>uD{_{sVIOy;k(ca(e_9`Emz_-Q^eACDIsKe^tW$^6y!
zroGONBfVy44o~xOxS+3iy<VaCX+CoNiumdAuJJdN{2d<WztTEf&`Zo8>u>Qie~0_{
zLtL?=#WRB$#tVxY{-*@m!fGSg%C@lcU|#mQn)%Uh`zEsaaN7nV4o_|FoGSad^Hpdw
z>vrIuR%d3#B*RC8n{FR)`4Mhd9_K)5{w#aJSoyTKn|s7=Yf-po&ZnQ(=GMiFa=>|g
z?hwuE-nBjbyjHs|IL+Jj#HUVMz@~;*oqj&G(%b0xJ4Ay^1EUp1Sessx+I!n&gWrDY
zd`)WW+kIte!hITa&Y9P8I!$;C$kK%S<j%`Ap%|B@$svTJ!Ht~Sgg)2-d^1ZE?$cm|
zoO!t>JVu@-+$VQlt_j7sG)+ua{b+DCr#8KZS!?)amL}ZCY~{?$HQ_PxG~qtE^Kwln
z#-(Y}pV0tEZjOc<jP7h=u`ErVZeFelkCCSd_h~TUoL=CXP>f5{q-OQZ&C%Ar`{FT7
zZ8}qQt?jSz+$VQlt_hEkrwRATotJAuF)mG$!@ho4$lM%}8VnvIPZRFbV6>cm<(lvq
td75ya+<Cbs6ywq~X}$(cc#J$vxKD0vxF$SCo+jKUcV4au#rQfj`9BCz25tZV

literal 0
HcmV?d00001

diff --git a/examples/elementwise_arith/air_project/mul_kernel_0.pdi b/examples/elementwise_arith/air_project/mul_kernel_0.pdi
new file mode 100644
index 0000000000000000000000000000000000000000..fe255396317cd13639d937b59f0b9075ebf0e6a4
GIT binary patch
literal 7856
zcmeHMPiP!v6n`^2Nv67OH#L?qNpYG42_=V(8zF^&v)u?$BW_KVfQP0+Ed_<xgZ5^`
zfJH<CqNEiu7cZWc9(stENP7?w$<d1;t$Oe_+fxtK_<P@<`QA*D+UTwHgU$Zl@4fGR
z@83*zXMZK4$=X2orT5-^^{4N?y-XC42W|*2ejme&*Jleq_GqO4K0n!P#+8_=k~En2
z{g-v|*!NfXJh}hO`Ln0bzkTN1S?93d!94nM`1q08E~3});&-{W%sK_KlgO86NW1@o
zSN`1kD}aN><jXjL=p<q|ceS<Ejl+DeL=+EZxv^Z?ILc=F1vx6nRFFd<cY8Vm_TAyV
zYr-C&z@rvAN}Ky!0iHs|ZoN>7DIJ042kN406>6<}6eA>Btx)wQyxiPco%Md%H$e1Z
z*Xk_&@?rwd;kAq3cx#nWwO1b!MJEnxG*S)rcmBQn(x=_a&7Zq}uXmlfG`jpa(eme6
znlOwGCt>tOvX5x0zS+C{{yRq|=lHzI<Q5unlL#lgb)o#ar_Rvw;)Mf3Uav=QeA4L+
zaQx8g#rl?CkPpQp#8!6Y&t^8aU;D%P3DwMbh`!T%1W|`8$a0Zrn0G}XnVhjd;cJA9
z{pvi_KA*Y#)%An^e0sIi`smK`xw~B>evp(*e%NE|Bccj><P{yi;hIFz<?CG$4IAwZ
zkLb>po*O$Syocw;&WV-noLJe;iIpj?Wew4DH>zhY^Ou)tBQJ+S&d=M^59jTHoHyNn
zJ3o=>zm@HLSlP~pl__o%=B-|sx4cXnc{vnve%?+zoVN#Z-gN)%{6wbzR<`qDWjh~M
zrg)|>Zzl`$mX~QGFNZ?T&)X{x=k0-<H{E|bKauIbmF;|3+0F;cJRkP6oDW&O@IRS9
z+cD-<@_^E#8LkK%3tSaA5x6PvoWSz}F9^IO@QT1Kf!73H7kE?P)L@-oEq4v^XNY`$
zqar_t<J94*z=^<3f#(FC7kEM7C4pB2ZV9|5@Vda80;dM+{2GP)wCws<7x_6Hrw&&I
zP6TcWJSXtHzzYH|3A`e3OW-wu*9G1bI5k-3H&e(@%Pzl@B0q=Y)ZwbYiNH;P=LDV?
zctPMLfmZ}>3A`roy1<(Prvg`~-{<dI>RJP~d4cCo=azRlQhy+LhRUXinfDA%#I;Zr
zd@T5i;7RaJ%$D<$zXmKHi&?A)dlGyTJ+$_T=xtT-vEVC$C&4#SE9WQQQ_IJ)p2D6C
z-_7?iv+VcH4lmTqI(&L$1ZzdK`E1ZB_gVW`*(c7>px;gh2X?Ygw13qnx#^FugTvhO
z9^A8e@vK)2ws{4R8LR7{pesY>tXRkQbzW?{J-_-M^JDvj-R0Kq`6>VR$Paofou9@Y
zWX?Un=Oe~Czr3z&KRc`ZoNjx5qkGJc?Gv1uU%Th0{NE!#=&^Ku8oT`TwWRaQ>&o`K
zsLIdjw&!<rkNL5EqGFPNcF#}wzej%1W9j@f_8{y2>3fdOFRv@xXFMuDr*rw?;e=Yf
zfx6L=l?`p)lDqZAZXDRb4>Pc0f<0uSUjDOf)whxlq&=04z4x)&RoqatPpp6cSkY9;
zMf=$LFJ0NyKi6uk6LDa1t*Qp|231GMj8z>Ax-w)uYf-wepzDEb?*Wy9t{?IfW?v+9
zJJg@&3u~A9Z`F(S)b_j7Gbz}kU)tW^xnJAtOU8bedKT@`FZCFUXqSH7O%_f4IZ0c8
zg>C&OI(y01cpR9Q)lEXSy-5nXX~?$5Qw3cB+4hEnZio8&d#t~<zyJDc`}?oIw!i=S
z<3`RE``7xbf5K+|6}B}-{e4mYoDTI@&$xLX>Yvl0{=TSxPKWyYOqgpbr<+Ji)Iqes
zbJ5ercOHX7@XrygF;AM;_;!h&@t!=Wc<SRTMeQB-`m()M@YW9rJ^_zBm3<)i5%4%g
z@uA?y!6PEYmjyqTmV6O2=cD(67cn!xrsBL^@<q%(KHw<(*z3#oBIXPu<_sfduDP;D
z%o&fExu*JD!6Rm0#GKhz(o%rOz{);pmv~C_UhpDj=GRQjfsJ{H`Z9aO<}mn!#?OS{
z1Mmk7KQ8zX{C>la3BC+Ii%sxBI>^7Ps5qncAVM?ogU|Xd_$lzNFV21vd{%Q|AA-jt
zc;$aq@YCS4nhQQiYr;=0i)ReHi^uU3;PDfM@;}sed<Y&tX(;}b;HSVpX838rPlI>y
zI6pz!5cxJlJPrIFqy6`{9UsEp#p(Dd@Gc(5PlI>oI(`;>)_37QNM}SmGg&+#cz3Sj
zd%!#Yj<2SJzUX^4!{gwcpTUfu05AHU@nhgc-!r}m{!vq(k+$<Q3;vMdrvyKe)<itD
z%+C;bS7*m#9z;BjZ-RH{Iz9%U&41hRBWXkUX=LX{;GG}Gqc47T-L)OxpAH6PlUKDp
zm|edSc#&_$_keePoP7-5^<DBc;lC#Q)ZBI5c0A%2{u|0({clJy4G5IL7}=L{ka|HC
dYSb$C%nZADj`SY$yOfjl?*>lR&vjY0>mMR!jRXJy

literal 0
HcmV?d00001

diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_aie_cdo_elfs.bin b/examples/elementwise_arith/air_project/mul_kernel_0_aie_cdo_elfs.bin
new file mode 100644
index 0000000000000000000000000000000000000000..9f902fee90c41225fe0fdf079a639fbbc0bb5606
GIT binary patch
literal 2656
zcmeH_KT8}z7>9qex8ClBnCuCWK@!-UBv@SGX=S&kSBMglm_|sG*w~~n79m}V2wH?R
zlCTznjs23sz*PY)ti;mlP{{|dv<Nxh*_m<QbN3S*H?Xkty#HpH#|1FgdI>ndHhs^2
zC*7(5JmFG=RyYpYrpvOR^Him$pJlRup@8{}RfX<6BtDr5HMjx^(877eGYokC7`R+0
zg}ACy<QN_f0wkvbAsr9x0FxKg=%U+R7H5?j@YQQC<7_bkhWvxi`{GZ~NNaUNDQ8FZ
zUwzUYF8_~Te6Pl@PO5*K-utaa{0NAD++LXy<!I!~pHT&vZ=R3EUp_u*y;JjA(Pf#Y
z0yN>n`u(>eouT&|>kruev?*7<l|@aRpW=F{c;ejIkA!uoEW3JVE80`9`_cRe(`Fu>
zzAWz1sr@~=<pwaJs-zEdXR40?dRvp<UZb$1X2!dR51qBEU!`R8I$r(f)g3n~nfp<V
z$(KN&I;tgSj<QFB+&T0lCQS550N35jBkPQ+yXy>R))~&MGn^ruzI`OyFX%K<C!L-d
zd4_^KpRvrxsezAu>!bF9J0*<+oLN^mv#xN4aIV0|W`U1+hEAUOjAcH~4t(reAGHJA
zDQO(w%(}vvb%is8Zwh>TUEpJ$p_6AmW0{Xj10Va=N9_Q2N*V_^v#xMvT~X`??RlZC

literal 0
HcmV?d00001

diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_aie_cdo_enable.bin b/examples/elementwise_arith/air_project/mul_kernel_0_aie_cdo_enable.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7cc1818bce4d6ce1226fc5fda519967a4842b99e
GIT binary patch
literal 104
zcmZQ!U|?`|@n>LQVqg#jvKQ~%@L!OTfq|Wo1;_&e1!V>x7mN{N##qFhv4{o3#Q>))
B1+xGE

literal 0
HcmV?d00001

diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_aie_cdo_init.bin b/examples/elementwise_arith/air_project/mul_kernel_0_aie_cdo_init.bin
new file mode 100644
index 0000000000000000000000000000000000000000..d4549ba181167c6e6d7475f23335e34fea2c3376
GIT binary patch
literal 6032
zcmb7{&rak<5XQ@%hINw3u1Uy>EJ*QcI7AT#M?)kINHH2BaX`vM<iIUF0*^qXJ?)8e
zF5wYI$_vaRaLt7vMY;Fvc_s37+iiD!;}I|tf4=&CUGA#tw!nzU<oIWiROFvEk-z_%
zJ!~W*FA@WSPcd@He{sy=EE%{TBKEES5p+(vAYGEKNKZ*$kiH~+Mf#fb4e1%_Iq6%{
z_oQv3yPk}D`c%hUV?tlVeouYUx*%SXu1HTwUy!~eeMS13^bP45={f0J()Xloqq{y$
zdiqqybf1o>Pg)mN>ymUudP@3&^d;#l($}PKNY6;mN#BybCv6+u_35mqPjyW9=@;sg
z)`iu&Bwdl7lD;5)N&1TPHR&7DGtzU?x1{e$Thh7gr0z}C-Y@95Kj1=K_e*vRasT2A
z`KGw%NtC~$_drR!Af6K!;#1rYonOBXO5z3Gubj9LpW<BU{LA+y<^`S0oYo8RDdrKc
zuV_9c@q&0xTpGTS_%ZrfttFZNDYrPwi8P`Px$pNsn)N<^;r&%-Y<~OqH}kU3U)A~j
z{NqLc{b^o@^RRhW-=}a-^G3%#Fo-!FJksMCBIYU#o-Fco^&UO5Ugxjkt^40s*X#2?
zQZGFCgnj0BwN4NVJu!$m9X!(G86u|Z?ecW>;_N?bzRq9a$@;UpUZ4Mwdf~w*)RN!T
zTGw0O+2D~L&k!+PZ<nX5_w6(5b^gllFaEZ=UZ4Mwdf~w*)RN!TIzg<PZ~YDo9_jH6
z5!3Z{dDfCo{WC<~82>Vspqp^>qxyLCZUYYv{=)g)M&yk)HtOi1`aSl`pz$Z~{VHC>
z{wm~8zhkS{>@Pz8!5=I0ho0@PV=wrGo|TPu3xmfX=5+8#k7tP3R^-WgJe!Ecz0Z3*
zJBWQ9<=Kinq2IpGzOohbc{t*JyYq{${(e{F$2@C)+CR@F|Df@&=r{W@PpQw>ig|vl
z-i6*{KG=BId(6k{cs_@S#l1i1@f;%-&#>z8oFW$Y{-no)v!Lz|&O*DNXg+CuAD%p)
z+W++V)c&W>r}jU6KIt>&<BoKDH!vTsX+9~<XUT*4q%@x;59X87e3m?zPwIlvU0w1V
z+JWF-=&NwXHW&VX<5p+jKZ(o(mzsC_!$5vWzIoMh*>-QrcKx@>wrfeNOYm3=@d_NZ
zw|+x>4;)3bJR^PpjtW{nB))G4Db?$G()v+vO7%KEZ|nOoNU7cwf84d}i)35nL%m9)
zUZqj5n{~S$^(u~f-3(ifdKE{#Db=fZZU-ih)>n&xd&hVAz^Pux=TW^Tu6I|q)q2#U
zHTbJ2{~>V${xaeR#53^EBfd|32(IcOZtO_bqxD;h5z6W6t-#g!Bz^*}&xc-r46f!)
z>of4z(euuUpMtA-6E}8D`NwMi``~*2njeCH5<P!+p?L=WRm6WKegghw#7~Kzg6sY3
z{KihGe-qmOL|#PscNUswSg-5T`~+O@U-MINy+6&*!PWVs=No%Q`#)3r&%pKmG~Wc*
z&)2-Pqm<64x~~V|I{!%VLvT8uitmHd`BZ!g{#n%jy@k$y4*o^NPl)f?G3|e>^6!G{
z`D%{yOZ(S+3a<C3c>%8O|3dRUJE8m&wZAQJonLdDk5s*%7Mkzakr_t)DHkL4{@Vhl
z{wcl*uJh~l1-L$+%*XWnG36iY_v1oy)KAZ!`1S4I2o|Ro1qz~3EajN7i78>jEaZQL
C^4EL-

literal 0
HcmV?d00001

diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.elf b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.elf
new file mode 100755
index 0000000000000000000000000000000000000000..c7c58092079ae00dffa136a058d3c0d77b6c8d5b
GIT binary patch
literal 1672
zcma)6O^6$17=9)jldSlYRxE=*FsvY}tl3Eut?6lYNu}5<tOsS$!|eR+?y$)u&P>*=
z9yVZ6LGh-<gW$!Bry$tFo`hEDNs6algzlnv@zCCSkZzwhGn1s^!H0bFywCgdz2DDd
z4lcCbP!t6jawttyozC7Pz#O>_4LK~Lh&&{Fz>(o59Wt1TGRIqg#%>my<-)8;?Yx2!
z(8TTe;w1XwbHMSNa~kd}F>-@Fua+@l2MzokmZ2Z9+tJD4hH`6u0r<FhxPe=j46dMb
zb^m+i=W=yin@p&xt7j5xNzXrCxIbL`^3-ts*HeG26s^yy!>57aSK}=uNl?!iCG~4#
z9=N!2`<dav`_DHwrC-xH<_3-cc4&u>y{C+OB>GDD#EibVqS{{+lm%Ho;|U}_%Kb~9
z)|Mby86NH}-iPXw`Pkn;E$zqE7nDZ<we%f#c?G;8LM<WPGwTiT&XDFG_887d&*8V%
zp3L>`{5UuI^xkm$pW;$3Z_K6pV@;Z`CRZwxLTy~1UJ}%At`$`*%7qix72w_p(DH*_
zFFdccT1~CmsOS|PaN1U|t?hYX<adMAt1a%taW6W*yu9tlJAJ$2bUVu%e%lZBm)q^V
z&MTJhX?D$V^@dxkd6rw#8?V)0cRlBvTX$<!yJpv&YQu9I^{vKB$r@&#T+Re}+&y`U
zYJK8S@GR}ojDCT(HYrjty%RaTyC?Er*qn*4N<N1{-)ShgL>u|!c*-awy^N`TTjGPM
zepfW1aY(%_f^WJ&>OUfHa_$~k;-8Q;4uR;;$(jhFOg$uPA_&nxkTns6=%2`%2txEP
zWK9Gi`ZuyBf)M??82RKME9*HHZA|rFq6brrO6={&kZbIC0gdQ#C$?<zP85fUHDSxy
zH6#B+FXH>sdS|WaTW!yY*Sc=M?V-|IUDeHM%4f-D>q6k_aKgA+aU43f`hjEG-nJi9
z;dDdK)XlT7`&;8jRV*E*8O2r@!;E@~W$sz+zGpVqL^``LJC+}qe&ELt3#Joy!zil~
zziEY`bvg4SVJ~noE=q%A?ao#vLi}9EtMaJJqc*-yLyqnLKw*-M<b-+DwB;L{p`04>
z#)w-_^?DBYAhm_Ec_g2q32smf7?f|e%)u`N@16f~4*A|^xX8e{OpvoBEYP0KJ7C`5
U|B?4{S~EyvyKE>uvw1Rp0HPHLbpQYW

literal 0
HcmV?d00001

diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.ld.script b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.ld.script
new file mode 100644
index 0000000..fc4f0cf
--- /dev/null
+++ b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.ld.script
@@ -0,0 +1,72 @@
+
+MEMORY
+{
+   program (RX) : ORIGIN = 0, LENGTH = 0x0020000
+   data (!RX) : ORIGIN = 0x78400, LENGTH = 0x7C00
+}
+ENTRY(__start)
+SECTIONS
+{
+  . = 0x0;
+  .text : {
+     /* the __start symbol has to come at address zero. */
+     *crt0.o(.text*)
+     _ctors_start = .;
+     _init_array_start = .;
+     KEEP(SORT(*.init_array))
+     _ctors_end = .;
+     _init_array_end = .;
+     _dtors_start = .;
+     _dtors_end = .;
+     *(.text*)
+  } > program
+  .data : {
+     *(.data*)
+     *(.rodata*)
+  } > data
+  .comment : {
+     *(.comment*)
+  }
+  .symtab : {
+     *(.symtab)
+  }
+  .shstrtab : {
+     *(.shstrtab)
+  }
+  .strtab : {
+     *(.strtab)
+  }
+  .stack_sizes : {
+     *(.stack_sizes)
+  }
+
+. = 0x70000;
+_sp_start_value_DM_stack = .;
+. += 0x400; /* stack */
+/* No tile with memory exists to the south. */
+. = 0x40000;
+. += 0x10000;
+/* No tile with memory exists to the west. */
+. = 0x50000;
+. += 0x10000;
+. = 0x60400;
+buf5 = .;
+. += 0x400;
+. = 0x64000;
+buf4 = .;
+. += 0x400;
+. = 0x68000;
+buf3 = .;
+. += 0x400;
+. = 0x70400;
+buf2 = .;
+. += 0x400;
+. = 0x74000;
+buf1 = .;
+. += 0x400;
+. = 0x78000;
+buf0 = .;
+. += 0x400;
+  .bss : { *(.bss*) } > data
+}
+PROVIDE(main = core_0_2);
diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.ll b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.ll
new file mode 100644
index 0000000..19c8134
--- /dev/null
+++ b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.ll
@@ -0,0 +1,95 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target triple = "aie2p"
+
+@buf0 = external global [256 x float]
+@buf1 = external global [256 x float]
+@buf2 = external global [256 x float]
+@buf3 = external global [256 x float]
+@buf4 = external global [256 x float]
+@buf5 = external global [256 x float]
+@buf6 = external global [256 x float]
+@buf7 = external global [256 x float]
+@buf8 = external global [256 x float]
+@buf9 = external global [256 x float]
+@buf10 = external global [256 x float]
+@buf11 = external global [256 x float]
+@buf12 = external global [1024 x float]
+@buf13 = external global [1024 x float]
+@buf14 = external global [1024 x float]
+
+declare void @debug_i32(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.event(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.put.ms(i32, i32)
+
+; Unknown intrinsic
+declare { i32, i32 } @llvm.aie2p.get.ss()
+
+; Unknown intrinsic
+declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32)
+
+; Unknown intrinsic
+declare <16 x i32> @llvm.aie2p.scd.read.vec(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.acquire(i32, i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.release(i32, i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.set.ctrl.reg(i32, i32)
+
+define void @core_0_2() {
+  call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1)
+  call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0)
+  br label %1
+
+1:                                                ; preds = %17, %0
+  call void @llvm.aie2p.acquire(i32 49, i32 -1)
+  call void @llvm.aie2p.acquire(i32 50, i32 -1)
+  call void @llvm.aie2p.acquire(i32 52, i32 -1)
+  br label %2
+
+2:                                                ; preds = %5, %1
+  %3 = phi i32 [ %16, %5 ], [ 0, %1 ]
+  %4 = icmp slt i32 %3, 256
+  br i1 %4, label %5, label %17
+
+5:                                                ; preds = %2
+  %6 = getelementptr float, ptr @buf2, i32 %3
+  %7 = load <16 x float>, ptr %6, align 4
+  %8 = getelementptr float, ptr @buf1, i32 %3
+  %9 = load <16 x float>, ptr %8, align 4
+  %10 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %7)
+  %11 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %9)
+  %12 = shufflevector <16 x bfloat> %10, <16 x bfloat> %10, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %13 = shufflevector <16 x bfloat> %11, <16 x bfloat> %11, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %14 = call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %12, <32 x bfloat> %13, i32 60)
+  %15 = getelementptr float, ptr @buf0, i32 %3
+  store <16 x float> %14, ptr %15, align 4
+  %16 = add i32 %3, 16
+  br label %2, !llvm.loop !1
+
+17:                                               ; preds = %2
+  call void @llvm.aie2p.release(i32 51, i32 1)
+  call void @llvm.aie2p.release(i32 53, i32 1)
+  call void @llvm.aie2p.release(i32 48, i32 1)
+  br label %1
+}
+
+; Unknown intrinsic
+declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>)
+
+; Unknown intrinsic
+declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32)
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !{!1, !2}
+!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.o b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.o
new file mode 100644
index 0000000000000000000000000000000000000000..43b8281695feaa671e97fd88c5e79951f78aeaed
GIT binary patch
literal 1000
zcmaJ<O>5L(5Psh7b~lwOX~7x<Vd+WNHe}U{7s;+$t(X-Nltm9qb{j>ET}-o3PZjJz
zM5GrbM=#!c>S0g)1=3@W75oAE2dvNJtLYvbGMRa1=ACEWNw!v;J4z{JqM#KJa4-dw
zn7k1eTt*cqwGwJ$pYJ^#yEHX<#yTvi`FT9<s>7K@;C*%6#o>J$nAY|-zpH({QN!;g
z)`%EJYa#KLtn%5)->CC>E_(Q5?pL$werQDJf#^$-)L5`J<EuRbtTvA>Mq6(#wQ)p3
zzAIz}$lwRF&s5Bj;O*dC0UtDt{-=sslo^;$MDn{+6L`MHDf3A=>nUzIc<Ox8kNh^S
zr+l2gqR#N7_KmmPPPmAW-Z>5A=0ffPW&Anr5nFkzgh%vu3fLE4Wq}gzrLN7`Jipw{
zEJdqRpHAQx6IfuKdlV6~T#a4!0<LO_FSn@6PeJ)et(RjLCOH-Vmsrfu+YDi@4@1wq
z`{42NM(7S+!Ev5++Am$-8-$(V+Q|1{ZiGST_G#a~7TX^^zXmh#d^hf34u*l(vwDm<
zt(I68G*ZwIU=j43mG!lgHKil`&s<?Ly+T~p<=msffo!E>MbnkQF4TxCI~BQ2ovf2@
xkV#f|ivd}eohEehuF{wOGLPhDsj~LMal9*nLasrdEpm~%9U>{8rb<8I{s-KramN4v

literal 0
HcmV?d00001

diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.opt.ll b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.opt.ll
new file mode 100644
index 0000000..0eee48f
--- /dev/null
+++ b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.opt.ll
@@ -0,0 +1,72 @@
+; ModuleID = 'air_project/mul_kernel_0_core_0_2.peanohack.ll'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-m:e-p:20:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-f32:32:32-i64:32-f64:32-a:0:32-n32"
+target triple = "aie2p"
+
+@buf0 = external local_unnamed_addr global [256 x float]
+@buf1 = external local_unnamed_addr global [256 x float]
+@buf2 = external local_unnamed_addr global [256 x float]
+
+; Function Attrs: nounwind
+declare void @llvm.aie2p.acquire(i32, i32) #0
+
+; Function Attrs: nounwind
+declare void @llvm.aie2p.release(i32, i32) #0
+
+; Function Attrs: nounwind memory(inaccessiblemem: write)
+declare void @llvm.aie2p.set.ctrl.reg(i32, i32) #1
+
+; Function Attrs: noreturn nounwind
+define void @core_0_2() local_unnamed_addr #2 {
+  tail call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1)
+  tail call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0)
+  br label %1
+
+1:                                                ; preds = %17, %0
+  tail call void @llvm.aie2p.acquire(i32 49, i32 -1)
+  tail call void @llvm.aie2p.acquire(i32 50, i32 -1)
+  tail call void @llvm.aie2p.acquire(i32 52, i32 -1)
+  br label %2
+
+2:                                                ; preds = %1, %2
+  %3 = phi i32 [ 0, %1 ], [ %15, %2 ]
+  %4 = trunc nuw i32 %3 to i20
+  %5 = getelementptr float, ptr @buf2, i20 %4
+  %6 = load <16 x float>, ptr %5, align 64
+  %7 = getelementptr float, ptr @buf1, i20 %4
+  %8 = load <16 x float>, ptr %7, align 64
+  %9 = tail call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %6)
+  %10 = tail call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %8)
+  %11 = shufflevector <16 x bfloat> %9, <16 x bfloat> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %12 = shufflevector <16 x bfloat> %10, <16 x bfloat> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %13 = tail call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %11, <32 x bfloat> %12, i32 60)
+  %14 = getelementptr float, ptr @buf0, i20 %4
+  store <16 x float> %13, ptr %14, align 64
+  %15 = add nuw nsw i32 %3, 16
+  %16 = icmp ult i32 %3, 240
+  br i1 %16, label %2, label %17, !llvm.loop !1
+
+17:                                               ; preds = %2
+  tail call void @llvm.aie2p.release(i32 51, i32 1)
+  tail call void @llvm.aie2p.release(i32 53, i32 1)
+  tail call void @llvm.aie2p.release(i32 48, i32 1)
+  br label %1
+}
+
+; Function Attrs: nofree nounwind memory(inaccessiblemem: read)
+declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>) #3
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read)
+declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32) #4
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind memory(inaccessiblemem: write) }
+attributes #2 = { noreturn nounwind }
+attributes #3 = { nofree nounwind memory(inaccessiblemem: read) }
+attributes #4 = { mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) }
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !{!1, !2}
+!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.peanohack.ll b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.peanohack.ll
new file mode 100644
index 0000000..7de74b2
--- /dev/null
+++ b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.peanohack.ll
@@ -0,0 +1,95 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target triple = "aie2p"
+
+@buf0 = external global [256 x float]
+@buf1 = external global [256 x float]
+@buf2 = external global [256 x float]
+@buf3 = external global [256 x float]
+@buf4 = external global [256 x float]
+@buf5 = external global [256 x float]
+@buf6 = external global [256 x float]
+@buf7 = external global [256 x float]
+@buf8 = external global [256 x float]
+@buf9 = external global [256 x float]
+@buf10 = external global [256 x float]
+@buf11 = external global [256 x float]
+@buf12 = external global [1024 x float]
+@buf13 = external global [1024 x float]
+@buf14 = external global [1024 x float]
+
+declare void @debug_i32(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.event(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.put.ms(i32, i32)
+
+; Unknown intrinsic
+declare { i32, i32 } @llvm.aie2p.get.ss()
+
+; Unknown intrinsic
+declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32)
+
+; Unknown intrinsic
+declare <16 x i32> @llvm.aie2p.scd.read.vec(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.acquire(i32, i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.release(i32, i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.set.ctrl.reg(i32, i32)
+
+define void @core_0_2() {
+  call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1)
+  call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0)
+  br label %1
+
+1:                                                ; preds = %17, %0
+  call void @llvm.aie2p.acquire(i32 49, i32 -1)
+  call void @llvm.aie2p.acquire(i32 50, i32 -1)
+  call void @llvm.aie2p.acquire(i32 52, i32 -1)
+  br label %2
+
+2:                                                ; preds = %5, %1
+  %3 = phi i32 [ %16, %5 ], [ 0, %1 ]
+  %4 = icmp slt i32 %3, 256
+  br i1 %4, label %5, label %17
+
+5:                                                ; preds = %2
+  %6 = getelementptr float, ptr @buf2, i32 %3
+  %7 = load <16 x float>, ptr %6
+  %8 = getelementptr float, ptr @buf1, i32 %3
+  %9 = load <16 x float>, ptr %8
+  %10 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %7)
+  %11 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %9)
+  %12 = shufflevector <16 x bfloat> %10, <16 x bfloat> %10, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %13 = shufflevector <16 x bfloat> %11, <16 x bfloat> %11, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %14 = call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %12, <32 x bfloat> %13, i32 60)
+  %15 = getelementptr float, ptr @buf0, i32 %3
+  store <16 x float> %14, ptr %15
+  %16 = add i32 %3, 16
+  br label %2, !llvm.loop !1
+
+17:                                               ; preds = %2
+  call void @llvm.aie2p.release(i32 51, i32 1)
+  call void @llvm.aie2p.release(i32 53, i32 1)
+  call void @llvm.aie2p.release(i32 48, i32 1)
+  br label %1
+}
+
+; Unknown intrinsic
+declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>)
+
+; Unknown intrinsic
+declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32)
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !{!1, !2}
+!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.elf b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.elf
new file mode 100755
index 0000000000000000000000000000000000000000..f1be4eefad7b89482558fe9d6e292826c0748801
GIT binary patch
literal 1736
zcma)7O^6$17=9)jldSlYRxE=*FsvYJS+g@u>^42kE~$vk!g^2^J<LvKW_O28CUG)Z
zw|Z#6qJrX0i3h=pmtG8lJ?u$ng`T8%>ZQ<K6fZq=FH%rkpEom;q~gJc{oeO|-k<ON
z{xW;{Lj6@mQIH~o!kAU)96kcfP*<QKgL&kUg=F_RQrx0L22)Ywc;mOoO=HtKH%$^d
zt6&J!aCbI8j=uODaQOO+hWiVQ++@#dMGV<N1ON7m&=1({;CO#sxjj1ve3;)~$L)(I
zS5UaN_pS0{(HPan6RPUUsn}Z3vybQg9;|+TVzBn}iC>rV_9w>RX<+chXiISt)Kg|b
z{nDHTUR%EV%;567&(}7jU(Gz^1`YsrXoioyql|jQ{7U%56u-5sHa^QKbFzNMV@Q0M
z`4>N_EkLqTJltEn57o!>vA>CO(vPdpDUSkb;T!Jq3V20?T0pXA)|=p+A<5tGFr1N|
zgRieYnd#jBerEXbgTdBc`Gri@oJsb_nj~M1uT&<5TD>v3B&c6q&#Rc13ny+Uz=LC;
z?gu+wcwVd5Yno9l=_MU-ns%_I?RsI?ZwHB2o8OM2PWSxc;+7w6_ZlUq-CA7tn|`pj
z*lg~$UbcNtYm^;Vue#;3XS-#+dbV=T^_(TQ;+Bm@xlwV9s^?ZKo7I=%HB3t`XN)Y4
zj-Mi18+#NyOM5uQU!bjxixf=mL{9JMSbhr|Q}Gqa=P>Cz2?ZBvBO4!25xKaRIpJ?g
zyg%V@3nw%Vsc#73OBYD}2h<y!yF)GUkEk^cf$&eMH6cWid_b)UA%tJ0)`Sqkf1uWc
z5W;_=)`Sqkf1%cd5W??>k>$G~WIcysm=pfH@cx9~6P}HK;gWwO+??<~g!d;LrO4Zh
zAlKgV0vg>*t;lXrZ+D|Gw#M3acC4=dzSrgZs=u{b^X;bRM5}GL*Yr@TudL{nk?1qj
zR{cWY>TtryC^-%to4vrX8s3&47;xI5XX#cMjo#)fA}S)vu&i!mhY_r9C$_9zyV>)s
z+Nw}z2Ug4W1IrKm2x7r<qITF#tHf{FVQ61UJ#p9zT#Sm6;E21^l`#=N*KtOKAwm~%
zPQ;Rkv!mBw$|?RYD9n-(pE!@6wtS;g6ca;UoVfLbuVsMu5?fR{kK{8n#!ZqTgYwOn
vIpifs4!;+W{}(Ct7+6XPb-IK(+S7U8G4Id+$UB|X^pn_CHk6*}JgNQ%b><P(

literal 0
HcmV?d00001

diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.ld.script b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.ld.script
new file mode 100644
index 0000000..6120a88
--- /dev/null
+++ b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.ld.script
@@ -0,0 +1,78 @@
+
+MEMORY
+{
+   program (RX) : ORIGIN = 0, LENGTH = 0x0020000
+   data (!RX) : ORIGIN = 0x78400, LENGTH = 0x7C00
+}
+ENTRY(__start)
+SECTIONS
+{
+  . = 0x0;
+  .text : {
+     /* the __start symbol has to come at address zero. */
+     *crt0.o(.text*)
+     _ctors_start = .;
+     _init_array_start = .;
+     KEEP(SORT(*.init_array))
+     _ctors_end = .;
+     _init_array_end = .;
+     _dtors_start = .;
+     _dtors_end = .;
+     *(.text*)
+  } > program
+  .data : {
+     *(.data*)
+     *(.rodata*)
+  } > data
+  .comment : {
+     *(.comment*)
+  }
+  .symtab : {
+     *(.symtab)
+  }
+  .shstrtab : {
+     *(.shstrtab)
+  }
+  .strtab : {
+     *(.strtab)
+  }
+  .stack_sizes : {
+     *(.stack_sizes)
+  }
+
+. = 0x70000;
+_sp_start_value_DM_stack = .;
+. += 0x400; /* stack */
+. = 0x40400;
+buf2 = .;
+. += 0x400;
+. = 0x44000;
+buf1 = .;
+. += 0x400;
+. = 0x48000;
+buf0 = .;
+. += 0x400;
+/* No tile with memory exists to the west. */
+. = 0x50000;
+. += 0x10000;
+. = 0x60400;
+buf8 = .;
+. += 0x400;
+. = 0x64000;
+buf7 = .;
+. += 0x400;
+. = 0x68000;
+buf6 = .;
+. += 0x400;
+. = 0x70400;
+buf5 = .;
+. += 0x400;
+. = 0x74000;
+buf4 = .;
+. += 0x400;
+. = 0x78000;
+buf3 = .;
+. += 0x400;
+  .bss : { *(.bss*) } > data
+}
+PROVIDE(main = core_0_3);
diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.ll b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.ll
new file mode 100644
index 0000000..79b2ca7
--- /dev/null
+++ b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.ll
@@ -0,0 +1,95 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target triple = "aie2p"
+
+@buf0 = external global [256 x float]
+@buf1 = external global [256 x float]
+@buf2 = external global [256 x float]
+@buf3 = external global [256 x float]
+@buf4 = external global [256 x float]
+@buf5 = external global [256 x float]
+@buf6 = external global [256 x float]
+@buf7 = external global [256 x float]
+@buf8 = external global [256 x float]
+@buf9 = external global [256 x float]
+@buf10 = external global [256 x float]
+@buf11 = external global [256 x float]
+@buf12 = external global [1024 x float]
+@buf13 = external global [1024 x float]
+@buf14 = external global [1024 x float]
+
+declare void @debug_i32(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.event(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.put.ms(i32, i32)
+
+; Unknown intrinsic
+declare { i32, i32 } @llvm.aie2p.get.ss()
+
+; Unknown intrinsic
+declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32)
+
+; Unknown intrinsic
+declare <16 x i32> @llvm.aie2p.scd.read.vec(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.acquire(i32, i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.release(i32, i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.set.ctrl.reg(i32, i32)
+
+define void @core_0_3() {
+  call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1)
+  call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0)
+  br label %1
+
+1:                                                ; preds = %17, %0
+  call void @llvm.aie2p.acquire(i32 49, i32 -1)
+  call void @llvm.aie2p.acquire(i32 50, i32 -1)
+  call void @llvm.aie2p.acquire(i32 52, i32 -1)
+  br label %2
+
+2:                                                ; preds = %5, %1
+  %3 = phi i32 [ %16, %5 ], [ 0, %1 ]
+  %4 = icmp slt i32 %3, 256
+  br i1 %4, label %5, label %17
+
+5:                                                ; preds = %2
+  %6 = getelementptr float, ptr @buf5, i32 %3
+  %7 = load <16 x float>, ptr %6, align 4
+  %8 = getelementptr float, ptr @buf4, i32 %3
+  %9 = load <16 x float>, ptr %8, align 4
+  %10 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %7)
+  %11 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %9)
+  %12 = shufflevector <16 x bfloat> %10, <16 x bfloat> %10, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %13 = shufflevector <16 x bfloat> %11, <16 x bfloat> %11, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %14 = call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %12, <32 x bfloat> %13, i32 60)
+  %15 = getelementptr float, ptr @buf3, i32 %3
+  store <16 x float> %14, ptr %15, align 4
+  %16 = add i32 %3, 16
+  br label %2, !llvm.loop !1
+
+17:                                               ; preds = %2
+  call void @llvm.aie2p.release(i32 51, i32 1)
+  call void @llvm.aie2p.release(i32 53, i32 1)
+  call void @llvm.aie2p.release(i32 48, i32 1)
+  br label %1
+}
+
+; Unknown intrinsic
+declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>)
+
+; Unknown intrinsic
+declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32)
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !{!1, !2}
+!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.o b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.o
new file mode 100644
index 0000000000000000000000000000000000000000..4343d406be95fbbfffcd92a0f42bf02db06255c2
GIT binary patch
literal 1000
zcmaJ<O=}Zj5Pse!O}0{HTd*#I5PH(sE*o19UStzfEw~Xulz>9W#}>r3!R-d>se(O-
zi1ecD(Tn$<ddR82Kzi)4f<Hk2fc2UE>NW?5%scbU%sbD#v)Ng5?klB`ih`Cyz|j;?
zU~)AoxQ;T;Y9-XfA<qLH`!p5#jdfU3^Ye7vR>w1o!29yJjpK(lFs&VIe^-Zkt%Bc6
ztPwGc#zO2ZS;dQ`zhUe1T=@9M+^>4M`=J(I2Es3SQe(l^jIZ_#uwFm88t%Ng*2D=3
zd6viukiqk3pR0%?#=HKd96qWWy-y{zC^Il0i{$sGCh&ZPQ)ZKL)^pr)@!b8mANg&p
zBz&B{q%QEL_KmOHjk$=Byg3PE=1S%PMf^D(5L;QSfEV=la@Z4JrGWyTB(BZaJipwX
zG)1dZ?@r)*6IfuCdlV6~T#j7!0&Z%tFR!S|pMvs{S}jH{OmZszUm`IBcRPT&ISgF$
z!K0_kTS2$~3Xb!v)qL6Y+<wp+4o02}b1U$JZjbiuYq7o2#%(!P<ye8~yIwcyW%h@@
z+p#)~IgN(Zxs})&ZO2*L9GtBw8R38C29wDXqOvyU9_22iD-|o6tORzhMqJscNQ*jI
yC*L5Iv~HaNS(lw8bn;3}`pZ0$o2AOybJy`M4|2I0eYVI%>h_4Fe3~l#g!><@G;zKF

literal 0
HcmV?d00001

diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.opt.ll b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.opt.ll
new file mode 100644
index 0000000..ce97114
--- /dev/null
+++ b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.opt.ll
@@ -0,0 +1,72 @@
+; ModuleID = 'air_project/mul_kernel_0_core_0_3.peanohack.ll'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-m:e-p:20:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-f32:32:32-i64:32-f64:32-a:0:32-n32"
+target triple = "aie2p"
+
+@buf3 = external local_unnamed_addr global [256 x float]
+@buf4 = external local_unnamed_addr global [256 x float]
+@buf5 = external local_unnamed_addr global [256 x float]
+
+; Function Attrs: nounwind
+declare void @llvm.aie2p.acquire(i32, i32) #0
+
+; Function Attrs: nounwind
+declare void @llvm.aie2p.release(i32, i32) #0
+
+; Function Attrs: nounwind memory(inaccessiblemem: write)
+declare void @llvm.aie2p.set.ctrl.reg(i32, i32) #1
+
+; Function Attrs: noreturn nounwind
+define void @core_0_3() local_unnamed_addr #2 {
+  tail call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1)
+  tail call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0)
+  br label %1
+
+1:                                                ; preds = %17, %0
+  tail call void @llvm.aie2p.acquire(i32 49, i32 -1)
+  tail call void @llvm.aie2p.acquire(i32 50, i32 -1)
+  tail call void @llvm.aie2p.acquire(i32 52, i32 -1)
+  br label %2
+
+2:                                                ; preds = %1, %2
+  %3 = phi i32 [ 0, %1 ], [ %15, %2 ]
+  %4 = trunc nuw i32 %3 to i20
+  %5 = getelementptr float, ptr @buf5, i20 %4
+  %6 = load <16 x float>, ptr %5, align 64
+  %7 = getelementptr float, ptr @buf4, i20 %4
+  %8 = load <16 x float>, ptr %7, align 64
+  %9 = tail call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %6)
+  %10 = tail call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %8)
+  %11 = shufflevector <16 x bfloat> %9, <16 x bfloat> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %12 = shufflevector <16 x bfloat> %10, <16 x bfloat> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %13 = tail call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %11, <32 x bfloat> %12, i32 60)
+  %14 = getelementptr float, ptr @buf3, i20 %4
+  store <16 x float> %13, ptr %14, align 64
+  %15 = add nuw nsw i32 %3, 16
+  %16 = icmp ult i32 %3, 240
+  br i1 %16, label %2, label %17, !llvm.loop !1
+
+17:                                               ; preds = %2
+  tail call void @llvm.aie2p.release(i32 51, i32 1)
+  tail call void @llvm.aie2p.release(i32 53, i32 1)
+  tail call void @llvm.aie2p.release(i32 48, i32 1)
+  br label %1
+}
+
+; Function Attrs: nofree nounwind memory(inaccessiblemem: read)
+declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>) #3
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read)
+declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32) #4
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind memory(inaccessiblemem: write) }
+attributes #2 = { noreturn nounwind }
+attributes #3 = { nofree nounwind memory(inaccessiblemem: read) }
+attributes #4 = { mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) }
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !{!1, !2}
+!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.peanohack.ll b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.peanohack.ll
new file mode 100644
index 0000000..c86e34d
--- /dev/null
+++ b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.peanohack.ll
@@ -0,0 +1,95 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target triple = "aie2p"
+
+@buf0 = external global [256 x float]
+@buf1 = external global [256 x float]
+@buf2 = external global [256 x float]
+@buf3 = external global [256 x float]
+@buf4 = external global [256 x float]
+@buf5 = external global [256 x float]
+@buf6 = external global [256 x float]
+@buf7 = external global [256 x float]
+@buf8 = external global [256 x float]
+@buf9 = external global [256 x float]
+@buf10 = external global [256 x float]
+@buf11 = external global [256 x float]
+@buf12 = external global [1024 x float]
+@buf13 = external global [1024 x float]
+@buf14 = external global [1024 x float]
+
+declare void @debug_i32(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.event(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.put.ms(i32, i32)
+
+; Unknown intrinsic
+declare { i32, i32 } @llvm.aie2p.get.ss()
+
+; Unknown intrinsic
+declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32)
+
+; Unknown intrinsic
+declare <16 x i32> @llvm.aie2p.scd.read.vec(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.acquire(i32, i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.release(i32, i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.set.ctrl.reg(i32, i32)
+
+define void @core_0_3() {
+  call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1)
+  call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0)
+  br label %1
+
+1:                                                ; preds = %17, %0
+  call void @llvm.aie2p.acquire(i32 49, i32 -1)
+  call void @llvm.aie2p.acquire(i32 50, i32 -1)
+  call void @llvm.aie2p.acquire(i32 52, i32 -1)
+  br label %2
+
+2:                                                ; preds = %5, %1
+  %3 = phi i32 [ %16, %5 ], [ 0, %1 ]
+  %4 = icmp slt i32 %3, 256
+  br i1 %4, label %5, label %17
+
+5:                                                ; preds = %2
+  %6 = getelementptr float, ptr @buf5, i32 %3
+  %7 = load <16 x float>, ptr %6
+  %8 = getelementptr float, ptr @buf4, i32 %3
+  %9 = load <16 x float>, ptr %8
+  %10 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %7)
+  %11 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %9)
+  %12 = shufflevector <16 x bfloat> %10, <16 x bfloat> %10, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %13 = shufflevector <16 x bfloat> %11, <16 x bfloat> %11, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %14 = call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %12, <32 x bfloat> %13, i32 60)
+  %15 = getelementptr float, ptr @buf3, i32 %3
+  store <16 x float> %14, ptr %15
+  %16 = add i32 %3, 16
+  br label %2, !llvm.loop !1
+
+17:                                               ; preds = %2
+  call void @llvm.aie2p.release(i32 51, i32 1)
+  call void @llvm.aie2p.release(i32 53, i32 1)
+  call void @llvm.aie2p.release(i32 48, i32 1)
+  br label %1
+}
+
+; Unknown intrinsic
+declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>)
+
+; Unknown intrinsic
+declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32)
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !{!1, !2}
+!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.elf b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.elf
new file mode 100755
index 0000000000000000000000000000000000000000..2158287344d4e58ea0bb86865df912b73324cb82
GIT binary patch
literal 1740
zcma)7O=u%!7=9*=Nh<ziS6l{vV5lH<sp*V~+s&S)B`e~lP!CGc!*nt;ZHFe4IGNP0
z9yZ{jg5phy2f>RM5e%Yx=t<ZW_9V-m_9ASHix&^Oo_bL0^JZp}ta$LD-}k=H`}4i;
z_mk<R^Yxb$ML~)T3S(BG@%smW8OjPYWH66BvXJOLM~WLX$Y3an9IxJq+|)NMbJHZz
zvkHbl4Yz0WWAC$10f(>5Xt=vT&vn+kQpAuIH1Kb~2>pQ74vzQNm0PoOz<c@qb=<mO
zvV+2vy|0uXi^eE7UQkt6PQ}`So_#p?_h9vt6N9y%PyD)^w?8xnj{}2GM_Y=6pq?@d
z>SyLG@bdEQCkB_^db+kD^=jrJH*f&3LNk2mO=VOg=9j}qrudCzwefLInUntMk0JhH
z=3o4zwg8Draer;G-j^S*$NDD9Nj-L-Qyv7=!k66TW$=m!wSZ*L%s0V1LlVE=p*tfr
z2VY!$G}F2J-OTWVqrujn`Gri@oJsb_oFrb2uT&O=TD>;8B&eTX&8wJ~3n#8Az|k>K
z_k$fTJg3#`HO;7&^pXxZO*`1qcD=Ccw}Zs0&2L9hr+aR3am$akdySIQZY{3+O+VOM
zY&LgW&)dGIHOh{wSKV^iv)!^@JySXBdd`wtamz-d+^9H4)pM(r&FXV;57RT-86%5(
z$B&V%jV%hEq&}SD&rsLKNeU)+BByunSbhr|Q~ni+=P+qI2?ZCZBO4!25xKaQIpME~
zzdzw`2q!cS$!`eZOBYD~yObMTyF)4d_bD|Ff$)zgH6cWid_buQA%uTTsR<#3|3Ik;
zA%y=#sR<#3-=x%p5W;^ECChh1NI!?7m=pe+@cxAVE<79m!X^F>;pT)N3GYuhN|Co0
zL9V^!1ys5hTan$M-0ntUtc|7Z>{wm@9k0vxReyc8=G#rriB{Wguj!#wUs=&DBatsq
zTJ`gRtHTK+qvSX=Z1w`jYIs|IV8CgIo~2tAG<ut7Wm=Ny3@ob~*<l2$+le)6*KYPa
ztF|iC*@4xv{lM}AKZ0nmoTwdk(=4%Db{N_hQ%mgj0vDsC#5v;bbY@J%&OTn0Y1CU;
zG{YFpy3Ezldobl3|0fj2$%s##$4_0p(<zFHBCk%&dcxN-z}tx~B3(z~=^EoY$&gO@
x?#mkT8YG6_49NeC6nk_mrGzqVVUGHA+_#MT>p$X7CprDZ_YDh5&2*hq{sTMI5=sC7

literal 0
HcmV?d00001

diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.ld.script b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.ld.script
new file mode 100644
index 0000000..ddda3c2
--- /dev/null
+++ b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.ld.script
@@ -0,0 +1,78 @@
+
+MEMORY
+{
+   program (RX) : ORIGIN = 0, LENGTH = 0x0020000
+   data (!RX) : ORIGIN = 0x78400, LENGTH = 0x7C00
+}
+ENTRY(__start)
+SECTIONS
+{
+  . = 0x0;
+  .text : {
+     /* the __start symbol has to come at address zero. */
+     *crt0.o(.text*)
+     _ctors_start = .;
+     _init_array_start = .;
+     KEEP(SORT(*.init_array))
+     _ctors_end = .;
+     _init_array_end = .;
+     _dtors_start = .;
+     _dtors_end = .;
+     *(.text*)
+  } > program
+  .data : {
+     *(.data*)
+     *(.rodata*)
+  } > data
+  .comment : {
+     *(.comment*)
+  }
+  .symtab : {
+     *(.symtab)
+  }
+  .shstrtab : {
+     *(.shstrtab)
+  }
+  .strtab : {
+     *(.strtab)
+  }
+  .stack_sizes : {
+     *(.stack_sizes)
+  }
+
+. = 0x70000;
+_sp_start_value_DM_stack = .;
+. += 0x400; /* stack */
+. = 0x40400;
+buf5 = .;
+. += 0x400;
+. = 0x44000;
+buf4 = .;
+. += 0x400;
+. = 0x48000;
+buf3 = .;
+. += 0x400;
+/* No tile with memory exists to the west. */
+. = 0x50000;
+. += 0x10000;
+. = 0x60400;
+buf11 = .;
+. += 0x400;
+. = 0x64000;
+buf10 = .;
+. += 0x400;
+. = 0x68000;
+buf9 = .;
+. += 0x400;
+. = 0x70400;
+buf8 = .;
+. += 0x400;
+. = 0x74000;
+buf7 = .;
+. += 0x400;
+. = 0x78000;
+buf6 = .;
+. += 0x400;
+  .bss : { *(.bss*) } > data
+}
+PROVIDE(main = core_0_4);
diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.ll b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.ll
new file mode 100644
index 0000000..2552e6c
--- /dev/null
+++ b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.ll
@@ -0,0 +1,95 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target triple = "aie2p"
+
+@buf0 = external global [256 x float]
+@buf1 = external global [256 x float]
+@buf2 = external global [256 x float]
+@buf3 = external global [256 x float]
+@buf4 = external global [256 x float]
+@buf5 = external global [256 x float]
+@buf6 = external global [256 x float]
+@buf7 = external global [256 x float]
+@buf8 = external global [256 x float]
+@buf9 = external global [256 x float]
+@buf10 = external global [256 x float]
+@buf11 = external global [256 x float]
+@buf12 = external global [1024 x float]
+@buf13 = external global [1024 x float]
+@buf14 = external global [1024 x float]
+
+declare void @debug_i32(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.event(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.put.ms(i32, i32)
+
+; Unknown intrinsic
+declare { i32, i32 } @llvm.aie2p.get.ss()
+
+; Unknown intrinsic
+declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32)
+
+; Unknown intrinsic
+declare <16 x i32> @llvm.aie2p.scd.read.vec(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.acquire(i32, i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.release(i32, i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.set.ctrl.reg(i32, i32)
+
+define void @core_0_4() {
+  call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1)
+  call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0)
+  br label %1
+
+1:                                                ; preds = %17, %0
+  call void @llvm.aie2p.acquire(i32 49, i32 -1)
+  call void @llvm.aie2p.acquire(i32 50, i32 -1)
+  call void @llvm.aie2p.acquire(i32 52, i32 -1)
+  br label %2
+
+2:                                                ; preds = %5, %1
+  %3 = phi i32 [ %16, %5 ], [ 0, %1 ]
+  %4 = icmp slt i32 %3, 256
+  br i1 %4, label %5, label %17
+
+5:                                                ; preds = %2
+  %6 = getelementptr float, ptr @buf8, i32 %3
+  %7 = load <16 x float>, ptr %6, align 4
+  %8 = getelementptr float, ptr @buf7, i32 %3
+  %9 = load <16 x float>, ptr %8, align 4
+  %10 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %7)
+  %11 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %9)
+  %12 = shufflevector <16 x bfloat> %10, <16 x bfloat> %10, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %13 = shufflevector <16 x bfloat> %11, <16 x bfloat> %11, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %14 = call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %12, <32 x bfloat> %13, i32 60)
+  %15 = getelementptr float, ptr @buf6, i32 %3
+  store <16 x float> %14, ptr %15, align 4
+  %16 = add i32 %3, 16
+  br label %2, !llvm.loop !1
+
+17:                                               ; preds = %2
+  call void @llvm.aie2p.release(i32 51, i32 1)
+  call void @llvm.aie2p.release(i32 53, i32 1)
+  call void @llvm.aie2p.release(i32 48, i32 1)
+  br label %1
+}
+
+; Unknown intrinsic
+declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>)
+
+; Unknown intrinsic
+declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32)
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !{!1, !2}
+!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.o b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.o
new file mode 100644
index 0000000000000000000000000000000000000000..2fea81b77544b9d8acd307af0a8527ae8aff2af3
GIT binary patch
literal 1000
zcmaJ<O=}Zj5Pse!O}0`cEm#*p2t8?RmyH%bF0zTK7Tky+N<blOk`=_Y!R-d>se(O-
zi1ecD(Tn$<ddR82Kzi)4f<Hk2fc2UE>NW?5%scbU%sbD#v)Nj;?<u8_ih`Cyz`+zy
zU~(lYxQsGRY9-XdKF_@hc4?~e8&_aT&Cla;M;*>A0`JS?4h|n!z_hlv`CaW->Q($+
zVvUHdHy2`W$t<2N{SDin=fX!n=6*HG&WCz<9tgkWN%aLw)4y6Xz*^(zVz~9@QVT~U
z<XIvsKnBmBeWoIg7;pRMa`>R3_db=>qRha2ERx@yn!xi_PMJ;0Sx<4x#Z&j=e&n}s
zJ>lc@C3S{3wQqdocFaYD<jqMSGZ!)sDB{oYfY{7p1w5y}lf$0)Dh(9yIB_k;=K1Ar
zrzzSB_09yoJAnn(xJMB&%jL*rFW{;c`|^sK{3$3OskLI{!XT&O|0NP5a5n=O>%+h`
z?mv98yb(D4SFr7;?bb`jbNfMiI2d^@jE%q#oF47l*J6947q{fNDaQ>MzUw(rFQY&7
z-LBbPfnhhBX7^fR>$F{ab$xKMreuWwnJY{tPl(DooO_hJkgim$XtEMmxf*e0ry_0Y
zWSxA2RMNUN24r1UlBkeZV$xsck=!g*)}FhLcX^P@)#<ZECQ`RUB<0go=_lO(0J<@8
A$N&HU

literal 0
HcmV?d00001

diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.opt.ll b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.opt.ll
new file mode 100644
index 0000000..e15e691
--- /dev/null
+++ b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.opt.ll
@@ -0,0 +1,72 @@
+; ModuleID = 'air_project/mul_kernel_0_core_0_4.peanohack.ll'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-m:e-p:20:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-f32:32:32-i64:32-f64:32-a:0:32-n32"
+target triple = "aie2p"
+
+@buf6 = external local_unnamed_addr global [256 x float]
+@buf7 = external local_unnamed_addr global [256 x float]
+@buf8 = external local_unnamed_addr global [256 x float]
+
+; Function Attrs: nounwind
+declare void @llvm.aie2p.acquire(i32, i32) #0
+
+; Function Attrs: nounwind
+declare void @llvm.aie2p.release(i32, i32) #0
+
+; Function Attrs: nounwind memory(inaccessiblemem: write)
+declare void @llvm.aie2p.set.ctrl.reg(i32, i32) #1
+
+; Function Attrs: noreturn nounwind
+define void @core_0_4() local_unnamed_addr #2 {
+  tail call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1)
+  tail call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0)
+  br label %1
+
+1:                                                ; preds = %17, %0
+  tail call void @llvm.aie2p.acquire(i32 49, i32 -1)
+  tail call void @llvm.aie2p.acquire(i32 50, i32 -1)
+  tail call void @llvm.aie2p.acquire(i32 52, i32 -1)
+  br label %2
+
+2:                                                ; preds = %1, %2
+  %3 = phi i32 [ 0, %1 ], [ %15, %2 ]
+  %4 = trunc nuw i32 %3 to i20
+  %5 = getelementptr float, ptr @buf8, i20 %4
+  %6 = load <16 x float>, ptr %5, align 64
+  %7 = getelementptr float, ptr @buf7, i20 %4
+  %8 = load <16 x float>, ptr %7, align 64
+  %9 = tail call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %6)
+  %10 = tail call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %8)
+  %11 = shufflevector <16 x bfloat> %9, <16 x bfloat> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %12 = shufflevector <16 x bfloat> %10, <16 x bfloat> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %13 = tail call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %11, <32 x bfloat> %12, i32 60)
+  %14 = getelementptr float, ptr @buf6, i20 %4
+  store <16 x float> %13, ptr %14, align 64
+  %15 = add nuw nsw i32 %3, 16
+  %16 = icmp ult i32 %3, 240
+  br i1 %16, label %2, label %17, !llvm.loop !1
+
+17:                                               ; preds = %2
+  tail call void @llvm.aie2p.release(i32 51, i32 1)
+  tail call void @llvm.aie2p.release(i32 53, i32 1)
+  tail call void @llvm.aie2p.release(i32 48, i32 1)
+  br label %1
+}
+
+; Function Attrs: nofree nounwind memory(inaccessiblemem: read)
+declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>) #3
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read)
+declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32) #4
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind memory(inaccessiblemem: write) }
+attributes #2 = { noreturn nounwind }
+attributes #3 = { nofree nounwind memory(inaccessiblemem: read) }
+attributes #4 = { mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) }
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !{!1, !2}
+!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.peanohack.ll b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.peanohack.ll
new file mode 100644
index 0000000..bfe891f
--- /dev/null
+++ b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.peanohack.ll
@@ -0,0 +1,95 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target triple = "aie2p"
+
+@buf0 = external global [256 x float]
+@buf1 = external global [256 x float]
+@buf2 = external global [256 x float]
+@buf3 = external global [256 x float]
+@buf4 = external global [256 x float]
+@buf5 = external global [256 x float]
+@buf6 = external global [256 x float]
+@buf7 = external global [256 x float]
+@buf8 = external global [256 x float]
+@buf9 = external global [256 x float]
+@buf10 = external global [256 x float]
+@buf11 = external global [256 x float]
+@buf12 = external global [1024 x float]
+@buf13 = external global [1024 x float]
+@buf14 = external global [1024 x float]
+
+declare void @debug_i32(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.event(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.put.ms(i32, i32)
+
+; Unknown intrinsic
+declare { i32, i32 } @llvm.aie2p.get.ss()
+
+; Unknown intrinsic
+declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32)
+
+; Unknown intrinsic
+declare <16 x i32> @llvm.aie2p.scd.read.vec(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.acquire(i32, i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.release(i32, i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.set.ctrl.reg(i32, i32)
+
+define void @core_0_4() {
+  call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1)
+  call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0)
+  br label %1
+
+1:                                                ; preds = %17, %0
+  call void @llvm.aie2p.acquire(i32 49, i32 -1)
+  call void @llvm.aie2p.acquire(i32 50, i32 -1)
+  call void @llvm.aie2p.acquire(i32 52, i32 -1)
+  br label %2
+
+2:                                                ; preds = %5, %1
+  %3 = phi i32 [ %16, %5 ], [ 0, %1 ]
+  %4 = icmp slt i32 %3, 256
+  br i1 %4, label %5, label %17
+
+5:                                                ; preds = %2
+  %6 = getelementptr float, ptr @buf8, i32 %3
+  %7 = load <16 x float>, ptr %6
+  %8 = getelementptr float, ptr @buf7, i32 %3
+  %9 = load <16 x float>, ptr %8
+  %10 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %7)
+  %11 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %9)
+  %12 = shufflevector <16 x bfloat> %10, <16 x bfloat> %10, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %13 = shufflevector <16 x bfloat> %11, <16 x bfloat> %11, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %14 = call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %12, <32 x bfloat> %13, i32 60)
+  %15 = getelementptr float, ptr @buf6, i32 %3
+  store <16 x float> %14, ptr %15
+  %16 = add i32 %3, 16
+  br label %2, !llvm.loop !1
+
+17:                                               ; preds = %2
+  call void @llvm.aie2p.release(i32 51, i32 1)
+  call void @llvm.aie2p.release(i32 53, i32 1)
+  call void @llvm.aie2p.release(i32 48, i32 1)
+  br label %1
+}
+
+; Unknown intrinsic
+declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>)
+
+; Unknown intrinsic
+declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32)
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !{!1, !2}
+!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.elf b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.elf
new file mode 100755
index 0000000000000000000000000000000000000000..680e4695811b0567bb930a499c2383fd1f5d7bf8
GIT binary patch
literal 1676
zcma)6O^6$17=9)jldSlYR$2ysU|2y`S+g@HcAK7NmsE<)!uFyI7G@_iv%AA4lXNm!
zw}NcIf<34=B?^KUZyth&3VRY-uqWxkLvMCVJ$NbYtq1A$c{4LfDjs~8?|a|p{rTSS
zn|J2hKU;rKQ52-epfF|?8lz``8OjPYWH66BvXJOLM~YiC$Y3an92fqI+|)NMbJHZz
zvkHbl4R>bqWA7WU0(ak^(ePk_o|~+Bt%xBjXy9{Dgnr0sherqN%I(=X;IsU}I&Pmg
zxr4&By&sidi^eE7UQkt6PR81To_#*|c(D4_vBBE!$L=lX?W@M%1Tgq|bfh>4>PfSp
zeq+u8=a%ohJlMbVT5Ut>)y%s*z#+g2&G5O8lu?bCUkP8B;<uL7#uqtdPWDfK4Dk;$
z|Ki8B1xQqir)!J#sr-07);Cd3>T&lu<yk;2{J>LQ0b4|<1te!?z6tgWN&G>F?u^tN
ze1HALOy|MRGsDjx4YvNtFJ!XjOmaTvB=Kt8Qdtyg^~R)0P`|yNS1~UQCvGUfqa&d1
z2RmMPMyuCrno%w3B^_{@cCe-GdSTaZ2Z>di-;SbA_srtrmLF~R8YQRQT3q*=ez3RL
zZ0@$+wtY`)lpR;Ey5+KGyJfw4s&d-(oF%v7mW@WaQE`l_=T<A5)i>ikOwZiT7+E|#
zdWmdpY*FwE_2CqMow_zoQZVU>yuF7<@-J*m`Bx;K!=&vv6r87yZ2WqP$l-!;bHYCq
z-k<P`!U>H-^5Y+7;{TNP4X)iu{8uS84uQmfNvR1Tisau>YC;I%KT&Ez2;sj_YC;I%
zzfo#J2;qNFYC;I%e~OajcSFc|?ulYf_}{|&6OK~k?M09_w!DB!_i`(;8<gAKD2%nS
zw4EKR>wn^P`MuOXSgrYX({rNLw%coZDAiY1bjwKOJCs)aY~bo}!pJB&4h@^Vz_A+M
zmLC{!+M#FZ*1Kr*HVtDm>oQkiS>4DEBUs%|tXaEuv*%g0RiVxftd{KumLK>LM1$o-
z?Xa6>iQTfp(7v2nVz(E#7$qgn5qGCEV<L9$<FrgmGMySV)RY(Z|DZ5TMtX1dsmnc^
zqL?W1)reV7_*w?|IMGF<>qtCZW85U^(<%41tRdZ$82<B+J3qxaI+jvGnQmc@`gGhr
W<L>`Q+*?UbKk>C#P->>@r1A+>rU?lE

literal 0
HcmV?d00001

diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.ld.script b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.ld.script
new file mode 100644
index 0000000..51c13db
--- /dev/null
+++ b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.ld.script
@@ -0,0 +1,72 @@
+
+MEMORY
+{
+   program (RX) : ORIGIN = 0, LENGTH = 0x0020000
+   data (!RX) : ORIGIN = 0x78400, LENGTH = 0x7C00
+}
+ENTRY(__start)
+SECTIONS
+{
+  . = 0x0;
+  .text : {
+     /* the __start symbol has to come at address zero. */
+     *crt0.o(.text*)
+     _ctors_start = .;
+     _init_array_start = .;
+     KEEP(SORT(*.init_array))
+     _ctors_end = .;
+     _init_array_end = .;
+     _dtors_start = .;
+     _dtors_end = .;
+     *(.text*)
+  } > program
+  .data : {
+     *(.data*)
+     *(.rodata*)
+  } > data
+  .comment : {
+     *(.comment*)
+  }
+  .symtab : {
+     *(.symtab)
+  }
+  .shstrtab : {
+     *(.shstrtab)
+  }
+  .strtab : {
+     *(.strtab)
+  }
+  .stack_sizes : {
+     *(.stack_sizes)
+  }
+
+. = 0x70000;
+_sp_start_value_DM_stack = .;
+. += 0x400; /* stack */
+. = 0x40400;
+buf8 = .;
+. += 0x400;
+. = 0x44000;
+buf7 = .;
+. += 0x400;
+. = 0x48000;
+buf6 = .;
+. += 0x400;
+/* No tile with memory exists to the west. */
+. = 0x50000;
+. += 0x10000;
+/* No tile with memory exists to the north. */
+. = 0x60000;
+. += 0x10000;
+. = 0x70400;
+buf11 = .;
+. += 0x400;
+. = 0x74000;
+buf10 = .;
+. += 0x400;
+. = 0x78000;
+buf9 = .;
+. += 0x400;
+  .bss : { *(.bss*) } > data
+}
+PROVIDE(main = core_0_5);
diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.ll b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.ll
new file mode 100644
index 0000000..4ed7251
--- /dev/null
+++ b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.ll
@@ -0,0 +1,95 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target triple = "aie2p"
+
+@buf0 = external global [256 x float]
+@buf1 = external global [256 x float]
+@buf2 = external global [256 x float]
+@buf3 = external global [256 x float]
+@buf4 = external global [256 x float]
+@buf5 = external global [256 x float]
+@buf6 = external global [256 x float]
+@buf7 = external global [256 x float]
+@buf8 = external global [256 x float]
+@buf9 = external global [256 x float]
+@buf10 = external global [256 x float]
+@buf11 = external global [256 x float]
+@buf12 = external global [1024 x float]
+@buf13 = external global [1024 x float]
+@buf14 = external global [1024 x float]
+
+declare void @debug_i32(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.event(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.put.ms(i32, i32)
+
+; Unknown intrinsic
+declare { i32, i32 } @llvm.aie2p.get.ss()
+
+; Unknown intrinsic
+declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32)
+
+; Unknown intrinsic
+declare <16 x i32> @llvm.aie2p.scd.read.vec(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.acquire(i32, i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.release(i32, i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.set.ctrl.reg(i32, i32)
+
+define void @core_0_5() {
+  call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1)
+  call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0)
+  br label %1
+
+1:                                                ; preds = %17, %0
+  call void @llvm.aie2p.acquire(i32 49, i32 -1)
+  call void @llvm.aie2p.acquire(i32 50, i32 -1)
+  call void @llvm.aie2p.acquire(i32 52, i32 -1)
+  br label %2
+
+2:                                                ; preds = %5, %1
+  %3 = phi i32 [ %16, %5 ], [ 0, %1 ]
+  %4 = icmp slt i32 %3, 256
+  br i1 %4, label %5, label %17
+
+5:                                                ; preds = %2
+  %6 = getelementptr float, ptr @buf11, i32 %3
+  %7 = load <16 x float>, ptr %6, align 4
+  %8 = getelementptr float, ptr @buf10, i32 %3
+  %9 = load <16 x float>, ptr %8, align 4
+  %10 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %7)
+  %11 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %9)
+  %12 = shufflevector <16 x bfloat> %10, <16 x bfloat> %10, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %13 = shufflevector <16 x bfloat> %11, <16 x bfloat> %11, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %14 = call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %12, <32 x bfloat> %13, i32 60)
+  %15 = getelementptr float, ptr @buf9, i32 %3
+  store <16 x float> %14, ptr %15, align 4
+  %16 = add i32 %3, 16
+  br label %2, !llvm.loop !1
+
+17:                                               ; preds = %2
+  call void @llvm.aie2p.release(i32 51, i32 1)
+  call void @llvm.aie2p.release(i32 53, i32 1)
+  call void @llvm.aie2p.release(i32 48, i32 1)
+  br label %1
+}
+
+; Unknown intrinsic
+declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>)
+
+; Unknown intrinsic
+declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32)
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !{!1, !2}
+!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.o b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.o
new file mode 100644
index 0000000000000000000000000000000000000000..e70224c9ad756c9bb1acf0ca65fa018ff13a4066
GIT binary patch
literal 1000
zcmZ`%O=}Zj5Pse!O}0{HTd*#I5PH(sF1xlKdXY^`wc<uZP=X#plCCJmhHkf5PZjJz
zM5GsGk6yg@)I(1F1=3@W75oAE2dvNRSGNI&ot=4R=ACEW+1*}t?<l2^ih`Cyz~LBB
zU~(}kxQa4PYbDgc0pI&N_Gl{d40Tvi^YdibQb!ZB!29yBg`@ipFs|)yeOCv%UBT};
z)`%EJeJ1witm667->~_4Dt!23>Q}AY`Cx|^f$&S7)R=KJ<Et|PtkjM#huiB{8aO5)
z-zBmFWa#^o&s4+_;~oD(4j<Nx?x&KPl^K|iMe=)NBY3*PDYHpA>lto2c;<ZEkNgf6
z5<X5}Qs;P5`^H=D#9Ty3?wkZNb1CzHBL18_CAP9y0gveK=I{&gRT?PZUgA28P4mlj
z(-f^vT_3?qBUoUSdlV5fUyfY%0<LSZFSn@5PeJ)etrjB}COH-Vmq^UO+X`T=4Fb=+
z`{42XM$qZKg6lqMHePo6UN2}4RyX?|%#FYgI$heguf=vZpD)4my?!U^VD<*S*S6X>
zVY>CY)xMEfTMBKPsO>JVty&R0T~so}|IIa~k}E_-EzbQWSDvm^tZ1?lIJp{eWv3#y
zsiSrB4N^(#ZZRP1a*~8j-WB@NU*?hABvsa)JC1jGkjvTh*&-9E+a;3najNta?tf@J
Bay|e6

literal 0
HcmV?d00001

diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.opt.ll b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.opt.ll
new file mode 100644
index 0000000..80307e8
--- /dev/null
+++ b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.opt.ll
@@ -0,0 +1,72 @@
+; ModuleID = 'air_project/mul_kernel_0_core_0_5.peanohack.ll'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-m:e-p:20:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-f32:32:32-i64:32-f64:32-a:0:32-n32"
+target triple = "aie2p"
+
+@buf9 = external local_unnamed_addr global [256 x float]
+@buf10 = external local_unnamed_addr global [256 x float]
+@buf11 = external local_unnamed_addr global [256 x float]
+
+; Function Attrs: nounwind
+declare void @llvm.aie2p.acquire(i32, i32) #0
+
+; Function Attrs: nounwind
+declare void @llvm.aie2p.release(i32, i32) #0
+
+; Function Attrs: nounwind memory(inaccessiblemem: write)
+declare void @llvm.aie2p.set.ctrl.reg(i32, i32) #1
+
+; Function Attrs: noreturn nounwind
+define void @core_0_5() local_unnamed_addr #2 {
+  tail call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1)
+  tail call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0)
+  br label %1
+
+1:                                                ; preds = %17, %0
+  tail call void @llvm.aie2p.acquire(i32 49, i32 -1)
+  tail call void @llvm.aie2p.acquire(i32 50, i32 -1)
+  tail call void @llvm.aie2p.acquire(i32 52, i32 -1)
+  br label %2
+
+2:                                                ; preds = %1, %2
+  %3 = phi i32 [ 0, %1 ], [ %15, %2 ]
+  %4 = trunc nuw i32 %3 to i20
+  %5 = getelementptr float, ptr @buf11, i20 %4
+  %6 = load <16 x float>, ptr %5, align 64
+  %7 = getelementptr float, ptr @buf10, i20 %4
+  %8 = load <16 x float>, ptr %7, align 64
+  %9 = tail call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %6)
+  %10 = tail call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %8)
+  %11 = shufflevector <16 x bfloat> %9, <16 x bfloat> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %12 = shufflevector <16 x bfloat> %10, <16 x bfloat> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %13 = tail call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %11, <32 x bfloat> %12, i32 60)
+  %14 = getelementptr float, ptr @buf9, i20 %4
+  store <16 x float> %13, ptr %14, align 64
+  %15 = add nuw nsw i32 %3, 16
+  %16 = icmp ult i32 %3, 240
+  br i1 %16, label %2, label %17, !llvm.loop !1
+
+17:                                               ; preds = %2
+  tail call void @llvm.aie2p.release(i32 51, i32 1)
+  tail call void @llvm.aie2p.release(i32 53, i32 1)
+  tail call void @llvm.aie2p.release(i32 48, i32 1)
+  br label %1
+}
+
+; Function Attrs: nofree nounwind memory(inaccessiblemem: read)
+declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>) #3
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read)
+declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32) #4
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind memory(inaccessiblemem: write) }
+attributes #2 = { noreturn nounwind }
+attributes #3 = { nofree nounwind memory(inaccessiblemem: read) }
+attributes #4 = { mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) }
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !{!1, !2}
+!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.peanohack.ll b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.peanohack.ll
new file mode 100644
index 0000000..5a9b5b8
--- /dev/null
+++ b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.peanohack.ll
@@ -0,0 +1,95 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target triple = "aie2p"
+
+@buf0 = external global [256 x float]
+@buf1 = external global [256 x float]
+@buf2 = external global [256 x float]
+@buf3 = external global [256 x float]
+@buf4 = external global [256 x float]
+@buf5 = external global [256 x float]
+@buf6 = external global [256 x float]
+@buf7 = external global [256 x float]
+@buf8 = external global [256 x float]
+@buf9 = external global [256 x float]
+@buf10 = external global [256 x float]
+@buf11 = external global [256 x float]
+@buf12 = external global [1024 x float]
+@buf13 = external global [1024 x float]
+@buf14 = external global [1024 x float]
+
+declare void @debug_i32(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.event(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.put.ms(i32, i32)
+
+; Unknown intrinsic
+declare { i32, i32 } @llvm.aie2p.get.ss()
+
+; Unknown intrinsic
+declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32)
+
+; Unknown intrinsic
+declare <16 x i32> @llvm.aie2p.scd.read.vec(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.acquire(i32, i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.release(i32, i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.set.ctrl.reg(i32, i32)
+
+define void @core_0_5() {
+  call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1)
+  call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0)
+  br label %1
+
+1:                                                ; preds = %17, %0
+  call void @llvm.aie2p.acquire(i32 49, i32 -1)
+  call void @llvm.aie2p.acquire(i32 50, i32 -1)
+  call void @llvm.aie2p.acquire(i32 52, i32 -1)
+  br label %2
+
+2:                                                ; preds = %5, %1
+  %3 = phi i32 [ %16, %5 ], [ 0, %1 ]
+  %4 = icmp slt i32 %3, 256
+  br i1 %4, label %5, label %17
+
+5:                                                ; preds = %2
+  %6 = getelementptr float, ptr @buf11, i32 %3
+  %7 = load <16 x float>, ptr %6
+  %8 = getelementptr float, ptr @buf10, i32 %3
+  %9 = load <16 x float>, ptr %8
+  %10 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %7)
+  %11 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %9)
+  %12 = shufflevector <16 x bfloat> %10, <16 x bfloat> %10, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %13 = shufflevector <16 x bfloat> %11, <16 x bfloat> %11, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %14 = call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %12, <32 x bfloat> %13, i32 60)
+  %15 = getelementptr float, ptr @buf9, i32 %3
+  store <16 x float> %14, ptr %15
+  %16 = add i32 %3, 16
+  br label %2, !llvm.loop !1
+
+17:                                               ; preds = %2
+  call void @llvm.aie2p.release(i32 51, i32 1)
+  call void @llvm.aie2p.release(i32 53, i32 1)
+  call void @llvm.aie2p.release(i32 48, i32 1)
+  br label %1
+}
+
+; Unknown intrinsic
+declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>)
+
+; Unknown intrinsic
+declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32)
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !{!1, !2}
+!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_design.bif b/examples/elementwise_arith/air_project/mul_kernel_0_design.bif
new file mode 100644
index 0000000..86ba205
--- /dev/null
+++ b/examples/elementwise_arith/air_project/mul_kernel_0_design.bif
@@ -0,0 +1,10 @@
+all:
+{
+  id_code = 0x14ca8093
+  extended_id_code = 0x01
+  image
+  {
+    name=aie_image, id=0x1c000000
+    { type=cdo file=air_project/mul_kernel_0_aie_cdo_elfs.bin file=air_project/mul_kernel_0_aie_cdo_init.bin file=air_project/mul_kernel_0_aie_cdo_enable.bin }
+  }
+}
diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_mul_kernel_0_sequence.bin b/examples/elementwise_arith/air_project/mul_kernel_0_mul_kernel_0_sequence.bin
new file mode 100644
index 0000000000000000000000000000000000000000..f2eb383b7eb903e8ad8809b7d44fb0a4d660bfb6
GIT binary patch
literal 3248
zcmcJQ-Aw~A5QNu<g?K=M2OfApq6JF0GAPG%L<ue-N=1yF_0OCH8j;via_gU++3a_@
zvk>neg4w5;eLtEZyo^(@j}krmd|WcZ&8OQJf_lyu+9e|Yukmso`>#0IXQ!}T>04{-
zdWRz5+iJIiZ@khs7OO7qzIV<49O}M3Vd35Et*zr4+n!k}H6?s5ZHHP2b8iNd9f!uq
z2%ELeWFG=0l9+H#Mw;SV&vYiNQGA=m?@TyH^<^fmv5ZNwd1Or5@6ws1SwdvOIT;Fy
zFEe2cjS1(dzRbimmN6++jgb*5y%`xL$~=|+C!AAeiZ3%^4UGxssJ_g^HI^|MPc*|&
zYb<A{(Vr%DO=9vtUuMD@8WYaRC|J1{n2BpFW3p2})f#K>`xk4J-t?`*yLP<eIY;$n
zCaj?`;T+YMnYhL>CTa8!jZ|yI(P^-T#)NY+)Rg;`nXrb&gmYA1X5t#lnB*HUVGWH5
W=cv75Caj?`;T+YMnYhL+nEU|x-i@&U

literal 0
HcmV?d00001

diff --git a/examples/elementwise_arith/air_project/npu.asm_air_output.mlir b/examples/elementwise_arith/air_project/npu.asm_air_output.mlir
new file mode 100644
index 0000000..a66ce9e
--- /dev/null
+++ b/examples/elementwise_arith/air_project/npu.asm_air_output.mlir
@@ -0,0 +1,300 @@
+#loop_annotation = #llvm.loop_annotation<mustProgress = true>
+module {
+  aie.device(npu2) @square_kernel_0 {
+    %shim_noc_tile_0_0 = aie.tile(0, 0)
+    %shim_noc_tile_1_0 = aie.tile(1, 0)
+    %mem_tile_0_1 = aie.tile(0, 1)
+    %mem_tile_1_1 = aie.tile(1, 1)
+    %tile_0_2 = aie.tile(0, 2)
+    %tile_0_3 = aie.tile(0, 3)
+    %tile_0_4 = aie.tile(0, 4)
+    %tile_0_5 = aie.tile(0, 5)
+    %lock_0_1 = aie.lock(%mem_tile_0_1, 1) {init = 4 : i32}
+    %lock_0_1_0 = aie.lock(%mem_tile_0_1, 0) {init = 0 : i32}
+    %lock_1_1 = aie.lock(%mem_tile_1_1, 1) {init = 4 : i32}
+    %lock_1_1_1 = aie.lock(%mem_tile_1_1, 0) {init = 0 : i32}
+    %lock_0_2 = aie.lock(%tile_0_2, 3) {init = 1 : i32}
+    %lock_0_2_2 = aie.lock(%tile_0_2, 2) {init = 0 : i32}
+    %lock_0_2_3 = aie.lock(%tile_0_2, 1) {init = 1 : i32}
+    %lock_0_2_4 = aie.lock(%tile_0_2, 0) {init = 0 : i32}
+    %lock_0_3 = aie.lock(%tile_0_3, 3) {init = 1 : i32}
+    %lock_0_3_5 = aie.lock(%tile_0_3, 2) {init = 0 : i32}
+    %lock_0_3_6 = aie.lock(%tile_0_3, 1) {init = 1 : i32}
+    %lock_0_3_7 = aie.lock(%tile_0_3, 0) {init = 0 : i32}
+    %lock_0_4 = aie.lock(%tile_0_4, 3) {init = 1 : i32}
+    %lock_0_4_8 = aie.lock(%tile_0_4, 2) {init = 0 : i32}
+    %lock_0_4_9 = aie.lock(%tile_0_4, 1) {init = 1 : i32}
+    %lock_0_4_10 = aie.lock(%tile_0_4, 0) {init = 0 : i32}
+    %lock_0_5 = aie.lock(%tile_0_5, 3) {init = 1 : i32}
+    %lock_0_5_11 = aie.lock(%tile_0_5, 2) {init = 0 : i32}
+    %lock_0_5_12 = aie.lock(%tile_0_5, 1) {init = 1 : i32}
+    %lock_0_5_13 = aie.lock(%tile_0_5, 0) {init = 0 : i32}
+    %buf9 = aie.buffer(%mem_tile_0_1) {sym_name = "buf9"} : memref<1024xi16, 1 : i32> 
+    %buf8 = aie.buffer(%mem_tile_1_1) {sym_name = "buf8"} : memref<1024xi16, 1> 
+    %buf7 = aie.buffer(%tile_0_5) {sym_name = "buf7"} : memref<256xi16, 2> 
+    %buf6 = aie.buffer(%tile_0_5) {sym_name = "buf6"} : memref<256xi16, 2> 
+    %buf5 = aie.buffer(%tile_0_4) {sym_name = "buf5"} : memref<256xi16, 2> 
+    %buf4 = aie.buffer(%tile_0_4) {sym_name = "buf4"} : memref<256xi16, 2> 
+    %buf3 = aie.buffer(%tile_0_3) {sym_name = "buf3"} : memref<256xi16, 2> 
+    %buf2 = aie.buffer(%tile_0_3) {sym_name = "buf2"} : memref<256xi16, 2> 
+    %buf1 = aie.buffer(%tile_0_2) {sym_name = "buf1"} : memref<256xi16, 2> 
+    %buf0 = aie.buffer(%tile_0_2) {sym_name = "buf0"} : memref<256xi16, 2> 
+    %mem_0_5 = aie.mem(%tile_0_5) {
+      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_0_5_13, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf6 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32}
+      aie.use_lock(%lock_0_5_12, Release, 1)
+      aie.next_bd ^bb1
+    ^bb2:  // pred: ^bb3
+      aie.end
+    ^bb3:  // pred: ^bb0
+      %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb2)
+    ^bb4:  // 2 preds: ^bb3, ^bb4
+      aie.use_lock(%lock_0_5, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf7 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32}
+      aie.use_lock(%lock_0_5_11, Release, 1)
+      aie.next_bd ^bb4
+    }
+    %core_0_5 = aie.core(%tile_0_5) {
+      %0 = ub.poison : i16
+      %c256 = arith.constant 256 : index
+      %c32 = arith.constant 32 : index
+      %c0 = arith.constant 0 : index
+      cf.br ^bb1
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_0_5_12, AcquireGreaterEqual, 1)
+      aie.use_lock(%lock_0_5_11, AcquireGreaterEqual, 1)
+      scf.for %arg0 = %c0 to %c256 step %c32 {
+        %subview = memref.subview %buf7[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2>
+        %subview_14 = memref.subview %buf6[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2>
+        %1 = vector.transfer_read %subview[%c0], %0 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16>
+        %2 = arith.muli %1, %1 : vector<32xi16>
+        vector.transfer_write %2, %subview_14[%c0] {in_bounds = [true]} : vector<32xi16>, memref<32xi16, strided<[1], offset: ?>, 2>
+      } {loop_annotation = #loop_annotation}
+      aie.use_lock(%lock_0_5, Release, 1)
+      aie.use_lock(%lock_0_5_13, Release, 1)
+      cf.br ^bb1
+    }
+    %mem_0_4 = aie.mem(%tile_0_4) {
+      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_0_4_10, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf4 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32}
+      aie.use_lock(%lock_0_4_9, Release, 1)
+      aie.next_bd ^bb1
+    ^bb2:  // pred: ^bb3
+      aie.end
+    ^bb3:  // pred: ^bb0
+      %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb2)
+    ^bb4:  // 2 preds: ^bb3, ^bb4
+      aie.use_lock(%lock_0_4, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf5 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32}
+      aie.use_lock(%lock_0_4_8, Release, 1)
+      aie.next_bd ^bb4
+    }
+    %core_0_4 = aie.core(%tile_0_4) {
+      %0 = ub.poison : i16
+      %c256 = arith.constant 256 : index
+      %c32 = arith.constant 32 : index
+      %c0 = arith.constant 0 : index
+      cf.br ^bb1
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_0_4_9, AcquireGreaterEqual, 1)
+      aie.use_lock(%lock_0_4_8, AcquireGreaterEqual, 1)
+      scf.for %arg0 = %c0 to %c256 step %c32 {
+        %subview = memref.subview %buf5[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2>
+        %subview_14 = memref.subview %buf4[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2>
+        %1 = vector.transfer_read %subview[%c0], %0 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16>
+        %2 = arith.muli %1, %1 : vector<32xi16>
+        vector.transfer_write %2, %subview_14[%c0] {in_bounds = [true]} : vector<32xi16>, memref<32xi16, strided<[1], offset: ?>, 2>
+      } {loop_annotation = #loop_annotation}
+      aie.use_lock(%lock_0_4, Release, 1)
+      aie.use_lock(%lock_0_4_10, Release, 1)
+      cf.br ^bb1
+    }
+    %mem_0_3 = aie.mem(%tile_0_3) {
+      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_0_3_7, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf2 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32}
+      aie.use_lock(%lock_0_3_6, Release, 1)
+      aie.next_bd ^bb1
+    ^bb2:  // pred: ^bb3
+      aie.end
+    ^bb3:  // pred: ^bb0
+      %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb2)
+    ^bb4:  // 2 preds: ^bb3, ^bb4
+      aie.use_lock(%lock_0_3, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf3 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32}
+      aie.use_lock(%lock_0_3_5, Release, 1)
+      aie.next_bd ^bb4
+    }
+    %core_0_3 = aie.core(%tile_0_3) {
+      %0 = ub.poison : i16
+      %c256 = arith.constant 256 : index
+      %c32 = arith.constant 32 : index
+      %c0 = arith.constant 0 : index
+      cf.br ^bb1
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_0_3_6, AcquireGreaterEqual, 1)
+      aie.use_lock(%lock_0_3_5, AcquireGreaterEqual, 1)
+      scf.for %arg0 = %c0 to %c256 step %c32 {
+        %subview = memref.subview %buf3[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2>
+        %subview_14 = memref.subview %buf2[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2>
+        %1 = vector.transfer_read %subview[%c0], %0 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16>
+        %2 = arith.muli %1, %1 : vector<32xi16>
+        vector.transfer_write %2, %subview_14[%c0] {in_bounds = [true]} : vector<32xi16>, memref<32xi16, strided<[1], offset: ?>, 2>
+      } {loop_annotation = #loop_annotation}
+      aie.use_lock(%lock_0_3, Release, 1)
+      aie.use_lock(%lock_0_3_7, Release, 1)
+      cf.br ^bb1
+    }
+    %mem_0_2 = aie.mem(%tile_0_2) {
+      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_0_2_4, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf0 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32}
+      aie.use_lock(%lock_0_2_3, Release, 1)
+      aie.next_bd ^bb1
+    ^bb2:  // pred: ^bb3
+      aie.end
+    ^bb3:  // pred: ^bb0
+      %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb2)
+    ^bb4:  // 2 preds: ^bb3, ^bb4
+      aie.use_lock(%lock_0_2, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf1 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32}
+      aie.use_lock(%lock_0_2_2, Release, 1)
+      aie.next_bd ^bb4
+    }
+    %core_0_2 = aie.core(%tile_0_2) {
+      %0 = ub.poison : i16
+      %c256 = arith.constant 256 : index
+      %c32 = arith.constant 32 : index
+      %c0 = arith.constant 0 : index
+      cf.br ^bb1
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_0_2_3, AcquireGreaterEqual, 1)
+      aie.use_lock(%lock_0_2_2, AcquireGreaterEqual, 1)
+      scf.for %arg0 = %c0 to %c256 step %c32 {
+        %subview = memref.subview %buf1[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2>
+        %subview_14 = memref.subview %buf0[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2>
+        %1 = vector.transfer_read %subview[%c0], %0 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16>
+        %2 = arith.muli %1, %1 : vector<32xi16>
+        vector.transfer_write %2, %subview_14[%c0] {in_bounds = [true]} : vector<32xi16>, memref<32xi16, strided<[1], offset: ?>, 2>
+      } {loop_annotation = #loop_annotation}
+      aie.use_lock(%lock_0_2, Release, 1)
+      aie.use_lock(%lock_0_2_4, Release, 1)
+      cf.br ^bb1
+    }
+    aie.flow(%shim_noc_tile_0_0, DMA : 0, %mem_tile_0_1, DMA : 0)
+    aie.flow(%mem_tile_1_1, DMA : 0, %shim_noc_tile_1_0, DMA : 0)
+    aie.flow(%mem_tile_0_1, DMA : 0, %tile_0_2, DMA : 0)
+    aie.flow(%mem_tile_0_1, DMA : 1, %tile_0_3, DMA : 0)
+    aie.flow(%mem_tile_0_1, DMA : 2, %tile_0_4, DMA : 0)
+    aie.flow(%mem_tile_0_1, DMA : 3, %tile_0_5, DMA : 0)
+    aie.flow(%tile_0_2, DMA : 0, %mem_tile_1_1, DMA : 0)
+    aie.flow(%tile_0_3, DMA : 0, %mem_tile_1_1, DMA : 1)
+    aie.flow(%tile_0_4, DMA : 0, %mem_tile_1_1, DMA : 2)
+    aie.flow(%tile_0_5, DMA : 0, %mem_tile_1_1, DMA : 3)
+    %memtile_dma_1_1 = aie.memtile_dma(%mem_tile_1_1) {
+      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_1_1_1, AcquireGreaterEqual, 4)
+      aie.dma_bd(%buf8 : memref<1024xi16, 1>, 0, 1024) {task_id = 0 : i32}
+      aie.use_lock(%lock_1_1, Release, 4)
+      aie.next_bd ^bb1
+    ^bb2:  // pred: ^bb9
+      aie.end
+    ^bb3:  // pred: ^bb0
+      %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb5)
+    ^bb4:  // 2 preds: ^bb3, ^bb4
+      aie.use_lock(%lock_1_1, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf8 : memref<1024xi16, 1>, 0, 256) {task_id = 0 : i32}
+      aie.use_lock(%lock_1_1_1, Release, 1)
+      aie.next_bd ^bb4
+    ^bb5:  // pred: ^bb3
+      %2 = aie.dma_start(S2MM, 1, ^bb6, ^bb7)
+    ^bb6:  // 2 preds: ^bb5, ^bb6
+      aie.use_lock(%lock_1_1, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf8 : memref<1024xi16, 1>, 256, 256) {task_id = 0 : i32}
+      aie.use_lock(%lock_1_1_1, Release, 1)
+      aie.next_bd ^bb6
+    ^bb7:  // pred: ^bb5
+      %3 = aie.dma_start(S2MM, 2, ^bb8, ^bb9)
+    ^bb8:  // 2 preds: ^bb7, ^bb8
+      aie.use_lock(%lock_1_1, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf8 : memref<1024xi16, 1>, 512, 256) {task_id = 0 : i32}
+      aie.use_lock(%lock_1_1_1, Release, 1)
+      aie.next_bd ^bb8
+    ^bb9:  // pred: ^bb7
+      %4 = aie.dma_start(S2MM, 3, ^bb10, ^bb2)
+    ^bb10:  // 2 preds: ^bb9, ^bb10
+      aie.use_lock(%lock_1_1, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf8 : memref<1024xi16, 1>, 768, 256) {task_id = 0 : i32}
+      aie.use_lock(%lock_1_1_1, Release, 1)
+      aie.next_bd ^bb10
+    }
+    %memtile_dma_0_1 = aie.memtile_dma(%mem_tile_0_1) {
+      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
+    ^bb1:  // 2 preds: ^bb0, ^bb1
+      aie.use_lock(%lock_0_1_0, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 0, 256) {task_id = 0 : i32}
+      aie.use_lock(%lock_0_1, Release, 1)
+      aie.next_bd ^bb1
+    ^bb2:  // pred: ^bb9
+      aie.end
+    ^bb3:  // pred: ^bb0
+      %1 = aie.dma_start(MM2S, 1, ^bb4, ^bb5)
+    ^bb4:  // 2 preds: ^bb3, ^bb4
+      aie.use_lock(%lock_0_1_0, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 256, 256) {task_id = 0 : i32}
+      aie.use_lock(%lock_0_1, Release, 1)
+      aie.next_bd ^bb4
+    ^bb5:  // pred: ^bb3
+      %2 = aie.dma_start(MM2S, 2, ^bb6, ^bb7)
+    ^bb6:  // 2 preds: ^bb5, ^bb6
+      aie.use_lock(%lock_0_1_0, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 512, 256) {task_id = 0 : i32}
+      aie.use_lock(%lock_0_1, Release, 1)
+      aie.next_bd ^bb6
+    ^bb7:  // pred: ^bb5
+      %3 = aie.dma_start(MM2S, 3, ^bb8, ^bb9)
+    ^bb8:  // 2 preds: ^bb7, ^bb8
+      aie.use_lock(%lock_0_1_0, AcquireGreaterEqual, 1)
+      aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 768, 256) {task_id = 0 : i32}
+      aie.use_lock(%lock_0_1, Release, 1)
+      aie.next_bd ^bb8
+    ^bb9:  // pred: ^bb7
+      %4 = aie.dma_start(S2MM, 0, ^bb10, ^bb2)
+    ^bb10:  // 2 preds: ^bb9, ^bb10
+      aie.use_lock(%lock_0_1, AcquireGreaterEqual, 4)
+      aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 0, 1024) {task_id = 0 : i32}
+      aie.use_lock(%lock_0_1_0, Release, 4)
+      aie.next_bd ^bb10
+    }
+    aie.shim_dma_allocation @air_channel_3(%shim_noc_tile_1_0, S2MM, 0)
+    aie.shim_dma_allocation @air_channel_0(%shim_noc_tile_0_0, MM2S, 0)
+    aie.runtime_sequence @square_kernel_0_sequence(%arg0: memref<*xi16>, %arg1: memref<*xi16>, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32) {
+      %0 = aiex.dma_configure_task_for @air_channel_0 {
+        aie.dma_bd(%arg0 : memref<*xi16>, 0, 1024, [<size = 2, stride = 512>, <size = 512, stride = 1>])
+        aie.end
+      }
+      aiex.dma_start_task(%0)
+      %1 = aiex.dma_configure_task_for @air_channel_3 {
+        aie.dma_bd(%arg1 : memref<*xi16>, 0, 1024, [<size = 2, stride = 512>, <size = 512, stride = 1>])
+        aie.end
+      } {issue_token = true}
+      aiex.dma_start_task(%1)
+      aiex.dma_free_task(%0)
+      aiex.dma_await_task(%1)
+    }
+  } {dlti.dl_spec = #dlti.dl_spec<index = 32 : i64>}
+  aie.device(npu2) {
+    aie.runtime_sequence @square_kernel(%arg0: memref<*xi16>, %arg1: memref<*xi16>, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32) {
+      aiex.configure @square_kernel_0 {
+        aiex.run @square_kernel_0_sequence(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7) : (memref<*xi16>, memref<*xi16>, i32, i32, i32, i32, i32, i32)
+      }
+    }
+  }
+}
diff --git a/examples/elementwise_arith/air_project/placed.asm_air_output.mlir b/examples/elementwise_arith/air_project/placed.asm_air_output.mlir
new file mode 100644
index 0000000..aa82d2e
--- /dev/null
+++ b/examples/elementwise_arith/air_project/placed.asm_air_output.mlir
@@ -0,0 +1,86 @@
+module {
+  air.channel @channel_0 []
+  air.channel @channel_1 [4, 1]
+  air.channel @channel_2 [4, 1]
+  air.channel @channel_3 []
+  func.func @square_kernel(%arg0: memref<*xi16> {tt.divisibility = 16 : i32}, %arg1: memref<*xi16> {tt.divisibility = 16 : i32}, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32) {
+    %c1 = arith.constant 1 : index
+    %0 = air.launch async (%arg8, %arg9, %arg10) in (%arg11=%c1, %arg12=%c1, %arg13=%c1) args(%arg14=%arg0, %arg15=%arg1) : memref<*xi16>, memref<*xi16> attributes {id = 1 : i32} {
+      %c1024 = arith.constant 1024 : index
+      %c1_0 = arith.constant 1 : index
+      %1 = arith.muli %arg8, %c1024 : index
+      %2 = air.channel.put async  @channel_0[] (%arg14[%1] [%c1024] [%c1_0]) {id = 1 : i32} : (memref<*xi16>)
+      %3 = air.channel.get async  @channel_3[] (%arg15[%1] [%c1024] [%c1_0]) {id = 2 : i32} : (memref<*xi16>)
+      %4 = air.segment @square_kernel_0 async  attributes {id = 2 : i32, x_loc = 0 : i64, x_size = 8 : i64, y_loc = 2 : i64, y_size = 6 : i64} {
+        %c4 = arith.constant 4 : index
+        %c768 = arith.constant 768 : index
+        %c3 = arith.constant 3 : index
+        %c512 = arith.constant 512 : index
+        %c2 = arith.constant 2 : index
+        %c256 = arith.constant 256 : index
+        %c0 = arith.constant 0 : index
+        %c1_1 = arith.constant 1 : index
+        %async_token, %results = air.execute -> (memref<1024xi16, 1 : i32>) {
+          %alloc = memref.alloc() : memref<1024xi16, 1 : i32>
+          air.execute_terminator %alloc : memref<1024xi16, 1 : i32>
+        }
+        %5 = air.channel.get async [%async_token]  @channel_0[] (%results[] [] []) {id = 3 : i32} : (memref<1024xi16, 1 : i32>)
+        %async_token_2, %results_3 = air.execute -> (memref<1024xi16, 1>) {
+          %alloc = memref.alloc() : memref<1024xi16, 1>
+          air.execute_terminator %alloc : memref<1024xi16, 1>
+        }
+        %6 = air.channel.put async [%5]  @channel_1[%c0, %c0] (%results[%c0] [%c256] [%c1_1]) {id = 4 : i32} : (memref<1024xi16, 1 : i32>)
+        %7 = air.channel.put async [%5]  @channel_1[%c1_1, %c0] (%results[%c256] [%c256] [%c1_1]) {id = 5 : i32} : (memref<1024xi16, 1 : i32>)
+        %8 = air.channel.put async [%5]  @channel_1[%c2, %c0] (%results[%c512] [%c256] [%c1_1]) {id = 6 : i32} : (memref<1024xi16, 1 : i32>)
+        %9 = air.channel.put async [%5]  @channel_1[%c3, %c0] (%results[%c768] [%c256] [%c1_1]) {id = 7 : i32} : (memref<1024xi16, 1 : i32>)
+        %10 = air.channel.get async [%async_token_2]  @channel_2[%c0, %c0] (%results_3[%c0] [%c256] [%c1_1]) {id = 8 : i32} : (memref<1024xi16, 1>)
+        %11 = air.channel.get async [%async_token_2]  @channel_2[%c1_1, %c0] (%results_3[%c256] [%c256] [%c1_1]) {id = 9 : i32} : (memref<1024xi16, 1>)
+        %12 = air.channel.get async [%async_token_2]  @channel_2[%c2, %c0] (%results_3[%c512] [%c256] [%c1_1]) {id = 10 : i32} : (memref<1024xi16, 1>)
+        %13 = air.channel.get async [%async_token_2]  @channel_2[%c3, %c0] (%results_3[%c768] [%c256] [%c1_1]) {id = 11 : i32} : (memref<1024xi16, 1>)
+        %14 = air.herd @herd_0 async [%5, %async_token_2]  tile (%arg16, %arg17) in (%arg18=%c1_1, %arg19=%c4) attributes {id = 3 : i32, x_loc = 0 : i64, y_loc = 2 : i64} {
+          %c32 = arith.constant 32 : index
+          %c256_5 = arith.constant 256 : index
+          %c0_6 = arith.constant 0 : index
+          %16 = ub.poison : i16
+          %async_token_7, %results_8 = air.execute -> (memref<256xi16, 2>) {
+            %alloc = memref.alloc() : memref<256xi16, 2>
+            air.execute_terminator %alloc : memref<256xi16, 2>
+          }
+          %17 = air.channel.get async [%async_token_7]  @channel_1[%arg17, %c0_6] (%results_8[] [] []) {id = 12 : i32} : (memref<256xi16, 2>)
+          %async_token_9, %results_10 = air.execute -> (memref<256xi16, 2>) {
+            %alloc = memref.alloc() : memref<256xi16, 2>
+            air.execute_terminator %alloc : memref<256xi16, 2>
+          }
+          %18 = air.wait_all async [%17, %async_token_9] 
+          %19 = scf.for %arg20 = %c0_6 to %c256_5 step %c32 iter_args(%arg21 = %18) -> (!air.async.token) {
+            %subview = memref.subview %results_8[%arg20] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2>
+            %subview_13 = memref.subview %results_10[%arg20] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2>
+            %async_token_14, %results_15 = air.execute [%arg21] -> (vector<32xi16>) {
+              %23 = vector.transfer_read %subview[%c0_6], %16 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16>
+              air.execute_terminator %23 : vector<32xi16>
+            }
+            %21 = arith.muli %results_15, %results_15 : vector<32xi16>
+            %async_token_16 = air.execute [%arg21] {
+              vector.transfer_write %21, %subview_13[%c0_6] {in_bounds = [true]} : vector<32xi16>, memref<32xi16, strided<[1], offset: ?>, 2>
+            }
+            %22 = air.wait_all async [%async_token_14, %async_token_16] 
+            scf.yield %22 : !air.async.token
+          }
+          %20 = air.channel.put async [%async_token_9]  @channel_2[%arg17, %c0_6] (%results_10[] [] []) {id = 13 : i32} : (memref<256xi16, 2>)
+          %async_token_11 = air.execute [%17] {
+            memref.dealloc %results_8 : memref<256xi16, 2>
+          }
+          %async_token_12 = air.execute [%20] {
+            memref.dealloc %results_10 : memref<256xi16, 2>
+          }
+        }
+        %15 = air.channel.put async [%14]  @channel_3[] (%results_3[] [] []) {id = 14 : i32} : (memref<1024xi16, 1>)
+        %async_token_4 = air.execute [%15] {
+          memref.dealloc %results_3 : memref<1024xi16, 1>
+        }
+        air.wait_all [%6, %7, %8, %9, %10, %11, %12, %13, %async_token_4]  {air.segment_end}
+      }
+    }
+    return
+  }
+}
diff --git a/examples/elementwise_arith/air_project/square_kernel_0.pdi b/examples/elementwise_arith/air_project/square_kernel_0.pdi
new file mode 100644
index 0000000000000000000000000000000000000000..1a6b4e2869f47c37579486ca1fbb299d49a2e6c4
GIT binary patch
literal 6272
zcmeHLO=w(I6h3d}C7Eh$-^f^A+Z11tBB5mQ(uvSQz&FkmqDGt=g@TKwLM;WA*p1y8
zG1MX=0XNf%kc}HxgF+W|GtzF9BD3tqFs-iTrMT##I-YZX-u>RQSyWerUYMEh`|i2t
zo_o%jzxyi@wdzyVOCP@T=FK0zy-ehj$J!x&$m={mWPG{yb4?@t{8Fpkj)I6{6b+Vj
z|7Bb_{Nokb57(bQcjnZ&_fDTZ;~Y*nm`7jD96NY(gy<)J$SaPa8E>#SN>su<(C+`>
z<9`hQ^4USN^&0O$bc18~^ZV|w%A1wEGEp?0olSVKH7REL!8|N*D)0*Ms;5sNzDK<G
zP1sZ9d$h?FrNOQ-qY*vF8N2gxIihryZW9G`r03CFUc>>>)quKvj{)5tUOVaC+BHS=
z$=J1%bnBIbJy)(?_{Qr6b80QAF&nRskN(|x^^0n!{Y&-t#@OjgbDd}OoP7@WCKaMZ
zZ?LcP(FX@x$!(6Z72V;)^ZxAd-SgwJzmA9b`uW|!`;GAJ&o9!Xnb%HbzFU&@j`R9!
znlZA&?JY{tKCG<lCF)$HT|{Ap_O27n*p<dUk@tYgkN96@wIq>rzP`Svw3>cb?tXf&
zbN2q&Y{^f`<~)>G`%v<0MtJOoD*}a=uaAW^V`gt~M(z!DU)Vm7WzrY64=lHRV7cuB
z%PFcC`f9GwS9wlbd0qjY@24Y=`-vXKe$w@|{Slh_T5juaxvjtD6g3O|)F|{*p3_#I
zSAggH>BYzW^hoxTuCMKn(A3v*TYt-K{Vk_xxzJB13jLJlw3X)-;Q4-f{c%4%lKrIX
zYx^TK^|jpA-*Q`j;ZlG5TMB<W$!{y+GRhvKbTY#M;0SOGI04)SyaIR?@EYI^z?*=(
zfO~-ZfCqq6gZ269x%Y^C$LG&C2j_D*N*#^?CxF|4R{*a9UIV-VcoT3Ja1U@F@BnaX
zus&b2a6Zjl{TgsShojWt7;pl(4R{6cD&RH18-O<fcLDbR_W=(8rv~fuEf>zGxjWwp
zIG@8&>TnD=0o(?>0(ceh8sH7Un}EB3dw~0Z2Y^$+0ZquSWc^-aY|q7d33S!6@_vlD
zw>@JYLC*)E6X-GbinDh;9e_^ITQTR}*}EPIKqshG%x7`-F6RO0WX@Za&l597JpN-N
zqt9zRl>*Jp^WX)4^g;HKvQKihLVi2t@0R@3_%l$3^iE)~jn8K;SoIamRatcLG#)Y@
zu}@^n+;H*mJP#d@#vXHt)#vYj!8#r{myd^Q_E7l|`-Hvbhl_{jdFXgFcJb(5(eb#s
zd_0p68IRZ}YEOJ}@$ftk9goHybDbaE>pC7cr{j4-ts~mci4<tLk<CkTzp;+?k=V-y
zv_EfXSYw!cjP_JE_TFb|RdG?#KC%8whl{3)7wseKzkFp!|B{ROQI3O$<RUg$7N|Ti
z7p(GBm>Xx#|E1Mjr7%}xE>2bM3v(09_nEm6=InjRh-dT7c~p^G&Wm6q!$vWm+J1+8
z7VWuS+P?JQdi~43Z0vW)XVISPrLHN6cBt3=WZmRnoyz22VVnPj(fwp|e3Y>aJJ(`v
zYm*e_mYCZdFBax}=C(Fem|IB8#8+Z#5?5`T@52nPuzsAVCpu|e<GW>g-g|bB(&>P_
zQ`FiS??ARTX5IQB&=b}<er4~2KFd1qqVx*rM_K2Hls*o6kd{k)Wux?DyDUf3Ygvbw
zMenIN@0Lq8=0kKKv**|xW_`Eua};!+^=vQDE37|d?B_urXMK;+7eM#ZX}RN6afaK|
zS$-1MvmAoH$hxbGvv0A^-*(DR1olg;XSoIaB<oo`p!;bZ{M55}=2>^~IDLV2zMfP2
z9@=*L66<bXr`K3_`#RlEn{cis?As*1m{a~Iww=Dj_Rhc4Ypgr}PWRJg@V}h-ud(j@
zI6X|KOHZ18XSb)beu!Al>JEC1^{0*fBIpyWXZ-_unAX969sJk9zth9C3HD88uRe=Y
pj((hv!{1NB{nRVPOgYBf|1Y+ZbL!7;a|h|TzC&_c-vJ26@eg<U>b?K~

literal 0
HcmV?d00001

diff --git a/examples/elementwise_arith/air_project/square_kernel_0_aie_cdo_elfs.bin b/examples/elementwise_arith/air_project/square_kernel_0_aie_cdo_elfs.bin
new file mode 100644
index 0000000000000000000000000000000000000000..29b57b909b220d0abba36bf749886d055504d85a
GIT binary patch
literal 2528
zcmeH_ze~eF6vw}pR+B=}cCehF5GzQB4z^Bisa6n0{IwKZ)Wt=l*2T@iq7LF52f@X?
zI4E=~*jYL{6)XM$jt*kIYwpOy<R2*XK}z1|?%loSeWCysi|c>^oQP-Xxz<fGz+_Re
zBEp`ANPD4KqCF+I_a4o72tx?72um{W<RNyXZb&1_a1Rvm9y1jKW+s4-m57b6EIL2}
z`3A!dvqf`wN}$$a;su><y~N&PY2e(fm+-dg07KT(=>uyda=lvgB0F1+5C6JrH%WKn
zHThXE_pWm8n75{XdCIYXW%fSqo*hpXosMX<XnzcgTL5w5VI?l__xcfewK78TR)Ozd
zUtmhDdxy&Pnj!1`T-4XSOuXsv@0R5Mes1li0QUkhfLlmafsC$H`yD{cPW4kS3u?$t
z+`E@iqx|(4sa<}%2R~-kh&mDVf8gl-Tz+2_e7@-q0sQvG<jAPiruZV?UEdS!ge?8<
z1ZCO@%Cr-d!JZ58E*Iimkf9x97Gb_``GIffSKquDQ)jrEF=g5X%CrlV!CnmUtq|f{
zkf9x97Gb_`3j^Q!);Dj))ETa3Oqq6pGVKCou(v~e+YIq7$j}Zli!k4}t$}ZS>zg-Y
Q>I_#ircAp)nRY?4KU-3TT>t<8

literal 0
HcmV?d00001

diff --git a/examples/elementwise_arith/air_project/square_kernel_0_aie_cdo_enable.bin b/examples/elementwise_arith/air_project/square_kernel_0_aie_cdo_enable.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7cc1818bce4d6ce1226fc5fda519967a4842b99e
GIT binary patch
literal 104
zcmZQ!U|?`|@n>LQVqg#jvKQ~%@L!OTfq|Wo1;_&e1!V>x7mN{N##qFhv4{o3#Q>))
B1+xGE

literal 0
HcmV?d00001

diff --git a/examples/elementwise_arith/air_project/square_kernel_0_aie_cdo_init.bin b/examples/elementwise_arith/air_project/square_kernel_0_aie_cdo_init.bin
new file mode 100644
index 0000000000000000000000000000000000000000..ace360fc11f90c98660ab2cdf87c15ccc1146121
GIT binary patch
literal 4300
zcmb7`y>1gh6ov2pBp3*=K`u;UiAAPBC_;-Iq!f`D%Rx$EiJ&xe<Pq`+MxvrhO_fJ*
zL+?lM6GWDHfpO|c;mq#NdhW@l$V!Rz-E(H<&Y4-usfdh^zKX;mA5xLue`mMbk;tP6
z;GTTor1FnzPVPjB`w+2j$`8*OaZX$i8{!k<GvagNDe(pICGm`SPP`yq65E>BnG9QU
zs%kD6pNrU!l#}M1^n%zBpAerBpA%1sFNiORXT)>j1@V&D*1XPX+>%pO(>aYOC(XIl
zTo4=L6XG-CbK)uS1@R^EjCfAGAYKyNn%6m<w&YaRbWYzWC(XIlTo4=L6XG-CbK)uS
z1@R^EjCfAGAYKw%;!OJDzNwn`3moPHJtp0Koi+Xf?{O4xPVad}x{zMre(Ul2ea=W1
zdOr%ByB@Fabw;|-y(y4~9<R?kBVFqJwxcoz7lmv@o$^x_7#o#wR$TvIeoi;r{CVj&
z%Io0YkMy}ekM32~kH6Ph&0)_eYL2}}0?#&tb$_D=_UHV@wWJ>1A3S*c+Sygb5o&%9
zDQb?rM*@%T4`=_N{G8ulX8q{?;KAe9&aNuf{Z)6+dnEAa{@y&WKj$~)`r$v_A3S*c
z+Sygb5o+uFs&~wLB=A)IJt^l9*@>&$=eaJ|ALeHCEA;2i?-r6BZLDc@Ykr1)Nos%e
zyBx)X*l+y&@q4zG%>LZZpZvLZexJ$yCVIi+Gbw6TxuNFRd!)s)g<3E0bXq)J)ZAiz
zo~*^wN9|SM=>?vR$e#ytE_db`1A@ogjyUhie^1`*$9Y!%xcxlu_><ayPu}duc}jJ^
z?m5pJ^D8)?a%Vm>oKMN&`5dAaW`EG)IYKQwL(}4!pcZC-+~P@53(pXD;ch=Rkqwcl
z-$Qp*{x)y81O1E0-0M>5Yran8Q}p^pqsy+~q?_^YqFuLU!v*w^g>(ZQdvE+H>3h(z
zr$+CPegGXiX!I@8Gn>R&pqo|VB5M5DZ%q4j`n=iSbrOgBeJi_aJocj*`txA^1JYCI
zsvpuj(4Ph4_etM^{vyy1Nl)!SHUqu48mRmYbah^&A4AvoLytd#j$Z;zemRYwKv(BU
z`giE6f6`Msr1^)c|9$AXf2|)ve;oApcBS<RblsoUyU=xiT2Jkm@)=WqV|f&;-(P8c
zg7JENt#_g8^|hYbQ(FI2t>1;N=hu4A4&tXle|xKey1zMebw5e(LVp>IKPJ5oUEP1u
pdv-|c4{7}&t*`Z-9n<)6Io^K<i^cA@5D9+J`&w$FxIhW}lK+|*O)3BY

literal 0
HcmV?d00001

diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_2.elf b/examples/elementwise_arith/air_project/square_kernel_0_core_0_2.elf
new file mode 100755
index 0000000000000000000000000000000000000000..ddd5bbc63d792e8ccbdd7d5efb1a64507a675d14
GIT binary patch
literal 1600
zcma)6PiWg_6#vQbVh2j+ZY+4LgRdQoY-ElUn|1D`YH-U~9q0}lfihHCQW6nc&LlZY
zcWK8$wnOQq7xz%;rO>0<p@*CT?RBHaTn0_p?(MCI((Lyv$u?t${ov`n-~0D{`kwUN
zywtp`X&N%*P#m!uopWyiQ`9vm<S>T<@{sIXlnetpM3{;a<?Fv=H;YZ`=}D5>c?}PM
z2A<CpM$wN?14my?DZH4+p|o7w*Kkc!fWqSv2G2AaI6U54(GF(L0zVh_R&a2|<PM6D
zw;yWzC1Y5NA?Ed+6Vq?{%Xd!oul{lBuZsoySEK)463d?DS+fXysXc$cfAjkf8|ES3
zrlF3wVXn`P?eK)GpX`TxCp<a9pIt1sZtvnu8c&)^e2|m+F=zc=!f0g}uPvT`=a-Au
zXMz4MW`J@LvpXC~R+;9T;7o?+BmFn&&B;#s_n*9-+kElc)Zmv_{k7MH`CQ(dO5X=&
zmS0XrHf&k0KOGMN<$F&GWz5MyiKje~SI0op4>r8;ylOTZ%BWZLiViq!J6Ka&UKsh^
zAoZ%b^*G*)&Mz#i`SE(MRdKqVg%!W;2iptn_EzUJ+xJwf>bQE{tyVqTt?Kp9YhSpY
zbIz@~RijmH)f}Vlx%Jv={gY%5lTU7Eggjmzze{#?<k9c}?FSS5Bif}=k%sa4%jvy5
zmT%phh%b>V5&GVNhAXs@PmZUA>0~W)%)gO1p(v97oe=&l0?FT^HrY$YBk`Z86-6L?
zms$xSO5}H`l@LPsLuw_25I&$*LI~me)QTby{yVi2LI^()BcJ?%vY$VNn`4ej>}|)8
zF?GCvMs%YS+b!z#C=L^Aq-|%ziu@nFh~Go=+vSFDw>>9b?z+9Uhe~s4Nw<ttFH&30
zOM$Dy31g$;ICQM`0>^53YkpwB>4u)ATZ?G*Rt<URu&gMy!x&bynON4A-R^l-V_B%P
z0jp#Cf#nB&46$H2aW{;zD)C!(7}_^7PZIV57sH}7IOOhRWkSTyeN^SKI2@WOhxPxU
zF-;^n+2o#-yD>v4HRQ_>x1Mr62mFxQqOv}c&(H{uNUk#|cdhi`pU?U7ZlK6LouSCU
ixr|U}TR2O5GVcQOj{YO>)3oMR8au-UWzDQlrvCwN-uX%Z

literal 0
HcmV?d00001

diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_2.ld.script b/examples/elementwise_arith/air_project/square_kernel_0_core_0_2.ld.script
new file mode 100644
index 0000000..13a60c2
--- /dev/null
+++ b/examples/elementwise_arith/air_project/square_kernel_0_core_0_2.ld.script
@@ -0,0 +1,66 @@
+
+MEMORY
+{
+   program (RX) : ORIGIN = 0, LENGTH = 0x0020000
+   data (!RX) : ORIGIN = 0x74400, LENGTH = 0xBC00
+}
+ENTRY(__start)
+SECTIONS
+{
+  . = 0x0;
+  .text : {
+     /* the __start symbol has to come at address zero. */
+     *crt0.o(.text*)
+     _ctors_start = .;
+     _init_array_start = .;
+     KEEP(SORT(*.init_array))
+     _ctors_end = .;
+     _init_array_end = .;
+     _dtors_start = .;
+     _dtors_end = .;
+     *(.text*)
+  } > program
+  .data : {
+     *(.data*)
+     *(.rodata*)
+  } > data
+  .comment : {
+     *(.comment*)
+  }
+  .symtab : {
+     *(.symtab)
+  }
+  .shstrtab : {
+     *(.shstrtab)
+  }
+  .strtab : {
+     *(.strtab)
+  }
+  .stack_sizes : {
+     *(.stack_sizes)
+  }
+
+. = 0x70000;
+_sp_start_value_DM_stack = .;
+. += 0x400; /* stack */
+/* No tile with memory exists to the south. */
+. = 0x40000;
+. += 0x10000;
+/* No tile with memory exists to the west. */
+. = 0x50000;
+. += 0x10000;
+. = 0x60400;
+buf3 = .;
+. += 0x400;
+. = 0x64000;
+buf2 = .;
+. += 0x400;
+. = 0x70400;
+buf1 = .;
+. += 0x400;
+. = 0x74000;
+buf0 = .;
+. += 0x400;
+  .bss : { *(.bss*) } > data
+}
+PROVIDE(main = core_0_2);
diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_2.ll b/examples/elementwise_arith/air_project/square_kernel_0_core_0_2.ll
new file mode 100644
index 0000000..d193819
--- /dev/null
+++ b/examples/elementwise_arith/air_project/square_kernel_0_core_0_2.ll
@@ -0,0 +1,84 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target triple = "aie2p"
+
+@buf0 = external global [256 x float]
+@buf1 = external global [256 x float]
+@buf2 = external global [256 x float]
+@buf3 = external global [256 x float]
+@buf4 = external global [256 x float]
+@buf5 = external global [256 x float]
+@buf6 = external global [256 x float]
+@buf7 = external global [256 x float]
+@buf8 = external global [1024 x float]
+@buf9 = external global [1024 x float]
+
+declare void @debug_i32(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.event(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.put.ms(i32, i32)
+
+; Unknown intrinsic
+declare { i32, i32 } @llvm.aie2p.get.ss()
+
+; Unknown intrinsic
+declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32)
+
+; Unknown intrinsic
+declare <16 x i32> @llvm.aie2p.scd.read.vec(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.acquire(i32, i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.release(i32, i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.set.ctrl.reg(i32, i32)
+
+define void @core_0_2() {
+  call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1)
+  call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0)
+  br label %1
+
+1:                                                ; preds = %13, %0
+  call void @llvm.aie2p.acquire(i32 49, i32 -1)
+  call void @llvm.aie2p.acquire(i32 50, i32 -1)
+  br label %2
+
+2:                                                ; preds = %5, %1
+  %3 = phi i32 [ %12, %5 ], [ 0, %1 ]
+  %4 = icmp slt i32 %3, 256
+  br i1 %4, label %5, label %13
+
+5:                                                ; preds = %2
+  %6 = getelementptr float, ptr @buf1, i32 %3
+  %7 = load <16 x float>, ptr %6, align 4
+  %8 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %7)
+  %9 = shufflevector <16 x bfloat> %8, <16 x bfloat> %8, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %10 = call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %9, <32 x bfloat> %9, i32 60)
+  %11 = getelementptr float, ptr @buf0, i32 %3
+  store <16 x float> %10, ptr %11, align 4
+  %12 = add i32 %3, 16
+  br label %2, !llvm.loop !1
+
+13:                                               ; preds = %2
+  call void @llvm.aie2p.release(i32 51, i32 1)
+  call void @llvm.aie2p.release(i32 48, i32 1)
+  br label %1
+}
+
+; Unknown intrinsic
+declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>)
+
+; Unknown intrinsic
+declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32)
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !{!1, !2}
+!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_2.o b/examples/elementwise_arith/air_project/square_kernel_0_core_0_2.o
new file mode 100644
index 0000000000000000000000000000000000000000..57437bb20b770876576b8753e9fb67450fce9b78
GIT binary patch
literal 932
zcmZ`$O-mb56g|(cj1VLWO;czgx~if>M%)Neq9kphlj6oAbrC0VT1r#Xjw9G^5@?Vv
zLf7*Lx_9LwtNw!0rOOEZ0Q(2D=f0V7hJY9DyZ5|v-hKDo*Qwgil~RaJ!AK#XKL(7G
zT+@Q5$RN?m)+LZbiOHTS0!5tXa7M)Z@^-MRE~XcN@0r0aE?$*^N#p$Fm-?Nvinv~;
zOl-N-%TC<{wV$)W_Mh3S&5ZNI3Lb=9ycY6h19+*1kAlvZClw6Y1lK8jKP$GptlZC<
z?1#AL&7|<(%|fGpius5Siwb`>Ci(>P{2)Yk-2utnLXGMR#(o|MPBBfTfqXAKfA9J*
z#Z2Je?OS3QUuff~G3iD~CjXxKbqZIayvPa6k>9!F<RgH_2o|`=If)6slF=!Ah9&t}
z+Bf>rgieuTUBy4wX8P`l5A(3?yXK46J1a-N)BFhAep9P_a9VEDueA@3TQ1Ba-}9XY
z<JV6*dTz_nhs|c&bL*x0DolH8OJd{rofQE~K#8DkR}T-8DMYv58y3k#t!vF=_WL=d
z<`c9@6px;dvffP=nTOaj`lwFk8(XYf=Z1`_91%lmHkeC&=|{LMT}GTbj{P)9aTfD<
R)J(Om%P8_my3~`q{{x==X=?xg

literal 0
HcmV?d00001

diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_2.opt.ll b/examples/elementwise_arith/air_project/square_kernel_0_core_0_2.opt.ll
new file mode 100644
index 0000000..f007116
--- /dev/null
+++ b/examples/elementwise_arith/air_project/square_kernel_0_core_0_2.opt.ll
@@ -0,0 +1,65 @@
+; ModuleID = 'air_project/square_kernel_0_core_0_2.peanohack.ll'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-m:e-p:20:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-f32:32:32-i64:32-f64:32-a:0:32-n32"
+target triple = "aie2p"
+
+@buf0 = external local_unnamed_addr global [256 x float]
+@buf1 = external local_unnamed_addr global [256 x float]
+
+; Function Attrs: nounwind
+declare void @llvm.aie2p.acquire(i32, i32) #0
+
+; Function Attrs: nounwind
+declare void @llvm.aie2p.release(i32, i32) #0
+
+; Function Attrs: nounwind memory(inaccessiblemem: write)
+declare void @llvm.aie2p.set.ctrl.reg(i32, i32) #1
+
+; Function Attrs: noreturn nounwind
+define void @core_0_2() local_unnamed_addr #2 {
+  tail call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1)
+  tail call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0)
+  br label %1
+
+1:                                                ; preds = %13, %0
+  tail call void @llvm.aie2p.acquire(i32 49, i32 -1)
+  tail call void @llvm.aie2p.acquire(i32 50, i32 -1)
+  br label %2
+
+2:                                                ; preds = %1, %2
+  %3 = phi i32 [ 0, %1 ], [ %11, %2 ]
+  %4 = trunc nuw i32 %3 to i20
+  %5 = getelementptr float, ptr @buf1, i20 %4
+  %6 = load <16 x float>, ptr %5, align 64
+  %7 = tail call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %6)
+  %8 = shufflevector <16 x bfloat> %7, <16 x bfloat> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %9 = tail call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %8, <32 x bfloat> %8, i32 60)
+  %10 = getelementptr float, ptr @buf0, i20 %4
+  store <16 x float> %9, ptr %10, align 64
+  %11 = add nuw nsw i32 %3, 16
+  %12 = icmp ult i32 %3, 240
+  br i1 %12, label %2, label %13, !llvm.loop !1
+
+13:                                               ; preds = %2
+  tail call void @llvm.aie2p.release(i32 51, i32 1)
+  tail call void @llvm.aie2p.release(i32 48, i32 1)
+  br label %1
+}
+
+; Function Attrs: nofree nounwind memory(inaccessiblemem: read)
+declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>) #3
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read)
+declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32) #4
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind memory(inaccessiblemem: write) }
+attributes #2 = { noreturn nounwind }
+attributes #3 = { nofree nounwind memory(inaccessiblemem: read) }
+attributes #4 = { mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) }
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !{!1, !2}
+!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_2.peanohack.ll b/examples/elementwise_arith/air_project/square_kernel_0_core_0_2.peanohack.ll
new file mode 100644
index 0000000..055e011
--- /dev/null
+++ b/examples/elementwise_arith/air_project/square_kernel_0_core_0_2.peanohack.ll
@@ -0,0 +1,84 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target triple = "aie2p"
+
+@buf0 = external global [256 x float]
+@buf1 = external global [256 x float]
+@buf2 = external global [256 x float]
+@buf3 = external global [256 x float]
+@buf4 = external global [256 x float]
+@buf5 = external global [256 x float]
+@buf6 = external global [256 x float]
+@buf7 = external global [256 x float]
+@buf8 = external global [1024 x float]
+@buf9 = external global [1024 x float]
+
+declare void @debug_i32(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.event(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.put.ms(i32, i32)
+
+; Unknown intrinsic
+declare { i32, i32 } @llvm.aie2p.get.ss()
+
+; Unknown intrinsic
+declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32)
+
+; Unknown intrinsic
+declare <16 x i32> @llvm.aie2p.scd.read.vec(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.acquire(i32, i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.release(i32, i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.set.ctrl.reg(i32, i32)
+
+define void @core_0_2() {
+  call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1)
+  call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0)
+  br label %1
+
+1:                                                ; preds = %13, %0
+  call void @llvm.aie2p.acquire(i32 49, i32 -1)
+  call void @llvm.aie2p.acquire(i32 50, i32 -1)
+  br label %2
+
+2:                                                ; preds = %5, %1
+  %3 = phi i32 [ %12, %5 ], [ 0, %1 ]
+  %4 = icmp slt i32 %3, 256
+  br i1 %4, label %5, label %13
+
+5:                                                ; preds = %2
+  %6 = getelementptr float, ptr @buf1, i32 %3
+  %7 = load <16 x float>, ptr %6
+  %8 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %7)
+  %9 = shufflevector <16 x bfloat> %8, <16 x bfloat> %8, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %10 = call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %9, <32 x bfloat> %9, i32 60)
+  %11 = getelementptr float, ptr @buf0, i32 %3
+  store <16 x float> %10, ptr %11
+  %12 = add i32 %3, 16
+  br label %2, !llvm.loop !1
+
+13:                                               ; preds = %2
+  call void @llvm.aie2p.release(i32 51, i32 1)
+  call void @llvm.aie2p.release(i32 48, i32 1)
+  br label %1
+}
+
+; Unknown intrinsic
+declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>)
+
+; Unknown intrinsic
+declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32)
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !{!1, !2}
+!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_3.elf b/examples/elementwise_arith/air_project/square_kernel_0_core_0_3.elf
new file mode 100755
index 0000000000000000000000000000000000000000..a8d3607d6115f3e937af1d24e81c3abb59403ece
GIT binary patch
literal 1640
zcma)6J!~UI6#mvRu_K+p2O=^8q~VZIK*8CK4TrDVi+l(XM+yZf5<<&*cayAQufy&-
z<Y+F4C{Q388dD%$a}uRRaYd>~cP;43G)R{SL_=}cU4aPS+aKGa;*(~+_r3RK=IzYv
z+`7?zSJO1)D4;x64LaAK0%ph!C={@O5{i)Q1GXF^I;1fb6}Aulh}}Fk&9l=Yvx^#T
z11&t9D~+SCy#$=UKcnzy5oa>x@`;9zH3cZ#tzh&(qk*%F;|=X}?h5c#>39RDt0pTb
z-`&5Xom7luED|j0hnHsm8LoYIdAR=T<=?ND>~D<W^Jy&atMg_Vcu#xy!tmB-FSpDy
zeoafAvtX{z8N1;nSwF3Z`Y?QYN<X+>?R<NLSF(6Isl-PG89&h+f1D~>IVNk1=fC;Y
z@~88_@Cb82wT$^g@DQfJd=osAWPfD-T?PwM$?(T}&lGka{X8@J`pIzX@6uwSXwGEc
z!!cR?)%3`c8LQ3vlS4rDhkK<e7UV#Q`x@}%0%-feju&22?RHBU&6-}*0jFyRTWZe>
zBflSHUbV0t$Gg$BrKK%D-X3&nPQSOb;dlLDf2rHu>%D3Fp6b*cS8uxYx@Ws}z4><I
z9oKWNx(&B(bn2akV>CUt+1PBpp4Kqkvz#%Cczp33)%w_@;YHfFr}V3|E8{^LCi^eD
z_xM8Yx;YhJp;pr9dlnj2X``5KPX)8-TINK5C~-ouN&Y7y__qioe~)Z(E;$~Fe@Rws
z0?|igC4#6>e@|8-2+?=QN(3Q#L{=gQ(I;fZCJ_A#S&1M-pNdiBJt3r?-^4H{`doA|
z{X-@GXA+-i)M9Txh8$zh3ur_)d$HXiZ%1*MT4T1I9V_y`@FL!q_D5?i-|l)&yw-OI
zT@SVP%8G6o884Hq_Km>R;e@eKa~wK02Z3XCye&U4;PgY!(ycl=gUzH{hGj*u9mcSt
z-PE%7?C!v`T5BSm9aufv4=g|MV~7RIiTh!cj}pIShoOBl_oQJjaFGnkf{AvgN2Wsj
zEXI&dm+md;8p-jRvTgqd8k3}@JDk3!@`mQ9WQN=zaqF4Z3&7`@EtJnA`3#Nm6U708
z@+Qk1a?+B+|2=unNeLDhxS9)cUcwdH^6uxbnD<vM$a&rynSGGOR=J?8na6Yf7yJDG
AD*ylh

literal 0
HcmV?d00001

diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_3.ld.script b/examples/elementwise_arith/air_project/square_kernel_0_core_0_3.ld.script
new file mode 100644
index 0000000..befdff2
--- /dev/null
+++ b/examples/elementwise_arith/air_project/square_kernel_0_core_0_3.ld.script
@@ -0,0 +1,69 @@
+
+MEMORY
+{
+   program (RX) : ORIGIN = 0, LENGTH = 0x0020000
+   data (!RX) : ORIGIN = 0x74400, LENGTH = 0xBC00
+}
+ENTRY(__start)
+SECTIONS
+{
+  . = 0x0;
+  .text : {
+     /* the __start symbol has to come at address zero. */
+     *crt0.o(.text*)
+     _ctors_start = .;
+     _init_array_start = .;
+     KEEP(SORT(*.init_array))
+     _ctors_end = .;
+     _init_array_end = .;
+     _dtors_start = .;
+     _dtors_end = .;
+     *(.text*)
+  } > program
+  .data : {
+     *(.data*)
+     *(.rodata*)
+  } > data
+  .comment : {
+     *(.comment*)
+  }
+  .symtab : {
+     *(.symtab)
+  }
+  .shstrtab : {
+     *(.shstrtab)
+  }
+  .strtab : {
+     *(.strtab)
+  }
+  .stack_sizes : {
+     *(.stack_sizes)
+  }
+
+. = 0x70000;
+_sp_start_value_DM_stack = .;
+. += 0x400; /* stack */
+. = 0x40400;
+buf1 = .;
+. += 0x400;
+. = 0x44000;
+buf0 = .;
+. += 0x400;
+/* No tile with memory exists to the west. */
+. = 0x50000;
+. += 0x10000;
+. = 0x60400;
+buf5 = .;
+. += 0x400;
+. = 0x64000;
+buf4 = .;
+. += 0x400;
+. = 0x70400;
+buf3 = .;
+. += 0x400;
+. = 0x74000;
+buf2 = .;
+. += 0x400;
+  .bss : { *(.bss*) } > data
+}
+PROVIDE(main = core_0_3);
diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_3.ll b/examples/elementwise_arith/air_project/square_kernel_0_core_0_3.ll
new file mode 100644
index 0000000..9d2e115
--- /dev/null
+++ b/examples/elementwise_arith/air_project/square_kernel_0_core_0_3.ll
@@ -0,0 +1,84 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target triple = "aie2p"
+
+@buf0 = external global [256 x float]
+@buf1 = external global [256 x float]
+@buf2 = external global [256 x float]
+@buf3 = external global [256 x float]
+@buf4 = external global [256 x float]
+@buf5 = external global [256 x float]
+@buf6 = external global [256 x float]
+@buf7 = external global [256 x float]
+@buf8 = external global [1024 x float]
+@buf9 = external global [1024 x float]
+
+declare void @debug_i32(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.event(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.put.ms(i32, i32)
+
+; Unknown intrinsic
+declare { i32, i32 } @llvm.aie2p.get.ss()
+
+; Unknown intrinsic
+declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32)
+
+; Unknown intrinsic
+declare <16 x i32> @llvm.aie2p.scd.read.vec(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.acquire(i32, i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.release(i32, i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.set.ctrl.reg(i32, i32)
+
+define void @core_0_3() {
+  call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1)
+  call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0)
+  br label %1
+
+1:                                                ; preds = %13, %0
+  call void @llvm.aie2p.acquire(i32 49, i32 -1)
+  call void @llvm.aie2p.acquire(i32 50, i32 -1)
+  br label %2
+
+2:                                                ; preds = %5, %1
+  %3 = phi i32 [ %12, %5 ], [ 0, %1 ]
+  %4 = icmp slt i32 %3, 256
+  br i1 %4, label %5, label %13
+
+5:                                                ; preds = %2
+  %6 = getelementptr float, ptr @buf3, i32 %3
+  %7 = load <16 x float>, ptr %6, align 4
+  %8 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %7)
+  %9 = shufflevector <16 x bfloat> %8, <16 x bfloat> %8, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %10 = call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %9, <32 x bfloat> %9, i32 60)
+  %11 = getelementptr float, ptr @buf2, i32 %3
+  store <16 x float> %10, ptr %11, align 4
+  %12 = add i32 %3, 16
+  br label %2, !llvm.loop !1
+
+13:                                               ; preds = %2
+  call void @llvm.aie2p.release(i32 51, i32 1)
+  call void @llvm.aie2p.release(i32 48, i32 1)
+  br label %1
+}
+
+; Unknown intrinsic
+declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>)
+
+; Unknown intrinsic
+declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32)
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !{!1, !2}
+!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_3.o b/examples/elementwise_arith/air_project/square_kernel_0_core_0_3.o
new file mode 100644
index 0000000000000000000000000000000000000000..6b3d34570ca51b7ee784f20a1a0a4907b4e35d5c
GIT binary patch
literal 932
zcmZ`$O-mb56g|(cj1VLWO(V3Bc2z}(q;^vfiB{W6C#48}fQxYwr$tPSjw5I{3JubQ
zbUlBddsi;9>MxKkT}JQ+*gsHv?wc8B2zcSXd(S)P-FM%8#|ze5r4(XQFj5FO>j8R6
zzS4ph$RN?m)-{kriOISu0!3Wpa8AVh`mVjEt_Gh2-!ttsTz#kj{l>-NFZDZDF5-5A
zGO>wLD?9KnsQw%dK3xvq%x0V)<=|1s#cLs7F@X1~Ga4N4J)K8~O>iC1_p@Td%gX(%
z$$p4i-cSnvnJv`LPVp?_!=l2U_lQ2hJl_t{eRn`|w@}^sg0YbYf>R7qX&~PU&p)_6
zPBDG>duJ2N_(JQ&DwC54$>hH<zfIxAC@*pXBjjJ-bMg_uQWq9D#yN=zKbg@fe1>uP
zSlT!Gc%M#@V_n5R*Jk?ep$~JX>AU8;kIR$$zEj_ZWvx`_x15Gs_p8m#gN6%p-}ii{
z#(4BqV(sAb6im-;IJ$yaZ+fm>vZsW23DcUJE7@@@6Wi9p&So-&==KN07@4Sbt$D(J
zcT;LUL7PPJ==rGV-DHt@h`phA>tw#M#kv`8$e1b-(V=FVxzv|_gv-)p#Hr)hNP`qt
TW*(24sn(q^ioBmL_2lmVgu7{C

literal 0
HcmV?d00001

diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_3.opt.ll b/examples/elementwise_arith/air_project/square_kernel_0_core_0_3.opt.ll
new file mode 100644
index 0000000..f2c89be
--- /dev/null
+++ b/examples/elementwise_arith/air_project/square_kernel_0_core_0_3.opt.ll
@@ -0,0 +1,65 @@
+; ModuleID = 'air_project/square_kernel_0_core_0_3.peanohack.ll'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-m:e-p:20:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-f32:32:32-i64:32-f64:32-a:0:32-n32"
+target triple = "aie2p"
+
+@buf2 = external local_unnamed_addr global [256 x float]
+@buf3 = external local_unnamed_addr global [256 x float]
+
+; Function Attrs: nounwind
+declare void @llvm.aie2p.acquire(i32, i32) #0
+
+; Function Attrs: nounwind
+declare void @llvm.aie2p.release(i32, i32) #0
+
+; Function Attrs: nounwind memory(inaccessiblemem: write)
+declare void @llvm.aie2p.set.ctrl.reg(i32, i32) #1
+
+; Function Attrs: noreturn nounwind
+define void @core_0_3() local_unnamed_addr #2 {
+  tail call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1)
+  tail call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0)
+  br label %1
+
+1:                                                ; preds = %13, %0
+  tail call void @llvm.aie2p.acquire(i32 49, i32 -1)
+  tail call void @llvm.aie2p.acquire(i32 50, i32 -1)
+  br label %2
+
+2:                                                ; preds = %1, %2
+  %3 = phi i32 [ 0, %1 ], [ %11, %2 ]
+  %4 = trunc nuw i32 %3 to i20
+  %5 = getelementptr float, ptr @buf3, i20 %4
+  %6 = load <16 x float>, ptr %5, align 64
+  %7 = tail call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %6)
+  %8 = shufflevector <16 x bfloat> %7, <16 x bfloat> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %9 = tail call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %8, <32 x bfloat> %8, i32 60)
+  %10 = getelementptr float, ptr @buf2, i20 %4
+  store <16 x float> %9, ptr %10, align 64
+  %11 = add nuw nsw i32 %3, 16
+  %12 = icmp ult i32 %3, 240
+  br i1 %12, label %2, label %13, !llvm.loop !1
+
+13:                                               ; preds = %2
+  tail call void @llvm.aie2p.release(i32 51, i32 1)
+  tail call void @llvm.aie2p.release(i32 48, i32 1)
+  br label %1
+}
+
+; Function Attrs: nofree nounwind memory(inaccessiblemem: read)
+declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>) #3
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read)
+declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32) #4
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind memory(inaccessiblemem: write) }
+attributes #2 = { noreturn nounwind }
+attributes #3 = { nofree nounwind memory(inaccessiblemem: read) }
+attributes #4 = { mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) }
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !{!1, !2}
+!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_3.peanohack.ll b/examples/elementwise_arith/air_project/square_kernel_0_core_0_3.peanohack.ll
new file mode 100644
index 0000000..ed78c15
--- /dev/null
+++ b/examples/elementwise_arith/air_project/square_kernel_0_core_0_3.peanohack.ll
@@ -0,0 +1,84 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target triple = "aie2p"
+
+@buf0 = external global [256 x float]
+@buf1 = external global [256 x float]
+@buf2 = external global [256 x float]
+@buf3 = external global [256 x float]
+@buf4 = external global [256 x float]
+@buf5 = external global [256 x float]
+@buf6 = external global [256 x float]
+@buf7 = external global [256 x float]
+@buf8 = external global [1024 x float]
+@buf9 = external global [1024 x float]
+
+declare void @debug_i32(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.event(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.put.ms(i32, i32)
+
+; Unknown intrinsic
+declare { i32, i32 } @llvm.aie2p.get.ss()
+
+; Unknown intrinsic
+declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32)
+
+; Unknown intrinsic
+declare <16 x i32> @llvm.aie2p.scd.read.vec(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.acquire(i32, i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.release(i32, i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.set.ctrl.reg(i32, i32)
+
+define void @core_0_3() {
+  call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1)
+  call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0)
+  br label %1
+
+1:                                                ; preds = %13, %0
+  call void @llvm.aie2p.acquire(i32 49, i32 -1)
+  call void @llvm.aie2p.acquire(i32 50, i32 -1)
+  br label %2
+
+2:                                                ; preds = %5, %1
+  %3 = phi i32 [ %12, %5 ], [ 0, %1 ]
+  %4 = icmp slt i32 %3, 256
+  br i1 %4, label %5, label %13
+
+5:                                                ; preds = %2
+  %6 = getelementptr float, ptr @buf3, i32 %3
+  %7 = load <16 x float>, ptr %6
+  %8 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %7)
+  %9 = shufflevector <16 x bfloat> %8, <16 x bfloat> %8, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %10 = call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %9, <32 x bfloat> %9, i32 60)
+  %11 = getelementptr float, ptr @buf2, i32 %3
+  store <16 x float> %10, ptr %11
+  %12 = add i32 %3, 16
+  br label %2, !llvm.loop !1
+
+13:                                               ; preds = %2
+  call void @llvm.aie2p.release(i32 51, i32 1)
+  call void @llvm.aie2p.release(i32 48, i32 1)
+  br label %1
+}
+
+; Unknown intrinsic
+declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>)
+
+; Unknown intrinsic
+declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32)
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !{!1, !2}
+!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_4.elf b/examples/elementwise_arith/air_project/square_kernel_0_core_0_4.elf
new file mode 100755
index 0000000000000000000000000000000000000000..b06bbc2edc2ab1b34e0e4e5e00fa2e23d127e11e
GIT binary patch
literal 1640
zcma)6O>7%Q6#mvRu_IOBf{2U?(oiH6P;l0<Y3iP~k&6&<q;LR5s?f6D-6X5n>#(~{
zQ!Wi63LFp@F6MyLOC^pLhaPf@)Lub7=7Q8jATB+$w;mATd;4QsocN@f?|tvRnRz=i
zJ9lohKF~A`8FDC&RGrSX7l0{p9SS+jp@2Ljd!H@CkPc}~MTza>KO-lLP4e`lNbS6a
zyFe3<X9}a}+phy>A5AGdoyVz6xp=JMrltUe`y~t?X*6(pezdNg%v=V(E*!1nWW`_w
z#ru2rwBu4W9*Y?B`oV?ie+H}HUmUFccJYtv1?$`D;MF9S_2pTk2z;nLer<5)i#M9a
zDZi$v&R8(l=Zu};f~=p^LwyjuIH4b1FSoxt#G7e6nN;G#oQxl9jz3BitqkL}#q;0%
za`E$7U~q^Tpj^c40eA=#V7>vKNxVNY{|<vWsbuiegO_qUPk)&je)D{=`FCMHmp7);
z@8OuV{&I3;@r>oh!|@@Y{NsZ{8FO-=#6t~ueh#!ef7=bNsaC70s*Q?X(E+<-`I~Cj
z4MMNyr(QL;6-7JYwS|REFWTz2D|WBDu<ms{e{Z4F+3mh(d9G^LY)5Z6wVG==HNA1A
ze${d9C8zGxs_k04ZdV(w)2MGW-breh>{-qTc|1FRg=%f&(Qt|O-3k2`?b2wFhVlN(
z?mat~yKYRxm#LLB`d)^H723!r+f%}HvX(K{pGcfgY?A+(2>vYs$=@X#oJ)>J;$M*!
zn?Up-S&1M@)IX4w2txEdvJydv9+H&^Li90Nu?a-~N>(BW(I;Z$c~1zb=XWuTu|5->
zPySGe{}so_8kNZ1iy+6?b$uG)?QUeX$y;F*B-V&6d)o}XFWr#$rS<7*)3Z9R9j*49
ze#b?nwY;pG)sz>>X6uIU=&*yRTCr_9Hu}D8w%tw7ufpyHuBn@KwEG+HOIL?!hLIIS
zFvFe1GIy;`-!+@7BJFLMUCZ-L&-WsT1=EgtL70sazi9=5bvyGUVb^yM4@!fvb|yzA
zLj0_wCf%ZROVVA5kJpfG`#;c_BrVzD<UN%)G(#yh<OYdbPqm%{Zl$(RHjm^pG{Vmm
z`wYsPEOW?7OAi0{_&q1Zm}6im6XdLf%e3X)&tNj|NhZiy-n*&2pT<_Wpsbn2GyWG1
C;r}=Q

literal 0
HcmV?d00001

diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_4.ld.script b/examples/elementwise_arith/air_project/square_kernel_0_core_0_4.ld.script
new file mode 100644
index 0000000..5970233
--- /dev/null
+++ b/examples/elementwise_arith/air_project/square_kernel_0_core_0_4.ld.script
@@ -0,0 +1,69 @@
+
+MEMORY
+{
+   program (RX) : ORIGIN = 0, LENGTH = 0x0020000
+   data (!RX) : ORIGIN = 0x74400, LENGTH = 0xBC00
+}
+ENTRY(__start)
+SECTIONS
+{
+  . = 0x0;
+  .text : {
+     /* the __start symbol has to come at address zero. */
+     *crt0.o(.text*)
+     _ctors_start = .;
+     _init_array_start = .;
+     KEEP(SORT(*.init_array))
+     _ctors_end = .;
+     _init_array_end = .;
+     _dtors_start = .;
+     _dtors_end = .;
+     *(.text*)
+  } > program
+  .data : {
+     *(.data*)
+     *(.rodata*)
+  } > data
+  .comment : {
+     *(.comment*)
+  }
+  .symtab : {
+     *(.symtab)
+  }
+  .shstrtab : {
+     *(.shstrtab)
+  }
+  .strtab : {
+     *(.strtab)
+  }
+  .stack_sizes : {
+     *(.stack_sizes)
+  }
+
+. = 0x70000;
+_sp_start_value_DM_stack = .;
+. += 0x400; /* stack */
+. = 0x40400;
+buf3 = .;
+. += 0x400;
+. = 0x44000;
+buf2 = .;
+. += 0x400;
+/* No tile with memory exists to the west. */
+. = 0x50000;
+. += 0x10000;
+. = 0x60400;
+buf7 = .;
+. += 0x400;
+. = 0x64000;
+buf6 = .;
+. += 0x400;
+. = 0x70400;
+buf5 = .;
+. += 0x400;
+. = 0x74000;
+buf4 = .;
+. += 0x400;
+  .bss : { *(.bss*) } > data
+}
+PROVIDE(main = core_0_4);
diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_4.ll b/examples/elementwise_arith/air_project/square_kernel_0_core_0_4.ll
new file mode 100644
index 0000000..cfa104c
--- /dev/null
+++ b/examples/elementwise_arith/air_project/square_kernel_0_core_0_4.ll
@@ -0,0 +1,84 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target triple = "aie2p"
+
+@buf0 = external global [256 x float]
+@buf1 = external global [256 x float]
+@buf2 = external global [256 x float]
+@buf3 = external global [256 x float]
+@buf4 = external global [256 x float]
+@buf5 = external global [256 x float]
+@buf6 = external global [256 x float]
+@buf7 = external global [256 x float]
+@buf8 = external global [1024 x float]
+@buf9 = external global [1024 x float]
+
+declare void @debug_i32(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.event(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.put.ms(i32, i32)
+
+; Unknown intrinsic
+declare { i32, i32 } @llvm.aie2p.get.ss()
+
+; Unknown intrinsic
+declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32)
+
+; Unknown intrinsic
+declare <16 x i32> @llvm.aie2p.scd.read.vec(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.acquire(i32, i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.release(i32, i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.set.ctrl.reg(i32, i32)
+
+define void @core_0_4() {
+  call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1)
+  call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0)
+  br label %1
+
+1:                                                ; preds = %13, %0
+  call void @llvm.aie2p.acquire(i32 49, i32 -1)
+  call void @llvm.aie2p.acquire(i32 50, i32 -1)
+  br label %2
+
+2:                                                ; preds = %5, %1
+  %3 = phi i32 [ %12, %5 ], [ 0, %1 ]
+  %4 = icmp slt i32 %3, 256
+  br i1 %4, label %5, label %13
+
+5:                                                ; preds = %2
+  %6 = getelementptr float, ptr @buf5, i32 %3
+  %7 = load <16 x float>, ptr %6, align 4
+  %8 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %7)
+  %9 = shufflevector <16 x bfloat> %8, <16 x bfloat> %8, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %10 = call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %9, <32 x bfloat> %9, i32 60)
+  %11 = getelementptr float, ptr @buf4, i32 %3
+  store <16 x float> %10, ptr %11, align 4
+  %12 = add i32 %3, 16
+  br label %2, !llvm.loop !1
+
+13:                                               ; preds = %2
+  call void @llvm.aie2p.release(i32 51, i32 1)
+  call void @llvm.aie2p.release(i32 48, i32 1)
+  br label %1
+}
+
+; Unknown intrinsic
+declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>)
+
+; Unknown intrinsic
+declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32)
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !{!1, !2}
+!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_4.o b/examples/elementwise_arith/air_project/square_kernel_0_core_0_4.o
new file mode 100644
index 0000000000000000000000000000000000000000..6afdc2e5495148f08a97bed4dbec3859aee95823
GIT binary patch
literal 932
zcmZ`$O-mb56g|(cj1WpJ7*nv2c2z}(MC?KkiC?YIiF8vHx)>*MTEx_};|S_Tp+Q`T
z>-huSyK<3Le?jTeWrY3!{R7%_-^@5ezzg@?d)_(kzWeSwUbfyTr4XBfkwU<E570|;
zS_|GFgG4J^H$V<0CZAOiDB>!IOCshs5A6+gJ@^{<nQ3p}`a=ciH?9tTso%MB5w{bR
ziA|JR*@63@dNCAy{4?}#KI5E~gW-^i*FwHx04u8VA~^o>atR$a!F532&x#E%EBCV|
z`yp<5&r<mBe4%!JidPXI78U-oNAwBi`F4mNy91KDh3eK9jLkd{oMMnl1Nl~X{>gPD
z#q{Cs!6ugRh1QF8CMOY+$$w>jo5HJ6UgQLxlb?Cy<RgH&E-Y}2a}pDNGNV)Y4CC^#
zv?Ka>pH7iuUBy4wX8P`d4|BKayXO1VPm_DTQ{RDQtyPz{orYWYtIe(bh6{7g_k5?u
zc=%0XZU4(#S*BrnZo|=K%zD#v?UKzsYhj^ePerlJXj{v>Tgeon+n)?$WTMu!W|aMY
zPpSC?Z4$+!=cA%`lSSqswnFdL$$Vprb&K4PF;ya>L(MF6sW1Hqm!->yQ^&EH1}U!0
TJRUVutvg{9c|Tq1$=&|}aJy+@

literal 0
HcmV?d00001

diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_4.opt.ll b/examples/elementwise_arith/air_project/square_kernel_0_core_0_4.opt.ll
new file mode 100644
index 0000000..a653490
--- /dev/null
+++ b/examples/elementwise_arith/air_project/square_kernel_0_core_0_4.opt.ll
@@ -0,0 +1,65 @@
+; ModuleID = 'air_project/square_kernel_0_core_0_4.peanohack.ll'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-m:e-p:20:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-f32:32:32-i64:32-f64:32-a:0:32-n32"
+target triple = "aie2p"
+
+@buf4 = external local_unnamed_addr global [256 x float]
+@buf5 = external local_unnamed_addr global [256 x float]
+
+; Function Attrs: nounwind
+declare void @llvm.aie2p.acquire(i32, i32) #0
+
+; Function Attrs: nounwind
+declare void @llvm.aie2p.release(i32, i32) #0
+
+; Function Attrs: nounwind memory(inaccessiblemem: write)
+declare void @llvm.aie2p.set.ctrl.reg(i32, i32) #1
+
+; Function Attrs: noreturn nounwind
+define void @core_0_4() local_unnamed_addr #2 {
+  tail call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1)
+  tail call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0)
+  br label %1
+
+1:                                                ; preds = %13, %0
+  tail call void @llvm.aie2p.acquire(i32 49, i32 -1)
+  tail call void @llvm.aie2p.acquire(i32 50, i32 -1)
+  br label %2
+
+2:                                                ; preds = %1, %2
+  %3 = phi i32 [ 0, %1 ], [ %11, %2 ]
+  %4 = trunc nuw i32 %3 to i20
+  %5 = getelementptr float, ptr @buf5, i20 %4
+  %6 = load <16 x float>, ptr %5, align 64
+  %7 = tail call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %6)
+  %8 = shufflevector <16 x bfloat> %7, <16 x bfloat> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %9 = tail call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %8, <32 x bfloat> %8, i32 60)
+  %10 = getelementptr float, ptr @buf4, i20 %4
+  store <16 x float> %9, ptr %10, align 64
+  %11 = add nuw nsw i32 %3, 16
+  %12 = icmp ult i32 %3, 240
+  br i1 %12, label %2, label %13, !llvm.loop !1
+
+13:                                               ; preds = %2
+  tail call void @llvm.aie2p.release(i32 51, i32 1)
+  tail call void @llvm.aie2p.release(i32 48, i32 1)
+  br label %1
+}
+
+; Function Attrs: nofree nounwind memory(inaccessiblemem: read)
+declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>) #3
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read)
+declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32) #4
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind memory(inaccessiblemem: write) }
+attributes #2 = { noreturn nounwind }
+attributes #3 = { nofree nounwind memory(inaccessiblemem: read) }
+attributes #4 = { mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) }
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !{!1, !2}
+!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_4.peanohack.ll b/examples/elementwise_arith/air_project/square_kernel_0_core_0_4.peanohack.ll
new file mode 100644
index 0000000..520a891
--- /dev/null
+++ b/examples/elementwise_arith/air_project/square_kernel_0_core_0_4.peanohack.ll
@@ -0,0 +1,84 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target triple = "aie2p"
+
+@buf0 = external global [256 x float]
+@buf1 = external global [256 x float]
+@buf2 = external global [256 x float]
+@buf3 = external global [256 x float]
+@buf4 = external global [256 x float]
+@buf5 = external global [256 x float]
+@buf6 = external global [256 x float]
+@buf7 = external global [256 x float]
+@buf8 = external global [1024 x float]
+@buf9 = external global [1024 x float]
+
+declare void @debug_i32(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.event(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.put.ms(i32, i32)
+
+; Unknown intrinsic
+declare { i32, i32 } @llvm.aie2p.get.ss()
+
+; Unknown intrinsic
+declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32)
+
+; Unknown intrinsic
+declare <16 x i32> @llvm.aie2p.scd.read.vec(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.acquire(i32, i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.release(i32, i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.set.ctrl.reg(i32, i32)
+
+define void @core_0_4() {
+  call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1)
+  call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0)
+  br label %1
+
+1:                                                ; preds = %13, %0
+  call void @llvm.aie2p.acquire(i32 49, i32 -1)
+  call void @llvm.aie2p.acquire(i32 50, i32 -1)
+  br label %2
+
+2:                                                ; preds = %5, %1
+  %3 = phi i32 [ %12, %5 ], [ 0, %1 ]
+  %4 = icmp slt i32 %3, 256
+  br i1 %4, label %5, label %13
+
+5:                                                ; preds = %2
+  %6 = getelementptr float, ptr @buf5, i32 %3
+  %7 = load <16 x float>, ptr %6
+  %8 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %7)
+  %9 = shufflevector <16 x bfloat> %8, <16 x bfloat> %8, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %10 = call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %9, <32 x bfloat> %9, i32 60)
+  %11 = getelementptr float, ptr @buf4, i32 %3
+  store <16 x float> %10, ptr %11
+  %12 = add i32 %3, 16
+  br label %2, !llvm.loop !1
+
+13:                                               ; preds = %2
+  call void @llvm.aie2p.release(i32 51, i32 1)
+  call void @llvm.aie2p.release(i32 48, i32 1)
+  br label %1
+}
+
+; Unknown intrinsic
+declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>)
+
+; Unknown intrinsic
+declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32)
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !{!1, !2}
+!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_5.elf b/examples/elementwise_arith/air_project/square_kernel_0_core_0_5.elf
new file mode 100755
index 0000000000000000000000000000000000000000..9b231c6aa9ecc51f66997deeb1a18cef3b26ae8b
GIT binary patch
literal 1600
zcma)6L2MgE6n*QM*pVu5sfdgKX<8%{P;l0@X>c!WBNrj+NaX;EgwS@qyGd5D*I~U5
zDVGKj1rCS{7jr=Br4mPrLk~GcYOkOkb3tk%xVN_+5aIpZ9oy=OC(XY9{{J)ccmD3&
zx!(HFFbrtqP#mi|oeM7jGvqoHa#%nCc}VsihsKBw8B9fq<CDK*PsgTtc3Nb1-oRa;
ziN|w=arCX%fzyv?6rL>NM3%gKWZ-i{0Sfm^7(Fs*;N<LZ%{ZRB1bkaKT*L7Vixm{_
z@7^<xO4Vd85-ghg=Vt#MuKsv_c=M0*e_bm$-&cpPq%r+0&s#;{BjfR_!#iKT(X>wZ
zHcfTPg1J9e>_q2e|Fj<J{piIh{peb`{lft+X7O}UiH~wJf1)}6FjcfPCVPwL`TTP6
z);usgz#LF6Vt$`3(KEAr3+zd9KC=Ecy*a65_{)Qra@$XSn;CugY`F1HVKJAtX0rEz
zrSr>a%aSF_jfayap#1ZLLKzFvP~ss^<k=a}3c@Wvx~f{Orm8k7X2k^DjuURE9Y5*^
zy)g5th0QqL?q6M6+6dy!LA&Dix=U+8CkS_!I-Q;FyH4P%cFpz7hF7ckj#o1qSL*M3
zzPs$zy;`+hYuDXs!}l8X^~T$24bxASGe#ay&n{5i9D5ABM*HrRev@`-JjuZ1{N?nX
zp2@dvO~qHJl??h`hJhQjkx!4OgxPd2Yob4uIH5Qs{|gcPTLhB7L$<h<v`6CKkQIkO
z^Z{9kAWGCfk(CHS^gXf?L5Loal?X!g5m|8vME_1!A_&pPV&wUq5K_;dVptQ6O6>2(
zkT!MwkVgM@H+I_O&3+uE)|ef4%kBqX`+a^7t<P4Qfz$Ebc(vyZIzB3`l@-&jW_+1!
zx2}ht2{($X71yO>eGs~K+usPnD%@V=+ot^<+Jp6Fd0c^Q_hToDVfVLF%ieK11K)10
zigdSNcby=#gD{987Hl`}MSVR>{I(NC&TZ{U!+z)?nUn<+?M=^2h4@)VQc*o=nk9$z
z|6njphQ90eXv^KGQOXSYGQ@3W+ROo8WwwxBNAejO<5!AX49Z<AYe)wrhyQVMPm>Z9
k8CceWtV_5=Tkdm>514nV1zG34li7P&>>@XmJ#}33e}uC6SpWb4

literal 0
HcmV?d00001

diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_5.ld.script b/examples/elementwise_arith/air_project/square_kernel_0_core_0_5.ld.script
new file mode 100644
index 0000000..818260c
--- /dev/null
+++ b/examples/elementwise_arith/air_project/square_kernel_0_core_0_5.ld.script
@@ -0,0 +1,66 @@
+
+MEMORY
+{
+   program (RX) : ORIGIN = 0, LENGTH = 0x0020000
+   data (!RX) : ORIGIN = 0x74400, LENGTH = 0xBC00
+}
+ENTRY(__start)
+SECTIONS
+{
+  . = 0x0;
+  .text : {
+     /* the __start symbol has to come at address zero. */
+     *crt0.o(.text*)
+     _ctors_start = .;
+     _init_array_start = .;
+     KEEP(SORT(*.init_array))
+     _ctors_end = .;
+     _init_array_end = .;
+     _dtors_start = .;
+     _dtors_end = .;
+     *(.text*)
+  } > program
+  .data : {
+     *(.data*)
+     *(.rodata*)
+  } > data
+  .comment : {
+     *(.comment*)
+  }
+  .symtab : {
+     *(.symtab)
+  }
+  .shstrtab : {
+     *(.shstrtab)
+  }
+  .strtab : {
+     *(.strtab)
+  }
+  .stack_sizes : {
+     *(.stack_sizes)
+  }
+
+. = 0x70000;
+_sp_start_value_DM_stack = .;
+. += 0x400; /* stack */
+. = 0x40400;
+buf5 = .;
+. += 0x400;
+. = 0x44000;
+buf4 = .;
+. += 0x400;
+/* No tile with memory exists to the west. */
+. = 0x50000;
+. += 0x10000;
+/* No tile with memory exists to the north. */
+. = 0x60000;
+. += 0x10000;
+. = 0x70400;
+buf7 = .;
+. += 0x400;
+. = 0x74000;
+buf6 = .;
+. += 0x400;
+  .bss : { *(.bss*) } > data
+}
+PROVIDE(main = core_0_5);
diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_5.ll b/examples/elementwise_arith/air_project/square_kernel_0_core_0_5.ll
new file mode 100644
index 0000000..3e15d3e
--- /dev/null
+++ b/examples/elementwise_arith/air_project/square_kernel_0_core_0_5.ll
@@ -0,0 +1,84 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target triple = "aie2p"
+
+@buf0 = external global [256 x float]
+@buf1 = external global [256 x float]
+@buf2 = external global [256 x float]
+@buf3 = external global [256 x float]
+@buf4 = external global [256 x float]
+@buf5 = external global [256 x float]
+@buf6 = external global [256 x float]
+@buf7 = external global [256 x float]
+@buf8 = external global [1024 x float]
+@buf9 = external global [1024 x float]
+
+declare void @debug_i32(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.event(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.put.ms(i32, i32)
+
+; Unknown intrinsic
+declare { i32, i32 } @llvm.aie2p.get.ss()
+
+; Unknown intrinsic
+declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32)
+
+; Unknown intrinsic
+declare <16 x i32> @llvm.aie2p.scd.read.vec(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.acquire(i32, i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.release(i32, i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.set.ctrl.reg(i32, i32)
+
+define void @core_0_5() {
+  call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1)
+  call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0)
+  br label %1
+
+1:                                                ; preds = %13, %0
+  call void @llvm.aie2p.acquire(i32 49, i32 -1)
+  call void @llvm.aie2p.acquire(i32 50, i32 -1)
+  br label %2
+
+2:                                                ; preds = %5, %1
+  %3 = phi i32 [ %12, %5 ], [ 0, %1 ]
+  %4 = icmp slt i32 %3, 256
+  br i1 %4, label %5, label %13
+
+5:                                                ; preds = %2
+  %6 = getelementptr float, ptr @buf7, i32 %3
+  %7 = load <16 x float>, ptr %6, align 4
+  %8 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %7)
+  %9 = shufflevector <16 x bfloat> %8, <16 x bfloat> %8, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %10 = call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %9, <32 x bfloat> %9, i32 60)
+  %11 = getelementptr float, ptr @buf6, i32 %3
+  store <16 x float> %10, ptr %11, align 4
+  %12 = add i32 %3, 16
+  br label %2, !llvm.loop !1
+
+13:                                               ; preds = %2
+  call void @llvm.aie2p.release(i32 51, i32 1)
+  call void @llvm.aie2p.release(i32 48, i32 1)
+  br label %1
+}
+
+; Unknown intrinsic
+declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>)
+
+; Unknown intrinsic
+declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32)
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !{!1, !2}
+!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_5.o b/examples/elementwise_arith/air_project/square_kernel_0_core_0_5.o
new file mode 100644
index 0000000000000000000000000000000000000000..cb309eb89ecceee56448345389d9d7a76543e35e
GIT binary patch
literal 932
zcmZ`$O-mb56g|(cj1Wo`OjBqfx>C_0DQ!h4C0cC-C(=z@!NoX9rlmABI*wqwQD~4Z
z#P$3E_pV%I)n8Ctx{S~tVE;h%+&44M5b(l%_nvpoyYIgH4wkJqN-4yqV5AUm+z0fN
ze5M7H$RN?m))kOLiOGg40!5tXa7M)Z>ZY@<E{7+8pP9}&E>|nSpmDx`s($CnMO@EN
zCN@)QXNUd=)svCn-Jg-a3mNBEIT#DMcrD~B2C$;KkAj1pCrjwE39du>epYOHS-GDz
z*$;8sdyv8x3x(S85ym4vEGqn2pXd|J^PLdgbq6GO3)QPH7@K(@IKnWM2J-Fj{JrbL
z6f=N-H#V`1FSLGqWO5iGnfzzw*D1Uf<wZ_ll>Gc1Cm#X4>cIl1I43dTXEQp5&oC_?
zOZ!2e9?&UrtgHCv+DzZw_hD|eeAj%t_I`HPcj{kZSs$uPU!0~}_p7bOUekrS>wCUa
zWBmS2Vr}o!oGdS3dT!IvWz2fZbM2D-9HzCnShAl+vCL>&%Ug|P3eoNNhAA>p>ss@e
z{cfk!e1bNK;?eU_(Ywha^AKC1_v&Q6vBkRA+>kL<BBD#pOXgBv`VlTmml3CqV>1m>
VT$y=1YNlFu$SCqby3~`qZvl4{X=eZc

literal 0
HcmV?d00001

diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_5.opt.ll b/examples/elementwise_arith/air_project/square_kernel_0_core_0_5.opt.ll
new file mode 100644
index 0000000..bccc4ff
--- /dev/null
+++ b/examples/elementwise_arith/air_project/square_kernel_0_core_0_5.opt.ll
@@ -0,0 +1,65 @@
+; ModuleID = 'air_project/square_kernel_0_core_0_5.peanohack.ll'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-m:e-p:20:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-f32:32:32-i64:32-f64:32-a:0:32-n32"
+target triple = "aie2p"
+
+@buf6 = external local_unnamed_addr global [256 x float]
+@buf7 = external local_unnamed_addr global [256 x float]
+
+; Function Attrs: nounwind
+declare void @llvm.aie2p.acquire(i32, i32) #0
+
+; Function Attrs: nounwind
+declare void @llvm.aie2p.release(i32, i32) #0
+
+; Function Attrs: nounwind memory(inaccessiblemem: write)
+declare void @llvm.aie2p.set.ctrl.reg(i32, i32) #1
+
+; Function Attrs: noreturn nounwind
+define void @core_0_5() local_unnamed_addr #2 {
+  tail call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1)
+  tail call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0)
+  br label %1
+
+1:                                                ; preds = %13, %0
+  tail call void @llvm.aie2p.acquire(i32 49, i32 -1)
+  tail call void @llvm.aie2p.acquire(i32 50, i32 -1)
+  br label %2
+
+2:                                                ; preds = %1, %2
+  %3 = phi i32 [ 0, %1 ], [ %11, %2 ]
+  %4 = trunc nuw i32 %3 to i20
+  %5 = getelementptr float, ptr @buf7, i20 %4
+  %6 = load <16 x float>, ptr %5, align 64
+  %7 = tail call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %6)
+  %8 = shufflevector <16 x bfloat> %7, <16 x bfloat> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %9 = tail call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %8, <32 x bfloat> %8, i32 60)
+  %10 = getelementptr float, ptr @buf6, i20 %4
+  store <16 x float> %9, ptr %10, align 64
+  %11 = add nuw nsw i32 %3, 16
+  %12 = icmp ult i32 %3, 240
+  br i1 %12, label %2, label %13, !llvm.loop !1
+
+13:                                               ; preds = %2
+  tail call void @llvm.aie2p.release(i32 51, i32 1)
+  tail call void @llvm.aie2p.release(i32 48, i32 1)
+  br label %1
+}
+
+; Function Attrs: nofree nounwind memory(inaccessiblemem: read)
+declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>) #3
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read)
+declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32) #4
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind memory(inaccessiblemem: write) }
+attributes #2 = { noreturn nounwind }
+attributes #3 = { nofree nounwind memory(inaccessiblemem: read) }
+attributes #4 = { mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) }
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !{!1, !2}
+!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_5.peanohack.ll b/examples/elementwise_arith/air_project/square_kernel_0_core_0_5.peanohack.ll
new file mode 100644
index 0000000..d8f77fa
--- /dev/null
+++ b/examples/elementwise_arith/air_project/square_kernel_0_core_0_5.peanohack.ll
@@ -0,0 +1,84 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target triple = "aie2p"
+
+@buf0 = external global [256 x float]
+@buf1 = external global [256 x float]
+@buf2 = external global [256 x float]
+@buf3 = external global [256 x float]
+@buf4 = external global [256 x float]
+@buf5 = external global [256 x float]
+@buf6 = external global [256 x float]
+@buf7 = external global [256 x float]
+@buf8 = external global [1024 x float]
+@buf9 = external global [1024 x float]
+
+declare void @debug_i32(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.event(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.put.ms(i32, i32)
+
+; Unknown intrinsic
+declare { i32, i32 } @llvm.aie2p.get.ss()
+
+; Unknown intrinsic
+declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32)
+
+; Unknown intrinsic
+declare <16 x i32> @llvm.aie2p.scd.read.vec(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.acquire(i32, i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.release(i32, i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.set.ctrl.reg(i32, i32)
+
+define void @core_0_5() {
+  call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1)
+  call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0)
+  br label %1
+
+1:                                                ; preds = %13, %0
+  call void @llvm.aie2p.acquire(i32 49, i32 -1)
+  call void @llvm.aie2p.acquire(i32 50, i32 -1)
+  br label %2
+
+2:                                                ; preds = %5, %1
+  %3 = phi i32 [ %12, %5 ], [ 0, %1 ]
+  %4 = icmp slt i32 %3, 256
+  br i1 %4, label %5, label %13
+
+5:                                                ; preds = %2
+  %6 = getelementptr float, ptr @buf7, i32 %3
+  %7 = load <16 x float>, ptr %6
+  %8 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %7)
+  %9 = shufflevector <16 x bfloat> %8, <16 x bfloat> %8, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %10 = call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %9, <32 x bfloat> %9, i32 60)
+  %11 = getelementptr float, ptr @buf6, i32 %3
+  store <16 x float> %10, ptr %11
+  %12 = add i32 %3, 16
+  br label %2, !llvm.loop !1
+
+13:                                               ; preds = %2
+  call void @llvm.aie2p.release(i32 51, i32 1)
+  call void @llvm.aie2p.release(i32 48, i32 1)
+  br label %1
+}
+
+; Unknown intrinsic
+declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>)
+
+; Unknown intrinsic
+declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32)
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !{!1, !2}
+!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/square_kernel_0_design.bif b/examples/elementwise_arith/air_project/square_kernel_0_design.bif
new file mode 100644
index 0000000..6e94022
--- /dev/null
+++ b/examples/elementwise_arith/air_project/square_kernel_0_design.bif
@@ -0,0 +1,10 @@
+all:
+{
+  id_code = 0x14ca8093
+  extended_id_code = 0x01
+  image
+  {
+    name=aie_image, id=0x1c000000
+    { type=cdo file=air_project/square_kernel_0_aie_cdo_elfs.bin file=air_project/square_kernel_0_aie_cdo_init.bin file=air_project/square_kernel_0_aie_cdo_enable.bin }
+  }
+}
diff --git a/examples/elementwise_arith/air_project/square_kernel_0_square_kernel_0_sequence.bin b/examples/elementwise_arith/air_project/square_kernel_0_square_kernel_0_sequence.bin
new file mode 100644
index 0000000000000000000000000000000000000000..97e175b81722f66b4d8fb9099cc8f7fbe11452c6
GIT binary patch
literal 2288
zcmcJP!41MN3`Lz%A#QNszyXOB7@?zu%{)TIB#gjM>*D?kas#o1VyE%<&jTu{-yf35
zAR@1W2+}#mB=?e?toB8bc2;%|B-1(DSe73B-{SoA=NBCORAcIXoI82@=$VP#V&<7<
z??-IhM6Sy|)_!>=l8dK|&wPETrL^|wYz7?iP^8=03@%Wz5F74Mv_<AJ8`EgUCc~A7
zB8_<Tj*?|ALu|N5p%9tNY)qpWo8rntkw!dvN6D1S5F73(Rb(!+F^y(y{GSGU(=8(&
z%h^@%8z97{ZZ5Mijb>~Hx|ia@+u5*2@O@J?m)Y3dW^8hm$JK8JDQm>oa8L9*VK%1G
UjE(+*4Qs^Ma8Df@)A$FQHy}r3Z2$lO

literal 0
HcmV?d00001

diff --git a/examples/elementwise_arith/air_project/sub_kernel_0.pdi b/examples/elementwise_arith/air_project/sub_kernel_0.pdi
new file mode 100644
index 0000000000000000000000000000000000000000..cad10284470b3236f231f480e9d2396b90ba8f55
GIT binary patch
literal 7792
zcmeHM&ubiY6n`^2Nw&IeH#L^ANpac)2_=V3H$n=5&UT}$5^-xP1UxhqYAq;)UfN@n
zfJH<CBBT{D7ePENJ$Q(hmG&ee`wtjWe;mBc_S8c)e%|-TeBVs6wb5JY7dHEOpZC7+
z`+m)4cJ>z{nynrmyngkaH-Gy6+Z#jydEmzI;`b@Mczw3~<B&%B?<=#-W>k)-B1waJ
z-+x)xPW*6_&y)KvT)uez^7|JqU33md9L%CG#!nq>3=qAH7r)E3ZPq!E{Y1VzL)!fx
z-1&3+uK*6}v#;X>q8ehjdaJcPh{JrZKopIpxv^Z{JjQ1F1v$*gl#`1>9`tku?7Ppq
zV!}R7fk)q<qqMcp72pL_?AKQd5hW9Jm#9o9+aA5;MMPWM-<7G=@d()SH_BAG3x6LD
zY%F;1Hez@#Ze07?i_4SPsaIG$b6BH^a<ISu@3q%H9b9YvJotNU;KKFEwP%UeKDS9W
zP6vn9wwI31#_V-A`W1!8`SAGOl@gz?<AJ<#<$#boweamv`n}^uua9GW%jf+<J``nS
z_(Pm3Rz8~z^Ll-L+pC)M5O2Tt6j3<6j=WZg#(7sbO(sj&$9zqwdcpI?64XAQvG(Qd
zgZ@%-tI+!B{@SGn0~3A_7fgQGWBi9)Z?nQvcU)5_ym5OVq;aFY;}PB8)^lR#g17OU
z*txK>oeL}5xv(-tRoeo!zq~oAXDs!Xl_{q$3OPGpC!fsMV=-U4|8`zN({C%=d9bpb
z2P;!l&&^jYH(yzqHnVb3$l3Wi^JKmri}}+1w(}C2ep}hjgO%+(Sec^v+<cwQ%~w{Y
z&8%D$a(2Gncrss)#eC_0+j$91zpZTN!OC_XSmt@K_hRzii|@@?=2h^3l4B_@3mget
z5jYmODe$7eO9C$oyejazz%7B>0(S)75;!qf=U2^KGki~w&2LiV=WvubToE`HxGC_W
zz)J!z3%n}uy1*@g+X8n4-V!)5Sm#&I<)>xWznaL;;V5yqB5*8lQ{Y8`mjqrGcvawa
zfm;H%1?~vEC2(S}&Tl@KpO#&IXGMMvM~TA~fn$N20xt@@B=EAps{*eJ+!DAga7W-R
zffIqtG~)AjC3UR<+q}T@-L~ajjx-Voo}sd7V&*-AV{t811Rn{$EO-)p6SL*~WUm3s
zM`9Ms!kz@*L=UZfEP7iJd?fg?;7RaJ)XMqE_SEu`tf#Oi!w>T3m|6DwVxL#kOb0$a
zGJ>_jH^S3Fzuaf-BV`{uLxX-h9~{`tKGyzKpX8=Lz6uU=&xi1t&5LKfY_QEMfXrB3
z2RU5{GH1m){#@t9wmb8y?J+;LkJ(*j_0CWEe@K4NW9j@f_8@ca0q%zw>-@61lD#)p
z`8nOr{3iF9AKS+`HM@G}r~E%8Kj^V^ej2;{^tGh(%j!z@yQRv{>2~IKY>)Y|eXL@V
ze|qPq{68c==&^Ku8hem+|MX{$&M&Jg*=IB=Kc{o~;bDVXM*{UhM^-kpd5a&^Rt9lk
zA3vPHiU{_QiH`7}9jmq-e<1CtVC=n*)vo-8ynSr_`zP|IO3vFy)_>t<Pybx2seZ(P
z#kHy!%o|i4Av0EW%;`#y@$5wDiaFg7WcwLV&gn)VKWFxZLbpr(dA_iE)PK8{ucx-}
zQO`JMkA7);fA@avurC<<9`(%IqhIQ=6j6_UJ&0FK{W(cne}!%RXZnwjt?@K4FRPn{
zY<m;ubaRkxjT<>#0NM72gl?Dm`+Kavwtw{cYx_s9zqWt$`s0I~EB3GTSO0{~{3~p0
zjQabc{uv$Wubxr!KGZ*>L;ZbG|BMdx_n9!)R7N+G6sV7Andf4ti|5e>7s0<o)MlPE
zukpPCz34rEQ1LX(SBly@?hU7VE8wjk5_}9Ec`Exr@Dt#1isFlcp9YVJ6kihjR8sIo
z%$$$j3tq&`d|SnNui%TAeca$E`^Xzk_af#LBjyw%X0ExiN6aaYn7O9<T)`t|U&Nf+
zmy<$($GXZs?iP4T^j`2HX6D-_=D@~0M#HH+VsjY$LE~pe@B#P(hMyLE5&V9`PYJ#R
zK8;QAK{Cp}i>Nrm?kGYt@q<tMF8Bs`*B56$3qGy6urGqgBX#9}LGW|n)0zuDNUFk5
zHH~Kqyo<;2GvN0b|6^Up7s2C433cu{!8gD^WB572&w+RGI6pyB7x~sjJaxiDedT|o
z>-Zw<U7U_@fOqjYeh$1l*YOMB)4mJ;K{7AmnNQ;>f_LXSeh9qt@AyhG>WjXoGdvC6
z`58_58StX-DL(~X^gZR9;GZ`2ndmw{3*hksjjCHi@DoW@#8XZEjDdG`c0A@m#N+rT
zcz3SjBk<|`cO5^G)P<jVdTt2b`Efk@;-}YL*YP9CXizeFRl1|;^&5g0`KJ63c<0C2
zN8nxGC0`Z(tHMv!UDsX5BYxq(uI$zSh6K}qKnaYIeJKZt7gV4|tuoKdu#4wN?=inK
QIa&WM;AH(Amu0*D0W-OP2mk;8

literal 0
HcmV?d00001

diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_aie_cdo_elfs.bin b/examples/elementwise_arith/air_project/sub_kernel_0_aie_cdo_elfs.bin
new file mode 100644
index 0000000000000000000000000000000000000000..cbef72b4fafd5ab9b209a73b1144c433a2ac7d6e
GIT binary patch
literal 2592
zcmeH_K}#D!6vzLwS+lFyb|WZrP}rtWJmk<Q1$*ko<PZe0wHKkM_R=1D2s!nvLVFNz
zh3%oxOQFZmgF;S%o+Y0k#NY?W(SsP@WM<sg?0$lffrZTPy?JjY@8<%j)^-2~IH2$4
zX=_*zfG1pv&<dY{wxeZP(0!`XN1tUfgCT?Yj1`2gJR_Q>LlLgPU!aEGtY;YTd>QC(
z<w6WfxB>#Koe8`bA<!G11ZZ^x=(lj%0bX2D$$hWACqCCBV8(BM{Uf5FOegh{{^pYY
zRSMik`MY@gxDda&EL>N-58uo2A`t(u53av<7vf=Kr535GwXmOu>Q5EFH0M=+eIME%
zHXpNnry@Td<weoNbvqSrIqE#yFNFW?sUF>PbXV0|C632+|1+^}`q6#!Vh)fSzv*3v
zz^XbWH(*{#`4K>~H2LiYg&ox+{&oJuX$=15THkKsPq$vlaig4hADuDzQvKg%$yXQ2
zP#}MvdlIWAdLe+DVd9N-L>(P>gfr_1XVwwU5U$&?Jk$%i<&lHR#iU1?AtTRcEb(t`
z>fglpr#;{fNi%cKtQ(wJH#kFB&G4_1;a{4en`S;^iGQ0@|0c#iJ#+4mG&ASSy1|)s
qgENHhGW>g+;a{4en`S;^iGRCO|0c#iJ#+4mG&ASSy1|)sL$P~=DwMGR

literal 0
HcmV?d00001

diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_aie_cdo_enable.bin b/examples/elementwise_arith/air_project/sub_kernel_0_aie_cdo_enable.bin
new file mode 100644
index 0000000000000000000000000000000000000000..7cc1818bce4d6ce1226fc5fda519967a4842b99e
GIT binary patch
literal 104
zcmZQ!U|?`|@n>LQVqg#jvKQ~%@L!OTfq|Wo1;_&e1!V>x7mN{N##qFhv4{o3#Q>))
B1+xGE

literal 0
HcmV?d00001

diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_aie_cdo_init.bin b/examples/elementwise_arith/air_project/sub_kernel_0_aie_cdo_init.bin
new file mode 100644
index 0000000000000000000000000000000000000000..d4549ba181167c6e6d7475f23335e34fea2c3376
GIT binary patch
literal 6032
zcmb7{&rak<5XQ@%hINw3u1Uy>EJ*QcI7AT#M?)kINHH2BaX`vM<iIUF0*^qXJ?)8e
zF5wYI$_vaRaLt7vMY;Fvc_s37+iiD!;}I|tf4=&CUGA#tw!nzU<oIWiROFvEk-z_%
zJ!~W*FA@WSPcd@He{sy=EE%{TBKEES5p+(vAYGEKNKZ*$kiH~+Mf#fb4e1%_Iq6%{
z_oQv3yPk}D`c%hUV?tlVeouYUx*%SXu1HTwUy!~eeMS13^bP45={f0J()Xloqq{y$
zdiqqybf1o>Pg)mN>ymUudP@3&^d;#l($}PKNY6;mN#BybCv6+u_35mqPjyW9=@;sg
z)`iu&Bwdl7lD;5)N&1TPHR&7DGtzU?x1{e$Thh7gr0z}C-Y@95Kj1=K_e*vRasT2A
z`KGw%NtC~$_drR!Af6K!;#1rYonOBXO5z3Gubj9LpW<BU{LA+y<^`S0oYo8RDdrKc
zuV_9c@q&0xTpGTS_%ZrfttFZNDYrPwi8P`Px$pNsn)N<^;r&%-Y<~OqH}kU3U)A~j
z{NqLc{b^o@^RRhW-=}a-^G3%#Fo-!FJksMCBIYU#o-Fco^&UO5Ugxjkt^40s*X#2?
zQZGFCgnj0BwN4NVJu!$m9X!(G86u|Z?ecW>;_N?bzRq9a$@;UpUZ4Mwdf~w*)RN!T
zTGw0O+2D~L&k!+PZ<nX5_w6(5b^gllFaEZ=UZ4Mwdf~w*)RN!TIzg<PZ~YDo9_jH6
z5!3Z{dDfCo{WC<~82>Vspqp^>qxyLCZUYYv{=)g)M&yk)HtOi1`aSl`pz$Z~{VHC>
z{wm~8zhkS{>@Pz8!5=I0ho0@PV=wrGo|TPu3xmfX=5+8#k7tP3R^-WgJe!Ecz0Z3*
zJBWQ9<=Kinq2IpGzOohbc{t*JyYq{${(e{F$2@C)+CR@F|Df@&=r{W@PpQw>ig|vl
z-i6*{KG=BId(6k{cs_@S#l1i1@f;%-&#>z8oFW$Y{-no)v!Lz|&O*DNXg+CuAD%p)
z+W++V)c&W>r}jU6KIt>&<BoKDH!vTsX+9~<XUT*4q%@x;59X87e3m?zPwIlvU0w1V
z+JWF-=&NwXHW&VX<5p+jKZ(o(mzsC_!$5vWzIoMh*>-QrcKx@>wrfeNOYm3=@d_NZ
zw|+x>4;)3bJR^PpjtW{nB))G4Db?$G()v+vO7%KEZ|nOoNU7cwf84d}i)35nL%m9)
zUZqj5n{~S$^(u~f-3(ifdKE{#Db=fZZU-ih)>n&xd&hVAz^Pux=TW^Tu6I|q)q2#U
zHTbJ2{~>V${xaeR#53^EBfd|32(IcOZtO_bqxD;h5z6W6t-#g!Bz^*}&xc-r46f!)
z>of4z(euuUpMtA-6E}8D`NwMi``~*2njeCH5<P!+p?L=WRm6WKegghw#7~Kzg6sY3
z{KihGe-qmOL|#PscNUswSg-5T`~+O@U-MINy+6&*!PWVs=No%Q`#)3r&%pKmG~Wc*
z&)2-Pqm<64x~~V|I{!%VLvT8uitmHd`BZ!g{#n%jy@k$y4*o^NPl)f?G3|e>^6!G{
z`D%{yOZ(S+3a<C3c>%8O|3dRUJE8m&wZAQJonLdDk5s*%7Mkzakr_t)DHkL4{@Vhl
z{wcl*uJh~l1-L$+%*XWnG36iY_v1oy)KAZ!`1S4I2o|Ro1qz~3EajN7i78>jEaZQL
C^4EL-

literal 0
HcmV?d00001

diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.elf b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.elf
new file mode 100755
index 0000000000000000000000000000000000000000..fbf768003c7ae9be77deae967dc2a77b06391f68
GIT binary patch
literal 1656
zcma)6PiPcp6o0eX?xv;HHDX!N!iR!HiJO`1+GM?S5;hX*2GWBP>R~$nk__3MU1ny}
zSPw2mYC-9(ussyK2;wOe6v-)&crBs#LK0fMct~#^O4Hw)Kf9xN@CV<#_j~Vqe`enI
z&7Il$RY{U4L57C=tODaV9}o>eR!C5WMk!BO;@w@W1Y0oh!d5C^ef4)>CBC%GrAe%3
zC3;L$qrDIFeed}*L<e6E2|663eNH)n>^CJrw0HEZKpVRfL6<U{1(Nqs_xt?jqO>xn
zqJ!bh^+%FgP!JiRI6U)dq>UA_pXC1Q&Hr?wxA6Okzozs0k4o<pQSayQg|KPvi|k16
zsQp<@<*YSvfQC^Y85`cmTtDgu{H}K_#doKR#)F)6Hugv5*uRnC_!J)zZ*b&8pJA_D
zn>W7gkLqtp<+vXD%}E~-6{jBJz3vi50u-kxeoO3E2{Q`67cy_Q;m&Z4-mlxoGws9Q
zhBofK?k)eDAIoIbq4@m}6X!2RBjlu1tnLhkLd9RU^F<ov0RuY{(d#3ky5p|e-W5@=
z*Mw3n$t9V{Z0PQ?ShGFfX}PggjIIPh+rKhCzU%}m9iwEnn&XR3!*SQg8;!N*CEc-w
zQ8q2PYL&~jZk6Tg=anhTHYcr$RaT6$Q8AUOZB;8v)eBJ%=_lIhBTN4rodjFxTO>LS
z{b7oqgI?$dNj!%>IG)e`Ldrh_F0f?SMsQ%RL8ol=c?y(^YN-Q$o&DVb|C%`<usGge
zg4@L4_}h?NkH^LSA0P!5gZX_(!2~FP_aFrmfcX<h!31Fb6jCq&n7@D&OaSJ8Knf-R
z^FLY1MmLoEIbg01_)F&90jE-6uLs0qY}zgq|5h{54agNg@FJ}*b#qnoo$qWP=cRsQ
zzUJr++YIJgR;OW8sXj9!Yf3C9Ahr6eYsqAKfl@L}7?wJ&sTuaN<0@pfJX@2s2{JlM
z;X`3fCQb7L-3v(b+mWWN>5Y!9)#jO+tE4q`$JHFy35Ye)%%J7@Nfg^P-ShNYi6wH|
zu0>%`><qb;j*JM~(Z^+eRQOR2$EosXdml*H#4Gy3(f@Ue4Kd<t7oHe4<C=v~GeqCU
zI*TL^?}w|8=OA4;`5(=v4u1&D9d{gy|L+N=;FwGZWYWS}=;?he?0fZ&eHY`HZtS~>
Lf?PAnlgR%8Kh^|F

literal 0
HcmV?d00001

diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.ld.script b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.ld.script
new file mode 100644
index 0000000..fc4f0cf
--- /dev/null
+++ b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.ld.script
@@ -0,0 +1,72 @@
+
+MEMORY
+{
+   program (RX) : ORIGIN = 0, LENGTH = 0x0020000
+   data (!RX) : ORIGIN = 0x78400, LENGTH = 0x7C00
+}
+ENTRY(__start)
+SECTIONS
+{
+  . = 0x0;
+  .text : {
+     /* the __start symbol has to come at address zero. */
+     *crt0.o(.text*)
+     _ctors_start = .;
+     _init_array_start = .;
+     KEEP(SORT(*.init_array))
+     _ctors_end = .;
+     _init_array_end = .;
+     _dtors_start = .;
+     _dtors_end = .;
+     *(.text*)
+  } > program
+  .data : {
+     *(.data*)
+     *(.rodata*)
+  } > data
+  .comment : {
+     *(.comment*)
+  }
+  .symtab : {
+     *(.symtab)
+  }
+  .shstrtab : {
+     *(.shstrtab)
+  }
+  .strtab : {
+     *(.strtab)
+  }
+  .stack_sizes : {
+     *(.stack_sizes)
+  }
+
+. = 0x70000;
+_sp_start_value_DM_stack = .;
+. += 0x400; /* stack */
+/* No tile with memory exists to the south. */
+. = 0x40000;
+. += 0x10000;
+/* No tile with memory exists to the west. */
+. = 0x50000;
+. += 0x10000;
+. = 0x60400;
+buf5 = .;
+. += 0x400;
+. = 0x64000;
+buf4 = .;
+. += 0x400;
+. = 0x68000;
+buf3 = .;
+. += 0x400;
+. = 0x70400;
+buf2 = .;
+. += 0x400;
+. = 0x74000;
+buf1 = .;
+. += 0x400;
+. = 0x78000;
+buf0 = .;
+. += 0x400;
+  .bss : { *(.bss*) } > data
+}
+PROVIDE(main = core_0_2);
diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.ll b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.ll
new file mode 100644
index 0000000..906e39c
--- /dev/null
+++ b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.ll
@@ -0,0 +1,95 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target triple = "aie2p"
+
+@buf0 = external global [256 x float]
+@buf1 = external global [256 x float]
+@buf2 = external global [256 x float]
+@buf3 = external global [256 x float]
+@buf4 = external global [256 x float]
+@buf5 = external global [256 x float]
+@buf6 = external global [256 x float]
+@buf7 = external global [256 x float]
+@buf8 = external global [256 x float]
+@buf9 = external global [256 x float]
+@buf10 = external global [256 x float]
+@buf11 = external global [256 x float]
+@buf12 = external global [1024 x float]
+@buf13 = external global [1024 x float]
+@buf14 = external global [1024 x float]
+
+declare void @debug_i32(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.event(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.put.ms(i32, i32)
+
+; Unknown intrinsic
+declare { i32, i32 } @llvm.aie2p.get.ss()
+
+; Unknown intrinsic
+declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32)
+
+; Unknown intrinsic
+declare <16 x i32> @llvm.aie2p.scd.read.vec(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.acquire(i32, i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.release(i32, i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.set.ctrl.reg(i32, i32)
+
+define void @core_0_2() {
+  br label %1
+
+1:                                                ; preds = %22, %0
+  call void @llvm.aie2p.acquire(i32 49, i32 -1)
+  call void @llvm.aie2p.acquire(i32 50, i32 -1)
+  call void @llvm.aie2p.acquire(i32 52, i32 -1)
+  br label %2
+
+2:                                                ; preds = %5, %1
+  %3 = phi i32 [ %21, %5 ], [ 0, %1 ]
+  %4 = icmp slt i32 %3, 256
+  br i1 %4, label %5, label %22
+
+5:                                                ; preds = %2
+  %6 = getelementptr float, ptr @buf2, i32 %3
+  %7 = load <16 x float>, ptr %6, align 4
+  %8 = getelementptr float, ptr @buf1, i32 %3
+  %9 = load <16 x float>, ptr %8, align 4
+  %10 = bitcast <16 x float> %7 to <8 x i64>
+  %11 = bitcast <16 x float> %9 to <8 x i64>
+  %12 = shufflevector <8 x i64> %10, <8 x i64> %10, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %13 = shufflevector <8 x i64> %11, <8 x i64> %11, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %14 = bitcast <32 x i64> %12 to <64 x float>
+  %15 = bitcast <32 x i64> %13 to <64 x float>
+  %16 = call <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float> %14, <64 x float> %15, i32 60)
+  %17 = bitcast <64 x float> %16 to <32 x i64>
+  %18 = shufflevector <32 x i64> %17, <32 x i64> %17, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %19 = bitcast <8 x i64> %18 to <16 x float>
+  %20 = getelementptr float, ptr @buf0, i32 %3
+  store <16 x float> %19, ptr %20, align 4
+  %21 = add i32 %3, 16
+  br label %2, !llvm.loop !1
+
+22:                                               ; preds = %2
+  call void @llvm.aie2p.release(i32 51, i32 1)
+  call void @llvm.aie2p.release(i32 53, i32 1)
+  call void @llvm.aie2p.release(i32 48, i32 1)
+  br label %1
+}
+
+; Unknown intrinsic
+declare <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float>, <64 x float>, i32)
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !{!1, !2}
+!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.o b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.o
new file mode 100644
index 0000000000000000000000000000000000000000..c01f0bbb18a2b8aef10a542b3c15d504b366c5fe
GIT binary patch
literal 984
zcmaJ<O=}ZT6g@9ZlOR>nfSQGt;;NrR#&%IRn#NWmOhg2w)J2#i(<+@dWjcZCMv*RD
zh`14E>8AUxTx8W>ApV33{($xm7|(t4l1#Jk!kv50J9qBA@6NnmcOEIF&^Lu#f&r%~
zV2t9jPMAR!!(2J~4dhWGc%`O+v)+$9PRNK=s!K((KW4id>UFgYjOV(C$EuvS&lh+u
zR?Jmy#AqR(zMA<P)xJzd&wft+S<Ski?C3fWee1WZK1dg$-u~@1oUtsQ8Qq^1{a`}+
zvnuByKMJlT_^;Js<8wycF?!ri^plk2CwO7{5@$Wmsb`hmk~pfz{Z7@A>Bo69>I(0*
z@}2iOGR2E4Ft;=ZgK%?zfHW?8b#hr}Y-4y%@WC+o5<c;Hf<G}>7G0ozZ=lJC0X9eY
z_6Qd`%RP!o{$f@`++vu|#W-Y`lb?Y0jy{*x5L4`__<v}#Lhmqywbu?k>+#bUiwB|G
z+=1h~tgXFqeXkkT+O3Z7!8!<o&}}fj-IdtrY~6(wc)qJESj~3e)k}4rIhBe;CfJ79
zLP{icXML|VT$35$e_@uQxq?osa_&yz1zts)L~*kcl=bQGAv+R#NFUA9Kf6}rT?)KU
smvxot+Ss>BAoXP)(I)7!_Wlfn*iVv#W)ri-z$EW0nWT@?rJl_A4}x%RasU7T

literal 0
HcmV?d00001

diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.opt.ll b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.opt.ll
new file mode 100644
index 0000000..1f9925e
--- /dev/null
+++ b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.opt.ll
@@ -0,0 +1,64 @@
+; ModuleID = 'air_project/sub_kernel_0_core_0_2.peanohack.ll'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-m:e-p:20:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-f32:32:32-i64:32-f64:32-a:0:32-n32"
+target triple = "aie2p"
+
+@buf0 = external local_unnamed_addr global [256 x float]
+@buf1 = external local_unnamed_addr global [256 x float]
+@buf2 = external local_unnamed_addr global [256 x float]
+
+; Function Attrs: nounwind
+declare void @llvm.aie2p.acquire(i32, i32) #0
+
+; Function Attrs: nounwind
+declare void @llvm.aie2p.release(i32, i32) #0
+
+; Function Attrs: noreturn nounwind
+define void @core_0_2() local_unnamed_addr #1 {
+  br label %1
+
+1:                                                ; preds = %19, %0
+  tail call void @llvm.aie2p.acquire(i32 49, i32 -1)
+  tail call void @llvm.aie2p.acquire(i32 50, i32 -1)
+  tail call void @llvm.aie2p.acquire(i32 52, i32 -1)
+  br label %2
+
+2:                                                ; preds = %1, %2
+  %3 = phi i32 [ 0, %1 ], [ %17, %2 ]
+  %4 = trunc nuw i32 %3 to i20
+  %5 = getelementptr float, ptr @buf2, i20 %4
+  %6 = load <8 x i64>, ptr %5, align 64
+  %7 = getelementptr float, ptr @buf1, i20 %4
+  %8 = load <8 x i64>, ptr %7, align 64
+  %9 = shufflevector <8 x i64> %6, <8 x i64> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %10 = shufflevector <8 x i64> %8, <8 x i64> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %11 = bitcast <32 x i64> %9 to <64 x float>
+  %12 = bitcast <32 x i64> %10 to <64 x float>
+  %13 = tail call <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float> %11, <64 x float> %12, i32 60)
+  %14 = bitcast <64 x float> %13 to <32 x i64>
+  %15 = shufflevector <32 x i64> %14, <32 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %16 = getelementptr float, ptr @buf0, i20 %4
+  store <8 x i64> %15, ptr %16, align 64
+  %17 = add nuw nsw i32 %3, 16
+  %18 = icmp ult i32 %3, 240
+  br i1 %18, label %2, label %19, !llvm.loop !1
+
+19:                                               ; preds = %2
+  tail call void @llvm.aie2p.release(i32 51, i32 1)
+  tail call void @llvm.aie2p.release(i32 53, i32 1)
+  tail call void @llvm.aie2p.release(i32 48, i32 1)
+  br label %1
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read)
+declare <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float>, <64 x float>, i32) #2
+
+attributes #0 = { nounwind }
+attributes #1 = { noreturn nounwind }
+attributes #2 = { mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) }
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !{!1, !2}
+!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.peanohack.ll b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.peanohack.ll
new file mode 100644
index 0000000..d91a003
--- /dev/null
+++ b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.peanohack.ll
@@ -0,0 +1,95 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target triple = "aie2p"
+
+@buf0 = external global [256 x float]
+@buf1 = external global [256 x float]
+@buf2 = external global [256 x float]
+@buf3 = external global [256 x float]
+@buf4 = external global [256 x float]
+@buf5 = external global [256 x float]
+@buf6 = external global [256 x float]
+@buf7 = external global [256 x float]
+@buf8 = external global [256 x float]
+@buf9 = external global [256 x float]
+@buf10 = external global [256 x float]
+@buf11 = external global [256 x float]
+@buf12 = external global [1024 x float]
+@buf13 = external global [1024 x float]
+@buf14 = external global [1024 x float]
+
+declare void @debug_i32(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.event(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.put.ms(i32, i32)
+
+; Unknown intrinsic
+declare { i32, i32 } @llvm.aie2p.get.ss()
+
+; Unknown intrinsic
+declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32)
+
+; Unknown intrinsic
+declare <16 x i32> @llvm.aie2p.scd.read.vec(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.acquire(i32, i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.release(i32, i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.set.ctrl.reg(i32, i32)
+
+define void @core_0_2() {
+  br label %1
+
+1:                                                ; preds = %22, %0
+  call void @llvm.aie2p.acquire(i32 49, i32 -1)
+  call void @llvm.aie2p.acquire(i32 50, i32 -1)
+  call void @llvm.aie2p.acquire(i32 52, i32 -1)
+  br label %2
+
+2:                                                ; preds = %5, %1
+  %3 = phi i32 [ %21, %5 ], [ 0, %1 ]
+  %4 = icmp slt i32 %3, 256
+  br i1 %4, label %5, label %22
+
+5:                                                ; preds = %2
+  %6 = getelementptr float, ptr @buf2, i32 %3
+  %7 = load <16 x float>, ptr %6
+  %8 = getelementptr float, ptr @buf1, i32 %3
+  %9 = load <16 x float>, ptr %8
+  %10 = bitcast <16 x float> %7 to <8 x i64>
+  %11 = bitcast <16 x float> %9 to <8 x i64>
+  %12 = shufflevector <8 x i64> %10, <8 x i64> %10, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %13 = shufflevector <8 x i64> %11, <8 x i64> %11, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %14 = bitcast <32 x i64> %12 to <64 x float>
+  %15 = bitcast <32 x i64> %13 to <64 x float>
+  %16 = call <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float> %14, <64 x float> %15, i32 60)
+  %17 = bitcast <64 x float> %16 to <32 x i64>
+  %18 = shufflevector <32 x i64> %17, <32 x i64> %17, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %19 = bitcast <8 x i64> %18 to <16 x float>
+  %20 = getelementptr float, ptr @buf0, i32 %3
+  store <16 x float> %19, ptr %20
+  %21 = add i32 %3, 16
+  br label %2, !llvm.loop !1
+
+22:                                               ; preds = %2
+  call void @llvm.aie2p.release(i32 51, i32 1)
+  call void @llvm.aie2p.release(i32 53, i32 1)
+  call void @llvm.aie2p.release(i32 48, i32 1)
+  br label %1
+}
+
+; Unknown intrinsic
+declare <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float>, <64 x float>, i32)
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !{!1, !2}
+!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.elf b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.elf
new file mode 100755
index 0000000000000000000000000000000000000000..e4e226b260243a140d2d00b735618366285dcca0
GIT binary patch
literal 1720
zcma)7O^6$17=9-klU4jlS1bV)J}Ss6Yj$Rnb~n8^T~aBUh4rA6_ApFlW;eqolQ5aA
zTRj-0sGxWgq9E=?5KoJs$etEjuTs1V?V@<;p}j~!aebcoNunno_`dgj-k<M%=li~y
zdG})T6;0D5Lr&%=tOn!$Ln3pKHBpk2MJY;N7(K$sa0CMfLZyW9`frh&`DSHdmZW-K
zldnY@a{J-p#QXd+BFC@JDY>^KcevyVR==t#k=rNVm*n7<CeXFqVM+8msQX^=@Uqsv
zWTJ!l!@aLGvt%GML3w`tbfPVl@{bk%9&dbdYP|W=sb4P??GKIdQzGL};|D^}rI+&y
z<CFfg4U?-j)G-=HeXQ6CA7%ZdAMoqp<1_r$g>viTf_5hLC*|lr$T5G0FQ|WUtcO11
zURj%g|Me&JkF;u95B(OjM?}hN`*^PFf+GRSYmz=C;!VLB#ruVo5BqTESY!OfjVE&b
zd*99-eDGkr{a0}*mpA9q=R;1qemOZpE-K~v&FP^?`Li3vvMlm|ft#AhgA<XaAMAMH
zdDUz-lu@tf6<x&X*ul2i^}>PQ3sS3EY)4UlaDI7t+mG7AR>kRcmoNJrKiFIDbauNh
z*}kV*RmauqZnf&!ZdI?JtF5`7v+CB|s?n;pYK~F&+<I-R{zB5j?2L9M$jkkcC&4x+
z7EPXpzCXjCgI<~>X*`Dsn9sYvIpbdkR~R~M3%D?^L6>~8JtZk5wah7hgZ|N!Um*t+
z2J>4ac$*03zXQp7JTCg*hg28@`A3k71So-zAr%Qg{tcue0m#3DR3rfTkC2K4ApaRs
zkpSesP|4%o0PN?OiaF)KlaHqSE_ptAqZxmn+??`1$VXEymB`zRgvZ|X0w{xP-N<f1
zwg*v|XcK8WJJ!H|+Z*71HQ(H5_;$y0qK%$A?08aXuCMEsk;)ZFt9dbSb#cPTs5lM`
zTf@MyTHdxF7~=Fo&(f``w1!(}In_8-#j*yG9Y$ge`iW-k+MS_iH8x0{9kIH$A6S0i
zM?y_3C+dZREQ@x_4nzA|W=Y&$;7Xj7I%Dq6W+sGo^s&Op;G}a}<Fv}@Tznf$?&AM~
zhA56?$FcR$k7!7dzP9m=qZzMh3^OP4R;p8E>o6X!34Q=MfRo>N-df%S#^8Gq{~sV?
nwBcCI2xQj68R-0vWcU(se`W+Si+eHEN2%{~6lBfJpUHm!Ajl1>

literal 0
HcmV?d00001

diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.ld.script b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.ld.script
new file mode 100644
index 0000000..6120a88
--- /dev/null
+++ b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.ld.script
@@ -0,0 +1,78 @@
+
+MEMORY
+{
+   program (RX) : ORIGIN = 0, LENGTH = 0x0020000
+   data (!RX) : ORIGIN = 0x78400, LENGTH = 0x7C00
+}
+ENTRY(__start)
+SECTIONS
+{
+  . = 0x0;
+  .text : {
+     /* the __start symbol has to come at address zero. */
+     *crt0.o(.text*)
+     _ctors_start = .;
+     _init_array_start = .;
+     KEEP(SORT(*.init_array))
+     _ctors_end = .;
+     _init_array_end = .;
+     _dtors_start = .;
+     _dtors_end = .;
+     *(.text*)
+  } > program
+  .data : {
+     *(.data*)
+     *(.rodata*)
+  } > data
+  .comment : {
+     *(.comment*)
+  }
+  .symtab : {
+     *(.symtab)
+  }
+  .shstrtab : {
+     *(.shstrtab)
+  }
+  .strtab : {
+     *(.strtab)
+  }
+  .stack_sizes : {
+     *(.stack_sizes)
+  }
+
+. = 0x70000;
+_sp_start_value_DM_stack = .;
+. += 0x400; /* stack */
+. = 0x40400;
+buf2 = .;
+. += 0x400;
+. = 0x44000;
+buf1 = .;
+. += 0x400;
+. = 0x48000;
+buf0 = .;
+. += 0x400;
+/* No tile with memory exists to the west. */
+. = 0x50000;
+. += 0x10000;
+. = 0x60400;
+buf8 = .;
+. += 0x400;
+. = 0x64000;
+buf7 = .;
+. += 0x400;
+. = 0x68000;
+buf6 = .;
+. += 0x400;
+. = 0x70400;
+buf5 = .;
+. += 0x400;
+. = 0x74000;
+buf4 = .;
+. += 0x400;
+. = 0x78000;
+buf3 = .;
+. += 0x400;
+  .bss : { *(.bss*) } > data
+}
+PROVIDE(main = core_0_3);
diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.ll b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.ll
new file mode 100644
index 0000000..ba863ab
--- /dev/null
+++ b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.ll
@@ -0,0 +1,95 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target triple = "aie2p"
+
+@buf0 = external global [256 x float]
+@buf1 = external global [256 x float]
+@buf2 = external global [256 x float]
+@buf3 = external global [256 x float]
+@buf4 = external global [256 x float]
+@buf5 = external global [256 x float]
+@buf6 = external global [256 x float]
+@buf7 = external global [256 x float]
+@buf8 = external global [256 x float]
+@buf9 = external global [256 x float]
+@buf10 = external global [256 x float]
+@buf11 = external global [256 x float]
+@buf12 = external global [1024 x float]
+@buf13 = external global [1024 x float]
+@buf14 = external global [1024 x float]
+
+declare void @debug_i32(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.event(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.put.ms(i32, i32)
+
+; Unknown intrinsic
+declare { i32, i32 } @llvm.aie2p.get.ss()
+
+; Unknown intrinsic
+declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32)
+
+; Unknown intrinsic
+declare <16 x i32> @llvm.aie2p.scd.read.vec(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.acquire(i32, i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.release(i32, i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.set.ctrl.reg(i32, i32)
+
+define void @core_0_3() {
+  br label %1
+
+1:                                                ; preds = %22, %0
+  call void @llvm.aie2p.acquire(i32 49, i32 -1)
+  call void @llvm.aie2p.acquire(i32 50, i32 -1)
+  call void @llvm.aie2p.acquire(i32 52, i32 -1)
+  br label %2
+
+2:                                                ; preds = %5, %1
+  %3 = phi i32 [ %21, %5 ], [ 0, %1 ]
+  %4 = icmp slt i32 %3, 256
+  br i1 %4, label %5, label %22
+
+5:                                                ; preds = %2
+  %6 = getelementptr float, ptr @buf5, i32 %3
+  %7 = load <16 x float>, ptr %6, align 4
+  %8 = getelementptr float, ptr @buf4, i32 %3
+  %9 = load <16 x float>, ptr %8, align 4
+  %10 = bitcast <16 x float> %7 to <8 x i64>
+  %11 = bitcast <16 x float> %9 to <8 x i64>
+  %12 = shufflevector <8 x i64> %10, <8 x i64> %10, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %13 = shufflevector <8 x i64> %11, <8 x i64> %11, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %14 = bitcast <32 x i64> %12 to <64 x float>
+  %15 = bitcast <32 x i64> %13 to <64 x float>
+  %16 = call <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float> %14, <64 x float> %15, i32 60)
+  %17 = bitcast <64 x float> %16 to <32 x i64>
+  %18 = shufflevector <32 x i64> %17, <32 x i64> %17, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %19 = bitcast <8 x i64> %18 to <16 x float>
+  %20 = getelementptr float, ptr @buf3, i32 %3
+  store <16 x float> %19, ptr %20, align 4
+  %21 = add i32 %3, 16
+  br label %2, !llvm.loop !1
+
+22:                                               ; preds = %2
+  call void @llvm.aie2p.release(i32 51, i32 1)
+  call void @llvm.aie2p.release(i32 53, i32 1)
+  call void @llvm.aie2p.release(i32 48, i32 1)
+  br label %1
+}
+
+; Unknown intrinsic
+declare <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float>, <64 x float>, i32)
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !{!1, !2}
+!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.o b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.o
new file mode 100644
index 0000000000000000000000000000000000000000..d5d447390e239adbd0677385ea8f16d9e12c9d10
GIT binary patch
literal 984
zcmaJ<&ubG=5T2K&NsuaOK%0X~@zkG7Vk@*4O=GJ$Y($Wjf<o9NTUuP3vfV)2iy}RE
z5b+}Hv6tR^@{m*i0`X6%;2%)`0n_h$`;u&P=)k=BzM0vXd9&~1n*Bm4B|TA+i7}D$
zgvbbji#lOS(lW@U>yJoQ3IK1_q{v10XI9R@fK{S{2(v$@JDX~|QW6=>bPi5cDO<dp
zgP)Sml<!4oE}Oia`X5%mjfby)jsIOqJ70?7U6JrduVv*~G8cCD9<0g*%HmAv{;25r
zW9*Nrm<RmWzZ2uXSMv3*DfP(2qjrp+C72)MxyfrB^)RQNmA#ob9FO{)s|C{!^QP1-
z+-vCs_c}Jk^GjlGX&iks(MKXl`PXfNbDfcl$UA^fhVf5$z?U)p#$YZwhxl?o#vdc{
zbcjD6;-u5qBZc|%X$|E*%xosY0n7{^0ptielhjZiqNl?9&|(GdK_J#{D{!rsuh!@H
z1E=v`Y<r`+y5o3mBdE5TZO;{JKkx&m4tw~4w%*=)!ef!gf>^%mIl7kBX!&lfP=n7d
zmkYJW#ui~~_S$Z9uqHFYO=cQFa|NAN!QAcG2e^tBTB=!zl(ctna7UCXbU05xb{&m&
uNmzk$T_va%^resWIS*rFP_Df<1EB21Ni0?bMv1;K?>iXNN1?388Lt3R{cdRh

literal 0
HcmV?d00001

diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.opt.ll b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.opt.ll
new file mode 100644
index 0000000..ddb3226
--- /dev/null
+++ b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.opt.ll
@@ -0,0 +1,64 @@
+; ModuleID = 'air_project/sub_kernel_0_core_0_3.peanohack.ll'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-m:e-p:20:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-f32:32:32-i64:32-f64:32-a:0:32-n32"
+target triple = "aie2p"
+
+@buf3 = external local_unnamed_addr global [256 x float]
+@buf4 = external local_unnamed_addr global [256 x float]
+@buf5 = external local_unnamed_addr global [256 x float]
+
+; Function Attrs: nounwind
+declare void @llvm.aie2p.acquire(i32, i32) #0
+
+; Function Attrs: nounwind
+declare void @llvm.aie2p.release(i32, i32) #0
+
+; Function Attrs: noreturn nounwind
+define void @core_0_3() local_unnamed_addr #1 {
+  br label %1
+
+1:                                                ; preds = %19, %0
+  tail call void @llvm.aie2p.acquire(i32 49, i32 -1)
+  tail call void @llvm.aie2p.acquire(i32 50, i32 -1)
+  tail call void @llvm.aie2p.acquire(i32 52, i32 -1)
+  br label %2
+
+2:                                                ; preds = %1, %2
+  %3 = phi i32 [ 0, %1 ], [ %17, %2 ]
+  %4 = trunc nuw i32 %3 to i20
+  %5 = getelementptr float, ptr @buf5, i20 %4
+  %6 = load <8 x i64>, ptr %5, align 64
+  %7 = getelementptr float, ptr @buf4, i20 %4
+  %8 = load <8 x i64>, ptr %7, align 64
+  %9 = shufflevector <8 x i64> %6, <8 x i64> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %10 = shufflevector <8 x i64> %8, <8 x i64> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %11 = bitcast <32 x i64> %9 to <64 x float>
+  %12 = bitcast <32 x i64> %10 to <64 x float>
+  %13 = tail call <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float> %11, <64 x float> %12, i32 60)
+  %14 = bitcast <64 x float> %13 to <32 x i64>
+  %15 = shufflevector <32 x i64> %14, <32 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %16 = getelementptr float, ptr @buf3, i20 %4
+  store <8 x i64> %15, ptr %16, align 64
+  %17 = add nuw nsw i32 %3, 16
+  %18 = icmp ult i32 %3, 240
+  br i1 %18, label %2, label %19, !llvm.loop !1
+
+19:                                               ; preds = %2
+  tail call void @llvm.aie2p.release(i32 51, i32 1)
+  tail call void @llvm.aie2p.release(i32 53, i32 1)
+  tail call void @llvm.aie2p.release(i32 48, i32 1)
+  br label %1
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read)
+declare <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float>, <64 x float>, i32) #2
+
+attributes #0 = { nounwind }
+attributes #1 = { noreturn nounwind }
+attributes #2 = { mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) }
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !{!1, !2}
+!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.peanohack.ll b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.peanohack.ll
new file mode 100644
index 0000000..8b8d6a6
--- /dev/null
+++ b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.peanohack.ll
@@ -0,0 +1,95 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target triple = "aie2p"
+
+@buf0 = external global [256 x float]
+@buf1 = external global [256 x float]
+@buf2 = external global [256 x float]
+@buf3 = external global [256 x float]
+@buf4 = external global [256 x float]
+@buf5 = external global [256 x float]
+@buf6 = external global [256 x float]
+@buf7 = external global [256 x float]
+@buf8 = external global [256 x float]
+@buf9 = external global [256 x float]
+@buf10 = external global [256 x float]
+@buf11 = external global [256 x float]
+@buf12 = external global [1024 x float]
+@buf13 = external global [1024 x float]
+@buf14 = external global [1024 x float]
+
+declare void @debug_i32(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.event(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.put.ms(i32, i32)
+
+; Unknown intrinsic
+declare { i32, i32 } @llvm.aie2p.get.ss()
+
+; Unknown intrinsic
+declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32)
+
+; Unknown intrinsic
+declare <16 x i32> @llvm.aie2p.scd.read.vec(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.acquire(i32, i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.release(i32, i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.set.ctrl.reg(i32, i32)
+
+define void @core_0_3() {
+  br label %1
+
+1:                                                ; preds = %22, %0
+  call void @llvm.aie2p.acquire(i32 49, i32 -1)
+  call void @llvm.aie2p.acquire(i32 50, i32 -1)
+  call void @llvm.aie2p.acquire(i32 52, i32 -1)
+  br label %2
+
+2:                                                ; preds = %5, %1
+  %3 = phi i32 [ %21, %5 ], [ 0, %1 ]
+  %4 = icmp slt i32 %3, 256
+  br i1 %4, label %5, label %22
+
+5:                                                ; preds = %2
+  %6 = getelementptr float, ptr @buf5, i32 %3
+  %7 = load <16 x float>, ptr %6
+  %8 = getelementptr float, ptr @buf4, i32 %3
+  %9 = load <16 x float>, ptr %8
+  %10 = bitcast <16 x float> %7 to <8 x i64>
+  %11 = bitcast <16 x float> %9 to <8 x i64>
+  %12 = shufflevector <8 x i64> %10, <8 x i64> %10, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %13 = shufflevector <8 x i64> %11, <8 x i64> %11, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %14 = bitcast <32 x i64> %12 to <64 x float>
+  %15 = bitcast <32 x i64> %13 to <64 x float>
+  %16 = call <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float> %14, <64 x float> %15, i32 60)
+  %17 = bitcast <64 x float> %16 to <32 x i64>
+  %18 = shufflevector <32 x i64> %17, <32 x i64> %17, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %19 = bitcast <8 x i64> %18 to <16 x float>
+  %20 = getelementptr float, ptr @buf3, i32 %3
+  store <16 x float> %19, ptr %20
+  %21 = add i32 %3, 16
+  br label %2, !llvm.loop !1
+
+22:                                               ; preds = %2
+  call void @llvm.aie2p.release(i32 51, i32 1)
+  call void @llvm.aie2p.release(i32 53, i32 1)
+  call void @llvm.aie2p.release(i32 48, i32 1)
+  br label %1
+}
+
+; Unknown intrinsic
+declare <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float>, <64 x float>, i32)
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !{!1, !2}
+!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.elf b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.elf
new file mode 100755
index 0000000000000000000000000000000000000000..cfc7551935cf7c8534f551f4b481c91b57b4768e
GIT binary patch
literal 1724
zcma)7O=uKn7=C9n-Hp<pO+;Bx;Uk5TO5Du2tI2xl7}gTn4WtK4(8Dl)$qw0_U1nxu
z)PswNEhyfEEwt1_3#F$}Xel`bwAUiu3mOzJ9#T&^sPTE{XLt4F1K;<)&-?Se?|k1k
zGgr^nzLX@1Qe<ef&&n|V@&VBZWSInIXp(Xy6GwM3Qf$G%1EG}1_~vP7r@mpC9VUq`
zOY|F2l^*TO^}U}RA=>_GM9|I@J?14(VD$@<AbPa-SDrQ=NCaKVZ01RMjJnryn@duC
zUPA|?o9n+yT3$tFjKb*Lp;(*B%O7Q5_ZDuC^%nmfdvYdce6RKn67}vx4}_rkFXZvw
zUi)ZO<5jC-8x5mAR&4kma{ag;@az7^L;S&+f_W<|O(g!f9Q!vioIk|J#akTfq0gvS
zuFZk(`s4arQYop2ezVekqQdN5JlA!?kpP8RN}dw&8sUth{ldzdZMZXBqxaLjPcrSD
zKSnmb+wHBq%uQuvZ6tX<<Rt4C;v?imsZhB;I20=Ua4%P&Nj_lUzC^UUM^tmYRmVRq
zYPG6RD@CQK5LpewTM=uHAGj?qv5Luh7`B7c)6*+%SnrrctJR!davQF<KHX@nHIEyv
zBg~RzD;2v`atynqR8Ey=ZO59i%XUdMOJ><pD~??$FIPT~dl;V4P9HM;zjpv^v2T&+
zF!Z}a{8Q-pev-uJ&<E%9-CrE?&w&dJ1-5Zqn6IIe9B)savT-eKz|XP2JK*P;0|JBd
zO(uAo7@U6vlI!ttvHu37z+f=H2`QKWdGH>jU;;4z9a1m>nEweWm;lWGffP&t<_{qS
z6M*@DtjM@G0Qa-aiZ<ZSn0E*KIdeIFqdEQsb8WzPnRf@AilMU}5+8fh@t_2knxSDr
z)`QTGwZ1g0RXuPoI|1%j?c0T_Ycw1yTxi*yhC{{L+?=ATi97+R*UoyjLY5z@MazO=
zx#L;7>8!Y(N><BvbVV<d*;$_DX@;j$r0YRw_#x>*JJ$3yqtS8n>H<@1mGq|Jdb;bm
zA+bif6}J2!&0@Q5_`Y!|wZv}6vnfhSoDsK&Gh@Pb^l_4>sJ9Ypsv6A-&z0ytXuOZ_
z2?=pL;$6q~L*HUUg5=eWZXKKPsz%T<L>CjCMY;~h!_~*%ARBP<cb;!9-v*AsHzWE-
oKt!m+F_RL=w1o-i{4YuIGvZ#Q1Tu{~mgwEY_ah2&&D5XDH`iMZ9{>OV

literal 0
HcmV?d00001

diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.ld.script b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.ld.script
new file mode 100644
index 0000000..ddda3c2
--- /dev/null
+++ b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.ld.script
@@ -0,0 +1,78 @@
+
+MEMORY
+{
+   program (RX) : ORIGIN = 0, LENGTH = 0x0020000
+   data (!RX) : ORIGIN = 0x78400, LENGTH = 0x7C00
+}
+ENTRY(__start)
+SECTIONS
+{
+  . = 0x0;
+  .text : {
+     /* the __start symbol has to come at address zero. */
+     *crt0.o(.text*)
+     _ctors_start = .;
+     _init_array_start = .;
+     KEEP(SORT(*.init_array))
+     _ctors_end = .;
+     _init_array_end = .;
+     _dtors_start = .;
+     _dtors_end = .;
+     *(.text*)
+  } > program
+  .data : {
+     *(.data*)
+     *(.rodata*)
+  } > data
+  .comment : {
+     *(.comment*)
+  }
+  .symtab : {
+     *(.symtab)
+  }
+  .shstrtab : {
+     *(.shstrtab)
+  }
+  .strtab : {
+     *(.strtab)
+  }
+  .stack_sizes : {
+     *(.stack_sizes)
+  }
+
+. = 0x70000;
+_sp_start_value_DM_stack = .;
+. += 0x400; /* stack */
+. = 0x40400;
+buf5 = .;
+. += 0x400;
+. = 0x44000;
+buf4 = .;
+. += 0x400;
+. = 0x48000;
+buf3 = .;
+. += 0x400;
+/* No tile with memory exists to the west. */
+. = 0x50000;
+. += 0x10000;
+. = 0x60400;
+buf11 = .;
+. += 0x400;
+. = 0x64000;
+buf10 = .;
+. += 0x400;
+. = 0x68000;
+buf9 = .;
+. += 0x400;
+. = 0x70400;
+buf8 = .;
+. += 0x400;
+. = 0x74000;
+buf7 = .;
+. += 0x400;
+. = 0x78000;
+buf6 = .;
+. += 0x400;
+  .bss : { *(.bss*) } > data
+}
+PROVIDE(main = core_0_4);
diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.ll b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.ll
new file mode 100644
index 0000000..54f47e7
--- /dev/null
+++ b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.ll
@@ -0,0 +1,95 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target triple = "aie2p"
+
+@buf0 = external global [256 x float]
+@buf1 = external global [256 x float]
+@buf2 = external global [256 x float]
+@buf3 = external global [256 x float]
+@buf4 = external global [256 x float]
+@buf5 = external global [256 x float]
+@buf6 = external global [256 x float]
+@buf7 = external global [256 x float]
+@buf8 = external global [256 x float]
+@buf9 = external global [256 x float]
+@buf10 = external global [256 x float]
+@buf11 = external global [256 x float]
+@buf12 = external global [1024 x float]
+@buf13 = external global [1024 x float]
+@buf14 = external global [1024 x float]
+
+declare void @debug_i32(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.event(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.put.ms(i32, i32)
+
+; Unknown intrinsic
+declare { i32, i32 } @llvm.aie2p.get.ss()
+
+; Unknown intrinsic
+declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32)
+
+; Unknown intrinsic
+declare <16 x i32> @llvm.aie2p.scd.read.vec(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.acquire(i32, i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.release(i32, i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.set.ctrl.reg(i32, i32)
+
+define void @core_0_4() {
+  br label %1
+
+1:                                                ; preds = %22, %0
+  call void @llvm.aie2p.acquire(i32 49, i32 -1)
+  call void @llvm.aie2p.acquire(i32 50, i32 -1)
+  call void @llvm.aie2p.acquire(i32 52, i32 -1)
+  br label %2
+
+2:                                                ; preds = %5, %1
+  %3 = phi i32 [ %21, %5 ], [ 0, %1 ]
+  %4 = icmp slt i32 %3, 256
+  br i1 %4, label %5, label %22
+
+5:                                                ; preds = %2
+  %6 = getelementptr float, ptr @buf8, i32 %3
+  %7 = load <16 x float>, ptr %6, align 4
+  %8 = getelementptr float, ptr @buf7, i32 %3
+  %9 = load <16 x float>, ptr %8, align 4
+  %10 = bitcast <16 x float> %7 to <8 x i64>
+  %11 = bitcast <16 x float> %9 to <8 x i64>
+  %12 = shufflevector <8 x i64> %10, <8 x i64> %10, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %13 = shufflevector <8 x i64> %11, <8 x i64> %11, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %14 = bitcast <32 x i64> %12 to <64 x float>
+  %15 = bitcast <32 x i64> %13 to <64 x float>
+  %16 = call <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float> %14, <64 x float> %15, i32 60)
+  %17 = bitcast <64 x float> %16 to <32 x i64>
+  %18 = shufflevector <32 x i64> %17, <32 x i64> %17, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %19 = bitcast <8 x i64> %18 to <16 x float>
+  %20 = getelementptr float, ptr @buf6, i32 %3
+  store <16 x float> %19, ptr %20, align 4
+  %21 = add i32 %3, 16
+  br label %2, !llvm.loop !1
+
+22:                                               ; preds = %2
+  call void @llvm.aie2p.release(i32 51, i32 1)
+  call void @llvm.aie2p.release(i32 53, i32 1)
+  call void @llvm.aie2p.release(i32 48, i32 1)
+  br label %1
+}
+
+; Unknown intrinsic
+declare <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float>, <64 x float>, i32)
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !{!1, !2}
+!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.o b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.o
new file mode 100644
index 0000000000000000000000000000000000000000..01ddce691877b0d149c944b2a2bff263cbb20063
GIT binary patch
literal 984
zcmaJ<O=}ZT6g@9ZlOR>nfSQF$an;Wuu|?yirm?MAOr%hhf<l-iQ<RQPnNFa(sYn+t
zMBE6obmP7&7g_Ze6n{bme?a>OjOV_2Nv2u!!hQFgckkS<d53HEx>5>#QOLv?aFPJV
z=v>kP(@0|&D_4JjED8j#)D&>m`<2Bp8L>)q>5=G9>F$<#Q!NAIneM?)Rn8XA=Xo!Z
z&s1(jXg-^~p86NozD|Z4r;~qI($42%coPV}_e)lvBy(YJ@9rwjn3iWs*Jno0pOE^@
z%6`a?{2MX;dnMoal2Z4LKPo5w<AlV=cy8(vXFm3+dzIP}IO>n;ov1}qkNu|9HO^J~
z!MTo1@_Y$qN^>zVw+9GF;-c3gmvu%mhSvli4U<3N5ueBSropo49Q}_6KKU`g@(6!A
z!iCOok0KJkkk$})Xy!5z4rpfO5l}u*XOkM@9&0MT4=q;U9t5y<+ktC6d;Vf!KX95m
zu<e(%)who4HiKHb)$v?d`+*-g4chnb#5OwHB{?3+@er2pdXCOzHQT;hFVvS{*_BG6
z{=nEGZQWkmZ4K9CTDZ#0&}l}{VO93siG9FPw1^coD?wS`4iDLpNR2ugryslaM!OVP
trOLX>R4uG4camTF5#Iz=*52=d5PNYF^A(AiVjvRtjZDJFsgh56{10FHZgT(t

literal 0
HcmV?d00001

diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.opt.ll b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.opt.ll
new file mode 100644
index 0000000..de0f954
--- /dev/null
+++ b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.opt.ll
@@ -0,0 +1,64 @@
+; ModuleID = 'air_project/sub_kernel_0_core_0_4.peanohack.ll'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-m:e-p:20:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-f32:32:32-i64:32-f64:32-a:0:32-n32"
+target triple = "aie2p"
+
+@buf6 = external local_unnamed_addr global [256 x float]
+@buf7 = external local_unnamed_addr global [256 x float]
+@buf8 = external local_unnamed_addr global [256 x float]
+
+; Function Attrs: nounwind
+declare void @llvm.aie2p.acquire(i32, i32) #0
+
+; Function Attrs: nounwind
+declare void @llvm.aie2p.release(i32, i32) #0
+
+; Function Attrs: noreturn nounwind
+define void @core_0_4() local_unnamed_addr #1 {
+  br label %1
+
+1:                                                ; preds = %19, %0
+  tail call void @llvm.aie2p.acquire(i32 49, i32 -1)
+  tail call void @llvm.aie2p.acquire(i32 50, i32 -1)
+  tail call void @llvm.aie2p.acquire(i32 52, i32 -1)
+  br label %2
+
+2:                                                ; preds = %1, %2
+  %3 = phi i32 [ 0, %1 ], [ %17, %2 ]
+  %4 = trunc nuw i32 %3 to i20
+  %5 = getelementptr float, ptr @buf8, i20 %4
+  %6 = load <8 x i64>, ptr %5, align 64
+  %7 = getelementptr float, ptr @buf7, i20 %4
+  %8 = load <8 x i64>, ptr %7, align 64
+  %9 = shufflevector <8 x i64> %6, <8 x i64> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %10 = shufflevector <8 x i64> %8, <8 x i64> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %11 = bitcast <32 x i64> %9 to <64 x float>
+  %12 = bitcast <32 x i64> %10 to <64 x float>
+  %13 = tail call <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float> %11, <64 x float> %12, i32 60)
+  %14 = bitcast <64 x float> %13 to <32 x i64>
+  %15 = shufflevector <32 x i64> %14, <32 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %16 = getelementptr float, ptr @buf6, i20 %4
+  store <8 x i64> %15, ptr %16, align 64
+  %17 = add nuw nsw i32 %3, 16
+  %18 = icmp ult i32 %3, 240
+  br i1 %18, label %2, label %19, !llvm.loop !1
+
+19:                                               ; preds = %2
+  tail call void @llvm.aie2p.release(i32 51, i32 1)
+  tail call void @llvm.aie2p.release(i32 53, i32 1)
+  tail call void @llvm.aie2p.release(i32 48, i32 1)
+  br label %1
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read)
+declare <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float>, <64 x float>, i32) #2
+
+attributes #0 = { nounwind }
+attributes #1 = { noreturn nounwind }
+attributes #2 = { mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) }
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !{!1, !2}
+!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.peanohack.ll b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.peanohack.ll
new file mode 100644
index 0000000..56c3882
--- /dev/null
+++ b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.peanohack.ll
@@ -0,0 +1,95 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target triple = "aie2p"
+
+@buf0 = external global [256 x float]
+@buf1 = external global [256 x float]
+@buf2 = external global [256 x float]
+@buf3 = external global [256 x float]
+@buf4 = external global [256 x float]
+@buf5 = external global [256 x float]
+@buf6 = external global [256 x float]
+@buf7 = external global [256 x float]
+@buf8 = external global [256 x float]
+@buf9 = external global [256 x float]
+@buf10 = external global [256 x float]
+@buf11 = external global [256 x float]
+@buf12 = external global [1024 x float]
+@buf13 = external global [1024 x float]
+@buf14 = external global [1024 x float]
+
+declare void @debug_i32(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.event(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.put.ms(i32, i32)
+
+; Unknown intrinsic
+declare { i32, i32 } @llvm.aie2p.get.ss()
+
+; Unknown intrinsic
+declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32)
+
+; Unknown intrinsic
+declare <16 x i32> @llvm.aie2p.scd.read.vec(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.acquire(i32, i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.release(i32, i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.set.ctrl.reg(i32, i32)
+
+define void @core_0_4() {
+  br label %1
+
+1:                                                ; preds = %22, %0
+  call void @llvm.aie2p.acquire(i32 49, i32 -1)
+  call void @llvm.aie2p.acquire(i32 50, i32 -1)
+  call void @llvm.aie2p.acquire(i32 52, i32 -1)
+  br label %2
+
+2:                                                ; preds = %5, %1
+  %3 = phi i32 [ %21, %5 ], [ 0, %1 ]
+  %4 = icmp slt i32 %3, 256
+  br i1 %4, label %5, label %22
+
+5:                                                ; preds = %2
+  %6 = getelementptr float, ptr @buf8, i32 %3
+  %7 = load <16 x float>, ptr %6
+  %8 = getelementptr float, ptr @buf7, i32 %3
+  %9 = load <16 x float>, ptr %8
+  %10 = bitcast <16 x float> %7 to <8 x i64>
+  %11 = bitcast <16 x float> %9 to <8 x i64>
+  %12 = shufflevector <8 x i64> %10, <8 x i64> %10, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %13 = shufflevector <8 x i64> %11, <8 x i64> %11, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %14 = bitcast <32 x i64> %12 to <64 x float>
+  %15 = bitcast <32 x i64> %13 to <64 x float>
+  %16 = call <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float> %14, <64 x float> %15, i32 60)
+  %17 = bitcast <64 x float> %16 to <32 x i64>
+  %18 = shufflevector <32 x i64> %17, <32 x i64> %17, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %19 = bitcast <8 x i64> %18 to <16 x float>
+  %20 = getelementptr float, ptr @buf6, i32 %3
+  store <16 x float> %19, ptr %20
+  %21 = add i32 %3, 16
+  br label %2, !llvm.loop !1
+
+22:                                               ; preds = %2
+  call void @llvm.aie2p.release(i32 51, i32 1)
+  call void @llvm.aie2p.release(i32 53, i32 1)
+  call void @llvm.aie2p.release(i32 48, i32 1)
+  br label %1
+}
+
+; Unknown intrinsic
+declare <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float>, <64 x float>, i32)
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !{!1, !2}
+!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.elf b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.elf
new file mode 100755
index 0000000000000000000000000000000000000000..4588246a4ddfdd6db12c4f239462db7892609dba
GIT binary patch
literal 1660
zcma)7O=uHQ5S~qAQpKNGQ35JF6r@T`cTH>)FRmquXbSbB1(jv9yKR>yo3Oj7tsn*w
z^q}5^C<tBz4<3pKm7WR;-o%4<f6#*$(VGXcezQMG^yCNc&3rTSX69vfH&>5WPDqkO
zF%mS`Wd#_`bwmS@1rn5?VM<eyIJ%7$V-*Ho2&D|xnJ>N>`}$?7pG106qQ^vK`mjFT
z_3qn6^y$=qpzkB}ky9Q*_Vbb;`mpvaL(6X^f-WUiG9-UQ-D~NUX=(1HiVg->79UA!
zM#0Gt*};h&p*E69Zc6>`Oy1ernR>JJ^RcvkQ|W9c>f8-3grJj0lS7@g=I*k}S<B)R
z8b*C&thpPxe%KHAb$4?ge|s!jy_J&oME<ZG`<D}ZzK;)yzc})t&!AVX&4K^=!}_aI
zKB|X)Q_=>aZ1ExPb)7I0AX}v9mWWpgGYWn$WL{~)o!}as2QRlIn%|!eEZ_Lqnf;X>
zNhH;Q=zcg8<<EvA<fN1>z3L5xviDx5voy>D23|=-Ki7yVwli<JM?|Gk7D_26=VT(I
zraQA@!E!yj;Y3z3Jm>pO@5t!rtnJUWsyU-kADy;qwzD`|t1Z+I>b51SdBc=TW<GD}
zW?n9h6^f>1jGF~BuT=BZf}xZwvs9QV?GJnC-_cGNN&2z24Q#4wk!UCMhkbl6^i21V
z#B=C^&-1fC)#sl87g#cELwI0LL#Jf;^JFMRXPB!!evWy&$1gAk1QwqUUyz>v3hZ2u
z$Ho4ekOGUr{5GUu0%XANLkcDU^Cysk3Bde0q+kLte+?;^0L<S(3MK&a_pBuG+yH!^
z&#b6D{*`&V$0_Gqi$3uf>y`t>TdMnd6>`q=-B9aF-I&)r`?BTXd8wS8EZcg`GW^Mg
z*{WHTt4vJDni9!FkXq%qW6ET>zLGNx7-m|Ip;fI}+fm49xRxeshpF0{QIuepc`uQs
zdA{!Yq<PIy(-!nv%hJk|OpSTc>bmV{wqyIm8fk{#aJ~2x+cn*F^`+Pnx-G|~;85fY
zxY>U)By2|?Mc&4F8w&=i@(=fakPyZz{!X@`ud*RR^y=Xo9LC{Q3!o;5E=4+vI1k6e
z)x`^tHk|yO<{Ugtj=?)0eB&Vk6yX?;31r;D9_ajE5Tk*(Z!v+4;|@f6JMvvbL9QA5
GWBCW<3<TZ)

literal 0
HcmV?d00001

diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.ld.script b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.ld.script
new file mode 100644
index 0000000..51c13db
--- /dev/null
+++ b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.ld.script
@@ -0,0 +1,72 @@
+
+MEMORY
+{
+   program (RX) : ORIGIN = 0, LENGTH = 0x0020000
+   data (!RX) : ORIGIN = 0x78400, LENGTH = 0x7C00
+}
+ENTRY(__start)
+SECTIONS
+{
+  . = 0x0;
+  .text : {
+     /* the __start symbol has to come at address zero. */
+     *crt0.o(.text*)
+     _ctors_start = .;
+     _init_array_start = .;
+     KEEP(SORT(*.init_array))
+     _ctors_end = .;
+     _init_array_end = .;
+     _dtors_start = .;
+     _dtors_end = .;
+     *(.text*)
+  } > program
+  .data : {
+     *(.data*)
+     *(.rodata*)
+  } > data
+  .comment : {
+     *(.comment*)
+  }
+  .symtab : {
+     *(.symtab)
+  }
+  .shstrtab : {
+     *(.shstrtab)
+  }
+  .strtab : {
+     *(.strtab)
+  }
+  .stack_sizes : {
+     *(.stack_sizes)
+  }
+
+. = 0x70000;
+_sp_start_value_DM_stack = .;
+. += 0x400; /* stack */
+. = 0x40400;
+buf8 = .;
+. += 0x400;
+. = 0x44000;
+buf7 = .;
+. += 0x400;
+. = 0x48000;
+buf6 = .;
+. += 0x400;
+/* No tile with memory exists to the west. */
+. = 0x50000;
+. += 0x10000;
+/* No tile with memory exists to the north. */
+. = 0x60000;
+. += 0x10000;
+. = 0x70400;
+buf11 = .;
+. += 0x400;
+. = 0x74000;
+buf10 = .;
+. += 0x400;
+. = 0x78000;
+buf9 = .;
+. += 0x400;
+  .bss : { *(.bss*) } > data
+}
+PROVIDE(main = core_0_5);
diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.ll b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.ll
new file mode 100644
index 0000000..8972a4d
--- /dev/null
+++ b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.ll
@@ -0,0 +1,95 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target triple = "aie2p"
+
+@buf0 = external global [256 x float]
+@buf1 = external global [256 x float]
+@buf2 = external global [256 x float]
+@buf3 = external global [256 x float]
+@buf4 = external global [256 x float]
+@buf5 = external global [256 x float]
+@buf6 = external global [256 x float]
+@buf7 = external global [256 x float]
+@buf8 = external global [256 x float]
+@buf9 = external global [256 x float]
+@buf10 = external global [256 x float]
+@buf11 = external global [256 x float]
+@buf12 = external global [1024 x float]
+@buf13 = external global [1024 x float]
+@buf14 = external global [1024 x float]
+
+declare void @debug_i32(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.event(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.put.ms(i32, i32)
+
+; Unknown intrinsic
+declare { i32, i32 } @llvm.aie2p.get.ss()
+
+; Unknown intrinsic
+declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32)
+
+; Unknown intrinsic
+declare <16 x i32> @llvm.aie2p.scd.read.vec(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.acquire(i32, i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.release(i32, i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.set.ctrl.reg(i32, i32)
+
+define void @core_0_5() {
+  br label %1
+
+1:                                                ; preds = %22, %0
+  call void @llvm.aie2p.acquire(i32 49, i32 -1)
+  call void @llvm.aie2p.acquire(i32 50, i32 -1)
+  call void @llvm.aie2p.acquire(i32 52, i32 -1)
+  br label %2
+
+2:                                                ; preds = %5, %1
+  %3 = phi i32 [ %21, %5 ], [ 0, %1 ]
+  %4 = icmp slt i32 %3, 256
+  br i1 %4, label %5, label %22
+
+5:                                                ; preds = %2
+  %6 = getelementptr float, ptr @buf11, i32 %3
+  %7 = load <16 x float>, ptr %6, align 4
+  %8 = getelementptr float, ptr @buf10, i32 %3
+  %9 = load <16 x float>, ptr %8, align 4
+  %10 = bitcast <16 x float> %7 to <8 x i64>
+  %11 = bitcast <16 x float> %9 to <8 x i64>
+  %12 = shufflevector <8 x i64> %10, <8 x i64> %10, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %13 = shufflevector <8 x i64> %11, <8 x i64> %11, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %14 = bitcast <32 x i64> %12 to <64 x float>
+  %15 = bitcast <32 x i64> %13 to <64 x float>
+  %16 = call <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float> %14, <64 x float> %15, i32 60)
+  %17 = bitcast <64 x float> %16 to <32 x i64>
+  %18 = shufflevector <32 x i64> %17, <32 x i64> %17, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %19 = bitcast <8 x i64> %18 to <16 x float>
+  %20 = getelementptr float, ptr @buf9, i32 %3
+  store <16 x float> %19, ptr %20, align 4
+  %21 = add i32 %3, 16
+  br label %2, !llvm.loop !1
+
+22:                                               ; preds = %2
+  call void @llvm.aie2p.release(i32 51, i32 1)
+  call void @llvm.aie2p.release(i32 53, i32 1)
+  call void @llvm.aie2p.release(i32 48, i32 1)
+  br label %1
+}
+
+; Unknown intrinsic
+declare <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float>, <64 x float>, i32)
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !{!1, !2}
+!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.o b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.o
new file mode 100644
index 0000000000000000000000000000000000000000..ca78f75bdee0299a5c24ffa00da8f22d235605ba
GIT binary patch
literal 984
zcmZ`%O-~b16g{t$wlPRb6SWI9#1$Y+I({Uu(NaN5GBw5oVsOz+%NUH6;xI&XqcJR8
z5aPyUmTcIwa?usPfbbI_;RnDUu$=qmwH=yxlRNjEcTeuU@6P)^@4Qq>p<@b}7z2(H
zzyOm^v|tQr^t5tx24qnnc&A2z)Arvi4#|jBqQ#7;f2Lb+)mpg-3}#x}2dbF0FD7{|
zmd})KM`$veyqWqJR`y21*T*C0vuXF29o_=MKb@B4=gC~y-nci1Q<mj7rTepD!yl6V
ztjc-Fcl_ZPKbg(fc2nws$w%!Zf0z(`jORwLan|FUdRFNz!v1{J??^p1{Wxz*-Qc~Z
z_j#`!Q#?Nnb4zp34R^W-NaC`+LN4o!YyfWvei$ZyVvYDB#up8iMdz5Wb#wA#fX{t=
zrjH99=N`pGKb6)HcNrdLA{;PG$RnVAqfaC?#C`Tud>`7Zz}pUBtv3VDT6ncIwHdhe
zk8qsj%G?LH;njmmb9JlX!P*S`z^yTU{Uour^?nAH?=@Uq!KycXuUe=+h2@k=h3X>{
zZ4ufwS=E_eUoB|bTa+2%|6!ada|JCbbM7xOIj^EkqPSTJiu$zoksXP>p!e(aW7pYe
wmja*BWnD$OHufzONPU?{azk`kduIkhY{W?{XA`qT*F^W5Oyq-fsV6gD0S*Oj=>Px#

literal 0
HcmV?d00001

diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.opt.ll b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.opt.ll
new file mode 100644
index 0000000..d08aa8f
--- /dev/null
+++ b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.opt.ll
@@ -0,0 +1,64 @@
+; ModuleID = 'air_project/sub_kernel_0_core_0_5.peanohack.ll'
+source_filename = "LLVMDialectModule"
+target datalayout = "e-m:e-p:20:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-f32:32:32-i64:32-f64:32-a:0:32-n32"
+target triple = "aie2p"
+
+@buf9 = external local_unnamed_addr global [256 x float]
+@buf10 = external local_unnamed_addr global [256 x float]
+@buf11 = external local_unnamed_addr global [256 x float]
+
+; Function Attrs: nounwind
+declare void @llvm.aie2p.acquire(i32, i32) #0
+
+; Function Attrs: nounwind
+declare void @llvm.aie2p.release(i32, i32) #0
+
+; Function Attrs: noreturn nounwind
+define void @core_0_5() local_unnamed_addr #1 {
+  br label %1
+
+1:                                                ; preds = %19, %0
+  tail call void @llvm.aie2p.acquire(i32 49, i32 -1)
+  tail call void @llvm.aie2p.acquire(i32 50, i32 -1)
+  tail call void @llvm.aie2p.acquire(i32 52, i32 -1)
+  br label %2
+
+2:                                                ; preds = %1, %2
+  %3 = phi i32 [ 0, %1 ], [ %17, %2 ]
+  %4 = trunc nuw i32 %3 to i20
+  %5 = getelementptr float, ptr @buf11, i20 %4
+  %6 = load <8 x i64>, ptr %5, align 64
+  %7 = getelementptr float, ptr @buf10, i20 %4
+  %8 = load <8 x i64>, ptr %7, align 64
+  %9 = shufflevector <8 x i64> %6, <8 x i64> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %10 = shufflevector <8 x i64> %8, <8 x i64> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %11 = bitcast <32 x i64> %9 to <64 x float>
+  %12 = bitcast <32 x i64> %10 to <64 x float>
+  %13 = tail call <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float> %11, <64 x float> %12, i32 60)
+  %14 = bitcast <64 x float> %13 to <32 x i64>
+  %15 = shufflevector <32 x i64> %14, <32 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %16 = getelementptr float, ptr @buf9, i20 %4
+  store <8 x i64> %15, ptr %16, align 64
+  %17 = add nuw nsw i32 %3, 16
+  %18 = icmp ult i32 %3, 240
+  br i1 %18, label %2, label %19, !llvm.loop !1
+
+19:                                               ; preds = %2
+  tail call void @llvm.aie2p.release(i32 51, i32 1)
+  tail call void @llvm.aie2p.release(i32 53, i32 1)
+  tail call void @llvm.aie2p.release(i32 48, i32 1)
+  br label %1
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read)
+declare <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float>, <64 x float>, i32) #2
+
+attributes #0 = { nounwind }
+attributes #1 = { noreturn nounwind }
+attributes #2 = { mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) }
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !{!1, !2}
+!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.peanohack.ll b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.peanohack.ll
new file mode 100644
index 0000000..69f695d
--- /dev/null
+++ b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.peanohack.ll
@@ -0,0 +1,95 @@
+; ModuleID = 'LLVMDialectModule'
+source_filename = "LLVMDialectModule"
+target triple = "aie2p"
+
+@buf0 = external global [256 x float]
+@buf1 = external global [256 x float]
+@buf2 = external global [256 x float]
+@buf3 = external global [256 x float]
+@buf4 = external global [256 x float]
+@buf5 = external global [256 x float]
+@buf6 = external global [256 x float]
+@buf7 = external global [256 x float]
+@buf8 = external global [256 x float]
+@buf9 = external global [256 x float]
+@buf10 = external global [256 x float]
+@buf11 = external global [256 x float]
+@buf12 = external global [1024 x float]
+@buf13 = external global [1024 x float]
+@buf14 = external global [1024 x float]
+
+declare void @debug_i32(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.event(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.put.ms(i32, i32)
+
+; Unknown intrinsic
+declare { i32, i32 } @llvm.aie2p.get.ss()
+
+; Unknown intrinsic
+declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32)
+
+; Unknown intrinsic
+declare <16 x i32> @llvm.aie2p.scd.read.vec(i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.acquire(i32, i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.release(i32, i32)
+
+; Unknown intrinsic
+declare void @llvm.aie2p.set.ctrl.reg(i32, i32)
+
+define void @core_0_5() {
+  br label %1
+
+1:                                                ; preds = %22, %0
+  call void @llvm.aie2p.acquire(i32 49, i32 -1)
+  call void @llvm.aie2p.acquire(i32 50, i32 -1)
+  call void @llvm.aie2p.acquire(i32 52, i32 -1)
+  br label %2
+
+2:                                                ; preds = %5, %1
+  %3 = phi i32 [ %21, %5 ], [ 0, %1 ]
+  %4 = icmp slt i32 %3, 256
+  br i1 %4, label %5, label %22
+
+5:                                                ; preds = %2
+  %6 = getelementptr float, ptr @buf11, i32 %3
+  %7 = load <16 x float>, ptr %6
+  %8 = getelementptr float, ptr @buf10, i32 %3
+  %9 = load <16 x float>, ptr %8
+  %10 = bitcast <16 x float> %7 to <8 x i64>
+  %11 = bitcast <16 x float> %9 to <8 x i64>
+  %12 = shufflevector <8 x i64> %10, <8 x i64> %10, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %13 = shufflevector <8 x i64> %11, <8 x i64> %11, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %14 = bitcast <32 x i64> %12 to <64 x float>
+  %15 = bitcast <32 x i64> %13 to <64 x float>
+  %16 = call <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float> %14, <64 x float> %15, i32 60)
+  %17 = bitcast <64 x float> %16 to <32 x i64>
+  %18 = shufflevector <32 x i64> %17, <32 x i64> %17, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %19 = bitcast <8 x i64> %18 to <16 x float>
+  %20 = getelementptr float, ptr @buf9, i32 %3
+  store <16 x float> %19, ptr %20
+  %21 = add i32 %3, 16
+  br label %2, !llvm.loop !1
+
+22:                                               ; preds = %2
+  call void @llvm.aie2p.release(i32 51, i32 1)
+  call void @llvm.aie2p.release(i32 53, i32 1)
+  call void @llvm.aie2p.release(i32 48, i32 1)
+  br label %1
+}
+
+; Unknown intrinsic
+declare <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float>, <64 x float>, i32)
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !{!1, !2}
+!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_design.bif b/examples/elementwise_arith/air_project/sub_kernel_0_design.bif
new file mode 100644
index 0000000..bbeec41
--- /dev/null
+++ b/examples/elementwise_arith/air_project/sub_kernel_0_design.bif
@@ -0,0 +1,10 @@
+all:
+{
+  id_code = 0x14ca8093
+  extended_id_code = 0x01
+  image
+  {
+    name=aie_image, id=0x1c000000
+    { type=cdo file=air_project/sub_kernel_0_aie_cdo_elfs.bin file=air_project/sub_kernel_0_aie_cdo_init.bin file=air_project/sub_kernel_0_aie_cdo_enable.bin }
+  }
+}
diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_sub_kernel_0_sequence.bin b/examples/elementwise_arith/air_project/sub_kernel_0_sub_kernel_0_sequence.bin
new file mode 100644
index 0000000000000000000000000000000000000000..f2eb383b7eb903e8ad8809b7d44fb0a4d660bfb6
GIT binary patch
literal 3248
zcmcJQ-Aw~A5QNu<g?K=M2OfApq6JF0GAPG%L<ue-N=1yF_0OCH8j;via_gU++3a_@
zvk>neg4w5;eLtEZyo^(@j}krmd|WcZ&8OQJf_lyu+9e|Yukmso`>#0IXQ!}T>04{-
zdWRz5+iJIiZ@khs7OO7qzIV<49O}M3Vd35Et*zr4+n!k}H6?s5ZHHP2b8iNd9f!uq
z2%ELeWFG=0l9+H#Mw;SV&vYiNQGA=m?@TyH^<^fmv5ZNwd1Or5@6ws1SwdvOIT;Fy
zFEe2cjS1(dzRbimmN6++jgb*5y%`xL$~=|+C!AAeiZ3%^4UGxssJ_g^HI^|MPc*|&
zYb<A{(Vr%DO=9vtUuMD@8WYaRC|J1{n2BpFW3p2})f#K>`xk4J-t?`*yLP<eIY;$n
zCaj?`;T+YMnYhL>CTa8!jZ|yI(P^-T#)NY+)Rg;`nXrb&gmYA1X5t#lnB*HUVGWH5
W=cv75Caj?`;T+YMnYhL+nEU|x-i@&U

literal 0
HcmV?d00001

diff --git a/examples/elementwise_arith/air_project/tt.mlir b/examples/elementwise_arith/air_project/tt.mlir
new file mode 100644
index 0000000..cfdc62d
--- /dev/null
+++ b/examples/elementwise_arith/air_project/tt.mlir
@@ -0,0 +1,35 @@
+#loc = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":86:1)
+#loc10 = loc("X"(#loc))
+#loc11 = loc("OUT"(#loc))
+module {
+  tt.func public @square_kernel(%X: !tt.ptr<i16> {tt.divisibility = 16 : i32} loc("X"(#loc)), %OUT: !tt.ptr<i16> {tt.divisibility = 16 : i32} loc("OUT"(#loc))) attributes {noinline = false} {
+    %c1024_i32 = arith.constant 1024 : i32 loc(#loc1)
+    %pid = tt.get_program_id x : i32 loc(#loc12)
+    %offsets = arith.muli %pid, %c1024_i32 : i32 loc(#loc13)
+    %offsets_0 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc14)
+    %offsets_1 = tt.splat %offsets : i32 -> tensor<1024xi32> loc(#loc13)
+    %offsets_2 = arith.addi %offsets_1, %offsets_0 : tensor<1024xi32> loc(#loc13)
+    %x = tt.splat %X : !tt.ptr<i16> -> tensor<1024x!tt.ptr<i16>> loc(#loc15)
+    %x_3 = tt.addptr %x, %offsets_2 : tensor<1024x!tt.ptr<i16>>, tensor<1024xi32> loc(#loc15)
+    %x_4 = tt.load %x_3 : tensor<1024x!tt.ptr<i16>> loc(#loc16)
+    %0 = tt.splat %OUT : !tt.ptr<i16> -> tensor<1024x!tt.ptr<i16>> loc(#loc7)
+    %1 = tt.addptr %0, %offsets_2 : tensor<1024x!tt.ptr<i16>>, tensor<1024xi32> loc(#loc7)
+    %2 = arith.muli %x_4, %x_4 : tensor<1024xi16> loc(#loc8)
+    tt.store %1, %2 : tensor<1024x!tt.ptr<i16>> loc(#loc9)
+    tt.return loc(#loc)
+  } loc(#loc)
+} loc(#loc)
+#loc1 = loc(unknown)
+#loc2 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":87:11)
+#loc3 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":88:15)
+#loc4 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":88:34)
+#loc5 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":89:17)
+#loc6 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":89:9)
+#loc7 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":90:14)
+#loc8 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":90:32)
+#loc9 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":90:5)
+#loc12 = loc("pid"(#loc2))
+#loc13 = loc("offsets"(#loc3))
+#loc14 = loc("offsets"(#loc4))
+#loc15 = loc("x"(#loc5))
+#loc16 = loc("x"(#loc6))
diff --git a/examples/elementwise_arith/elementwise_arith.py b/examples/elementwise_arith/elementwise_arith.py
new file mode 100644
index 0000000..04d4844
--- /dev/null
+++ b/examples/elementwise_arith/elementwise_arith.py
@@ -0,0 +1,189 @@
+# Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
+# SPDX-License-Identifier: MIT
+
+# Elementwise arithmetic benchmark: sub, mul, div, square.
+# Supports bf16 (default) and f32 (via bf16-emulation).
+# Not all ops support all dtypes:
+#   sub: bf16, f32
+#   mul: bf16, f32
+#   div: f32 only (hardware constraint: arith.divf is f32-only on AIE2P)
+#   square: bf16, f32 (implemented as x * x)
+
+import argparse
+import torch
+import triton
+import triton.language as tl
+import sys
+import os
+
+sys.path.append(os.path.abspath(".."))
+import benchmark
+
+DTYPE_CONFIG = {
+    "bf16": {
+        "torch_dtype": torch.bfloat16,
+        "atol": 1e-2,
+        "rtol": 1e-2,
+        "bf16_emulation": False,
+    },
+    "f32": {
+        "torch_dtype": torch.float32,
+        "atol": 1e-1,
+        "rtol": 5e-2,
+        "bf16_emulation": True,
+    },
+}
+
+# Which dtypes each op supports.
+# Integer types (i16) fail at aircc for subi/muli on AIE2P (only addi works).
+OP_DTYPES = {
+    "sub": ["bf16", "f32"],
+    "mul": ["bf16", "f32"],
+    "div": ["f32"],  # arith.divf is f32-only on AIE2P; bf16 divf not supported
+    "square": ["bf16", "f32"],
+}
+
+
+# --- Triton kernels ---
+
+
+@triton.jit
+def sub_kernel(X, Y, OUT, n_elements: tl.constexpr, BLOCK_SIZE: tl.constexpr):
+    pid = tl.program_id(0)
+    offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    x = tl.load(X + offsets[:])
+    y = tl.load(Y + offsets[:])
+    tl.store(OUT + offsets[:], x - y)
+
+
+@triton.jit
+def mul_kernel(X, Y, OUT, n_elements: tl.constexpr, BLOCK_SIZE: tl.constexpr):
+    pid = tl.program_id(0)
+    offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    x = tl.load(X + offsets[:])
+    y = tl.load(Y + offsets[:])
+    tl.store(OUT + offsets[:], x * y)
+
+
+@triton.jit
+def div_kernel(X, Y, OUT, n_elements: tl.constexpr, BLOCK_SIZE: tl.constexpr):
+    pid = tl.program_id(0)
+    offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    x = tl.load(X + offsets[:])
+    y = tl.load(Y + offsets[:])
+    tl.store(OUT + offsets[:], x / y)
+
+
+@triton.jit
+def square_kernel(X, OUT, n_elements: tl.constexpr, BLOCK_SIZE: tl.constexpr):
+    pid = tl.program_id(0)
+    offsets = pid * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    x = tl.load(X + offsets[:])
+    tl.store(OUT + offsets[:], x * x)
+
+
+# --- Kernel dispatch table ---
+
+KERNELS = {
+    "sub": sub_kernel,
+    "mul": mul_kernel,
+    "div": div_kernel,
+    "square": square_kernel,
+}
+
+# --- Torch reference functions ---
+
+TORCH_REF = {
+    "sub": lambda x, y: x - y,
+    "mul": lambda x, y: x * y,
+    "div": lambda x, y: x / y,
+    "square": lambda x, y: x * x,
+}
+
+
+def bench_op(op, N, provider, cfg):
+    device = "cpu"
+    torch_dtype = cfg["torch_dtype"]
+    is_unary = op == "square"
+
+    x = torch.randn(N, device=device, dtype=torch_dtype)
+    if not is_unary:
+        if op == "div":
+            # Avoid division by zero; use values in [0.5, 1.5]
+            y = 0.5 + torch.rand(N, device=device, dtype=torch_dtype)
+        else:
+            y = torch.randn(N, device=device, dtype=torch_dtype)
+
+    out = torch.empty(N, device=device, dtype=torch_dtype)
+
+    if provider == "torch" or provider == "test":
+        out_ref = TORCH_REF[op](x, y if not is_unary else None)
+
+    if provider == "triton" or provider == "test":
+        grid = lambda META: (triton.cdiv(N, META["BLOCK_SIZE"]),)
+        kernel = KERNELS[op]
+        if is_unary:
+            compiled_kernel = kernel[grid](x, out, N, BLOCK_SIZE=1024)
+        else:
+            compiled_kernel = kernel[grid](x, y, out, N, BLOCK_SIZE=1024)
+        with open("tt.shared.mlir", "w") as f:
+            f.write(str(compiled_kernel.asm["ttsharedir"]))
+        if provider == "test":
+            torch.testing.assert_close(out, out_ref, atol=cfg["atol"], rtol=cfg["rtol"])
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Elementwise arithmetic benchmark for AMD NPU"
+    )
+    parser.add_argument(
+        "--op",
+        type=str,
+        choices=list(KERNELS.keys()),
+        required=True,
+        help="Operation to benchmark",
+    )
+    parser.add_argument(
+        "--dtype",
+        type=str,
+        choices=list(DTYPE_CONFIG.keys()),
+        default="bf16",
+        help="Element data type (default: bf16)",
+    )
+    parser.add_argument(
+        "--bf16-emulation",
+        dest="bf16_emulation",
+        default=False,
+        action="store_true",
+        help="Use f32 data type with bf16 emulation on AIE cores",
+    )
+    args = parser.parse_args()
+
+    if args.bf16_emulation:
+        args.dtype = "f32"
+
+    # Validate op + dtype combination
+    if args.dtype not in OP_DTYPES[args.op]:
+        supported = ", ".join(OP_DTYPES[args.op])
+        print(f"Error: --op {args.op} does not support --dtype {args.dtype}.")
+        print(f"Supported dtypes for {args.op}: {supported}")
+        sys.exit(1)
+
+    cfg = DTYPE_CONFIG[args.dtype]
+
+    if cfg["bf16_emulation"]:
+        os.environ["AMD_TRITON_NPU_BF16_EMULATION"] = "1"
+
+    # Select the right transform script based on op arity.
+    # If AIR_TRANSFORM_TILING_SCRIPT is already set, respect it.
+    if not os.environ.get("AIR_TRANSFORM_TILING_SCRIPT"):
+        is_unary = args.op == "square"
+        script_dir = os.path.dirname(os.path.abspath(__file__))
+        arity = "unary" if is_unary else "binary"
+        os.environ["AIR_TRANSFORM_TILING_SCRIPT"] = os.path.join(
+            script_dir, f"transform_{arity}_aie2p.mlir"
+        )
+
+    benchmark.select_npu_backend()
+    for N in [2**i for i in range(10, 16, 1)]:
+        bench_op(args.op, N, "test", cfg)
diff --git a/examples/elementwise_arith/transform_binary_aie2p.mlir b/examples/elementwise_arith/transform_binary_aie2p.mlir
new file mode 100644
index 0000000..2d76c54
--- /dev/null
+++ b/examples/elementwise_arith/transform_binary_aie2p.mlir
@@ -0,0 +1,40 @@
+// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: MIT
+
+////////////////////////////////////////////////////////////////////////////////
+// Transform Script for Binary Elementwise Ops (AIE2P): sub, mul, div
+// Dtype-generic: uses @DTYPE@ and @VECTOR_SIZE@ placeholders.
+// Uses shared library sequences from transform_library.mlir (auto-injected).
+////////////////////////////////////////////////////////////////////////////////
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(
+      %arg1: !transform.any_op {transform.readonly}) {
+
+    transform.include @canonicalize_with_fold_dims failures(propagate)
+        (%arg1) : (!transform.any_op) -> ()
+    transform.include @fuse_elementwise_and_canonicalize failures(propagate)
+        (%arg1) : (!transform.any_op) -> ()
+    transform.include @flatten_tile_forall failures(propagate)
+        (%arg1) : (!transform.any_op) -> ()
+    transform.include @canonicalize_with_cse failures(propagate)
+        (%arg1) : (!transform.any_op) -> ()
+    transform.include @pad_and_promote_binary_@DTYPE@ failures(propagate)
+        (%arg1) : (!transform.any_op) -> ()
+    transform.include @canonicalize_with_cse failures(propagate)
+        (%arg1) : (!transform.any_op) -> ()
+    transform.include @one_shot_bufferize failures(propagate)
+        (%arg1) : (!transform.any_op) -> ()
+    transform.include @post_bufferize_cleanup failures(propagate)
+        (%arg1) : (!transform.any_op) -> ()
+
+    transform.include @vectorize_generics_at_@VECTOR_SIZE@ failures(propagate)
+        (%arg1) : (!transform.any_op) -> ()
+    %vh = transform.include @air_herd_mapping_and_vectorize
+        failures(propagate) (%arg1) : (!transform.any_op) -> !transform.any_op
+    transform.include @cast_bf16_only_ops failures(propagate)
+        (%vh) : (!transform.any_op) -> ()
+
+    transform.yield
+  }
+}
diff --git a/examples/elementwise_arith/transform_unary_aie2p.mlir b/examples/elementwise_arith/transform_unary_aie2p.mlir
new file mode 100644
index 0000000..14bfd4c
--- /dev/null
+++ b/examples/elementwise_arith/transform_unary_aie2p.mlir
@@ -0,0 +1,40 @@
+// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: MIT
+
+////////////////////////////////////////////////////////////////////////////////
+// Transform Script for Unary Elementwise Ops (AIE2P): square
+// Dtype-generic: uses @DTYPE@ and @VECTOR_SIZE@ placeholders.
+// Uses shared library sequences from transform_library.mlir (auto-injected).
+////////////////////////////////////////////////////////////////////////////////
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(
+      %arg1: !transform.any_op {transform.readonly}) {
+
+    transform.include @canonicalize_with_fold_dims failures(propagate)
+        (%arg1) : (!transform.any_op) -> ()
+    transform.include @fuse_elementwise_and_canonicalize failures(propagate)
+        (%arg1) : (!transform.any_op) -> ()
+    transform.include @flatten_tile_forall failures(propagate)
+        (%arg1) : (!transform.any_op) -> ()
+    transform.include @canonicalize_with_cse failures(propagate)
+        (%arg1) : (!transform.any_op) -> ()
+    transform.include @pad_and_promote_unary_@DTYPE@ failures(propagate)
+        (%arg1) : (!transform.any_op) -> ()
+    transform.include @canonicalize_with_cse failures(propagate)
+        (%arg1) : (!transform.any_op) -> ()
+    transform.include @one_shot_bufferize failures(propagate)
+        (%arg1) : (!transform.any_op) -> ()
+    transform.include @post_bufferize_cleanup failures(propagate)
+        (%arg1) : (!transform.any_op) -> ()
+
+    transform.include @vectorize_generics_at_@VECTOR_SIZE@ failures(propagate)
+        (%arg1) : (!transform.any_op) -> ()
+    %vh = transform.include @air_herd_mapping_and_vectorize
+        failures(propagate) (%arg1) : (!transform.any_op) -> !transform.any_op
+    transform.include @cast_bf16_only_ops failures(propagate)
+        (%vh) : (!transform.any_op) -> ()
+
+    transform.yield
+  }
+}
diff --git a/examples/elementwise_arith/tt.shared.mlir b/examples/elementwise_arith/tt.shared.mlir
new file mode 100644
index 0000000..dc6929b
--- /dev/null
+++ b/examples/elementwise_arith/tt.shared.mlir
@@ -0,0 +1 @@
+b'#loc = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":86:1)\n#loc5 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":89:9)\n#map = affine_map<(d0) -> (d0)>\n#loc8 = loc("X"(#loc))\n#loc9 = loc("OUT"(#loc))\n#loc12 = loc("x"(#loc5))\nmodule {\n  func.func @square_kernel(%arg0: memref<*xf32> {tt.divisibility = 16 : i32} loc("X"(#loc)), %arg1: memref<*xf32> {tt.divisibility = 16 : i32} loc("OUT"(#loc)), %arg2: i32 loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":86:1), %arg3: i32 loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":86:1), %arg4: i32 loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":86:1), %arg5: i32 loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":86:1), %arg6: i32 loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":86:1), %arg7: i32 loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":86:1)) {\n    %c1024_i32 = arith.constant 1024 : i32 loc(#loc1)\n    %0 = arith.muli %arg5, %c1024_i32 : i32 loc(#loc10)\n    %1 = arith.index_cast %0 : i32 to index loc(#loc3)\n    %reinterpret_cast = memref.reinterpret_cast %arg0 to offset: [%1], sizes: [1024], strides: [1] : memref<*xf32> to memref<1024xf32, strided<[1], offset: ?>> loc(#loc11)\n    %alloc = memref.alloc() : memref<1024xf32> loc(#loc12)\n    memref.copy %reinterpret_cast, %alloc : memref<1024xf32, strided<[1], offset: ?>> to memref<1024xf32> loc(#loc12)\n    %2 = bufferization.to_tensor %alloc restrict writable : memref<1024xf32> to tensor<1024xf32> loc(#loc12)\n    %reinterpret_cast_0 = memref.reinterpret_cast %arg1 to offset: [%1], sizes: [1024], strides: [1] : memref<*xf32> to memref<1024xf32, strided<[1], offset: ?>> loc(#loc3)\n    %3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%2, %2 : tensor<1024xf32>, tensor<1024xf32>) outs(%2 : tensor<1024xf32>) {\n    ^bb0(%in: f32 loc("x"(#loc5)), %in_1: f32 loc("x"(#loc5)), %out: f32 loc("x"(#loc5))):\n      %4 = arith.mulf %in, %in_1 : f32 loc(#loc6)\n      linalg.yield %4 : f32 loc(#loc6)\n    } -> tensor<1024xf32> loc(#loc6)\n    bufferization.materialize_in_destination %3 in writable %reinterpret_cast_0 : (tensor<1024xf32>, memref<1024xf32, strided<[1], offset: ?>>) -> () loc(#loc7)\n    return loc(#loc)\n  } loc(#loc)\n} loc(#loc)\n#loc1 = loc(unknown)\n#loc2 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":88:15)\n#loc3 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":90:14)\n#loc4 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":89:17)\n#loc6 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":90:32)\n#loc7 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":90:5)\n#loc10 = loc("offsets"(#loc2))\n#loc11 = loc("x"(#loc4))\n\n'
\ No newline at end of file
diff --git a/examples/generate_readme.py b/examples/generate_readme.py
index 75dd998..bc75808 100644
--- a/examples/generate_readme.py
+++ b/examples/generate_readme.py
@@ -146,6 +146,12 @@
         "path": "multi_drivers",
         "datatypes": "bf16",
     },
+    {
+        "category": "Element-wise",
+        "name": "Elementwise Arith (sub, mul, div, square)",
+        "path": "elementwise_arith",
+        "datatypes": "bf16, f32",
+    },
 ]
 
 # Directories to ignore when verifying registry completeness

From d38be1d2e60f2a7ac6c7d4606634967274ac3d6a Mon Sep 17 00:00:00 2001
From: erweiw <erwei.wang@amd.com>
Date: Thu, 9 Apr 2026 22:01:07 -0700
Subject: [PATCH 8/9] Remove build artifacts from elementwise_arith

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../air_project/aie.asm_air_output.mlir       | 386 -----------
 .../elementwise_arith/air_project/aie.elf     | Bin 29488 -> 0 bytes
 .../aiecc_failure_1775797115_856352.mlir      | 411 ------------
 .../aiecc_failure_1775797139_858651.mlir      | 601 ------------------
 .../aiecc_failure_1775797174_862028.mlir      | 431 -------------
 .../aiecc_repeater_1775797115_856352.sh       |  12 -
 .../aiecc_repeater_1775797139_858651.sh       |  14 -
 .../aiecc_repeater_1775797174_862028.sh       |  14 -
 .../air_project/airinput.mlir                 |  41 --
 .../air_project/asm_air_output.mlir           |  41 --
 .../air_project/asm_src.mlir                  |  34 -
 .../air_project/div_kernel_0.pdi              | Bin 15904 -> 0 bytes
 .../air_project/div_kernel_0_aie_cdo_elfs.bin | Bin 10704 -> 0 bytes
 .../div_kernel_0_aie_cdo_enable.bin           | Bin 104 -> 0 bytes
 .../air_project/div_kernel_0_aie_cdo_init.bin | Bin 6032 -> 0 bytes
 .../air_project/div_kernel_0_core_0_2.elf     | Bin 4132 -> 0 bytes
 .../div_kernel_0_core_0_2.ld.script           |  72 ---
 .../air_project/div_kernel_0_core_0_2.ll      | 158 -----
 .../air_project/div_kernel_0_core_0_2.o       | Bin 2048 -> 0 bytes
 .../air_project/div_kernel_0_core_0_2.opt.ll  | 129 ----
 .../div_kernel_0_core_0_2.peanohack.ll        | 158 -----
 .../air_project/div_kernel_0_core_0_3.elf     | Bin 4192 -> 0 bytes
 .../div_kernel_0_core_0_3.ld.script           |  78 ---
 .../air_project/div_kernel_0_core_0_3.ll      | 158 -----
 .../air_project/div_kernel_0_core_0_3.o       | Bin 2048 -> 0 bytes
 .../air_project/div_kernel_0_core_0_3.opt.ll  | 129 ----
 .../div_kernel_0_core_0_3.peanohack.ll        | 158 -----
 .../air_project/div_kernel_0_core_0_4.elf     | Bin 4196 -> 0 bytes
 .../div_kernel_0_core_0_4.ld.script           |  78 ---
 .../air_project/div_kernel_0_core_0_4.ll      | 158 -----
 .../air_project/div_kernel_0_core_0_4.o       | Bin 2048 -> 0 bytes
 .../air_project/div_kernel_0_core_0_4.opt.ll  | 129 ----
 .../div_kernel_0_core_0_4.peanohack.ll        | 158 -----
 .../air_project/div_kernel_0_core_0_5.elf     | Bin 4132 -> 0 bytes
 .../div_kernel_0_core_0_5.ld.script           |  72 ---
 .../air_project/div_kernel_0_core_0_5.ll      | 158 -----
 .../air_project/div_kernel_0_core_0_5.o       | Bin 2052 -> 0 bytes
 .../air_project/div_kernel_0_core_0_5.opt.ll  | 129 ----
 .../div_kernel_0_core_0_5.peanohack.ll        | 158 -----
 .../air_project/div_kernel_0_design.bif       |  10 -
 .../div_kernel_0_div_kernel_0_sequence.bin    | Bin 3248 -> 0 bytes
 .../elementwise_arith/air_project/empty_0.pdi | Bin 368 -> 0 bytes
 .../air_project/empty_0_aie_cdo_elfs.bin      | Bin 24 -> 0 bytes
 .../air_project/empty_0_aie_cdo_enable.bin    | Bin 24 -> 0 bytes
 .../air_project/empty_0_aie_cdo_init.bin      | Bin 24 -> 0 bytes
 .../air_project/empty_0_design.bif            |  10 -
 .../air_project/full_elf_config.json          | 134 ----
 .../air_project/input_with_addresses.mlir     | 328 ----------
 .../elementwise_arith/air_project/main.pdi    | Bin 368 -> 0 bytes
 .../air_project/main_aie_cdo_elfs.bin         | Bin 24 -> 0 bytes
 .../air_project/main_aie_cdo_enable.bin       | Bin 24 -> 0 bytes
 .../air_project/main_aie_cdo_init.bin         | Bin 24 -> 0 bytes
 .../air_project/main_design.bif               |  10 -
 .../air_project/main_div_kernel.bin           | Bin 22460 -> 0 bytes
 .../air_project/main_mul_kernel.bin           | Bin 14460 -> 0 bytes
 .../air_project/main_square_kernel.bin        | Bin 11048 -> 0 bytes
 .../air_project/main_sub_kernel.bin           | Bin 14396 -> 0 bytes
 .../air_project/mul_kernel_0.pdi              | Bin 7856 -> 0 bytes
 .../air_project/mul_kernel_0_aie_cdo_elfs.bin | Bin 2656 -> 0 bytes
 .../mul_kernel_0_aie_cdo_enable.bin           | Bin 104 -> 0 bytes
 .../air_project/mul_kernel_0_aie_cdo_init.bin | Bin 6032 -> 0 bytes
 .../air_project/mul_kernel_0_core_0_2.elf     | Bin 1672 -> 0 bytes
 .../mul_kernel_0_core_0_2.ld.script           |  72 ---
 .../air_project/mul_kernel_0_core_0_2.ll      |  95 ---
 .../air_project/mul_kernel_0_core_0_2.o       | Bin 1000 -> 0 bytes
 .../air_project/mul_kernel_0_core_0_2.opt.ll  |  72 ---
 .../mul_kernel_0_core_0_2.peanohack.ll        |  95 ---
 .../air_project/mul_kernel_0_core_0_3.elf     | Bin 1736 -> 0 bytes
 .../mul_kernel_0_core_0_3.ld.script           |  78 ---
 .../air_project/mul_kernel_0_core_0_3.ll      |  95 ---
 .../air_project/mul_kernel_0_core_0_3.o       | Bin 1000 -> 0 bytes
 .../air_project/mul_kernel_0_core_0_3.opt.ll  |  72 ---
 .../mul_kernel_0_core_0_3.peanohack.ll        |  95 ---
 .../air_project/mul_kernel_0_core_0_4.elf     | Bin 1740 -> 0 bytes
 .../mul_kernel_0_core_0_4.ld.script           |  78 ---
 .../air_project/mul_kernel_0_core_0_4.ll      |  95 ---
 .../air_project/mul_kernel_0_core_0_4.o       | Bin 1000 -> 0 bytes
 .../air_project/mul_kernel_0_core_0_4.opt.ll  |  72 ---
 .../mul_kernel_0_core_0_4.peanohack.ll        |  95 ---
 .../air_project/mul_kernel_0_core_0_5.elf     | Bin 1676 -> 0 bytes
 .../mul_kernel_0_core_0_5.ld.script           |  72 ---
 .../air_project/mul_kernel_0_core_0_5.ll      |  95 ---
 .../air_project/mul_kernel_0_core_0_5.o       | Bin 1000 -> 0 bytes
 .../air_project/mul_kernel_0_core_0_5.opt.ll  |  72 ---
 .../mul_kernel_0_core_0_5.peanohack.ll        |  95 ---
 .../air_project/mul_kernel_0_design.bif       |  10 -
 .../mul_kernel_0_mul_kernel_0_sequence.bin    | Bin 3248 -> 0 bytes
 .../air_project/npu.asm_air_output.mlir       | 300 ---------
 .../air_project/placed.asm_air_output.mlir    |  86 ---
 .../air_project/square_kernel_0.pdi           | Bin 6272 -> 0 bytes
 .../square_kernel_0_aie_cdo_elfs.bin          | Bin 2528 -> 0 bytes
 .../square_kernel_0_aie_cdo_enable.bin        | Bin 104 -> 0 bytes
 .../square_kernel_0_aie_cdo_init.bin          | Bin 4300 -> 0 bytes
 .../air_project/square_kernel_0_core_0_2.elf  | Bin 1600 -> 0 bytes
 .../square_kernel_0_core_0_2.ld.script        |  66 --
 .../air_project/square_kernel_0_core_0_2.ll   |  84 ---
 .../air_project/square_kernel_0_core_0_2.o    | Bin 932 -> 0 bytes
 .../square_kernel_0_core_0_2.opt.ll           |  65 --
 .../square_kernel_0_core_0_2.peanohack.ll     |  84 ---
 .../air_project/square_kernel_0_core_0_3.elf  | Bin 1640 -> 0 bytes
 .../square_kernel_0_core_0_3.ld.script        |  69 --
 .../air_project/square_kernel_0_core_0_3.ll   |  84 ---
 .../air_project/square_kernel_0_core_0_3.o    | Bin 932 -> 0 bytes
 .../square_kernel_0_core_0_3.opt.ll           |  65 --
 .../square_kernel_0_core_0_3.peanohack.ll     |  84 ---
 .../air_project/square_kernel_0_core_0_4.elf  | Bin 1640 -> 0 bytes
 .../square_kernel_0_core_0_4.ld.script        |  69 --
 .../air_project/square_kernel_0_core_0_4.ll   |  84 ---
 .../air_project/square_kernel_0_core_0_4.o    | Bin 932 -> 0 bytes
 .../square_kernel_0_core_0_4.opt.ll           |  65 --
 .../square_kernel_0_core_0_4.peanohack.ll     |  84 ---
 .../air_project/square_kernel_0_core_0_5.elf  | Bin 1600 -> 0 bytes
 .../square_kernel_0_core_0_5.ld.script        |  66 --
 .../air_project/square_kernel_0_core_0_5.ll   |  84 ---
 .../air_project/square_kernel_0_core_0_5.o    | Bin 932 -> 0 bytes
 .../square_kernel_0_core_0_5.opt.ll           |  65 --
 .../square_kernel_0_core_0_5.peanohack.ll     |  84 ---
 .../air_project/square_kernel_0_design.bif    |  10 -
 ...uare_kernel_0_square_kernel_0_sequence.bin | Bin 2288 -> 0 bytes
 .../air_project/sub_kernel_0.pdi              | Bin 7792 -> 0 bytes
 .../air_project/sub_kernel_0_aie_cdo_elfs.bin | Bin 2592 -> 0 bytes
 .../sub_kernel_0_aie_cdo_enable.bin           | Bin 104 -> 0 bytes
 .../air_project/sub_kernel_0_aie_cdo_init.bin | Bin 6032 -> 0 bytes
 .../air_project/sub_kernel_0_core_0_2.elf     | Bin 1656 -> 0 bytes
 .../sub_kernel_0_core_0_2.ld.script           |  72 ---
 .../air_project/sub_kernel_0_core_0_2.ll      |  95 ---
 .../air_project/sub_kernel_0_core_0_2.o       | Bin 984 -> 0 bytes
 .../air_project/sub_kernel_0_core_0_2.opt.ll  |  64 --
 .../sub_kernel_0_core_0_2.peanohack.ll        |  95 ---
 .../air_project/sub_kernel_0_core_0_3.elf     | Bin 1720 -> 0 bytes
 .../sub_kernel_0_core_0_3.ld.script           |  78 ---
 .../air_project/sub_kernel_0_core_0_3.ll      |  95 ---
 .../air_project/sub_kernel_0_core_0_3.o       | Bin 984 -> 0 bytes
 .../air_project/sub_kernel_0_core_0_3.opt.ll  |  64 --
 .../sub_kernel_0_core_0_3.peanohack.ll        |  95 ---
 .../air_project/sub_kernel_0_core_0_4.elf     | Bin 1724 -> 0 bytes
 .../sub_kernel_0_core_0_4.ld.script           |  78 ---
 .../air_project/sub_kernel_0_core_0_4.ll      |  95 ---
 .../air_project/sub_kernel_0_core_0_4.o       | Bin 984 -> 0 bytes
 .../air_project/sub_kernel_0_core_0_4.opt.ll  |  64 --
 .../sub_kernel_0_core_0_4.peanohack.ll        |  95 ---
 .../air_project/sub_kernel_0_core_0_5.elf     | Bin 1660 -> 0 bytes
 .../sub_kernel_0_core_0_5.ld.script           |  72 ---
 .../air_project/sub_kernel_0_core_0_5.ll      |  95 ---
 .../air_project/sub_kernel_0_core_0_5.o       | Bin 984 -> 0 bytes
 .../air_project/sub_kernel_0_core_0_5.opt.ll  |  64 --
 .../sub_kernel_0_core_0_5.peanohack.ll        |  95 ---
 .../air_project/sub_kernel_0_design.bif       |  10 -
 .../sub_kernel_0_sub_kernel_0_sequence.bin    | Bin 3248 -> 0 bytes
 .../elementwise_arith/air_project/tt.mlir     |  35 -
 examples/elementwise_arith/tt.shared.mlir     |   1 -
 151 files changed, 8875 deletions(-)
 delete mode 100644 examples/elementwise_arith/air_project/aie.asm_air_output.mlir
 delete mode 100644 examples/elementwise_arith/air_project/aie.elf
 delete mode 100644 examples/elementwise_arith/air_project/aiecc_failure_1775797115_856352.mlir
 delete mode 100644 examples/elementwise_arith/air_project/aiecc_failure_1775797139_858651.mlir
 delete mode 100644 examples/elementwise_arith/air_project/aiecc_failure_1775797174_862028.mlir
 delete mode 100755 examples/elementwise_arith/air_project/aiecc_repeater_1775797115_856352.sh
 delete mode 100755 examples/elementwise_arith/air_project/aiecc_repeater_1775797139_858651.sh
 delete mode 100755 examples/elementwise_arith/air_project/aiecc_repeater_1775797174_862028.sh
 delete mode 100644 examples/elementwise_arith/air_project/airinput.mlir
 delete mode 100644 examples/elementwise_arith/air_project/asm_air_output.mlir
 delete mode 100644 examples/elementwise_arith/air_project/asm_src.mlir
 delete mode 100644 examples/elementwise_arith/air_project/div_kernel_0.pdi
 delete mode 100644 examples/elementwise_arith/air_project/div_kernel_0_aie_cdo_elfs.bin
 delete mode 100644 examples/elementwise_arith/air_project/div_kernel_0_aie_cdo_enable.bin
 delete mode 100644 examples/elementwise_arith/air_project/div_kernel_0_aie_cdo_init.bin
 delete mode 100755 examples/elementwise_arith/air_project/div_kernel_0_core_0_2.elf
 delete mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_2.ld.script
 delete mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_2.ll
 delete mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_2.o
 delete mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_2.opt.ll
 delete mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_2.peanohack.ll
 delete mode 100755 examples/elementwise_arith/air_project/div_kernel_0_core_0_3.elf
 delete mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_3.ld.script
 delete mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_3.ll
 delete mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_3.o
 delete mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_3.opt.ll
 delete mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_3.peanohack.ll
 delete mode 100755 examples/elementwise_arith/air_project/div_kernel_0_core_0_4.elf
 delete mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_4.ld.script
 delete mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_4.ll
 delete mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_4.o
 delete mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_4.opt.ll
 delete mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_4.peanohack.ll
 delete mode 100755 examples/elementwise_arith/air_project/div_kernel_0_core_0_5.elf
 delete mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_5.ld.script
 delete mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_5.ll
 delete mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_5.o
 delete mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_5.opt.ll
 delete mode 100644 examples/elementwise_arith/air_project/div_kernel_0_core_0_5.peanohack.ll
 delete mode 100644 examples/elementwise_arith/air_project/div_kernel_0_design.bif
 delete mode 100644 examples/elementwise_arith/air_project/div_kernel_0_div_kernel_0_sequence.bin
 delete mode 100644 examples/elementwise_arith/air_project/empty_0.pdi
 delete mode 100644 examples/elementwise_arith/air_project/empty_0_aie_cdo_elfs.bin
 delete mode 100644 examples/elementwise_arith/air_project/empty_0_aie_cdo_enable.bin
 delete mode 100644 examples/elementwise_arith/air_project/empty_0_aie_cdo_init.bin
 delete mode 100644 examples/elementwise_arith/air_project/empty_0_design.bif
 delete mode 100644 examples/elementwise_arith/air_project/full_elf_config.json
 delete mode 100644 examples/elementwise_arith/air_project/input_with_addresses.mlir
 delete mode 100644 examples/elementwise_arith/air_project/main.pdi
 delete mode 100644 examples/elementwise_arith/air_project/main_aie_cdo_elfs.bin
 delete mode 100644 examples/elementwise_arith/air_project/main_aie_cdo_enable.bin
 delete mode 100644 examples/elementwise_arith/air_project/main_aie_cdo_init.bin
 delete mode 100644 examples/elementwise_arith/air_project/main_design.bif
 delete mode 100644 examples/elementwise_arith/air_project/main_div_kernel.bin
 delete mode 100644 examples/elementwise_arith/air_project/main_mul_kernel.bin
 delete mode 100644 examples/elementwise_arith/air_project/main_square_kernel.bin
 delete mode 100644 examples/elementwise_arith/air_project/main_sub_kernel.bin
 delete mode 100644 examples/elementwise_arith/air_project/mul_kernel_0.pdi
 delete mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_aie_cdo_elfs.bin
 delete mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_aie_cdo_enable.bin
 delete mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_aie_cdo_init.bin
 delete mode 100755 examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.elf
 delete mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.ld.script
 delete mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.ll
 delete mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.o
 delete mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.opt.ll
 delete mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.peanohack.ll
 delete mode 100755 examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.elf
 delete mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.ld.script
 delete mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.ll
 delete mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.o
 delete mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.opt.ll
 delete mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.peanohack.ll
 delete mode 100755 examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.elf
 delete mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.ld.script
 delete mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.ll
 delete mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.o
 delete mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.opt.ll
 delete mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.peanohack.ll
 delete mode 100755 examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.elf
 delete mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.ld.script
 delete mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.ll
 delete mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.o
 delete mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.opt.ll
 delete mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.peanohack.ll
 delete mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_design.bif
 delete mode 100644 examples/elementwise_arith/air_project/mul_kernel_0_mul_kernel_0_sequence.bin
 delete mode 100644 examples/elementwise_arith/air_project/npu.asm_air_output.mlir
 delete mode 100644 examples/elementwise_arith/air_project/placed.asm_air_output.mlir
 delete mode 100644 examples/elementwise_arith/air_project/square_kernel_0.pdi
 delete mode 100644 examples/elementwise_arith/air_project/square_kernel_0_aie_cdo_elfs.bin
 delete mode 100644 examples/elementwise_arith/air_project/square_kernel_0_aie_cdo_enable.bin
 delete mode 100644 examples/elementwise_arith/air_project/square_kernel_0_aie_cdo_init.bin
 delete mode 100755 examples/elementwise_arith/air_project/square_kernel_0_core_0_2.elf
 delete mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_2.ld.script
 delete mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_2.ll
 delete mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_2.o
 delete mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_2.opt.ll
 delete mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_2.peanohack.ll
 delete mode 100755 examples/elementwise_arith/air_project/square_kernel_0_core_0_3.elf
 delete mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_3.ld.script
 delete mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_3.ll
 delete mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_3.o
 delete mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_3.opt.ll
 delete mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_3.peanohack.ll
 delete mode 100755 examples/elementwise_arith/air_project/square_kernel_0_core_0_4.elf
 delete mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_4.ld.script
 delete mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_4.ll
 delete mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_4.o
 delete mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_4.opt.ll
 delete mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_4.peanohack.ll
 delete mode 100755 examples/elementwise_arith/air_project/square_kernel_0_core_0_5.elf
 delete mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_5.ld.script
 delete mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_5.ll
 delete mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_5.o
 delete mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_5.opt.ll
 delete mode 100644 examples/elementwise_arith/air_project/square_kernel_0_core_0_5.peanohack.ll
 delete mode 100644 examples/elementwise_arith/air_project/square_kernel_0_design.bif
 delete mode 100644 examples/elementwise_arith/air_project/square_kernel_0_square_kernel_0_sequence.bin
 delete mode 100644 examples/elementwise_arith/air_project/sub_kernel_0.pdi
 delete mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_aie_cdo_elfs.bin
 delete mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_aie_cdo_enable.bin
 delete mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_aie_cdo_init.bin
 delete mode 100755 examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.elf
 delete mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.ld.script
 delete mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.ll
 delete mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.o
 delete mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.opt.ll
 delete mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.peanohack.ll
 delete mode 100755 examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.elf
 delete mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.ld.script
 delete mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.ll
 delete mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.o
 delete mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.opt.ll
 delete mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.peanohack.ll
 delete mode 100755 examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.elf
 delete mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.ld.script
 delete mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.ll
 delete mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.o
 delete mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.opt.ll
 delete mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.peanohack.ll
 delete mode 100755 examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.elf
 delete mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.ld.script
 delete mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.ll
 delete mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.o
 delete mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.opt.ll
 delete mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.peanohack.ll
 delete mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_design.bif
 delete mode 100644 examples/elementwise_arith/air_project/sub_kernel_0_sub_kernel_0_sequence.bin
 delete mode 100644 examples/elementwise_arith/air_project/tt.mlir
 delete mode 100644 examples/elementwise_arith/tt.shared.mlir

diff --git a/examples/elementwise_arith/air_project/aie.asm_air_output.mlir b/examples/elementwise_arith/air_project/aie.asm_air_output.mlir
deleted file mode 100644
index e55b5a1..0000000
--- a/examples/elementwise_arith/air_project/aie.asm_air_output.mlir
+++ /dev/null
@@ -1,386 +0,0 @@
-#loop_annotation = #llvm.loop_annotation<mustProgress = true>
-module {
-  aie.device(npu2) @square_kernel_0 {
-    %shim_noc_tile_0_0 = aie.tile(0, 0)
-    %shim_noc_tile_1_0 = aie.tile(1, 0)
-    %mem_tile_0_1 = aie.tile(0, 1)
-    %mem_tile_1_1 = aie.tile(1, 1)
-    %tile_0_2 = aie.tile(0, 2)
-    %tile_0_3 = aie.tile(0, 3)
-    %tile_0_4 = aie.tile(0, 4)
-    %tile_0_5 = aie.tile(0, 5)
-    %lock_0_1 = aie.lock(%mem_tile_0_1, 1) {init = 4 : i32}
-    %lock_0_1_0 = aie.lock(%mem_tile_0_1, 0) {init = 0 : i32}
-    %lock_1_1 = aie.lock(%mem_tile_1_1, 1) {init = 4 : i32}
-    %lock_1_1_1 = aie.lock(%mem_tile_1_1, 0) {init = 0 : i32}
-    %lock_0_2 = aie.lock(%tile_0_2, 3) {init = 1 : i32}
-    %lock_0_2_2 = aie.lock(%tile_0_2, 2) {init = 0 : i32}
-    %lock_0_2_3 = aie.lock(%tile_0_2, 1) {init = 1 : i32}
-    %lock_0_2_4 = aie.lock(%tile_0_2, 0) {init = 0 : i32}
-    %lock_0_3 = aie.lock(%tile_0_3, 3) {init = 1 : i32}
-    %lock_0_3_5 = aie.lock(%tile_0_3, 2) {init = 0 : i32}
-    %lock_0_3_6 = aie.lock(%tile_0_3, 1) {init = 1 : i32}
-    %lock_0_3_7 = aie.lock(%tile_0_3, 0) {init = 0 : i32}
-    %lock_0_4 = aie.lock(%tile_0_4, 3) {init = 1 : i32}
-    %lock_0_4_8 = aie.lock(%tile_0_4, 2) {init = 0 : i32}
-    %lock_0_4_9 = aie.lock(%tile_0_4, 1) {init = 1 : i32}
-    %lock_0_4_10 = aie.lock(%tile_0_4, 0) {init = 0 : i32}
-    %lock_0_5 = aie.lock(%tile_0_5, 3) {init = 1 : i32}
-    %lock_0_5_11 = aie.lock(%tile_0_5, 2) {init = 0 : i32}
-    %lock_0_5_12 = aie.lock(%tile_0_5, 1) {init = 1 : i32}
-    %lock_0_5_13 = aie.lock(%tile_0_5, 0) {init = 0 : i32}
-    %buf9 = aie.buffer(%mem_tile_0_1) {sym_name = "buf9"} : memref<1024xi16, 1 : i32> 
-    %buf8 = aie.buffer(%mem_tile_1_1) {sym_name = "buf8"} : memref<1024xi16, 1> 
-    %buf7 = aie.buffer(%tile_0_5) {sym_name = "buf7"} : memref<256xi16, 2> 
-    %buf6 = aie.buffer(%tile_0_5) {sym_name = "buf6"} : memref<256xi16, 2> 
-    %buf5 = aie.buffer(%tile_0_4) {sym_name = "buf5"} : memref<256xi16, 2> 
-    %buf4 = aie.buffer(%tile_0_4) {sym_name = "buf4"} : memref<256xi16, 2> 
-    %buf3 = aie.buffer(%tile_0_3) {sym_name = "buf3"} : memref<256xi16, 2> 
-    %buf2 = aie.buffer(%tile_0_3) {sym_name = "buf2"} : memref<256xi16, 2> 
-    %buf1 = aie.buffer(%tile_0_2) {sym_name = "buf1"} : memref<256xi16, 2> 
-    %buf0 = aie.buffer(%tile_0_2) {sym_name = "buf0"} : memref<256xi16, 2> 
-    %mem_0_5 = aie.mem(%tile_0_5) {
-      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
-    ^bb1:  // 2 preds: ^bb0, ^bb1
-      aie.use_lock(%lock_0_5_13, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf6 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32}
-      aie.use_lock(%lock_0_5_12, Release, 1)
-      aie.next_bd ^bb1
-    ^bb2:  // pred: ^bb3
-      aie.end
-    ^bb3:  // pred: ^bb0
-      %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb2)
-    ^bb4:  // 2 preds: ^bb3, ^bb4
-      aie.use_lock(%lock_0_5, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf7 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32}
-      aie.use_lock(%lock_0_5_11, Release, 1)
-      aie.next_bd ^bb4
-    }
-    %core_0_5 = aie.core(%tile_0_5) {
-      %0 = ub.poison : i16
-      %c256 = arith.constant 256 : index
-      %c32 = arith.constant 32 : index
-      %c0 = arith.constant 0 : index
-      cf.br ^bb1
-    ^bb1:  // 2 preds: ^bb0, ^bb2
-      aie.use_lock(%lock_0_5_12, AcquireGreaterEqual, 1)
-      cf.br ^bb2
-    ^bb2:  // pred: ^bb1
-      aie.use_lock(%lock_0_5_11, AcquireGreaterEqual, 1)
-      scf.for %arg0 = %c0 to %c256 step %c32 {
-        %subview = memref.subview %buf7[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2>
-        %subview_14 = memref.subview %buf6[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2>
-        %1 = vector.transfer_read %subview[%c0], %0 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16>
-        %2 = arith.muli %1, %1 : vector<32xi16>
-        vector.transfer_write %2, %subview_14[%c0] {in_bounds = [true]} : vector<32xi16>, memref<32xi16, strided<[1], offset: ?>, 2>
-      } {loop_annotation = #loop_annotation}
-      aie.use_lock(%lock_0_5, Release, 1)
-      aie.use_lock(%lock_0_5_13, Release, 1)
-      cf.br ^bb1
-    }
-    %mem_0_4 = aie.mem(%tile_0_4) {
-      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
-    ^bb1:  // 2 preds: ^bb0, ^bb1
-      aie.use_lock(%lock_0_4_10, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf4 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32}
-      aie.use_lock(%lock_0_4_9, Release, 1)
-      aie.next_bd ^bb1
-    ^bb2:  // pred: ^bb3
-      aie.end
-    ^bb3:  // pred: ^bb0
-      %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb2)
-    ^bb4:  // 2 preds: ^bb3, ^bb4
-      aie.use_lock(%lock_0_4, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf5 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32}
-      aie.use_lock(%lock_0_4_8, Release, 1)
-      aie.next_bd ^bb4
-    }
-    %core_0_4 = aie.core(%tile_0_4) {
-      %0 = ub.poison : i16
-      %c256 = arith.constant 256 : index
-      %c32 = arith.constant 32 : index
-      %c0 = arith.constant 0 : index
-      cf.br ^bb1
-    ^bb1:  // 2 preds: ^bb0, ^bb2
-      aie.use_lock(%lock_0_4_9, AcquireGreaterEqual, 1)
-      cf.br ^bb2
-    ^bb2:  // pred: ^bb1
-      aie.use_lock(%lock_0_4_8, AcquireGreaterEqual, 1)
-      scf.for %arg0 = %c0 to %c256 step %c32 {
-        %subview = memref.subview %buf5[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2>
-        %subview_14 = memref.subview %buf4[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2>
-        %1 = vector.transfer_read %subview[%c0], %0 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16>
-        %2 = arith.muli %1, %1 : vector<32xi16>
-        vector.transfer_write %2, %subview_14[%c0] {in_bounds = [true]} : vector<32xi16>, memref<32xi16, strided<[1], offset: ?>, 2>
-      } {loop_annotation = #loop_annotation}
-      aie.use_lock(%lock_0_4, Release, 1)
-      aie.use_lock(%lock_0_4_10, Release, 1)
-      cf.br ^bb1
-    }
-    %mem_0_3 = aie.mem(%tile_0_3) {
-      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
-    ^bb1:  // 2 preds: ^bb0, ^bb1
-      aie.use_lock(%lock_0_3_7, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf2 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32}
-      aie.use_lock(%lock_0_3_6, Release, 1)
-      aie.next_bd ^bb1
-    ^bb2:  // pred: ^bb3
-      aie.end
-    ^bb3:  // pred: ^bb0
-      %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb2)
-    ^bb4:  // 2 preds: ^bb3, ^bb4
-      aie.use_lock(%lock_0_3, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf3 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32}
-      aie.use_lock(%lock_0_3_5, Release, 1)
-      aie.next_bd ^bb4
-    }
-    %core_0_3 = aie.core(%tile_0_3) {
-      %0 = ub.poison : i16
-      %c256 = arith.constant 256 : index
-      %c32 = arith.constant 32 : index
-      %c0 = arith.constant 0 : index
-      cf.br ^bb1
-    ^bb1:  // 2 preds: ^bb0, ^bb2
-      aie.use_lock(%lock_0_3_6, AcquireGreaterEqual, 1)
-      cf.br ^bb2
-    ^bb2:  // pred: ^bb1
-      aie.use_lock(%lock_0_3_5, AcquireGreaterEqual, 1)
-      scf.for %arg0 = %c0 to %c256 step %c32 {
-        %subview = memref.subview %buf3[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2>
-        %subview_14 = memref.subview %buf2[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2>
-        %1 = vector.transfer_read %subview[%c0], %0 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16>
-        %2 = arith.muli %1, %1 : vector<32xi16>
-        vector.transfer_write %2, %subview_14[%c0] {in_bounds = [true]} : vector<32xi16>, memref<32xi16, strided<[1], offset: ?>, 2>
-      } {loop_annotation = #loop_annotation}
-      aie.use_lock(%lock_0_3, Release, 1)
-      aie.use_lock(%lock_0_3_7, Release, 1)
-      cf.br ^bb1
-    }
-    %mem_0_2 = aie.mem(%tile_0_2) {
-      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
-    ^bb1:  // 2 preds: ^bb0, ^bb1
-      aie.use_lock(%lock_0_2_4, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf0 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32}
-      aie.use_lock(%lock_0_2_3, Release, 1)
-      aie.next_bd ^bb1
-    ^bb2:  // pred: ^bb3
-      aie.end
-    ^bb3:  // pred: ^bb0
-      %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb2)
-    ^bb4:  // 2 preds: ^bb3, ^bb4
-      aie.use_lock(%lock_0_2, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf1 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32}
-      aie.use_lock(%lock_0_2_2, Release, 1)
-      aie.next_bd ^bb4
-    }
-    %core_0_2 = aie.core(%tile_0_2) {
-      %0 = ub.poison : i16
-      %c256 = arith.constant 256 : index
-      %c32 = arith.constant 32 : index
-      %c0 = arith.constant 0 : index
-      cf.br ^bb1
-    ^bb1:  // 2 preds: ^bb0, ^bb2
-      aie.use_lock(%lock_0_2_3, AcquireGreaterEqual, 1)
-      cf.br ^bb2
-    ^bb2:  // pred: ^bb1
-      aie.use_lock(%lock_0_2_2, AcquireGreaterEqual, 1)
-      scf.for %arg0 = %c0 to %c256 step %c32 {
-        %subview = memref.subview %buf1[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2>
-        %subview_14 = memref.subview %buf0[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2>
-        %1 = vector.transfer_read %subview[%c0], %0 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16>
-        %2 = arith.muli %1, %1 : vector<32xi16>
-        vector.transfer_write %2, %subview_14[%c0] {in_bounds = [true]} : vector<32xi16>, memref<32xi16, strided<[1], offset: ?>, 2>
-      } {loop_annotation = #loop_annotation}
-      aie.use_lock(%lock_0_2, Release, 1)
-      aie.use_lock(%lock_0_2_4, Release, 1)
-      cf.br ^bb1
-    }
-    air.channel @channel_0 []
-    air.channel @channel_2 [1, 1]
-    air.channel @channel_8 [1, 1]
-    air.channel @channel_9 [1, 1]
-    air.channel @channel_10 [1, 1]
-    air.channel @channel_4 [1, 1]
-    air.channel @channel_5 [1, 1]
-    air.channel @channel_6 [1, 1]
-    air.channel @channel_7 [1, 1]
-    air.channel @channel_3 []
-    aie.flow(%shim_noc_tile_0_0, DMA : 0, %mem_tile_0_1, DMA : 0)
-    aie.flow(%mem_tile_1_1, DMA : 0, %shim_noc_tile_1_0, DMA : 0)
-    aie.flow(%mem_tile_0_1, DMA : 0, %tile_0_2, DMA : 0)
-    aie.flow(%mem_tile_0_1, DMA : 1, %tile_0_3, DMA : 0)
-    aie.flow(%mem_tile_0_1, DMA : 2, %tile_0_4, DMA : 0)
-    aie.flow(%mem_tile_0_1, DMA : 3, %tile_0_5, DMA : 0)
-    aie.flow(%tile_0_2, DMA : 0, %mem_tile_1_1, DMA : 0)
-    aie.flow(%tile_0_3, DMA : 0, %mem_tile_1_1, DMA : 1)
-    aie.flow(%tile_0_4, DMA : 0, %mem_tile_1_1, DMA : 2)
-    aie.flow(%tile_0_5, DMA : 0, %mem_tile_1_1, DMA : 3)
-    %memtile_dma_1_1 = aie.memtile_dma(%mem_tile_1_1) {
-      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
-    ^bb1:  // 2 preds: ^bb0, ^bb1
-      aie.use_lock(%lock_1_1_1, AcquireGreaterEqual, 4)
-      aie.dma_bd(%buf8 : memref<1024xi16, 1>, 0, 1024) {task_id = 0 : i32}
-      aie.use_lock(%lock_1_1, Release, 4)
-      aie.next_bd ^bb1
-    ^bb2:  // pred: ^bb9
-      aie.end
-    ^bb3:  // pred: ^bb0
-      %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb5)
-    ^bb4:  // 2 preds: ^bb3, ^bb4
-      aie.use_lock(%lock_1_1, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf8 : memref<1024xi16, 1>, 0, 256) {task_id = 0 : i32}
-      aie.use_lock(%lock_1_1_1, Release, 1)
-      aie.next_bd ^bb4
-    ^bb5:  // pred: ^bb3
-      %2 = aie.dma_start(S2MM, 1, ^bb6, ^bb7)
-    ^bb6:  // 2 preds: ^bb5, ^bb6
-      aie.use_lock(%lock_1_1, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf8 : memref<1024xi16, 1>, 256, 256) {task_id = 0 : i32}
-      aie.use_lock(%lock_1_1_1, Release, 1)
-      aie.next_bd ^bb6
-    ^bb7:  // pred: ^bb5
-      %3 = aie.dma_start(S2MM, 2, ^bb8, ^bb9)
-    ^bb8:  // 2 preds: ^bb7, ^bb8
-      aie.use_lock(%lock_1_1, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf8 : memref<1024xi16, 1>, 512, 256) {task_id = 0 : i32}
-      aie.use_lock(%lock_1_1_1, Release, 1)
-      aie.next_bd ^bb8
-    ^bb9:  // pred: ^bb7
-      %4 = aie.dma_start(S2MM, 3, ^bb10, ^bb2)
-    ^bb10:  // 2 preds: ^bb9, ^bb10
-      aie.use_lock(%lock_1_1, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf8 : memref<1024xi16, 1>, 768, 256) {task_id = 0 : i32}
-      aie.use_lock(%lock_1_1_1, Release, 1)
-      aie.next_bd ^bb10
-    }
-    %memtile_dma_0_1 = aie.memtile_dma(%mem_tile_0_1) {
-      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
-    ^bb1:  // 2 preds: ^bb0, ^bb1
-      aie.use_lock(%lock_0_1_0, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 0, 256) {task_id = 0 : i32}
-      aie.use_lock(%lock_0_1, Release, 1)
-      aie.next_bd ^bb1
-    ^bb2:  // pred: ^bb9
-      aie.end
-    ^bb3:  // pred: ^bb0
-      %1 = aie.dma_start(MM2S, 1, ^bb4, ^bb5)
-    ^bb4:  // 2 preds: ^bb3, ^bb4
-      aie.use_lock(%lock_0_1_0, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 256, 256) {task_id = 0 : i32}
-      aie.use_lock(%lock_0_1, Release, 1)
-      aie.next_bd ^bb4
-    ^bb5:  // pred: ^bb3
-      %2 = aie.dma_start(MM2S, 2, ^bb6, ^bb7)
-    ^bb6:  // 2 preds: ^bb5, ^bb6
-      aie.use_lock(%lock_0_1_0, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 512, 256) {task_id = 0 : i32}
-      aie.use_lock(%lock_0_1, Release, 1)
-      aie.next_bd ^bb6
-    ^bb7:  // pred: ^bb5
-      %3 = aie.dma_start(MM2S, 3, ^bb8, ^bb9)
-    ^bb8:  // 2 preds: ^bb7, ^bb8
-      aie.use_lock(%lock_0_1_0, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 768, 256) {task_id = 0 : i32}
-      aie.use_lock(%lock_0_1, Release, 1)
-      aie.next_bd ^bb8
-    ^bb9:  // pred: ^bb7
-      %4 = aie.dma_start(S2MM, 0, ^bb10, ^bb2)
-    ^bb10:  // 2 preds: ^bb9, ^bb10
-      aie.use_lock(%lock_0_1, AcquireGreaterEqual, 4)
-      aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 0, 1024) {task_id = 0 : i32}
-      aie.use_lock(%lock_0_1_0, Release, 4)
-      aie.next_bd ^bb10
-    }
-    aie.shim_dma_allocation @air_channel_3(%shim_noc_tile_1_0, S2MM, 0)
-    aie.shim_dma_allocation @air_channel_0(%shim_noc_tile_0_0, MM2S, 0)
-  } {dlti.dl_spec = #dlti.dl_spec<index = 32 : i64>}
-  airrt.module_metadata{
-    airrt.segment_metadata attributes {dma_allocations = [{channel = 2 : i64, col = 0 : i64, id = 3 : i64, location = 0 : i64, row = -1 : i64}], sym_name = "square_kernel_0"}{
-      airrt.herd_metadata {dma_allocations = [], loc_x = 0 : i64, loc_y = 2 : i64, size_x = 1 : i64, size_y = 4 : i64, sym_name = "herd_0"}
-    }
-  }
-  air.channel @channel_0 []
-  air.channel @channel_1 [4, 1]
-  air.channel @channel_2 [4, 1]
-  air.channel @channel_3 []
-  func.func @square_kernel(%arg0: memref<*xi16> {tt.divisibility = 16 : i32}, %arg1: memref<*xi16> {tt.divisibility = 16 : i32}, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32) {
-    %c1 = arith.constant 1 : index
-    %0 = air.launch async (%arg8, %arg9, %arg10) in (%arg11=%c1, %arg12=%c1, %arg13=%c1) args(%arg14=%arg0, %arg15=%arg1) : memref<*xi16>, memref<*xi16> attributes {id = 1 : i32} {
-      %c1024 = arith.constant 1024 : index
-      %c1_0 = arith.constant 1 : index
-      %1 = arith.muli %arg8, %c1024 : index
-      %2 = air.channel.put async  @channel_0[] (%arg14[%1] [%c1024] [%c1_0]) {id = 1 : i32, metadataArray = [{base = "air_channel_0", index = 0 : i32}]} : (memref<*xi16>)
-      %3 = air.channel.get async  @channel_3[] (%arg15[%1] [%c1024] [%c1_0]) {id = 2 : i32, metadataArray = [{base = "air_channel_3", index = 0 : i32}]} : (memref<*xi16>)
-      %4 = air.segment @square_kernel_0 async  attributes {id = 2 : i32, x_loc = 0 : i64, x_size = 8 : i64, y_loc = 2 : i64, y_size = 6 : i64} {
-        %c4 = arith.constant 4 : index
-        %c768 = arith.constant 768 : index
-        %c3 = arith.constant 3 : index
-        %c512 = arith.constant 512 : index
-        %c2 = arith.constant 2 : index
-        %c256 = arith.constant 256 : index
-        %c0 = arith.constant 0 : index
-        %c1_1 = arith.constant 1 : index
-        %async_token, %results = air.execute -> (memref<1024xi16, 1 : i32>) {
-          %alloc = memref.alloc() : memref<1024xi16, 1 : i32>
-          air.execute_terminator %alloc : memref<1024xi16, 1 : i32>
-        }
-        %5 = air.channel.get async [%async_token]  @channel_0[] (%results[] [] []) {id = 3 : i32} : (memref<1024xi16, 1 : i32>)
-        %async_token_2, %results_3 = air.execute -> (memref<1024xi16, 1>) {
-          %alloc = memref.alloc() : memref<1024xi16, 1>
-          air.execute_terminator %alloc : memref<1024xi16, 1>
-        }
-        %6 = air.channel.put async [%5]  @channel_1[%c0, %c0] (%results[%c0] [%c256] [%c1_1]) {id = 4 : i32} : (memref<1024xi16, 1 : i32>)
-        %7 = air.channel.put async [%5]  @channel_1[%c1_1, %c0] (%results[%c256] [%c256] [%c1_1]) {id = 5 : i32} : (memref<1024xi16, 1 : i32>)
-        %8 = air.channel.put async [%5]  @channel_1[%c2, %c0] (%results[%c512] [%c256] [%c1_1]) {id = 6 : i32} : (memref<1024xi16, 1 : i32>)
-        %9 = air.channel.put async [%5]  @channel_1[%c3, %c0] (%results[%c768] [%c256] [%c1_1]) {id = 7 : i32} : (memref<1024xi16, 1 : i32>)
-        %10 = air.channel.get async [%async_token_2]  @channel_2[%c0, %c0] (%results_3[%c0] [%c256] [%c1_1]) {id = 8 : i32} : (memref<1024xi16, 1>)
-        %11 = air.channel.get async [%async_token_2]  @channel_2[%c1_1, %c0] (%results_3[%c256] [%c256] [%c1_1]) {id = 9 : i32} : (memref<1024xi16, 1>)
-        %12 = air.channel.get async [%async_token_2]  @channel_2[%c2, %c0] (%results_3[%c512] [%c256] [%c1_1]) {id = 10 : i32} : (memref<1024xi16, 1>)
-        %13 = air.channel.get async [%async_token_2]  @channel_2[%c3, %c0] (%results_3[%c768] [%c256] [%c1_1]) {id = 11 : i32} : (memref<1024xi16, 1>)
-        %14 = air.herd @herd_0 async [%5, %async_token_2]  tile (%arg16, %arg17) in (%arg18=%c1_1, %arg19=%c4) attributes {id = 3 : i32, x_loc = 0 : i64, y_loc = 2 : i64} {
-          %c32 = arith.constant 32 : index
-          %c256_5 = arith.constant 256 : index
-          %c0_6 = arith.constant 0 : index
-          %16 = ub.poison : i16
-          %async_token_7, %results_8 = air.execute -> (memref<256xi16, 2>) {
-            %alloc = memref.alloc() : memref<256xi16, 2>
-            air.execute_terminator %alloc : memref<256xi16, 2>
-          }
-          %17 = air.channel.get async [%async_token_7]  @channel_1[%arg17, %c0_6] (%results_8[] [] []) {id = 12 : i32} : (memref<256xi16, 2>)
-          %async_token_9, %results_10 = air.execute -> (memref<256xi16, 2>) {
-            %alloc = memref.alloc() : memref<256xi16, 2>
-            air.execute_terminator %alloc : memref<256xi16, 2>
-          }
-          %18 = air.wait_all async [%17, %async_token_9] 
-          %19 = scf.for %arg20 = %c0_6 to %c256_5 step %c32 iter_args(%arg21 = %18) -> (!air.async.token) {
-            %subview = memref.subview %results_8[%arg20] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2>
-            %subview_13 = memref.subview %results_10[%arg20] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2>
-            %async_token_14, %results_15 = air.execute [%arg21] -> (vector<32xi16>) {
-              %23 = vector.transfer_read %subview[%c0_6], %16 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16>
-              air.execute_terminator %23 : vector<32xi16>
-            }
-            %21 = arith.muli %results_15, %results_15 : vector<32xi16>
-            %async_token_16 = air.execute [%arg21] {
-              vector.transfer_write %21, %subview_13[%c0_6] {in_bounds = [true]} : vector<32xi16>, memref<32xi16, strided<[1], offset: ?>, 2>
-            }
-            %22 = air.wait_all async [%async_token_14, %async_token_16] 
-            scf.yield %22 : !air.async.token
-          }
-          %20 = air.channel.put async [%async_token_9]  @channel_2[%arg17, %c0_6] (%results_10[] [] []) {id = 13 : i32} : (memref<256xi16, 2>)
-          %async_token_11 = air.execute [%17] {
-            memref.dealloc %results_8 : memref<256xi16, 2>
-          }
-          %async_token_12 = air.execute [%20] {
-            memref.dealloc %results_10 : memref<256xi16, 2>
-          }
-        }
-        %15 = air.channel.put async [%14]  @channel_3[] (%results_3[] [] []) {id = 14 : i32} : (memref<1024xi16, 1>)
-        %async_token_4 = air.execute [%15] {
-          memref.dealloc %results_3 : memref<1024xi16, 1>
-        }
-        air.wait_all [%6, %7, %8, %9, %10, %11, %12, %13, %async_token_4]  {air.segment_end}
-      }
-    }
-    return
-  }
-}
diff --git a/examples/elementwise_arith/air_project/aie.elf b/examples/elementwise_arith/air_project/aie.elf
deleted file mode 100644
index d54eb1201d9709da9aaf31632fd7188b680af270..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 29488
zcmeHQU2GiH6~43HBuksZx^>Z#1Tpym0*$P798^G28DpmkZ7P-oP*jkzCYTU4aTdpE
zL#2wPjVNtVWwm`URF(YF7sSJ=KZvF;TfqaZP_^PIFO3|^BSIU*1C?mq^WA&T+%rEn
z4JC4HG{>6VbH8)u-ZST(o!y=J_WGGU`@S9o!JeuzQc?l^8QDco)fVfax<+kRLu6e`
z5Ns?V?SQRp0qo!8PDpQ&j%^L;7zd_a$zY_acFk<k-q@&A{{~p|_wR0^F~|qfub55l
zkMSGZjNLbW_hw`FjURo@zki4J?c8JR+rOd|HXh%g=J6l&G%>nm@cHjO@z7g;e)&aY
z3CEAn@XUGqW&3YK)Qsi6iAE!=hN>PTPQiZvkDR{!FE3#{x&P$RsY6GPPac_a9a3iV
z<=5Nqx{*??c#~dBRvsjf>Z2sbkba*=|M}9=zfoi36Zg{yN)3?y$3JzJ25B(fK5O}Y
zi;C*6wBcFGa^!o@`^kL2K{0&zdz0@{K-mws#<hfY_55Z`i`l4MOSM(1T@mfUz*G$8
zaZNC74|R(w2Wo*{L2B`ub@AcfLKXVXHwHr0-=Qwxl(MrMs7Hd30#dJ5Ri_t_qAo0*
zJs6z3W{XlkT6gxKI`_?pdak^B`nN&1I->VgT6WaWtY7<|`EUJvaK7>P!FR{jO+G&|
z|7Bz2$J+G~75&~|@rL<lp1N@&x<FA*gzr$H)A-cmv6=N4-^4@hGc(tUc5kfq*bC38
z&DL%@g}qb4{;o~?pEWJgSlrk^(*Ctd^LnM`pQDqrs#VnWXXsSKds_cUpP6kuXK_9+
zA(HvuynbD&)&Jc<=f@w-ANhFQj#4=qu*ah!olo+jQ!`H4T55N_;U+=VUVMFBO>MW<
zHz*??E#Zal>Hs^T&2_+St^;m!9dKKPqp7+YN!3-dt<ERg712)C)6Sxv)M~6JGrwFP
zH9NoD=K1F~&p)?SIG(Dfu~a=J+v<F>T@meMJ>6T>)2gf|GrwFPH9NoD=K1F~&p)?S
zcpz0z52WfT*;eP1?TTn8>uG;cPph(?%=~hF)a?9no9Ca~JpX88{`s?%{Op9!R%k;8
z0@C`MW4$W$Q0R4`M?!B1y(#pT(Az?v6Z(SCJ3{XYy(jcVq4%wBj&C&a8o_&f^7ux?
z@i{&0JH0OSNazirH-+93dRypoLSGPiN9bLl_k_ME^uE>2@r|dB&$QkAjfvxPdf0b*
zUFea}8$xdiy(RRv(C37{AoPyVyF%{?eNpIrtDECHkUBoocE|UCI6kL`eW%xj9tpi6
z^rp~TLT?LwPUs6l?+Cps^q$Zch29r>Rc+GOFZ64TbUrS^OZd)dab2>|>r~r;#Y0i^
zRe>vk*Qr*VzpLr0z?G=2I_2K^yBetqT#31=)3G>zm-DK?l|9};oKLK*3Gi*6q(}P%
zIt8X}9~bW_ug&uhwSScGMdP1`%GV-4Lv#*=pmwTO=lIIhhOX;M*aiu7@r-@Kc;Fvl
z$b`6f$WCJ7F?v94#9F5Jf9NJ2XG_LIbM}ey1OJG6O^S<$>?9^0qq}%at(bV6Eg8?|
zPZ$sUBR!^kaPg3x#KdFtfZ8TMrq)e7&Sv8IoZe5V4U|Zz+_tcNias7YBmG18V*s@w
z>1bPHOVP8^UkzCQ;HP@8^iFC2i2E<yo_5vkw13F`54_~<AGz4Imf|2mF6vgt4mwZN
zhOYCJvaP2!eV5j@O3F4wZFH)0pR#SDHZFt0RueY9Ua}_Od{Z8E<lLs6ghWeY>3kZ0
zPd?NBG+)NQls{h?{{ido$!FT1=F46{_RQDE(HWb6eN;C8TIc-lUb_L;D|D{zBpt%G
z32O7)L@C=oYIBbFrfg+u^W3PEZFheF*L>TEYv>|f53_oO@I6X(fh&U>{euB@M{v`1
z8dn?fO40jn4>rd8)(Pi+l)xjxDSqu=7WfXrX%vlD1b!#s6p_Z)3%uGNDA6SwjYk&;
zaA;;<!bQx$yE@Jf21*?BHnlPKr`ViM__em5I|W`QJRVEn6~ez@{dWm`J>l0`e7C^M
z{jIpBtmCX*+#2U6B0SEa!1ogF=EeC>5KixQx}Q+^?;||Ut-ucw9>*i_a(`6xGaARU
zi*Of@!*>%-m*@1j+b%kMAK`9XhYu0%#&vkPKQ4}IT#P%e=)#=tf73;W?<0TLzr%+J
zcl|rO+&>`tKM?mnM7Zn6;kEwO(&z2CJ1%aG>mejOo_B!{5&lK%zgOU!2#@PW;I;m!
z=zmo7KPvinc&$G!{KvJwJ{R{X`t@`;^!|i)xgV73)T9^_-@nKU$Ekhz8l-ERYe=qb
zt^uGe<+FDG0iO<elgGIn_a83jevb;uYb!Jf5e0|)fbo3LO#sZXyLyd=mM%S_HXx;E
z=|TJHrh6#U{+E)Wx1AmDIdQ9JYU@*st7D7%vq21SFmK~cg@j(f>Lz8ssc1x%*+#E1
z#MGt@HLjzh`zvvuS2{P59wdHPw%pipS9EULaay#pY`L-HuISu!uTpO8xGOq0?$xT3
zv}|=_$6e96iQ+Ta^5({lyP|W`zDl{V<F4r3bgxow?6_AVH~e|$emWFzjw&j7Oz{}u
z*7)QK4_wh?8+aNJe2~B<ZMatt$+O|Uz0Zbw_-R|95j<>BNI}8nSY7wt)Ahphn^!M+
zHm;XE8`n#o4d1<3P(yOOD&6~s<kcs50X`cz0zMn~0%@D8-JM14>g=r6+BI{_^DAqK
z>m|>|^^#}fddahaYr>MP-7#&-uU)?ld<00|e&8mgZGlGcaBoq&Iv1<8cFo-45HR=2
z+V$JGUh?+iddaha*TRAtlH>6`yu2C&-vN?m1NXsa0}mo?bG5s_s9l|%)mpn|Zh3xX
zEpff%*|=WvY+NsSHt=N_Prp{!#&*OKOMi2$;+h!??&GsaobUC+CUNiqSR@{K@P-F(
zdhnJ9Z+q}L55C~RJ0862!FwKj(S!F>IDfX}&w9D%4)HmV=ah5ITnV&gzQLzpk$C9A
z!OLKg{*ec7c<`nNZ+Y;x2cPrc3m&}V!Mh&3=fM{}ct3@^c{87L^5@Oikk{<Ijd8kD
z^Olc?mg9n*<otji!y@s>gEu^Q(}TA>c-w=|dGG}f-tpjF58m_Oiypk6!ri=?_xSvI
zbMODzd2{D;nQ!omSR@{LaPX8^q<`eW8y>vr!CM}@?ZM|f_<{%Tc<`<V?|JY=58n6S
z;HN=Av5|Q*pVG7`frGc^@z*$PvY)y=KL;s1Or5u}AEfH>S04YmJ&$Dj&p-cGJ^m_n
zJ~GkJJv>|Q+5Yp-6IGADO1*Ae|E6xU{pZ(n)y`wuU!~?3b}hXJ%nx@w95eO<?`}JG
z2qL|kd;ph=+swa1Th23oj!(x0P97wU^XD<fIS$nuyUcjmFVg+Xhh)5Cu_+m^!Fhak
z0%s4B#`(Js<9OE68@bGQ*)Os?x~F8kp1QTyl6ZXHyX?XFIs^N=csE~WyzKYC;@!-l
zr0T)oJU%;>?e*L=&e!x9$Fq?Qnp2@qP?z8UvOn?3jK2o*Ic4DJu|N2K`aV4X4`)sK
zgZ~%%UxEY3{e%A(`+GQR=%0_4d4OokdX=0&i4%`ZpLa2y#EDaui-Rx7eM^7I9h5jY
zgz(3M#H*hE!7GG6cyH2QatwiUeiZTZ;{5pLmpF#8Kk@%!f6sh@1IYeV56=G*9K+aO
z_V1Z5*+12T>0fhH{TxH&1VZy`6nd2Tq50J~;*dDFhY(;Ampnv?gO3PF_anK9(jOc}
zNP4~`Us3vlw+KnsgXAzue{dNg;LS<g@)}`E;NUnyz!{YOmhT8t5-08>1YE*|zu`M-
z-kiiOClb3P#<l#&3~srS863Px2<17^kL6Hi{K2P$P~H;$mRp(e2hS2hN0;!ooXd<q
z_?G~yDSykw%=lBDwSP{YHBLKpW!@C=HPJ~r&z8HH>EH4=vp8`v0fLt&`?vhgj6b-Z
z05ivBc$^u3a6kcq2P*rwe9(+PxS;^STa^BmCz|mGXA}b7sPwn|(TqR1q!92orT=!r
z`^@-*V+zT>UJOpAL_gr3LWrW4z%36IrUVX7DkM|4o_N4jg=F&M@dt+$l8wjUWJ<&X
zZYw04pL~CCULoMU%6wY>D@+L-Tv$l<c=P)QM;4Mj-h6*>XCc}7GB}wM@$9nx;ML0b
zEx#6BC2(+UA=&uz`v(UXl8ry#AKY9>HhzPXDG?7iyO3-=`TpSY!XJEIdAyd_3sV9I
z#}{Dc^=|mP8Gmqp0fPT4`?oycj6XQR05j(wPygTw!#^9p!O4^uH?;oQ`1Ae2Eza-X
z;N1TJoFL#FL-2WnxX1GPH~7ae(b)PkiL~*xYV3_IX&WWP;8~lgoBaLaT8R6EpO!1l
z1?7J2^YZc5%z$zJnF~uVu!1MM3UbpnG4kWG)r}o@Mb}N2vtpi?EjM=D6`dP>MsxXD
zw%pipS9ETo_$;pzv~0Pt<F4r3w69Wb?6@mBH{Gk08$0fm$c-94`Q*vjnc2z5NaN?H
zkLz~p@uR~dY8X#D&C{qFK0G^fY<BAO>~LKTKmPPI?PH#wKCXsmrjAVl)X3!VqleY-
zk(m>xTJXJeXEcoQBxU-<?9}k-nc3mPC#Ih`dgRp1<m}NC)Mt!8bl-$hhrT^>$H}Ko
zP0mam`p(qM^whCK^~T}kpGw&eot%2=)YSChDcX70@yVmpN%zL#<PPp+X*D_9TL`;Z
zc;J`cb{RyC!v1i=PP0{4Hk~6{73Nrt$B5`Rcl2w;c#dtGzSqPuNS}=Hb3H8){HypL
zV8pgVYOS%I&O_IqsFm7Hn8wNP572>=evqCu-ZpDbYMk_03Pj^wrQU-Mf70jZbGzw3
zP!$T!^dG2Kpkv%9P}}I@g6Tg{Z$by&4pc;68cqL!dKWrys_jSV%eLu1Q1}j1u}L_%
zg>Zdcl465ylQFy1;t|CL9XRQ8+brIuK5n)+>90UXKV4d1@)nDe{w8$YKgD*d#c5vO
zg$|sK<x#q+WcsIIZ?iZZ%W3Esm-J0^+0^tOs9!+`-o?Qw{Qgv56{bT}FBk~0f(jc;
z@%u>5#HWA0oZm-&*79Afhp85d^)Ss^oOlIY;Fr-OfnI;JSPzrlROnta-p2Gszjx>L
z5YZl_6S7-hwO9|6?$*U?yP}j<M?IPOMZ9`l7(3tG=K1F~&p)?|^)QR|dXww*7VBXa
z>px$~^`DFNFmbNM`bo@(ht3>$y(W!a|B~OMi}f%~zCz2tM825Q2|W^cL+DMRw}jpn
z`kc@ggx(Q)SLi*VFABYHbu)j(dYE)~66LGw9Q<K?*T-BjXBawo!i+OF*vxIQ9wz6d
zSPv8Hg&ICBUR(SdZ<n89J<RlamBo6P#d?@Hk1|gcuN%otg{@c*Q*$%*x`@Skm}Y&*
zA;SmMIVsk|v^gOz9(Z{AH6zx~EY`y`_iy<73f_;gnRc-rCZ!^lm9ict<_Ek&+}6WJ
zw<XoV{SWFjJke$wu!oy>fA!W!fA;+~KfLYkd#nF?pUjsn`m>qvKG#s{HPMD0LVQej
zub1>W`kC3|Fr?&5)Ye}nEdC++QI{G+UmQEm=xF1(AWdqtcra??_#j(oV?WsO{{|%i
zl>O-TX6%RmH;Df`nAcxj_%+F=OT<j9H_F2O?Iyt9*43to{Vpc<#q$Jh$PLRu@@rB9
zT_SEgqtAV`;n-PTBR||<C0?}__~BkT?#XjbVAxClh!6X7h<L_!h_*r6SeoPq95EyR
z*azc1NgL-6|3|S!etCI0A^h<F6c-=L&+~VMAO5dmb2`7j9|%AEe?|PCMXYni!g>A)
r0rEI{6bK&&;^*V|1?lqs^EvVx;?C$9#|U|uHtF{s`Gqi8VZZ+Zn&r4p

diff --git a/examples/elementwise_arith/air_project/aiecc_failure_1775797115_856352.mlir b/examples/elementwise_arith/air_project/aiecc_failure_1775797115_856352.mlir
deleted file mode 100644
index 15c21c6..0000000
--- a/examples/elementwise_arith/air_project/aiecc_failure_1775797115_856352.mlir
+++ /dev/null
@@ -1,411 +0,0 @@
-#loop_annotation = #llvm.loop_annotation<mustProgress = true>
-module {
-  aie.device(npu2) @sub_kernel_0 {
-    %shim_noc_tile_0_0 = aie.tile(0, 0)
-    %shim_noc_tile_1_0 = aie.tile(1, 0)
-    %shim_noc_tile_2_0 = aie.tile(2, 0)
-    %mem_tile_0_1 = aie.tile(0, 1)
-    %mem_tile_1_1 = aie.tile(1, 1)
-    %mem_tile_2_1 = aie.tile(2, 1)
-    %tile_0_2 = aie.tile(0, 2)
-    %tile_0_3 = aie.tile(0, 3)
-    %tile_0_4 = aie.tile(0, 4)
-    %tile_0_5 = aie.tile(0, 5)
-    %lock_1_1 = aie.lock(%mem_tile_1_1, 1) {init = 4 : i32}
-    %lock_1_1_0 = aie.lock(%mem_tile_1_1, 0) {init = 0 : i32}
-    %lock_0_1 = aie.lock(%mem_tile_0_1, 1) {init = 4 : i32}
-    %lock_0_1_1 = aie.lock(%mem_tile_0_1, 0) {init = 0 : i32}
-    %lock_2_1 = aie.lock(%mem_tile_2_1, 1) {init = 4 : i32}
-    %lock_2_1_2 = aie.lock(%mem_tile_2_1, 0) {init = 0 : i32}
-    %lock_0_2 = aie.lock(%tile_0_2, 5) {init = 1 : i32}
-    %lock_0_2_3 = aie.lock(%tile_0_2, 4) {init = 0 : i32}
-    %lock_0_2_4 = aie.lock(%tile_0_2, 3) {init = 1 : i32}
-    %lock_0_2_5 = aie.lock(%tile_0_2, 2) {init = 0 : i32}
-    %lock_0_2_6 = aie.lock(%tile_0_2, 1) {init = 1 : i32}
-    %lock_0_2_7 = aie.lock(%tile_0_2, 0) {init = 0 : i32}
-    %lock_0_3 = aie.lock(%tile_0_3, 5) {init = 1 : i32}
-    %lock_0_3_8 = aie.lock(%tile_0_3, 4) {init = 0 : i32}
-    %lock_0_3_9 = aie.lock(%tile_0_3, 3) {init = 1 : i32}
-    %lock_0_3_10 = aie.lock(%tile_0_3, 2) {init = 0 : i32}
-    %lock_0_3_11 = aie.lock(%tile_0_3, 1) {init = 1 : i32}
-    %lock_0_3_12 = aie.lock(%tile_0_3, 0) {init = 0 : i32}
-    %lock_0_4 = aie.lock(%tile_0_4, 5) {init = 1 : i32}
-    %lock_0_4_13 = aie.lock(%tile_0_4, 4) {init = 0 : i32}
-    %lock_0_4_14 = aie.lock(%tile_0_4, 3) {init = 1 : i32}
-    %lock_0_4_15 = aie.lock(%tile_0_4, 2) {init = 0 : i32}
-    %lock_0_4_16 = aie.lock(%tile_0_4, 1) {init = 1 : i32}
-    %lock_0_4_17 = aie.lock(%tile_0_4, 0) {init = 0 : i32}
-    %lock_0_5 = aie.lock(%tile_0_5, 5) {init = 1 : i32}
-    %lock_0_5_18 = aie.lock(%tile_0_5, 4) {init = 0 : i32}
-    %lock_0_5_19 = aie.lock(%tile_0_5, 3) {init = 1 : i32}
-    %lock_0_5_20 = aie.lock(%tile_0_5, 2) {init = 0 : i32}
-    %lock_0_5_21 = aie.lock(%tile_0_5, 1) {init = 1 : i32}
-    %lock_0_5_22 = aie.lock(%tile_0_5, 0) {init = 0 : i32}
-    %buf14 = aie.buffer(%mem_tile_0_1) {sym_name = "buf14"} : memref<1024xi16, 1 : i32> 
-    %buf13 = aie.buffer(%mem_tile_1_1) {sym_name = "buf13"} : memref<1024xi16, 1 : i32> 
-    %buf12 = aie.buffer(%mem_tile_2_1) {sym_name = "buf12"} : memref<1024xi16, 1> 
-    %buf11 = aie.buffer(%tile_0_5) {sym_name = "buf11"} : memref<256xi16, 2> 
-    %buf10 = aie.buffer(%tile_0_5) {sym_name = "buf10"} : memref<256xi16, 2> 
-    %buf9 = aie.buffer(%tile_0_5) {sym_name = "buf9"} : memref<256xi16, 2> 
-    %buf8 = aie.buffer(%tile_0_4) {sym_name = "buf8"} : memref<256xi16, 2> 
-    %buf7 = aie.buffer(%tile_0_4) {sym_name = "buf7"} : memref<256xi16, 2> 
-    %buf6 = aie.buffer(%tile_0_4) {sym_name = "buf6"} : memref<256xi16, 2> 
-    %buf5 = aie.buffer(%tile_0_3) {sym_name = "buf5"} : memref<256xi16, 2> 
-    %buf4 = aie.buffer(%tile_0_3) {sym_name = "buf4"} : memref<256xi16, 2> 
-    %buf3 = aie.buffer(%tile_0_3) {sym_name = "buf3"} : memref<256xi16, 2> 
-    %buf2 = aie.buffer(%tile_0_2) {sym_name = "buf2"} : memref<256xi16, 2> 
-    %buf1 = aie.buffer(%tile_0_2) {sym_name = "buf1"} : memref<256xi16, 2> 
-    %buf0 = aie.buffer(%tile_0_2) {sym_name = "buf0"} : memref<256xi16, 2> 
-    %mem_0_5 = aie.mem(%tile_0_5) {
-      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
-    ^bb1:  // 2 preds: ^bb0, ^bb1
-      aie.use_lock(%lock_0_5_22, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf9 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32}
-      aie.use_lock(%lock_0_5_21, Release, 1)
-      aie.next_bd ^bb1
-    ^bb2:  // pred: ^bb5
-      aie.end
-    ^bb3:  // pred: ^bb0
-      %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb5)
-    ^bb4:  // 2 preds: ^bb3, ^bb4
-      aie.use_lock(%lock_0_5_19, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf11 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32}
-      aie.use_lock(%lock_0_5_20, Release, 1)
-      aie.next_bd ^bb4
-    ^bb5:  // pred: ^bb3
-      %2 = aie.dma_start(S2MM, 1, ^bb6, ^bb2)
-    ^bb6:  // 2 preds: ^bb5, ^bb6
-      aie.use_lock(%lock_0_5, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf10 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32}
-      aie.use_lock(%lock_0_5_18, Release, 1)
-      aie.next_bd ^bb6
-    }
-    %core_0_5 = aie.core(%tile_0_5) {
-      %0 = ub.poison : i16
-      %c256 = arith.constant 256 : index
-      %c32 = arith.constant 32 : index
-      %c0 = arith.constant 0 : index
-      cf.br ^bb1
-    ^bb1:  // 2 preds: ^bb0, ^bb1
-      aie.use_lock(%lock_0_5_21, AcquireGreaterEqual, 1)
-      aie.use_lock(%lock_0_5_20, AcquireGreaterEqual, 1)
-      aie.use_lock(%lock_0_5_18, AcquireGreaterEqual, 1)
-      scf.for %arg0 = %c0 to %c256 step %c32 {
-        %subview = memref.subview %buf11[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2>
-        %subview_23 = memref.subview %buf10[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2>
-        %subview_24 = memref.subview %buf9[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2>
-        %1 = vector.transfer_read %subview[%c0], %0 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16>
-        %2 = vector.transfer_read %subview_23[%c0], %0 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16>
-        %3 = arith.subi %1, %2 : vector<32xi16>
-        vector.transfer_write %3, %subview_24[%c0] {in_bounds = [true]} : vector<32xi16>, memref<32xi16, strided<[1], offset: ?>, 2>
-      } {loop_annotation = #loop_annotation}
-      aie.use_lock(%lock_0_5_19, Release, 1)
-      aie.use_lock(%lock_0_5, Release, 1)
-      aie.use_lock(%lock_0_5_22, Release, 1)
-      cf.br ^bb1
-    }
-    %mem_0_4 = aie.mem(%tile_0_4) {
-      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
-    ^bb1:  // 2 preds: ^bb0, ^bb1
-      aie.use_lock(%lock_0_4_17, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf6 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32}
-      aie.use_lock(%lock_0_4_16, Release, 1)
-      aie.next_bd ^bb1
-    ^bb2:  // pred: ^bb5
-      aie.end
-    ^bb3:  // pred: ^bb0
-      %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb5)
-    ^bb4:  // 2 preds: ^bb3, ^bb4
-      aie.use_lock(%lock_0_4_14, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf8 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32}
-      aie.use_lock(%lock_0_4_15, Release, 1)
-      aie.next_bd ^bb4
-    ^bb5:  // pred: ^bb3
-      %2 = aie.dma_start(S2MM, 1, ^bb6, ^bb2)
-    ^bb6:  // 2 preds: ^bb5, ^bb6
-      aie.use_lock(%lock_0_4, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf7 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32}
-      aie.use_lock(%lock_0_4_13, Release, 1)
-      aie.next_bd ^bb6
-    }
-    %core_0_4 = aie.core(%tile_0_4) {
-      %0 = ub.poison : i16
-      %c256 = arith.constant 256 : index
-      %c32 = arith.constant 32 : index
-      %c0 = arith.constant 0 : index
-      cf.br ^bb1
-    ^bb1:  // 2 preds: ^bb0, ^bb1
-      aie.use_lock(%lock_0_4_16, AcquireGreaterEqual, 1)
-      aie.use_lock(%lock_0_4_15, AcquireGreaterEqual, 1)
-      aie.use_lock(%lock_0_4_13, AcquireGreaterEqual, 1)
-      scf.for %arg0 = %c0 to %c256 step %c32 {
-        %subview = memref.subview %buf8[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2>
-        %subview_23 = memref.subview %buf7[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2>
-        %subview_24 = memref.subview %buf6[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2>
-        %1 = vector.transfer_read %subview[%c0], %0 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16>
-        %2 = vector.transfer_read %subview_23[%c0], %0 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16>
-        %3 = arith.subi %1, %2 : vector<32xi16>
-        vector.transfer_write %3, %subview_24[%c0] {in_bounds = [true]} : vector<32xi16>, memref<32xi16, strided<[1], offset: ?>, 2>
-      } {loop_annotation = #loop_annotation}
-      aie.use_lock(%lock_0_4_14, Release, 1)
-      aie.use_lock(%lock_0_4, Release, 1)
-      aie.use_lock(%lock_0_4_17, Release, 1)
-      cf.br ^bb1
-    }
-    %mem_0_3 = aie.mem(%tile_0_3) {
-      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
-    ^bb1:  // 2 preds: ^bb0, ^bb1
-      aie.use_lock(%lock_0_3_12, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf3 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32}
-      aie.use_lock(%lock_0_3_11, Release, 1)
-      aie.next_bd ^bb1
-    ^bb2:  // pred: ^bb5
-      aie.end
-    ^bb3:  // pred: ^bb0
-      %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb5)
-    ^bb4:  // 2 preds: ^bb3, ^bb4
-      aie.use_lock(%lock_0_3_9, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf5 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32}
-      aie.use_lock(%lock_0_3_10, Release, 1)
-      aie.next_bd ^bb4
-    ^bb5:  // pred: ^bb3
-      %2 = aie.dma_start(S2MM, 1, ^bb6, ^bb2)
-    ^bb6:  // 2 preds: ^bb5, ^bb6
-      aie.use_lock(%lock_0_3, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf4 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32}
-      aie.use_lock(%lock_0_3_8, Release, 1)
-      aie.next_bd ^bb6
-    }
-    %core_0_3 = aie.core(%tile_0_3) {
-      %0 = ub.poison : i16
-      %c256 = arith.constant 256 : index
-      %c32 = arith.constant 32 : index
-      %c0 = arith.constant 0 : index
-      cf.br ^bb1
-    ^bb1:  // 2 preds: ^bb0, ^bb1
-      aie.use_lock(%lock_0_3_11, AcquireGreaterEqual, 1)
-      aie.use_lock(%lock_0_3_10, AcquireGreaterEqual, 1)
-      aie.use_lock(%lock_0_3_8, AcquireGreaterEqual, 1)
-      scf.for %arg0 = %c0 to %c256 step %c32 {
-        %subview = memref.subview %buf5[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2>
-        %subview_23 = memref.subview %buf4[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2>
-        %subview_24 = memref.subview %buf3[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2>
-        %1 = vector.transfer_read %subview[%c0], %0 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16>
-        %2 = vector.transfer_read %subview_23[%c0], %0 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16>
-        %3 = arith.subi %1, %2 : vector<32xi16>
-        vector.transfer_write %3, %subview_24[%c0] {in_bounds = [true]} : vector<32xi16>, memref<32xi16, strided<[1], offset: ?>, 2>
-      } {loop_annotation = #loop_annotation}
-      aie.use_lock(%lock_0_3_9, Release, 1)
-      aie.use_lock(%lock_0_3, Release, 1)
-      aie.use_lock(%lock_0_3_12, Release, 1)
-      cf.br ^bb1
-    }
-    %mem_0_2 = aie.mem(%tile_0_2) {
-      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
-    ^bb1:  // 2 preds: ^bb0, ^bb1
-      aie.use_lock(%lock_0_2_7, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf0 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32}
-      aie.use_lock(%lock_0_2_6, Release, 1)
-      aie.next_bd ^bb1
-    ^bb2:  // pred: ^bb5
-      aie.end
-    ^bb3:  // pred: ^bb0
-      %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb5)
-    ^bb4:  // 2 preds: ^bb3, ^bb4
-      aie.use_lock(%lock_0_2_4, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf2 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32}
-      aie.use_lock(%lock_0_2_5, Release, 1)
-      aie.next_bd ^bb4
-    ^bb5:  // pred: ^bb3
-      %2 = aie.dma_start(S2MM, 1, ^bb6, ^bb2)
-    ^bb6:  // 2 preds: ^bb5, ^bb6
-      aie.use_lock(%lock_0_2, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf1 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32}
-      aie.use_lock(%lock_0_2_3, Release, 1)
-      aie.next_bd ^bb6
-    }
-    %core_0_2 = aie.core(%tile_0_2) {
-      %0 = ub.poison : i16
-      %c256 = arith.constant 256 : index
-      %c32 = arith.constant 32 : index
-      %c0 = arith.constant 0 : index
-      cf.br ^bb1
-    ^bb1:  // 2 preds: ^bb0, ^bb1
-      aie.use_lock(%lock_0_2_6, AcquireGreaterEqual, 1)
-      aie.use_lock(%lock_0_2_5, AcquireGreaterEqual, 1)
-      aie.use_lock(%lock_0_2_3, AcquireGreaterEqual, 1)
-      scf.for %arg0 = %c0 to %c256 step %c32 {
-        %subview = memref.subview %buf2[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2>
-        %subview_23 = memref.subview %buf1[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2>
-        %subview_24 = memref.subview %buf0[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2>
-        %1 = vector.transfer_read %subview[%c0], %0 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16>
-        %2 = vector.transfer_read %subview_23[%c0], %0 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16>
-        %3 = arith.subi %1, %2 : vector<32xi16>
-        vector.transfer_write %3, %subview_24[%c0] {in_bounds = [true]} : vector<32xi16>, memref<32xi16, strided<[1], offset: ?>, 2>
-      } {loop_annotation = #loop_annotation}
-      aie.use_lock(%lock_0_2_4, Release, 1)
-      aie.use_lock(%lock_0_2, Release, 1)
-      aie.use_lock(%lock_0_2_7, Release, 1)
-      cf.br ^bb1
-    }
-    aie.flow(%shim_noc_tile_0_0, DMA : 0, %mem_tile_0_1, DMA : 0)
-    aie.flow(%shim_noc_tile_1_0, DMA : 0, %mem_tile_1_1, DMA : 0)
-    aie.flow(%mem_tile_2_1, DMA : 0, %shim_noc_tile_2_0, DMA : 0)
-    aie.flow(%mem_tile_0_1, DMA : 0, %tile_0_2, DMA : 0)
-    aie.flow(%mem_tile_0_1, DMA : 1, %tile_0_3, DMA : 0)
-    aie.flow(%mem_tile_0_1, DMA : 2, %tile_0_4, DMA : 0)
-    aie.flow(%mem_tile_0_1, DMA : 3, %tile_0_5, DMA : 0)
-    aie.flow(%mem_tile_1_1, DMA : 0, %tile_0_2, DMA : 1)
-    aie.flow(%mem_tile_1_1, DMA : 1, %tile_0_3, DMA : 1)
-    aie.flow(%mem_tile_1_1, DMA : 2, %tile_0_4, DMA : 1)
-    aie.flow(%mem_tile_1_1, DMA : 3, %tile_0_5, DMA : 1)
-    aie.flow(%tile_0_2, DMA : 0, %mem_tile_2_1, DMA : 0)
-    aie.flow(%tile_0_3, DMA : 0, %mem_tile_2_1, DMA : 1)
-    aie.flow(%tile_0_4, DMA : 0, %mem_tile_2_1, DMA : 2)
-    aie.flow(%tile_0_5, DMA : 0, %mem_tile_2_1, DMA : 3)
-    %memtile_dma_2_1 = aie.memtile_dma(%mem_tile_2_1) {
-      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
-    ^bb1:  // 2 preds: ^bb0, ^bb1
-      aie.use_lock(%lock_2_1_2, AcquireGreaterEqual, 4)
-      aie.dma_bd(%buf12 : memref<1024xi16, 1>, 0, 1024) {task_id = 0 : i32}
-      aie.use_lock(%lock_2_1, Release, 4)
-      aie.next_bd ^bb1
-    ^bb2:  // pred: ^bb9
-      aie.end
-    ^bb3:  // pred: ^bb0
-      %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb5)
-    ^bb4:  // 2 preds: ^bb3, ^bb4
-      aie.use_lock(%lock_2_1, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf12 : memref<1024xi16, 1>, 0, 256) {task_id = 0 : i32}
-      aie.use_lock(%lock_2_1_2, Release, 1)
-      aie.next_bd ^bb4
-    ^bb5:  // pred: ^bb3
-      %2 = aie.dma_start(S2MM, 1, ^bb6, ^bb7)
-    ^bb6:  // 2 preds: ^bb5, ^bb6
-      aie.use_lock(%lock_2_1, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf12 : memref<1024xi16, 1>, 256, 256) {task_id = 0 : i32}
-      aie.use_lock(%lock_2_1_2, Release, 1)
-      aie.next_bd ^bb6
-    ^bb7:  // pred: ^bb5
-      %3 = aie.dma_start(S2MM, 2, ^bb8, ^bb9)
-    ^bb8:  // 2 preds: ^bb7, ^bb8
-      aie.use_lock(%lock_2_1, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf12 : memref<1024xi16, 1>, 512, 256) {task_id = 0 : i32}
-      aie.use_lock(%lock_2_1_2, Release, 1)
-      aie.next_bd ^bb8
-    ^bb9:  // pred: ^bb7
-      %4 = aie.dma_start(S2MM, 3, ^bb10, ^bb2)
-    ^bb10:  // 2 preds: ^bb9, ^bb10
-      aie.use_lock(%lock_2_1, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf12 : memref<1024xi16, 1>, 768, 256) {task_id = 0 : i32}
-      aie.use_lock(%lock_2_1_2, Release, 1)
-      aie.next_bd ^bb10
-    }
-    %memtile_dma_0_1 = aie.memtile_dma(%mem_tile_0_1) {
-      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
-    ^bb1:  // 2 preds: ^bb0, ^bb1
-      aie.use_lock(%lock_0_1_1, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf14 : memref<1024xi16, 1 : i32>, 0, 256) {task_id = 0 : i32}
-      aie.use_lock(%lock_0_1, Release, 1)
-      aie.next_bd ^bb1
-    ^bb2:  // pred: ^bb9
-      aie.end
-    ^bb3:  // pred: ^bb0
-      %1 = aie.dma_start(MM2S, 1, ^bb4, ^bb5)
-    ^bb4:  // 2 preds: ^bb3, ^bb4
-      aie.use_lock(%lock_0_1_1, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf14 : memref<1024xi16, 1 : i32>, 256, 256) {task_id = 0 : i32}
-      aie.use_lock(%lock_0_1, Release, 1)
-      aie.next_bd ^bb4
-    ^bb5:  // pred: ^bb3
-      %2 = aie.dma_start(MM2S, 2, ^bb6, ^bb7)
-    ^bb6:  // 2 preds: ^bb5, ^bb6
-      aie.use_lock(%lock_0_1_1, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf14 : memref<1024xi16, 1 : i32>, 512, 256) {task_id = 0 : i32}
-      aie.use_lock(%lock_0_1, Release, 1)
-      aie.next_bd ^bb6
-    ^bb7:  // pred: ^bb5
-      %3 = aie.dma_start(MM2S, 3, ^bb8, ^bb9)
-    ^bb8:  // 2 preds: ^bb7, ^bb8
-      aie.use_lock(%lock_0_1_1, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf14 : memref<1024xi16, 1 : i32>, 768, 256) {task_id = 0 : i32}
-      aie.use_lock(%lock_0_1, Release, 1)
-      aie.next_bd ^bb8
-    ^bb9:  // pred: ^bb7
-      %4 = aie.dma_start(S2MM, 0, ^bb10, ^bb2)
-    ^bb10:  // 2 preds: ^bb9, ^bb10
-      aie.use_lock(%lock_0_1, AcquireGreaterEqual, 4)
-      aie.dma_bd(%buf14 : memref<1024xi16, 1 : i32>, 0, 1024) {task_id = 0 : i32}
-      aie.use_lock(%lock_0_1_1, Release, 4)
-      aie.next_bd ^bb10
-    }
-    %memtile_dma_1_1 = aie.memtile_dma(%mem_tile_1_1) {
-      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
-    ^bb1:  // 2 preds: ^bb0, ^bb1
-      aie.use_lock(%lock_1_1_0, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf13 : memref<1024xi16, 1 : i32>, 0, 256) {task_id = 0 : i32}
-      aie.use_lock(%lock_1_1, Release, 1)
-      aie.next_bd ^bb1
-    ^bb2:  // pred: ^bb9
-      aie.end
-    ^bb3:  // pred: ^bb0
-      %1 = aie.dma_start(MM2S, 1, ^bb4, ^bb5)
-    ^bb4:  // 2 preds: ^bb3, ^bb4
-      aie.use_lock(%lock_1_1_0, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf13 : memref<1024xi16, 1 : i32>, 256, 256) {task_id = 0 : i32}
-      aie.use_lock(%lock_1_1, Release, 1)
-      aie.next_bd ^bb4
-    ^bb5:  // pred: ^bb3
-      %2 = aie.dma_start(MM2S, 2, ^bb6, ^bb7)
-    ^bb6:  // 2 preds: ^bb5, ^bb6
-      aie.use_lock(%lock_1_1_0, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf13 : memref<1024xi16, 1 : i32>, 512, 256) {task_id = 0 : i32}
-      aie.use_lock(%lock_1_1, Release, 1)
-      aie.next_bd ^bb6
-    ^bb7:  // pred: ^bb5
-      %3 = aie.dma_start(MM2S, 3, ^bb8, ^bb9)
-    ^bb8:  // 2 preds: ^bb7, ^bb8
-      aie.use_lock(%lock_1_1_0, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf13 : memref<1024xi16, 1 : i32>, 768, 256) {task_id = 0 : i32}
-      aie.use_lock(%lock_1_1, Release, 1)
-      aie.next_bd ^bb8
-    ^bb9:  // pred: ^bb7
-      %4 = aie.dma_start(S2MM, 0, ^bb10, ^bb2)
-    ^bb10:  // 2 preds: ^bb9, ^bb10
-      aie.use_lock(%lock_1_1, AcquireGreaterEqual, 4)
-      aie.dma_bd(%buf13 : memref<1024xi16, 1 : i32>, 0, 1024) {task_id = 0 : i32}
-      aie.use_lock(%lock_1_1_0, Release, 4)
-      aie.next_bd ^bb10
-    }
-    aie.shim_dma_allocation @air_channel_5(%shim_noc_tile_2_0, S2MM, 0)
-    aie.shim_dma_allocation @air_channel_0(%shim_noc_tile_0_0, MM2S, 0)
-    aie.shim_dma_allocation @air_channel_1(%shim_noc_tile_1_0, MM2S, 0)
-    aie.runtime_sequence @sub_kernel_0_sequence(%arg0: memref<*xi16>, %arg1: memref<*xi16>, %arg2: memref<*xi16>, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32) {
-      %0 = aiex.dma_configure_task_for @air_channel_0 {
-        aie.dma_bd(%arg0 : memref<*xi16>, 0, 1024, [<size = 2, stride = 512>, <size = 512, stride = 1>])
-        aie.end
-      }
-      aiex.dma_start_task(%0)
-      %1 = aiex.dma_configure_task_for @air_channel_1 {
-        aie.dma_bd(%arg1 : memref<*xi16>, 0, 1024, [<size = 2, stride = 512>, <size = 512, stride = 1>])
-        aie.end
-      }
-      aiex.dma_start_task(%1)
-      %2 = aiex.dma_configure_task_for @air_channel_5 {
-        aie.dma_bd(%arg2 : memref<*xi16>, 0, 1024, [<size = 2, stride = 512>, <size = 512, stride = 1>])
-        aie.end
-      } {issue_token = true}
-      aiex.dma_start_task(%2)
-      aiex.dma_free_task(%0)
-      aiex.dma_await_task(%2)
-      aiex.dma_free_task(%1)
-    }
-  } {dlti.dl_spec = #dlti.dl_spec<index = 32 : i64>}
-  aie.device(npu2) {
-    aie.runtime_sequence @sub_kernel(%arg0: memref<*xi16>, %arg1: memref<*xi16>, %arg2: memref<*xi16>, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32) {
-      aiex.configure @sub_kernel_0 {
-        aiex.run @sub_kernel_0_sequence(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8) : (memref<*xi16>, memref<*xi16>, memref<*xi16>, i32, i32, i32, i32, i32, i32)
-      }
-    }
-  }
-}
diff --git a/examples/elementwise_arith/air_project/aiecc_failure_1775797139_858651.mlir b/examples/elementwise_arith/air_project/aiecc_failure_1775797139_858651.mlir
deleted file mode 100644
index fc9d492..0000000
--- a/examples/elementwise_arith/air_project/aiecc_failure_1775797139_858651.mlir
+++ /dev/null
@@ -1,601 +0,0 @@
-#loop_annotation = #llvm.loop_annotation<mustProgress = true>
-module {
-  aie.device(npu2) @mul_kernel_0 {
-    %shim_noc_tile_0_0 = aie.tile(0, 0) {controller_id = #aie.packet_info<pkt_type = 0, pkt_id = 15>}
-    %shim_noc_tile_1_0 = aie.tile(1, 0) {controller_id = #aie.packet_info<pkt_type = 0, pkt_id = 15>}
-    %shim_noc_tile_2_0 = aie.tile(2, 0) {controller_id = #aie.packet_info<pkt_type = 0, pkt_id = 15>}
-    %mem_tile_0_1 = aie.tile(0, 1) {controller_id = #aie.packet_info<pkt_type = 0, pkt_id = 26>}
-    %mem_tile_1_1 = aie.tile(1, 1) {controller_id = #aie.packet_info<pkt_type = 0, pkt_id = 26>}
-    %mem_tile_2_1 = aie.tile(2, 1) {controller_id = #aie.packet_info<pkt_type = 0, pkt_id = 26>}
-    %tile_0_2 = aie.tile(0, 2) {controller_id = #aie.packet_info<pkt_type = 0, pkt_id = 27>}
-    %tile_0_3 = aie.tile(0, 3) {controller_id = #aie.packet_info<pkt_type = 0, pkt_id = 29>}
-    %tile_0_4 = aie.tile(0, 4) {controller_id = #aie.packet_info<pkt_type = 0, pkt_id = 30>}
-    %tile_0_5 = aie.tile(0, 5) {controller_id = #aie.packet_info<pkt_type = 0, pkt_id = 31>}
-    %lock_1_1 = aie.lock(%mem_tile_1_1, 1) {init = 4 : i32}
-    %lock_1_1_0 = aie.lock(%mem_tile_1_1, 0) {init = 0 : i32}
-    %lock_0_1 = aie.lock(%mem_tile_0_1, 1) {init = 4 : i32}
-    %lock_0_1_1 = aie.lock(%mem_tile_0_1, 0) {init = 0 : i32}
-    %lock_2_1 = aie.lock(%mem_tile_2_1, 1) {init = 4 : i32}
-    %lock_2_1_2 = aie.lock(%mem_tile_2_1, 0) {init = 0 : i32}
-    %lock_0_2 = aie.lock(%tile_0_2, 5) {init = 1 : i32}
-    %lock_0_2_3 = aie.lock(%tile_0_2, 4) {init = 0 : i32}
-    %lock_0_2_4 = aie.lock(%tile_0_2, 3) {init = 1 : i32}
-    %lock_0_2_5 = aie.lock(%tile_0_2, 2) {init = 0 : i32}
-    %lock_0_2_6 = aie.lock(%tile_0_2, 1) {init = 1 : i32}
-    %lock_0_2_7 = aie.lock(%tile_0_2, 0) {init = 0 : i32}
-    %lock_0_3 = aie.lock(%tile_0_3, 5) {init = 1 : i32}
-    %lock_0_3_8 = aie.lock(%tile_0_3, 4) {init = 0 : i32}
-    %lock_0_3_9 = aie.lock(%tile_0_3, 3) {init = 1 : i32}
-    %lock_0_3_10 = aie.lock(%tile_0_3, 2) {init = 0 : i32}
-    %lock_0_3_11 = aie.lock(%tile_0_3, 1) {init = 1 : i32}
-    %lock_0_3_12 = aie.lock(%tile_0_3, 0) {init = 0 : i32}
-    %lock_0_4 = aie.lock(%tile_0_4, 5) {init = 1 : i32}
-    %lock_0_4_13 = aie.lock(%tile_0_4, 4) {init = 0 : i32}
-    %lock_0_4_14 = aie.lock(%tile_0_4, 3) {init = 1 : i32}
-    %lock_0_4_15 = aie.lock(%tile_0_4, 2) {init = 0 : i32}
-    %lock_0_4_16 = aie.lock(%tile_0_4, 1) {init = 1 : i32}
-    %lock_0_4_17 = aie.lock(%tile_0_4, 0) {init = 0 : i32}
-    %lock_0_5 = aie.lock(%tile_0_5, 5) {init = 1 : i32}
-    %lock_0_5_18 = aie.lock(%tile_0_5, 4) {init = 0 : i32}
-    %lock_0_5_19 = aie.lock(%tile_0_5, 3) {init = 1 : i32}
-    %lock_0_5_20 = aie.lock(%tile_0_5, 2) {init = 0 : i32}
-    %lock_0_5_21 = aie.lock(%tile_0_5, 1) {init = 1 : i32}
-    %lock_0_5_22 = aie.lock(%tile_0_5, 0) {init = 0 : i32}
-    %buf14 = aie.buffer(%mem_tile_0_1) {address = 0 : i32, mem_bank = 0 : i32, sym_name = "buf14"} : memref<1024xi16, 1 : i32> 
-    %buf13 = aie.buffer(%mem_tile_1_1) {address = 0 : i32, mem_bank = 0 : i32, sym_name = "buf13"} : memref<1024xi16, 1 : i32> 
-    %buf12 = aie.buffer(%mem_tile_2_1) {address = 0 : i32, mem_bank = 0 : i32, sym_name = "buf12"} : memref<1024xi16, 1> 
-    %buf11 = aie.buffer(%tile_0_5) {address = 1024 : i32, mem_bank = 0 : i32, sym_name = "buf11"} : memref<256xi16, 2> 
-    %buf10 = aie.buffer(%tile_0_5) {address = 16384 : i32, mem_bank = 1 : i32, sym_name = "buf10"} : memref<256xi16, 2> 
-    %buf9 = aie.buffer(%tile_0_5) {address = 32768 : i32, mem_bank = 2 : i32, sym_name = "buf9"} : memref<256xi16, 2> 
-    %buf8 = aie.buffer(%tile_0_4) {address = 1024 : i32, mem_bank = 0 : i32, sym_name = "buf8"} : memref<256xi16, 2> 
-    %buf7 = aie.buffer(%tile_0_4) {address = 16384 : i32, mem_bank = 1 : i32, sym_name = "buf7"} : memref<256xi16, 2> 
-    %buf6 = aie.buffer(%tile_0_4) {address = 32768 : i32, mem_bank = 2 : i32, sym_name = "buf6"} : memref<256xi16, 2> 
-    %buf5 = aie.buffer(%tile_0_3) {address = 1024 : i32, mem_bank = 0 : i32, sym_name = "buf5"} : memref<256xi16, 2> 
-    %buf4 = aie.buffer(%tile_0_3) {address = 16384 : i32, mem_bank = 1 : i32, sym_name = "buf4"} : memref<256xi16, 2> 
-    %buf3 = aie.buffer(%tile_0_3) {address = 32768 : i32, mem_bank = 2 : i32, sym_name = "buf3"} : memref<256xi16, 2> 
-    %buf2 = aie.buffer(%tile_0_2) {address = 1024 : i32, mem_bank = 0 : i32, sym_name = "buf2"} : memref<256xi16, 2> 
-    %buf1 = aie.buffer(%tile_0_2) {address = 16384 : i32, mem_bank = 1 : i32, sym_name = "buf1"} : memref<256xi16, 2> 
-    %buf0 = aie.buffer(%tile_0_2) {address = 32768 : i32, mem_bank = 2 : i32, sym_name = "buf0"} : memref<256xi16, 2> 
-    %mem_0_5 = aie.mem(%tile_0_5) {
-      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
-    ^bb1:  // 2 preds: ^bb0, ^bb1
-      aie.use_lock(%lock_0_5_22, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf9 : memref<256xi16, 2>, 0, 256) {bd_id = 0 : i32, next_bd_id = 0 : i32, task_id = 0 : i32}
-      aie.use_lock(%lock_0_5_21, Release, 1)
-      aie.next_bd ^bb1
-    ^bb2:  // pred: ^bb5
-      aie.end
-    ^bb3:  // pred: ^bb0
-      %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb5)
-    ^bb4:  // 2 preds: ^bb3, ^bb4
-      aie.use_lock(%lock_0_5_19, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf11 : memref<256xi16, 2>, 0, 256) {bd_id = 1 : i32, next_bd_id = 1 : i32, task_id = 0 : i32}
-      aie.use_lock(%lock_0_5_20, Release, 1)
-      aie.next_bd ^bb4
-    ^bb5:  // pred: ^bb3
-      %2 = aie.dma_start(S2MM, 1, ^bb6, ^bb2)
-    ^bb6:  // 2 preds: ^bb5, ^bb6
-      aie.use_lock(%lock_0_5, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf10 : memref<256xi16, 2>, 0, 256) {bd_id = 2 : i32, next_bd_id = 2 : i32, task_id = 0 : i32}
-      aie.use_lock(%lock_0_5_18, Release, 1)
-      aie.next_bd ^bb6
-    }
-    %core_0_5 = aie.core(%tile_0_5) {
-      %c0_i32 = arith.constant 0 : i32
-      %c256 = arith.constant 256 : index
-      %c32 = arith.constant 32 : index
-      %c0 = arith.constant 0 : index
-      cf.br ^bb1
-    ^bb1:  // 2 preds: ^bb0, ^bb4
-      aie.use_lock(%lock_0_5_21, AcquireGreaterEqual, 1)
-      aie.use_lock(%lock_0_5_20, AcquireGreaterEqual, 1)
-      aie.use_lock(%lock_0_5_18, AcquireGreaterEqual, 1)
-      cf.br ^bb2(%c0 : index)
-    ^bb2(%0: index):  // 2 preds: ^bb1, ^bb3
-      %1 = arith.cmpi slt, %0, %c256 : index
-      cf.cond_br %1, ^bb3, ^bb4
-    ^bb3:  // pred: ^bb2
-      %2 = vector.load %buf11[%0] : memref<256xi16, 2>, vector<32xi16>
-      %3 = vector.load %buf10[%0] : memref<256xi16, 2>, vector<32xi16>
-      %4 = aievec.mul_elem %2, %3 : vector<32xi16>, vector<32xi16>, vector<32xi32>
-      %5 = aievec.srs %4, %c0_i32 : vector<32xi32>, i32, vector<32xi16>
-      vector.store %5, %buf9[%0] : memref<256xi16, 2>, vector<32xi16>
-      %6 = arith.addi %0, %c32 : index
-      cf.br ^bb2(%6 : index) {loop_annotation = #loop_annotation}
-    ^bb4:  // pred: ^bb2
-      aie.use_lock(%lock_0_5_19, Release, 1)
-      aie.use_lock(%lock_0_5, Release, 1)
-      aie.use_lock(%lock_0_5_22, Release, 1)
-      cf.br ^bb1
-    }
-    %mem_0_4 = aie.mem(%tile_0_4) {
-      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
-    ^bb1:  // 2 preds: ^bb0, ^bb1
-      aie.use_lock(%lock_0_4_17, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf6 : memref<256xi16, 2>, 0, 256) {bd_id = 0 : i32, next_bd_id = 0 : i32, task_id = 0 : i32}
-      aie.use_lock(%lock_0_4_16, Release, 1)
-      aie.next_bd ^bb1
-    ^bb2:  // pred: ^bb5
-      aie.end
-    ^bb3:  // pred: ^bb0
-      %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb5)
-    ^bb4:  // 2 preds: ^bb3, ^bb4
-      aie.use_lock(%lock_0_4_14, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf8 : memref<256xi16, 2>, 0, 256) {bd_id = 1 : i32, next_bd_id = 1 : i32, task_id = 0 : i32}
-      aie.use_lock(%lock_0_4_15, Release, 1)
-      aie.next_bd ^bb4
-    ^bb5:  // pred: ^bb3
-      %2 = aie.dma_start(S2MM, 1, ^bb6, ^bb2)
-    ^bb6:  // 2 preds: ^bb5, ^bb6
-      aie.use_lock(%lock_0_4, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf7 : memref<256xi16, 2>, 0, 256) {bd_id = 2 : i32, next_bd_id = 2 : i32, task_id = 0 : i32}
-      aie.use_lock(%lock_0_4_13, Release, 1)
-      aie.next_bd ^bb6
-    }
-    %core_0_4 = aie.core(%tile_0_4) {
-      %c0_i32 = arith.constant 0 : i32
-      %c256 = arith.constant 256 : index
-      %c32 = arith.constant 32 : index
-      %c0 = arith.constant 0 : index
-      cf.br ^bb1
-    ^bb1:  // 2 preds: ^bb0, ^bb4
-      aie.use_lock(%lock_0_4_16, AcquireGreaterEqual, 1)
-      aie.use_lock(%lock_0_4_15, AcquireGreaterEqual, 1)
-      aie.use_lock(%lock_0_4_13, AcquireGreaterEqual, 1)
-      cf.br ^bb2(%c0 : index)
-    ^bb2(%0: index):  // 2 preds: ^bb1, ^bb3
-      %1 = arith.cmpi slt, %0, %c256 : index
-      cf.cond_br %1, ^bb3, ^bb4
-    ^bb3:  // pred: ^bb2
-      %2 = vector.load %buf8[%0] : memref<256xi16, 2>, vector<32xi16>
-      %3 = vector.load %buf7[%0] : memref<256xi16, 2>, vector<32xi16>
-      %4 = aievec.mul_elem %2, %3 : vector<32xi16>, vector<32xi16>, vector<32xi32>
-      %5 = aievec.srs %4, %c0_i32 : vector<32xi32>, i32, vector<32xi16>
-      vector.store %5, %buf6[%0] : memref<256xi16, 2>, vector<32xi16>
-      %6 = arith.addi %0, %c32 : index
-      cf.br ^bb2(%6 : index) {loop_annotation = #loop_annotation}
-    ^bb4:  // pred: ^bb2
-      aie.use_lock(%lock_0_4_14, Release, 1)
-      aie.use_lock(%lock_0_4, Release, 1)
-      aie.use_lock(%lock_0_4_17, Release, 1)
-      cf.br ^bb1
-    }
-    %mem_0_3 = aie.mem(%tile_0_3) {
-      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
-    ^bb1:  // 2 preds: ^bb0, ^bb1
-      aie.use_lock(%lock_0_3_12, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf3 : memref<256xi16, 2>, 0, 256) {bd_id = 0 : i32, next_bd_id = 0 : i32, task_id = 0 : i32}
-      aie.use_lock(%lock_0_3_11, Release, 1)
-      aie.next_bd ^bb1
-    ^bb2:  // pred: ^bb5
-      aie.end
-    ^bb3:  // pred: ^bb0
-      %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb5)
-    ^bb4:  // 2 preds: ^bb3, ^bb4
-      aie.use_lock(%lock_0_3_9, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf5 : memref<256xi16, 2>, 0, 256) {bd_id = 1 : i32, next_bd_id = 1 : i32, task_id = 0 : i32}
-      aie.use_lock(%lock_0_3_10, Release, 1)
-      aie.next_bd ^bb4
-    ^bb5:  // pred: ^bb3
-      %2 = aie.dma_start(S2MM, 1, ^bb6, ^bb2)
-    ^bb6:  // 2 preds: ^bb5, ^bb6
-      aie.use_lock(%lock_0_3, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf4 : memref<256xi16, 2>, 0, 256) {bd_id = 2 : i32, next_bd_id = 2 : i32, task_id = 0 : i32}
-      aie.use_lock(%lock_0_3_8, Release, 1)
-      aie.next_bd ^bb6
-    }
-    %core_0_3 = aie.core(%tile_0_3) {
-      %c0_i32 = arith.constant 0 : i32
-      %c256 = arith.constant 256 : index
-      %c32 = arith.constant 32 : index
-      %c0 = arith.constant 0 : index
-      cf.br ^bb1
-    ^bb1:  // 2 preds: ^bb0, ^bb4
-      aie.use_lock(%lock_0_3_11, AcquireGreaterEqual, 1)
-      aie.use_lock(%lock_0_3_10, AcquireGreaterEqual, 1)
-      aie.use_lock(%lock_0_3_8, AcquireGreaterEqual, 1)
-      cf.br ^bb2(%c0 : index)
-    ^bb2(%0: index):  // 2 preds: ^bb1, ^bb3
-      %1 = arith.cmpi slt, %0, %c256 : index
-      cf.cond_br %1, ^bb3, ^bb4
-    ^bb3:  // pred: ^bb2
-      %2 = vector.load %buf5[%0] : memref<256xi16, 2>, vector<32xi16>
-      %3 = vector.load %buf4[%0] : memref<256xi16, 2>, vector<32xi16>
-      %4 = aievec.mul_elem %2, %3 : vector<32xi16>, vector<32xi16>, vector<32xi32>
-      %5 = aievec.srs %4, %c0_i32 : vector<32xi32>, i32, vector<32xi16>
-      vector.store %5, %buf3[%0] : memref<256xi16, 2>, vector<32xi16>
-      %6 = arith.addi %0, %c32 : index
-      cf.br ^bb2(%6 : index) {loop_annotation = #loop_annotation}
-    ^bb4:  // pred: ^bb2
-      aie.use_lock(%lock_0_3_9, Release, 1)
-      aie.use_lock(%lock_0_3, Release, 1)
-      aie.use_lock(%lock_0_3_12, Release, 1)
-      cf.br ^bb1
-    }
-    %mem_0_2 = aie.mem(%tile_0_2) {
-      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
-    ^bb1:  // 2 preds: ^bb0, ^bb1
-      aie.use_lock(%lock_0_2_7, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf0 : memref<256xi16, 2>, 0, 256) {bd_id = 0 : i32, next_bd_id = 0 : i32, task_id = 0 : i32}
-      aie.use_lock(%lock_0_2_6, Release, 1)
-      aie.next_bd ^bb1
-    ^bb2:  // pred: ^bb5
-      aie.end
-    ^bb3:  // pred: ^bb0
-      %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb5)
-    ^bb4:  // 2 preds: ^bb3, ^bb4
-      aie.use_lock(%lock_0_2_4, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf2 : memref<256xi16, 2>, 0, 256) {bd_id = 1 : i32, next_bd_id = 1 : i32, task_id = 0 : i32}
-      aie.use_lock(%lock_0_2_5, Release, 1)
-      aie.next_bd ^bb4
-    ^bb5:  // pred: ^bb3
-      %2 = aie.dma_start(S2MM, 1, ^bb6, ^bb2)
-    ^bb6:  // 2 preds: ^bb5, ^bb6
-      aie.use_lock(%lock_0_2, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf1 : memref<256xi16, 2>, 0, 256) {bd_id = 2 : i32, next_bd_id = 2 : i32, task_id = 0 : i32}
-      aie.use_lock(%lock_0_2_3, Release, 1)
-      aie.next_bd ^bb6
-    }
-    %core_0_2 = aie.core(%tile_0_2) {
-      %c0_i32 = arith.constant 0 : i32
-      %c256 = arith.constant 256 : index
-      %c32 = arith.constant 32 : index
-      %c0 = arith.constant 0 : index
-      cf.br ^bb1
-    ^bb1:  // 2 preds: ^bb0, ^bb4
-      aie.use_lock(%lock_0_2_6, AcquireGreaterEqual, 1)
-      aie.use_lock(%lock_0_2_5, AcquireGreaterEqual, 1)
-      aie.use_lock(%lock_0_2_3, AcquireGreaterEqual, 1)
-      cf.br ^bb2(%c0 : index)
-    ^bb2(%0: index):  // 2 preds: ^bb1, ^bb3
-      %1 = arith.cmpi slt, %0, %c256 : index
-      cf.cond_br %1, ^bb3, ^bb4
-    ^bb3:  // pred: ^bb2
-      %2 = vector.load %buf2[%0] : memref<256xi16, 2>, vector<32xi16>
-      %3 = vector.load %buf1[%0] : memref<256xi16, 2>, vector<32xi16>
-      %4 = aievec.mul_elem %2, %3 : vector<32xi16>, vector<32xi16>, vector<32xi32>
-      %5 = aievec.srs %4, %c0_i32 : vector<32xi32>, i32, vector<32xi16>
-      vector.store %5, %buf0[%0] : memref<256xi16, 2>, vector<32xi16>
-      %6 = arith.addi %0, %c32 : index
-      cf.br ^bb2(%6 : index) {loop_annotation = #loop_annotation}
-    ^bb4:  // pred: ^bb2
-      aie.use_lock(%lock_0_2_4, Release, 1)
-      aie.use_lock(%lock_0_2, Release, 1)
-      aie.use_lock(%lock_0_2_7, Release, 1)
-      cf.br ^bb1
-    }
-    %memtile_dma_2_1 = aie.memtile_dma(%mem_tile_2_1) {
-      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
-    ^bb1:  // 2 preds: ^bb0, ^bb1
-      aie.use_lock(%lock_2_1_2, AcquireGreaterEqual, 4)
-      aie.dma_bd(%buf12 : memref<1024xi16, 1>, 0, 1024) {bd_id = 0 : i32, next_bd_id = 0 : i32, task_id = 0 : i32}
-      aie.use_lock(%lock_2_1, Release, 4)
-      aie.next_bd ^bb1
-    ^bb2:  // pred: ^bb9
-      aie.end
-    ^bb3:  // pred: ^bb0
-      %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb5)
-    ^bb4:  // 2 preds: ^bb3, ^bb4
-      aie.use_lock(%lock_2_1, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf12 : memref<1024xi16, 1>, 0, 256) {bd_id = 1 : i32, next_bd_id = 1 : i32, task_id = 0 : i32}
-      aie.use_lock(%lock_2_1_2, Release, 1)
-      aie.next_bd ^bb4
-    ^bb5:  // pred: ^bb3
-      %2 = aie.dma_start(S2MM, 1, ^bb6, ^bb7)
-    ^bb6:  // 2 preds: ^bb5, ^bb6
-      aie.use_lock(%lock_2_1, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf12 : memref<1024xi16, 1>, 256, 256) {bd_id = 24 : i32, next_bd_id = 24 : i32, task_id = 0 : i32}
-      aie.use_lock(%lock_2_1_2, Release, 1)
-      aie.next_bd ^bb6
-    ^bb7:  // pred: ^bb5
-      %3 = aie.dma_start(S2MM, 2, ^bb8, ^bb9)
-    ^bb8:  // 2 preds: ^bb7, ^bb8
-      aie.use_lock(%lock_2_1, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf12 : memref<1024xi16, 1>, 512, 256) {bd_id = 2 : i32, next_bd_id = 2 : i32, task_id = 0 : i32}
-      aie.use_lock(%lock_2_1_2, Release, 1)
-      aie.next_bd ^bb8
-    ^bb9:  // pred: ^bb7
-      %4 = aie.dma_start(S2MM, 3, ^bb10, ^bb2)
-    ^bb10:  // 2 preds: ^bb9, ^bb10
-      aie.use_lock(%lock_2_1, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf12 : memref<1024xi16, 1>, 768, 256) {bd_id = 25 : i32, next_bd_id = 25 : i32, task_id = 0 : i32}
-      aie.use_lock(%lock_2_1_2, Release, 1)
-      aie.next_bd ^bb10
-    }
-    %memtile_dma_0_1 = aie.memtile_dma(%mem_tile_0_1) {
-      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
-    ^bb1:  // 2 preds: ^bb0, ^bb1
-      aie.use_lock(%lock_0_1_1, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf14 : memref<1024xi16, 1 : i32>, 0, 256) {bd_id = 0 : i32, next_bd_id = 0 : i32, task_id = 0 : i32}
-      aie.use_lock(%lock_0_1, Release, 1)
-      aie.next_bd ^bb1
-    ^bb2:  // pred: ^bb9
-      aie.end
-    ^bb3:  // pred: ^bb0
-      %1 = aie.dma_start(MM2S, 1, ^bb4, ^bb5)
-    ^bb4:  // 2 preds: ^bb3, ^bb4
-      aie.use_lock(%lock_0_1_1, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf14 : memref<1024xi16, 1 : i32>, 256, 256) {bd_id = 24 : i32, next_bd_id = 24 : i32, task_id = 0 : i32}
-      aie.use_lock(%lock_0_1, Release, 1)
-      aie.next_bd ^bb4
-    ^bb5:  // pred: ^bb3
-      %2 = aie.dma_start(MM2S, 2, ^bb6, ^bb7)
-    ^bb6:  // 2 preds: ^bb5, ^bb6
-      aie.use_lock(%lock_0_1_1, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf14 : memref<1024xi16, 1 : i32>, 512, 256) {bd_id = 1 : i32, next_bd_id = 1 : i32, task_id = 0 : i32}
-      aie.use_lock(%lock_0_1, Release, 1)
-      aie.next_bd ^bb6
-    ^bb7:  // pred: ^bb5
-      %3 = aie.dma_start(MM2S, 3, ^bb8, ^bb9)
-    ^bb8:  // 2 preds: ^bb7, ^bb8
-      aie.use_lock(%lock_0_1_1, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf14 : memref<1024xi16, 1 : i32>, 768, 256) {bd_id = 25 : i32, next_bd_id = 25 : i32, task_id = 0 : i32}
-      aie.use_lock(%lock_0_1, Release, 1)
-      aie.next_bd ^bb8
-    ^bb9:  // pred: ^bb7
-      %4 = aie.dma_start(S2MM, 0, ^bb10, ^bb2)
-    ^bb10:  // 2 preds: ^bb9, ^bb10
-      aie.use_lock(%lock_0_1, AcquireGreaterEqual, 4)
-      aie.dma_bd(%buf14 : memref<1024xi16, 1 : i32>, 0, 1024) {bd_id = 2 : i32, next_bd_id = 2 : i32, task_id = 0 : i32}
-      aie.use_lock(%lock_0_1_1, Release, 4)
-      aie.next_bd ^bb10
-    }
-    %memtile_dma_1_1 = aie.memtile_dma(%mem_tile_1_1) {
-      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
-    ^bb1:  // 2 preds: ^bb0, ^bb1
-      aie.use_lock(%lock_1_1_0, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf13 : memref<1024xi16, 1 : i32>, 0, 256) {bd_id = 0 : i32, next_bd_id = 0 : i32, task_id = 0 : i32}
-      aie.use_lock(%lock_1_1, Release, 1)
-      aie.next_bd ^bb1
-    ^bb2:  // pred: ^bb9
-      aie.end
-    ^bb3:  // pred: ^bb0
-      %1 = aie.dma_start(MM2S, 1, ^bb4, ^bb5)
-    ^bb4:  // 2 preds: ^bb3, ^bb4
-      aie.use_lock(%lock_1_1_0, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf13 : memref<1024xi16, 1 : i32>, 256, 256) {bd_id = 24 : i32, next_bd_id = 24 : i32, task_id = 0 : i32}
-      aie.use_lock(%lock_1_1, Release, 1)
-      aie.next_bd ^bb4
-    ^bb5:  // pred: ^bb3
-      %2 = aie.dma_start(MM2S, 2, ^bb6, ^bb7)
-    ^bb6:  // 2 preds: ^bb5, ^bb6
-      aie.use_lock(%lock_1_1_0, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf13 : memref<1024xi16, 1 : i32>, 512, 256) {bd_id = 1 : i32, next_bd_id = 1 : i32, task_id = 0 : i32}
-      aie.use_lock(%lock_1_1, Release, 1)
-      aie.next_bd ^bb6
-    ^bb7:  // pred: ^bb5
-      %3 = aie.dma_start(MM2S, 3, ^bb8, ^bb9)
-    ^bb8:  // 2 preds: ^bb7, ^bb8
-      aie.use_lock(%lock_1_1_0, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf13 : memref<1024xi16, 1 : i32>, 768, 256) {bd_id = 25 : i32, next_bd_id = 25 : i32, task_id = 0 : i32}
-      aie.use_lock(%lock_1_1, Release, 1)
-      aie.next_bd ^bb8
-    ^bb9:  // pred: ^bb7
-      %4 = aie.dma_start(S2MM, 0, ^bb10, ^bb2)
-    ^bb10:  // 2 preds: ^bb9, ^bb10
-      aie.use_lock(%lock_1_1, AcquireGreaterEqual, 4)
-      aie.dma_bd(%buf13 : memref<1024xi16, 1 : i32>, 0, 1024) {bd_id = 2 : i32, next_bd_id = 2 : i32, task_id = 0 : i32}
-      aie.use_lock(%lock_1_1_0, Release, 4)
-      aie.next_bd ^bb10
-    }
-    aie.shim_dma_allocation @air_channel_5(%shim_noc_tile_2_0, S2MM, 0)
-    aie.shim_dma_allocation @air_channel_0(%shim_noc_tile_0_0, MM2S, 0)
-    aie.shim_dma_allocation @air_channel_1(%shim_noc_tile_1_0, MM2S, 0)
-    aie.runtime_sequence @mul_kernel_0_sequence(%arg0: memref<*xi16>, %arg1: memref<*xi16>, %arg2: memref<*xi16>, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32) {
-      %0 = aiex.dma_configure_task_for @air_channel_0 {
-        aie.dma_bd(%arg0 : memref<*xi16>, 0, 1024, [<size = 2, stride = 512>, <size = 512, stride = 1>])
-        aie.end
-      }
-      aiex.dma_start_task(%0)
-      %1 = aiex.dma_configure_task_for @air_channel_1 {
-        aie.dma_bd(%arg1 : memref<*xi16>, 0, 1024, [<size = 2, stride = 512>, <size = 512, stride = 1>])
-        aie.end
-      }
-      aiex.dma_start_task(%1)
-      %2 = aiex.dma_configure_task_for @air_channel_5 {
-        aie.dma_bd(%arg2 : memref<*xi16>, 0, 1024, [<size = 2, stride = 512>, <size = 512, stride = 1>])
-        aie.end
-      } {issue_token = true}
-      aiex.dma_start_task(%2)
-      aiex.dma_free_task(%0)
-      aiex.dma_await_task(%2)
-      aiex.dma_free_task(%1)
-    }
-    aie.packet_flow(15) {
-      aie.packet_source<%shim_noc_tile_0_0, TileControl : 0>
-      aie.packet_dest<%shim_noc_tile_0_0, South : 0>
-    } {keep_pkt_header = true, priority_route = true}
-    aie.packet_flow(15) {
-      aie.packet_source<%shim_noc_tile_1_0, TileControl : 0>
-      aie.packet_dest<%shim_noc_tile_1_0, South : 0>
-    } {keep_pkt_header = true, priority_route = true}
-    aie.packet_flow(15) {
-      aie.packet_source<%shim_noc_tile_2_0, TileControl : 0>
-      aie.packet_dest<%shim_noc_tile_2_0, South : 0>
-    } {keep_pkt_header = true, priority_route = true}
-    %switchbox_0_0 = aie.switchbox(%shim_noc_tile_0_0) {
-      aie.connect<South : 3, North : 3>
-      %0 = aie.amsel<5> (3)
-      %1 = aie.masterset(South : 0, %0) {keep_pkt_header = true}
-      aie.packet_rules(TileControl : 0) {
-        aie.rule(31, 15, %0)
-      }
-    }
-    %shim_mux_0_0 = aie.shim_mux(%shim_noc_tile_0_0) {
-      aie.connect<DMA : 0, North : 3>
-    }
-    %switchbox_0_1 = aie.switchbox(%mem_tile_0_1) {
-      aie.connect<South : 3, DMA : 0>
-      aie.connect<DMA : 0, North : 1>
-      aie.connect<DMA : 1, North : 5>
-      aie.connect<DMA : 2, North : 0>
-      aie.connect<DMA : 3, North : 3>
-    }
-    %switchbox_1_0 = aie.switchbox(%shim_noc_tile_1_0) {
-      aie.connect<South : 3, North : 1>
-      %0 = aie.amsel<5> (3)
-      %1 = aie.masterset(South : 0, %0) {keep_pkt_header = true}
-      aie.packet_rules(TileControl : 0) {
-        aie.rule(31, 15, %0)
-      }
-    }
-    %shim_mux_1_0 = aie.shim_mux(%shim_noc_tile_1_0) {
-      aie.connect<DMA : 0, North : 3>
-    }
-    %switchbox_1_1 = aie.switchbox(%mem_tile_1_1) {
-      aie.connect<South : 1, DMA : 0>
-      aie.connect<DMA : 0, North : 1>
-      aie.connect<DMA : 1, North : 5>
-      aie.connect<DMA : 2, North : 0>
-      aie.connect<DMA : 3, North : 3>
-    }
-    %switchbox_2_0 = aie.switchbox(%shim_noc_tile_2_0) {
-      aie.connect<North : 2, South : 2>
-      %0 = aie.amsel<5> (3)
-      %1 = aie.masterset(South : 0, %0) {keep_pkt_header = true}
-      aie.packet_rules(TileControl : 0) {
-        aie.rule(31, 15, %0)
-      }
-    }
-    %shim_mux_2_0 = aie.shim_mux(%shim_noc_tile_2_0) {
-      aie.connect<North : 2, DMA : 0>
-    }
-    %switchbox_2_1 = aie.switchbox(%mem_tile_2_1) {
-      aie.connect<DMA : 0, South : 2>
-      aie.connect<North : 2, DMA : 0>
-      aie.connect<North : 1, DMA : 1>
-      aie.connect<North : 0, DMA : 2>
-      aie.connect<North : 3, DMA : 3>
-    }
-    %switchbox_0_2 = aie.switchbox(%tile_0_2) {
-      aie.connect<South : 1, DMA : 0>
-      aie.connect<South : 5, North : 3>
-      aie.connect<South : 0, North : 5>
-      aie.connect<South : 3, North : 4>
-      aie.connect<East : 3, DMA : 1>
-      aie.connect<DMA : 0, East : 0>
-    }
-    %switchbox_0_3 = aie.switchbox(%tile_0_3) {
-      aie.connect<South : 3, DMA : 0>
-      aie.connect<South : 5, North : 0>
-      aie.connect<South : 4, North : 2>
-      aie.connect<East : 2, DMA : 1>
-      aie.connect<East : 0, North : 5>
-      aie.connect<DMA : 0, East : 0>
-    }
-    %switchbox_0_4 = aie.switchbox(%tile_0_4) {
-      aie.connect<South : 0, DMA : 0>
-      aie.connect<South : 2, North : 0>
-      aie.connect<South : 5, DMA : 1>
-      aie.connect<East : 2, North : 5>
-      aie.connect<DMA : 0, East : 0>
-      aie.connect<North : 0, East : 3>
-    }
-    %switchbox_0_5 = aie.switchbox(%tile_0_5) {
-      aie.connect<South : 0, DMA : 0>
-      aie.connect<South : 5, DMA : 1>
-      aie.connect<DMA : 0, South : 0>
-    }
-    %tile_1_2 = aie.tile(1, 2)
-    %switchbox_1_2 = aie.switchbox(%tile_1_2) {
-      aie.connect<South : 1, West : 3>
-      aie.connect<South : 5, North : 1>
-      aie.connect<South : 0, North : 2>
-      aie.connect<South : 3, North : 0>
-      aie.connect<West : 0, East : 1>
-      aie.connect<North : 3, East : 3>
-    }
-    %tile_1_3 = aie.tile(1, 3)
-    %switchbox_1_3 = aie.switchbox(%tile_1_3) {
-      aie.connect<South : 1, West : 2>
-      aie.connect<South : 2, West : 0>
-      aie.connect<South : 0, North : 0>
-      aie.connect<West : 0, East : 1>
-      aie.connect<North : 1, South : 3>
-    }
-    %tile_1_4 = aie.tile(1, 4)
-    %switchbox_1_4 = aie.switchbox(%tile_1_4) {
-      aie.connect<South : 0, West : 2>
-      aie.connect<West : 0, South : 1>
-      aie.connect<West : 3, East : 3>
-    }
-    %tile_2_2 = aie.tile(2, 2)
-    %switchbox_2_2 = aie.switchbox(%tile_2_2) {
-      aie.connect<West : 1, South : 2>
-      aie.connect<North : 3, South : 1>
-      aie.connect<West : 3, South : 0>
-      aie.connect<North : 0, South : 3>
-    }
-    %tile_2_3 = aie.tile(2, 3)
-    %switchbox_2_3 = aie.switchbox(%tile_2_3) {
-      aie.connect<West : 1, South : 3>
-      aie.connect<North : 1, South : 0>
-    }
-    %tile_2_4 = aie.tile(2, 4)
-    %switchbox_2_4 = aie.switchbox(%tile_2_4) {
-      aie.connect<West : 3, South : 1>
-    }
-    aie.wire(%shim_mux_0_0 : North, %switchbox_0_0 : South)
-    aie.wire(%shim_noc_tile_0_0 : DMA, %shim_mux_0_0 : DMA)
-    aie.wire(%mem_tile_0_1 : Core, %switchbox_0_1 : Core)
-    aie.wire(%mem_tile_0_1 : DMA, %switchbox_0_1 : DMA)
-    aie.wire(%switchbox_0_0 : North, %switchbox_0_1 : South)
-    aie.wire(%tile_0_2 : Core, %switchbox_0_2 : Core)
-    aie.wire(%tile_0_2 : DMA, %switchbox_0_2 : DMA)
-    aie.wire(%switchbox_0_1 : North, %switchbox_0_2 : South)
-    aie.wire(%tile_0_3 : Core, %switchbox_0_3 : Core)
-    aie.wire(%tile_0_3 : DMA, %switchbox_0_3 : DMA)
-    aie.wire(%switchbox_0_2 : North, %switchbox_0_3 : South)
-    aie.wire(%tile_0_4 : Core, %switchbox_0_4 : Core)
-    aie.wire(%tile_0_4 : DMA, %switchbox_0_4 : DMA)
-    aie.wire(%switchbox_0_3 : North, %switchbox_0_4 : South)
-    aie.wire(%tile_0_5 : Core, %switchbox_0_5 : Core)
-    aie.wire(%tile_0_5 : DMA, %switchbox_0_5 : DMA)
-    aie.wire(%switchbox_0_4 : North, %switchbox_0_5 : South)
-    aie.wire(%switchbox_0_0 : East, %switchbox_1_0 : West)
-    aie.wire(%shim_mux_1_0 : North, %switchbox_1_0 : South)
-    aie.wire(%shim_noc_tile_1_0 : DMA, %shim_mux_1_0 : DMA)
-    aie.wire(%switchbox_0_1 : East, %switchbox_1_1 : West)
-    aie.wire(%mem_tile_1_1 : Core, %switchbox_1_1 : Core)
-    aie.wire(%mem_tile_1_1 : DMA, %switchbox_1_1 : DMA)
-    aie.wire(%switchbox_1_0 : North, %switchbox_1_1 : South)
-    aie.wire(%switchbox_0_2 : East, %switchbox_1_2 : West)
-    aie.wire(%tile_1_2 : Core, %switchbox_1_2 : Core)
-    aie.wire(%tile_1_2 : DMA, %switchbox_1_2 : DMA)
-    aie.wire(%switchbox_1_1 : North, %switchbox_1_2 : South)
-    aie.wire(%switchbox_0_3 : East, %switchbox_1_3 : West)
-    aie.wire(%tile_1_3 : Core, %switchbox_1_3 : Core)
-    aie.wire(%tile_1_3 : DMA, %switchbox_1_3 : DMA)
-    aie.wire(%switchbox_1_2 : North, %switchbox_1_3 : South)
-    aie.wire(%switchbox_0_4 : East, %switchbox_1_4 : West)
-    aie.wire(%tile_1_4 : Core, %switchbox_1_4 : Core)
-    aie.wire(%tile_1_4 : DMA, %switchbox_1_4 : DMA)
-    aie.wire(%switchbox_1_3 : North, %switchbox_1_4 : South)
-    aie.wire(%switchbox_1_0 : East, %switchbox_2_0 : West)
-    aie.wire(%shim_mux_2_0 : North, %switchbox_2_0 : South)
-    aie.wire(%shim_noc_tile_2_0 : DMA, %shim_mux_2_0 : DMA)
-    aie.wire(%switchbox_1_1 : East, %switchbox_2_1 : West)
-    aie.wire(%mem_tile_2_1 : Core, %switchbox_2_1 : Core)
-    aie.wire(%mem_tile_2_1 : DMA, %switchbox_2_1 : DMA)
-    aie.wire(%switchbox_2_0 : North, %switchbox_2_1 : South)
-    aie.wire(%switchbox_1_2 : East, %switchbox_2_2 : West)
-    aie.wire(%tile_2_2 : Core, %switchbox_2_2 : Core)
-    aie.wire(%tile_2_2 : DMA, %switchbox_2_2 : DMA)
-    aie.wire(%switchbox_2_1 : North, %switchbox_2_2 : South)
-    aie.wire(%switchbox_1_3 : East, %switchbox_2_3 : West)
-    aie.wire(%tile_2_3 : Core, %switchbox_2_3 : Core)
-    aie.wire(%tile_2_3 : DMA, %switchbox_2_3 : DMA)
-    aie.wire(%switchbox_2_2 : North, %switchbox_2_3 : South)
-    aie.wire(%switchbox_1_4 : East, %switchbox_2_4 : West)
-    aie.wire(%tile_2_4 : Core, %switchbox_2_4 : Core)
-    aie.wire(%tile_2_4 : DMA, %switchbox_2_4 : DMA)
-    aie.wire(%switchbox_2_3 : North, %switchbox_2_4 : South)
-  } {dlti.dl_spec = #dlti.dl_spec<index = 32 : i64>}
-  aie.device(npu2) {
-    aie.runtime_sequence @mul_kernel(%arg0: memref<*xi16>, %arg1: memref<*xi16>, %arg2: memref<*xi16>, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32) {
-      aiex.configure @mul_kernel_0 {
-        aiex.run @mul_kernel_0_sequence(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7, %arg8) : (memref<*xi16>, memref<*xi16>, memref<*xi16>, i32, i32, i32, i32, i32, i32)
-      }
-    }
-  }
-}
diff --git a/examples/elementwise_arith/air_project/aiecc_failure_1775797174_862028.mlir b/examples/elementwise_arith/air_project/aiecc_failure_1775797174_862028.mlir
deleted file mode 100644
index 918aa51..0000000
--- a/examples/elementwise_arith/air_project/aiecc_failure_1775797174_862028.mlir
+++ /dev/null
@@ -1,431 +0,0 @@
-#loop_annotation = #llvm.loop_annotation<mustProgress = true>
-module {
-  aie.device(npu2) @square_kernel_0 {
-    %shim_noc_tile_0_0 = aie.tile(0, 0) {controller_id = #aie.packet_info<pkt_type = 0, pkt_id = 15>}
-    %shim_noc_tile_1_0 = aie.tile(1, 0) {controller_id = #aie.packet_info<pkt_type = 0, pkt_id = 15>}
-    %mem_tile_0_1 = aie.tile(0, 1) {controller_id = #aie.packet_info<pkt_type = 0, pkt_id = 26>}
-    %mem_tile_1_1 = aie.tile(1, 1) {controller_id = #aie.packet_info<pkt_type = 0, pkt_id = 26>}
-    %tile_0_2 = aie.tile(0, 2) {controller_id = #aie.packet_info<pkt_type = 0, pkt_id = 27>}
-    %tile_0_3 = aie.tile(0, 3) {controller_id = #aie.packet_info<pkt_type = 0, pkt_id = 29>}
-    %tile_0_4 = aie.tile(0, 4) {controller_id = #aie.packet_info<pkt_type = 0, pkt_id = 30>}
-    %tile_0_5 = aie.tile(0, 5) {controller_id = #aie.packet_info<pkt_type = 0, pkt_id = 31>}
-    %lock_0_1 = aie.lock(%mem_tile_0_1, 1) {init = 4 : i32}
-    %lock_0_1_0 = aie.lock(%mem_tile_0_1, 0) {init = 0 : i32}
-    %lock_1_1 = aie.lock(%mem_tile_1_1, 1) {init = 4 : i32}
-    %lock_1_1_1 = aie.lock(%mem_tile_1_1, 0) {init = 0 : i32}
-    %lock_0_2 = aie.lock(%tile_0_2, 3) {init = 1 : i32}
-    %lock_0_2_2 = aie.lock(%tile_0_2, 2) {init = 0 : i32}
-    %lock_0_2_3 = aie.lock(%tile_0_2, 1) {init = 1 : i32}
-    %lock_0_2_4 = aie.lock(%tile_0_2, 0) {init = 0 : i32}
-    %lock_0_3 = aie.lock(%tile_0_3, 3) {init = 1 : i32}
-    %lock_0_3_5 = aie.lock(%tile_0_3, 2) {init = 0 : i32}
-    %lock_0_3_6 = aie.lock(%tile_0_3, 1) {init = 1 : i32}
-    %lock_0_3_7 = aie.lock(%tile_0_3, 0) {init = 0 : i32}
-    %lock_0_4 = aie.lock(%tile_0_4, 3) {init = 1 : i32}
-    %lock_0_4_8 = aie.lock(%tile_0_4, 2) {init = 0 : i32}
-    %lock_0_4_9 = aie.lock(%tile_0_4, 1) {init = 1 : i32}
-    %lock_0_4_10 = aie.lock(%tile_0_4, 0) {init = 0 : i32}
-    %lock_0_5 = aie.lock(%tile_0_5, 3) {init = 1 : i32}
-    %lock_0_5_11 = aie.lock(%tile_0_5, 2) {init = 0 : i32}
-    %lock_0_5_12 = aie.lock(%tile_0_5, 1) {init = 1 : i32}
-    %lock_0_5_13 = aie.lock(%tile_0_5, 0) {init = 0 : i32}
-    %buf9 = aie.buffer(%mem_tile_0_1) {address = 0 : i32, mem_bank = 0 : i32, sym_name = "buf9"} : memref<1024xi16, 1 : i32> 
-    %buf8 = aie.buffer(%mem_tile_1_1) {address = 0 : i32, mem_bank = 0 : i32, sym_name = "buf8"} : memref<1024xi16, 1> 
-    %buf7 = aie.buffer(%tile_0_5) {address = 1024 : i32, mem_bank = 0 : i32, sym_name = "buf7"} : memref<256xi16, 2> 
-    %buf6 = aie.buffer(%tile_0_5) {address = 16384 : i32, mem_bank = 1 : i32, sym_name = "buf6"} : memref<256xi16, 2> 
-    %buf5 = aie.buffer(%tile_0_4) {address = 1024 : i32, mem_bank = 0 : i32, sym_name = "buf5"} : memref<256xi16, 2> 
-    %buf4 = aie.buffer(%tile_0_4) {address = 16384 : i32, mem_bank = 1 : i32, sym_name = "buf4"} : memref<256xi16, 2> 
-    %buf3 = aie.buffer(%tile_0_3) {address = 1024 : i32, mem_bank = 0 : i32, sym_name = "buf3"} : memref<256xi16, 2> 
-    %buf2 = aie.buffer(%tile_0_3) {address = 16384 : i32, mem_bank = 1 : i32, sym_name = "buf2"} : memref<256xi16, 2> 
-    %buf1 = aie.buffer(%tile_0_2) {address = 1024 : i32, mem_bank = 0 : i32, sym_name = "buf1"} : memref<256xi16, 2> 
-    %buf0 = aie.buffer(%tile_0_2) {address = 16384 : i32, mem_bank = 1 : i32, sym_name = "buf0"} : memref<256xi16, 2> 
-    %mem_0_5 = aie.mem(%tile_0_5) {
-      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
-    ^bb1:  // 2 preds: ^bb0, ^bb1
-      aie.use_lock(%lock_0_5_13, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf6 : memref<256xi16, 2>, 0, 256) {bd_id = 0 : i32, next_bd_id = 0 : i32, task_id = 0 : i32}
-      aie.use_lock(%lock_0_5_12, Release, 1)
-      aie.next_bd ^bb1
-    ^bb2:  // pred: ^bb3
-      aie.end
-    ^bb3:  // pred: ^bb0
-      %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb2)
-    ^bb4:  // 2 preds: ^bb3, ^bb4
-      aie.use_lock(%lock_0_5, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf7 : memref<256xi16, 2>, 0, 256) {bd_id = 1 : i32, next_bd_id = 1 : i32, task_id = 0 : i32}
-      aie.use_lock(%lock_0_5_11, Release, 1)
-      aie.next_bd ^bb4
-    }
-    %core_0_5 = aie.core(%tile_0_5) {
-      %c0_i32 = arith.constant 0 : i32
-      %c256 = arith.constant 256 : index
-      %c32 = arith.constant 32 : index
-      %c0 = arith.constant 0 : index
-      cf.br ^bb1
-    ^bb1:  // 2 preds: ^bb0, ^bb4
-      aie.use_lock(%lock_0_5_12, AcquireGreaterEqual, 1)
-      aie.use_lock(%lock_0_5_11, AcquireGreaterEqual, 1)
-      cf.br ^bb2(%c0 : index)
-    ^bb2(%0: index):  // 2 preds: ^bb1, ^bb3
-      %1 = arith.cmpi slt, %0, %c256 : index
-      cf.cond_br %1, ^bb3, ^bb4
-    ^bb3:  // pred: ^bb2
-      %2 = vector.load %buf7[%0] : memref<256xi16, 2>, vector<32xi16>
-      %3 = aievec.mul_elem %2, %2 : vector<32xi16>, vector<32xi16>, vector<32xi32>
-      %4 = aievec.srs %3, %c0_i32 : vector<32xi32>, i32, vector<32xi16>
-      vector.store %4, %buf6[%0] : memref<256xi16, 2>, vector<32xi16>
-      %5 = arith.addi %0, %c32 : index
-      cf.br ^bb2(%5 : index) {loop_annotation = #loop_annotation}
-    ^bb4:  // pred: ^bb2
-      aie.use_lock(%lock_0_5, Release, 1)
-      aie.use_lock(%lock_0_5_13, Release, 1)
-      cf.br ^bb1
-    }
-    %mem_0_4 = aie.mem(%tile_0_4) {
-      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
-    ^bb1:  // 2 preds: ^bb0, ^bb1
-      aie.use_lock(%lock_0_4_10, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf4 : memref<256xi16, 2>, 0, 256) {bd_id = 0 : i32, next_bd_id = 0 : i32, task_id = 0 : i32}
-      aie.use_lock(%lock_0_4_9, Release, 1)
-      aie.next_bd ^bb1
-    ^bb2:  // pred: ^bb3
-      aie.end
-    ^bb3:  // pred: ^bb0
-      %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb2)
-    ^bb4:  // 2 preds: ^bb3, ^bb4
-      aie.use_lock(%lock_0_4, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf5 : memref<256xi16, 2>, 0, 256) {bd_id = 1 : i32, next_bd_id = 1 : i32, task_id = 0 : i32}
-      aie.use_lock(%lock_0_4_8, Release, 1)
-      aie.next_bd ^bb4
-    }
-    %core_0_4 = aie.core(%tile_0_4) {
-      %c0_i32 = arith.constant 0 : i32
-      %c256 = arith.constant 256 : index
-      %c32 = arith.constant 32 : index
-      %c0 = arith.constant 0 : index
-      cf.br ^bb1
-    ^bb1:  // 2 preds: ^bb0, ^bb4
-      aie.use_lock(%lock_0_4_9, AcquireGreaterEqual, 1)
-      aie.use_lock(%lock_0_4_8, AcquireGreaterEqual, 1)
-      cf.br ^bb2(%c0 : index)
-    ^bb2(%0: index):  // 2 preds: ^bb1, ^bb3
-      %1 = arith.cmpi slt, %0, %c256 : index
-      cf.cond_br %1, ^bb3, ^bb4
-    ^bb3:  // pred: ^bb2
-      %2 = vector.load %buf5[%0] : memref<256xi16, 2>, vector<32xi16>
-      %3 = aievec.mul_elem %2, %2 : vector<32xi16>, vector<32xi16>, vector<32xi32>
-      %4 = aievec.srs %3, %c0_i32 : vector<32xi32>, i32, vector<32xi16>
-      vector.store %4, %buf4[%0] : memref<256xi16, 2>, vector<32xi16>
-      %5 = arith.addi %0, %c32 : index
-      cf.br ^bb2(%5 : index) {loop_annotation = #loop_annotation}
-    ^bb4:  // pred: ^bb2
-      aie.use_lock(%lock_0_4, Release, 1)
-      aie.use_lock(%lock_0_4_10, Release, 1)
-      cf.br ^bb1
-    }
-    %mem_0_3 = aie.mem(%tile_0_3) {
-      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
-    ^bb1:  // 2 preds: ^bb0, ^bb1
-      aie.use_lock(%lock_0_3_7, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf2 : memref<256xi16, 2>, 0, 256) {bd_id = 0 : i32, next_bd_id = 0 : i32, task_id = 0 : i32}
-      aie.use_lock(%lock_0_3_6, Release, 1)
-      aie.next_bd ^bb1
-    ^bb2:  // pred: ^bb3
-      aie.end
-    ^bb3:  // pred: ^bb0
-      %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb2)
-    ^bb4:  // 2 preds: ^bb3, ^bb4
-      aie.use_lock(%lock_0_3, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf3 : memref<256xi16, 2>, 0, 256) {bd_id = 1 : i32, next_bd_id = 1 : i32, task_id = 0 : i32}
-      aie.use_lock(%lock_0_3_5, Release, 1)
-      aie.next_bd ^bb4
-    }
-    %core_0_3 = aie.core(%tile_0_3) {
-      %c0_i32 = arith.constant 0 : i32
-      %c256 = arith.constant 256 : index
-      %c32 = arith.constant 32 : index
-      %c0 = arith.constant 0 : index
-      cf.br ^bb1
-    ^bb1:  // 2 preds: ^bb0, ^bb4
-      aie.use_lock(%lock_0_3_6, AcquireGreaterEqual, 1)
-      aie.use_lock(%lock_0_3_5, AcquireGreaterEqual, 1)
-      cf.br ^bb2(%c0 : index)
-    ^bb2(%0: index):  // 2 preds: ^bb1, ^bb3
-      %1 = arith.cmpi slt, %0, %c256 : index
-      cf.cond_br %1, ^bb3, ^bb4
-    ^bb3:  // pred: ^bb2
-      %2 = vector.load %buf3[%0] : memref<256xi16, 2>, vector<32xi16>
-      %3 = aievec.mul_elem %2, %2 : vector<32xi16>, vector<32xi16>, vector<32xi32>
-      %4 = aievec.srs %3, %c0_i32 : vector<32xi32>, i32, vector<32xi16>
-      vector.store %4, %buf2[%0] : memref<256xi16, 2>, vector<32xi16>
-      %5 = arith.addi %0, %c32 : index
-      cf.br ^bb2(%5 : index) {loop_annotation = #loop_annotation}
-    ^bb4:  // pred: ^bb2
-      aie.use_lock(%lock_0_3, Release, 1)
-      aie.use_lock(%lock_0_3_7, Release, 1)
-      cf.br ^bb1
-    }
-    %mem_0_2 = aie.mem(%tile_0_2) {
-      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
-    ^bb1:  // 2 preds: ^bb0, ^bb1
-      aie.use_lock(%lock_0_2_4, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf0 : memref<256xi16, 2>, 0, 256) {bd_id = 0 : i32, next_bd_id = 0 : i32, task_id = 0 : i32}
-      aie.use_lock(%lock_0_2_3, Release, 1)
-      aie.next_bd ^bb1
-    ^bb2:  // pred: ^bb3
-      aie.end
-    ^bb3:  // pred: ^bb0
-      %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb2)
-    ^bb4:  // 2 preds: ^bb3, ^bb4
-      aie.use_lock(%lock_0_2, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf1 : memref<256xi16, 2>, 0, 256) {bd_id = 1 : i32, next_bd_id = 1 : i32, task_id = 0 : i32}
-      aie.use_lock(%lock_0_2_2, Release, 1)
-      aie.next_bd ^bb4
-    }
-    %core_0_2 = aie.core(%tile_0_2) {
-      %c0_i32 = arith.constant 0 : i32
-      %c256 = arith.constant 256 : index
-      %c32 = arith.constant 32 : index
-      %c0 = arith.constant 0 : index
-      cf.br ^bb1
-    ^bb1:  // 2 preds: ^bb0, ^bb4
-      aie.use_lock(%lock_0_2_3, AcquireGreaterEqual, 1)
-      aie.use_lock(%lock_0_2_2, AcquireGreaterEqual, 1)
-      cf.br ^bb2(%c0 : index)
-    ^bb2(%0: index):  // 2 preds: ^bb1, ^bb3
-      %1 = arith.cmpi slt, %0, %c256 : index
-      cf.cond_br %1, ^bb3, ^bb4
-    ^bb3:  // pred: ^bb2
-      %2 = vector.load %buf1[%0] : memref<256xi16, 2>, vector<32xi16>
-      %3 = aievec.mul_elem %2, %2 : vector<32xi16>, vector<32xi16>, vector<32xi32>
-      %4 = aievec.srs %3, %c0_i32 : vector<32xi32>, i32, vector<32xi16>
-      vector.store %4, %buf0[%0] : memref<256xi16, 2>, vector<32xi16>
-      %5 = arith.addi %0, %c32 : index
-      cf.br ^bb2(%5 : index) {loop_annotation = #loop_annotation}
-    ^bb4:  // pred: ^bb2
-      aie.use_lock(%lock_0_2, Release, 1)
-      aie.use_lock(%lock_0_2_4, Release, 1)
-      cf.br ^bb1
-    }
-    %memtile_dma_1_1 = aie.memtile_dma(%mem_tile_1_1) {
-      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
-    ^bb1:  // 2 preds: ^bb0, ^bb1
-      aie.use_lock(%lock_1_1_1, AcquireGreaterEqual, 4)
-      aie.dma_bd(%buf8 : memref<1024xi16, 1>, 0, 1024) {bd_id = 0 : i32, next_bd_id = 0 : i32, task_id = 0 : i32}
-      aie.use_lock(%lock_1_1, Release, 4)
-      aie.next_bd ^bb1
-    ^bb2:  // pred: ^bb9
-      aie.end
-    ^bb3:  // pred: ^bb0
-      %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb5)
-    ^bb4:  // 2 preds: ^bb3, ^bb4
-      aie.use_lock(%lock_1_1, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf8 : memref<1024xi16, 1>, 0, 256) {bd_id = 1 : i32, next_bd_id = 1 : i32, task_id = 0 : i32}
-      aie.use_lock(%lock_1_1_1, Release, 1)
-      aie.next_bd ^bb4
-    ^bb5:  // pred: ^bb3
-      %2 = aie.dma_start(S2MM, 1, ^bb6, ^bb7)
-    ^bb6:  // 2 preds: ^bb5, ^bb6
-      aie.use_lock(%lock_1_1, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf8 : memref<1024xi16, 1>, 256, 256) {bd_id = 24 : i32, next_bd_id = 24 : i32, task_id = 0 : i32}
-      aie.use_lock(%lock_1_1_1, Release, 1)
-      aie.next_bd ^bb6
-    ^bb7:  // pred: ^bb5
-      %3 = aie.dma_start(S2MM, 2, ^bb8, ^bb9)
-    ^bb8:  // 2 preds: ^bb7, ^bb8
-      aie.use_lock(%lock_1_1, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf8 : memref<1024xi16, 1>, 512, 256) {bd_id = 2 : i32, next_bd_id = 2 : i32, task_id = 0 : i32}
-      aie.use_lock(%lock_1_1_1, Release, 1)
-      aie.next_bd ^bb8
-    ^bb9:  // pred: ^bb7
-      %4 = aie.dma_start(S2MM, 3, ^bb10, ^bb2)
-    ^bb10:  // 2 preds: ^bb9, ^bb10
-      aie.use_lock(%lock_1_1, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf8 : memref<1024xi16, 1>, 768, 256) {bd_id = 25 : i32, next_bd_id = 25 : i32, task_id = 0 : i32}
-      aie.use_lock(%lock_1_1_1, Release, 1)
-      aie.next_bd ^bb10
-    }
-    %memtile_dma_0_1 = aie.memtile_dma(%mem_tile_0_1) {
-      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
-    ^bb1:  // 2 preds: ^bb0, ^bb1
-      aie.use_lock(%lock_0_1_0, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 0, 256) {bd_id = 0 : i32, next_bd_id = 0 : i32, task_id = 0 : i32}
-      aie.use_lock(%lock_0_1, Release, 1)
-      aie.next_bd ^bb1
-    ^bb2:  // pred: ^bb9
-      aie.end
-    ^bb3:  // pred: ^bb0
-      %1 = aie.dma_start(MM2S, 1, ^bb4, ^bb5)
-    ^bb4:  // 2 preds: ^bb3, ^bb4
-      aie.use_lock(%lock_0_1_0, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 256, 256) {bd_id = 24 : i32, next_bd_id = 24 : i32, task_id = 0 : i32}
-      aie.use_lock(%lock_0_1, Release, 1)
-      aie.next_bd ^bb4
-    ^bb5:  // pred: ^bb3
-      %2 = aie.dma_start(MM2S, 2, ^bb6, ^bb7)
-    ^bb6:  // 2 preds: ^bb5, ^bb6
-      aie.use_lock(%lock_0_1_0, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 512, 256) {bd_id = 1 : i32, next_bd_id = 1 : i32, task_id = 0 : i32}
-      aie.use_lock(%lock_0_1, Release, 1)
-      aie.next_bd ^bb6
-    ^bb7:  // pred: ^bb5
-      %3 = aie.dma_start(MM2S, 3, ^bb8, ^bb9)
-    ^bb8:  // 2 preds: ^bb7, ^bb8
-      aie.use_lock(%lock_0_1_0, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 768, 256) {bd_id = 25 : i32, next_bd_id = 25 : i32, task_id = 0 : i32}
-      aie.use_lock(%lock_0_1, Release, 1)
-      aie.next_bd ^bb8
-    ^bb9:  // pred: ^bb7
-      %4 = aie.dma_start(S2MM, 0, ^bb10, ^bb2)
-    ^bb10:  // 2 preds: ^bb9, ^bb10
-      aie.use_lock(%lock_0_1, AcquireGreaterEqual, 4)
-      aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 0, 1024) {bd_id = 2 : i32, next_bd_id = 2 : i32, task_id = 0 : i32}
-      aie.use_lock(%lock_0_1_0, Release, 4)
-      aie.next_bd ^bb10
-    }
-    aie.shim_dma_allocation @air_channel_3(%shim_noc_tile_1_0, S2MM, 0)
-    aie.shim_dma_allocation @air_channel_0(%shim_noc_tile_0_0, MM2S, 0)
-    aie.runtime_sequence @square_kernel_0_sequence(%arg0: memref<*xi16>, %arg1: memref<*xi16>, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32) {
-      %0 = aiex.dma_configure_task_for @air_channel_0 {
-        aie.dma_bd(%arg0 : memref<*xi16>, 0, 1024, [<size = 2, stride = 512>, <size = 512, stride = 1>])
-        aie.end
-      }
-      aiex.dma_start_task(%0)
-      %1 = aiex.dma_configure_task_for @air_channel_3 {
-        aie.dma_bd(%arg1 : memref<*xi16>, 0, 1024, [<size = 2, stride = 512>, <size = 512, stride = 1>])
-        aie.end
-      } {issue_token = true}
-      aiex.dma_start_task(%1)
-      aiex.dma_free_task(%0)
-      aiex.dma_await_task(%1)
-    }
-    aie.packet_flow(15) {
-      aie.packet_source<%shim_noc_tile_0_0, TileControl : 0>
-      aie.packet_dest<%shim_noc_tile_0_0, South : 0>
-    } {keep_pkt_header = true, priority_route = true}
-    aie.packet_flow(15) {
-      aie.packet_source<%shim_noc_tile_1_0, TileControl : 0>
-      aie.packet_dest<%shim_noc_tile_1_0, South : 0>
-    } {keep_pkt_header = true, priority_route = true}
-    %switchbox_0_0 = aie.switchbox(%shim_noc_tile_0_0) {
-      aie.connect<South : 3, North : 3>
-      %0 = aie.amsel<5> (3)
-      %1 = aie.masterset(South : 0, %0) {keep_pkt_header = true}
-      aie.packet_rules(TileControl : 0) {
-        aie.rule(31, 15, %0)
-      }
-    }
-    %shim_mux_0_0 = aie.shim_mux(%shim_noc_tile_0_0) {
-      aie.connect<DMA : 0, North : 3>
-    }
-    %switchbox_0_1 = aie.switchbox(%mem_tile_0_1) {
-      aie.connect<South : 3, DMA : 0>
-      aie.connect<DMA : 0, North : 1>
-      aie.connect<DMA : 1, North : 5>
-      aie.connect<DMA : 2, North : 0>
-      aie.connect<DMA : 3, North : 3>
-    }
-    %switchbox_1_0 = aie.switchbox(%shim_noc_tile_1_0) {
-      aie.connect<North : 2, South : 2>
-      %0 = aie.amsel<5> (3)
-      %1 = aie.masterset(South : 0, %0) {keep_pkt_header = true}
-      aie.packet_rules(TileControl : 0) {
-        aie.rule(31, 15, %0)
-      }
-    }
-    %shim_mux_1_0 = aie.shim_mux(%shim_noc_tile_1_0) {
-      aie.connect<North : 2, DMA : 0>
-    }
-    %switchbox_1_1 = aie.switchbox(%mem_tile_1_1) {
-      aie.connect<DMA : 0, South : 2>
-      aie.connect<North : 1, DMA : 0>
-      aie.connect<North : 3, DMA : 1>
-      aie.connect<North : 0, DMA : 2>
-      aie.connect<North : 2, DMA : 3>
-    }
-    %switchbox_0_2 = aie.switchbox(%tile_0_2) {
-      aie.connect<South : 1, DMA : 0>
-      aie.connect<South : 5, North : 3>
-      aie.connect<South : 0, North : 5>
-      aie.connect<South : 3, North : 4>
-      aie.connect<DMA : 0, East : 0>
-      aie.connect<North : 0, East : 3>
-    }
-    %switchbox_0_3 = aie.switchbox(%tile_0_3) {
-      aie.connect<South : 3, DMA : 0>
-      aie.connect<South : 5, North : 0>
-      aie.connect<South : 4, North : 2>
-      aie.connect<DMA : 0, East : 0>
-      aie.connect<North : 0, South : 0>
-    }
-    %switchbox_0_4 = aie.switchbox(%tile_0_4) {
-      aie.connect<South : 0, DMA : 0>
-      aie.connect<South : 2, North : 0>
-      aie.connect<DMA : 0, East : 0>
-      aie.connect<North : 0, South : 0>
-    }
-    %switchbox_0_5 = aie.switchbox(%tile_0_5) {
-      aie.connect<South : 0, DMA : 0>
-      aie.connect<DMA : 0, South : 0>
-    }
-    %tile_1_2 = aie.tile(1, 2)
-    %switchbox_1_2 = aie.switchbox(%tile_1_2) {
-      aie.connect<West : 0, South : 1>
-      aie.connect<North : 1, South : 3>
-      aie.connect<North : 3, South : 0>
-      aie.connect<West : 3, South : 2>
-    }
-    %tile_1_3 = aie.tile(1, 3)
-    %switchbox_1_3 = aie.switchbox(%tile_1_3) {
-      aie.connect<West : 0, South : 1>
-      aie.connect<North : 1, South : 3>
-    }
-    %tile_1_4 = aie.tile(1, 4)
-    %switchbox_1_4 = aie.switchbox(%tile_1_4) {
-      aie.connect<West : 0, South : 1>
-    }
-    aie.wire(%shim_mux_0_0 : North, %switchbox_0_0 : South)
-    aie.wire(%shim_noc_tile_0_0 : DMA, %shim_mux_0_0 : DMA)
-    aie.wire(%mem_tile_0_1 : Core, %switchbox_0_1 : Core)
-    aie.wire(%mem_tile_0_1 : DMA, %switchbox_0_1 : DMA)
-    aie.wire(%switchbox_0_0 : North, %switchbox_0_1 : South)
-    aie.wire(%tile_0_2 : Core, %switchbox_0_2 : Core)
-    aie.wire(%tile_0_2 : DMA, %switchbox_0_2 : DMA)
-    aie.wire(%switchbox_0_1 : North, %switchbox_0_2 : South)
-    aie.wire(%tile_0_3 : Core, %switchbox_0_3 : Core)
-    aie.wire(%tile_0_3 : DMA, %switchbox_0_3 : DMA)
-    aie.wire(%switchbox_0_2 : North, %switchbox_0_3 : South)
-    aie.wire(%tile_0_4 : Core, %switchbox_0_4 : Core)
-    aie.wire(%tile_0_4 : DMA, %switchbox_0_4 : DMA)
-    aie.wire(%switchbox_0_3 : North, %switchbox_0_4 : South)
-    aie.wire(%tile_0_5 : Core, %switchbox_0_5 : Core)
-    aie.wire(%tile_0_5 : DMA, %switchbox_0_5 : DMA)
-    aie.wire(%switchbox_0_4 : North, %switchbox_0_5 : South)
-    aie.wire(%switchbox_0_0 : East, %switchbox_1_0 : West)
-    aie.wire(%shim_mux_1_0 : North, %switchbox_1_0 : South)
-    aie.wire(%shim_noc_tile_1_0 : DMA, %shim_mux_1_0 : DMA)
-    aie.wire(%switchbox_0_1 : East, %switchbox_1_1 : West)
-    aie.wire(%mem_tile_1_1 : Core, %switchbox_1_1 : Core)
-    aie.wire(%mem_tile_1_1 : DMA, %switchbox_1_1 : DMA)
-    aie.wire(%switchbox_1_0 : North, %switchbox_1_1 : South)
-    aie.wire(%switchbox_0_2 : East, %switchbox_1_2 : West)
-    aie.wire(%tile_1_2 : Core, %switchbox_1_2 : Core)
-    aie.wire(%tile_1_2 : DMA, %switchbox_1_2 : DMA)
-    aie.wire(%switchbox_1_1 : North, %switchbox_1_2 : South)
-    aie.wire(%switchbox_0_3 : East, %switchbox_1_3 : West)
-    aie.wire(%tile_1_3 : Core, %switchbox_1_3 : Core)
-    aie.wire(%tile_1_3 : DMA, %switchbox_1_3 : DMA)
-    aie.wire(%switchbox_1_2 : North, %switchbox_1_3 : South)
-    aie.wire(%switchbox_0_4 : East, %switchbox_1_4 : West)
-    aie.wire(%tile_1_4 : Core, %switchbox_1_4 : Core)
-    aie.wire(%tile_1_4 : DMA, %switchbox_1_4 : DMA)
-    aie.wire(%switchbox_1_3 : North, %switchbox_1_4 : South)
-  } {dlti.dl_spec = #dlti.dl_spec<index = 32 : i64>}
-  aie.device(npu2) {
-    aie.runtime_sequence @square_kernel(%arg0: memref<*xi16>, %arg1: memref<*xi16>, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32) {
-      aiex.configure @square_kernel_0 {
-        aiex.run @square_kernel_0_sequence(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7) : (memref<*xi16>, memref<*xi16>, i32, i32, i32, i32, i32, i32)
-      }
-    }
-  }
-}
diff --git a/examples/elementwise_arith/air_project/aiecc_repeater_1775797115_856352.sh b/examples/elementwise_arith/air_project/aiecc_repeater_1775797115_856352.sh
deleted file mode 100755
index b7aa36c..0000000
--- a/examples/elementwise_arith/air_project/aiecc_repeater_1775797115_856352.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/bash
-set -e
-# Repeater script for: resource allocation
-echo "Original MLIR Diagnostics:"
-cat << 'DIAGNOSTICS_EOF'
-failed to legalize operation 'arith.subi' that was explicitly marked illegal: %120 = "arith.subi"(%118, %119) <{overflowFlags = #arith.overflow<none>}> : (vector<32xi16>, vector<32xi16>) -> vector<32xi16>
-DIAGNOSTICS_EOF
-echo ""
-
-MLIR_FILE='air_project/aiecc_failure_1775797115_856352.mlir'
-PASS_PIPELINE='any(canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},unknown<RedundantLoadStoreOptimizationPass>,unknown<ReorderOperationsPass>,unknown<{anonymous}::CopyRemovalPass>,unknown<VectorBroadcastLoweringPass>,test-canonicalize-vector-for-aievec{aie-target=aie2p target-backend=llvmir},test-lower-vector-to-aievec{aie-target=aie2p target-backend=llvmir},canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},unknown<ExtendUPDOpsPass>,cse,unknown<SimplifyUPDOpsPass>,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},test-aievec-optimize{aie-target=aie2p target-backend=llvmir},cse,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},aievec-convolution-analysis{print=false},test-aievec-convolution-optimize{aie-target=aie2p shift=0 target-backend=llvmir},cse,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},loop-invariant-code-motion,canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},lower-affine,aie-canonicalize-device,aie.device(aie-assign-lock-ids,aie-register-objectFifos,aie-objectFifo-stateful-transform{dynamic-objFifos=false packet-sw-objFifos=false},aie-assign-bd-ids,aie-lower-cascade-flows,aie-lower-broadcast-packet,aie-lower-multicast,aie-assign-tile-controller-ids{column-wise-unique-ids=true},aie-generate-column-control-overlay{route-shim-to-tct=shim-only route-shim-to-tile-ctrl=false},aie-assign-buffer-addresses{alloc-scheme=},aie-assign-core-link-files,aie-vector-transfer-lowering{max-transfer-rank=4294967295}),convert-scf-to-cf{allow-pattern-rollback=true})'
-aie-opt --mlir-print-ir-after-all --mlir-disable-threading --pass-pipeline="$PASS_PIPELINE" "$MLIR_FILE"
diff --git a/examples/elementwise_arith/air_project/aiecc_repeater_1775797139_858651.sh b/examples/elementwise_arith/air_project/aiecc_repeater_1775797139_858651.sh
deleted file mode 100755
index 2f765b5..0000000
--- a/examples/elementwise_arith/air_project/aiecc_repeater_1775797139_858651.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/bash
-set -e
-# Repeater script for: LLVM lowering
-echo "Original MLIR Diagnostics:"
-cat << 'DIAGNOSTICS_EOF'
-aievec.mul_elem conversion is not supported for AIE2p.
-
-failed to legalize operation 'aievec.mul_elem' that was explicitly marked illegal: %28 = "aievec.mul_elem"(%24, %27) : (vector<32xi16>, vector<32xi16>) -> vector<32xi32>
-DIAGNOSTICS_EOF
-echo ""
-
-MLIR_FILE='air_project/aiecc_failure_1775797139_858651.mlir'
-PASS_PIPELINE='any(aie.device(aie-localize-locks,aie-normalize-address-spaces,aie-transform-bfp-types),aie-standard-lowering{device=mul_kernel_0 tilecol=0 tilerow=5},aiex-standard-lowering,convert-aievec-to-llvm{aie-target=aie2p aie2-fp32-emulation-strategy=accuracy-safe},canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},cse,expand-strided-metadata,lower-affine,arith-expand{include-bf16=false include-f4e2m1=false include-f8e8m0=false},finalize-memref-to-llvm{index-bitwidth=0 use-aligned-alloc=false use-generic-functions=false},convert-func-to-llvm{index-bitwidth=0 use-bare-ptr-memref-call-conv=true},convert-to-llvm{allow-pattern-rollback=true dynamic=true },convert-vector-to-llvm{enable-arm-bf16=false enable-arm-i8mm=false enable-arm-neon=false enable-arm-sve=false enable-x86=false force-32bit-vector-indices=true reassociate-fp-reductions=false use-vector-alignment=false vector-contract-lowering=dot vector-transpose-lowering=eltwise},convert-ub-to-llvm{index-bitwidth=0},canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},cse)'
-aie-opt --mlir-print-ir-after-all --mlir-disable-threading --pass-pipeline="$PASS_PIPELINE" "$MLIR_FILE"
diff --git a/examples/elementwise_arith/air_project/aiecc_repeater_1775797174_862028.sh b/examples/elementwise_arith/air_project/aiecc_repeater_1775797174_862028.sh
deleted file mode 100755
index e9fc1e4..0000000
--- a/examples/elementwise_arith/air_project/aiecc_repeater_1775797174_862028.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/bash
-set -e
-# Repeater script for: LLVM lowering
-echo "Original MLIR Diagnostics:"
-cat << 'DIAGNOSTICS_EOF'
-aievec.mul_elem conversion is not supported for AIE2p.
-
-failed to legalize operation 'aievec.mul_elem' that was explicitly marked illegal: %21 = "aievec.mul_elem"(%20, %20) : (vector<32xi16>, vector<32xi16>) -> vector<32xi32>
-DIAGNOSTICS_EOF
-echo ""
-
-MLIR_FILE='air_project/aiecc_failure_1775797174_862028.mlir'
-PASS_PIPELINE='any(aie.device(aie-localize-locks,aie-normalize-address-spaces,aie-transform-bfp-types),aie-standard-lowering{device=square_kernel_0 tilecol=0 tilerow=5},aiex-standard-lowering,convert-aievec-to-llvm{aie-target=aie2p aie2-fp32-emulation-strategy=accuracy-safe},canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},cse,expand-strided-metadata,lower-affine,arith-expand{include-bf16=false include-f4e2m1=false include-f8e8m0=false},finalize-memref-to-llvm{index-bitwidth=0 use-aligned-alloc=false use-generic-functions=false},convert-func-to-llvm{index-bitwidth=0 use-bare-ptr-memref-call-conv=true},convert-to-llvm{allow-pattern-rollback=true dynamic=true },convert-vector-to-llvm{enable-arm-bf16=false enable-arm-i8mm=false enable-arm-neon=false enable-arm-sve=false enable-x86=false force-32bit-vector-indices=true reassociate-fp-reductions=false use-vector-alignment=false vector-contract-lowering=dot vector-transpose-lowering=eltwise},convert-ub-to-llvm{index-bitwidth=0},canonicalize{  max-iterations=10 max-num-rewrites=-1 region-simplify=normal test-convergence=false top-down=true},cse)'
-aie-opt --mlir-print-ir-after-all --mlir-disable-threading --pass-pipeline="$PASS_PIPELINE" "$MLIR_FILE"
diff --git a/examples/elementwise_arith/air_project/airinput.mlir b/examples/elementwise_arith/air_project/airinput.mlir
deleted file mode 100644
index d0b7377..0000000
--- a/examples/elementwise_arith/air_project/airinput.mlir
+++ /dev/null
@@ -1,41 +0,0 @@
-#map = affine_map<()[s0] -> (s0 * 256)>
-module {
-  func.func @square_kernel(%arg0: memref<*xi16> {tt.divisibility = 16 : i32}, %arg1: memref<*xi16> {tt.divisibility = 16 : i32}, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32) {
-    %c1 = arith.constant 1 : index
-    air.launch (%arg8, %arg9, %arg10) in (%arg11=%c1, %arg12=%c1, %arg13=%c1) args(%arg14=%arg0, %arg15=%arg1) : memref<*xi16>, memref<*xi16> {
-      air.segment @square_kernel_0  args(%arg16=%arg8, %arg17=%arg14, %arg18=%arg15) : index, memref<*xi16>, memref<*xi16> {
-        %c1024 = arith.constant 1024 : index
-        %c4 = arith.constant 4 : index
-        %c1_0 = arith.constant 1 : index
-        %0 = arith.muli %arg16, %c1024 : index
-        %alloc = memref.alloc() : memref<1024xi16, 1 : i32>
-        air.dma_memcpy_nd (%alloc[] [] [], %arg17[%0] [%c1024] [%c1_0]) {id = 1 : i32} : (memref<1024xi16, 1 : i32>, memref<*xi16>)
-        %alloc_1 = memref.alloc() : memref<1024xi16, 1>
-        air.herd @herd_0  tile (%arg19, %arg20) in (%arg21=%c4, %arg22=%c1_0) args(%arg23=%alloc, %arg24=%alloc_1) : memref<1024xi16, 1 : i32>, memref<1024xi16, 1> {
-          %1 = ub.poison : i16
-          %c1_2 = arith.constant 1 : index
-          %c0 = arith.constant 0 : index
-          %c256 = arith.constant 256 : index
-          %c32 = arith.constant 32 : index
-          %2 = affine.apply #map()[%arg19]
-          %alloc_3 = memref.alloc() : memref<256xi16, 2>
-          air.dma_memcpy_nd (%alloc_3[] [] [], %arg23[%2] [%c256] [%c1_2]) {id = 1 : i32} : (memref<256xi16, 2>, memref<1024xi16, 1 : i32>)
-          %alloc_4 = memref.alloc() : memref<256xi16, 2>
-          scf.for %arg25 = %c0 to %c256 step %c32 {
-            %subview = memref.subview %alloc_3[%arg25] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2>
-            %subview_5 = memref.subview %alloc_4[%arg25] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2>
-            %3 = vector.transfer_read %subview[%c0], %1 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16>
-            %4 = arith.muli %3, %3 : vector<32xi16>
-            vector.transfer_write %4, %subview_5[%c0] {in_bounds = [true]} : vector<32xi16>, memref<32xi16, strided<[1], offset: ?>, 2>
-          }
-          air.dma_memcpy_nd (%arg24[%2] [%c256] [%c1_2], %alloc_4[] [] []) {id = 2 : i32} : (memref<1024xi16, 1>, memref<256xi16, 2>)
-          memref.dealloc %alloc_3 : memref<256xi16, 2>
-          memref.dealloc %alloc_4 : memref<256xi16, 2>
-        }
-        air.dma_memcpy_nd (%arg18[%0] [%c1024] [%c1_0], %alloc_1[] [] []) {id = 2 : i32} : (memref<*xi16>, memref<1024xi16, 1>)
-        memref.dealloc %alloc_1 : memref<1024xi16, 1>
-      }
-    }
-    return
-  }
-}
diff --git a/examples/elementwise_arith/air_project/asm_air_output.mlir b/examples/elementwise_arith/air_project/asm_air_output.mlir
deleted file mode 100644
index d0b7377..0000000
--- a/examples/elementwise_arith/air_project/asm_air_output.mlir
+++ /dev/null
@@ -1,41 +0,0 @@
-#map = affine_map<()[s0] -> (s0 * 256)>
-module {
-  func.func @square_kernel(%arg0: memref<*xi16> {tt.divisibility = 16 : i32}, %arg1: memref<*xi16> {tt.divisibility = 16 : i32}, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32) {
-    %c1 = arith.constant 1 : index
-    air.launch (%arg8, %arg9, %arg10) in (%arg11=%c1, %arg12=%c1, %arg13=%c1) args(%arg14=%arg0, %arg15=%arg1) : memref<*xi16>, memref<*xi16> {
-      air.segment @square_kernel_0  args(%arg16=%arg8, %arg17=%arg14, %arg18=%arg15) : index, memref<*xi16>, memref<*xi16> {
-        %c1024 = arith.constant 1024 : index
-        %c4 = arith.constant 4 : index
-        %c1_0 = arith.constant 1 : index
-        %0 = arith.muli %arg16, %c1024 : index
-        %alloc = memref.alloc() : memref<1024xi16, 1 : i32>
-        air.dma_memcpy_nd (%alloc[] [] [], %arg17[%0] [%c1024] [%c1_0]) {id = 1 : i32} : (memref<1024xi16, 1 : i32>, memref<*xi16>)
-        %alloc_1 = memref.alloc() : memref<1024xi16, 1>
-        air.herd @herd_0  tile (%arg19, %arg20) in (%arg21=%c4, %arg22=%c1_0) args(%arg23=%alloc, %arg24=%alloc_1) : memref<1024xi16, 1 : i32>, memref<1024xi16, 1> {
-          %1 = ub.poison : i16
-          %c1_2 = arith.constant 1 : index
-          %c0 = arith.constant 0 : index
-          %c256 = arith.constant 256 : index
-          %c32 = arith.constant 32 : index
-          %2 = affine.apply #map()[%arg19]
-          %alloc_3 = memref.alloc() : memref<256xi16, 2>
-          air.dma_memcpy_nd (%alloc_3[] [] [], %arg23[%2] [%c256] [%c1_2]) {id = 1 : i32} : (memref<256xi16, 2>, memref<1024xi16, 1 : i32>)
-          %alloc_4 = memref.alloc() : memref<256xi16, 2>
-          scf.for %arg25 = %c0 to %c256 step %c32 {
-            %subview = memref.subview %alloc_3[%arg25] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2>
-            %subview_5 = memref.subview %alloc_4[%arg25] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2>
-            %3 = vector.transfer_read %subview[%c0], %1 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16>
-            %4 = arith.muli %3, %3 : vector<32xi16>
-            vector.transfer_write %4, %subview_5[%c0] {in_bounds = [true]} : vector<32xi16>, memref<32xi16, strided<[1], offset: ?>, 2>
-          }
-          air.dma_memcpy_nd (%arg24[%2] [%c256] [%c1_2], %alloc_4[] [] []) {id = 2 : i32} : (memref<1024xi16, 1>, memref<256xi16, 2>)
-          memref.dealloc %alloc_3 : memref<256xi16, 2>
-          memref.dealloc %alloc_4 : memref<256xi16, 2>
-        }
-        air.dma_memcpy_nd (%arg18[%0] [%c1024] [%c1_0], %alloc_1[] [] []) {id = 2 : i32} : (memref<*xi16>, memref<1024xi16, 1>)
-        memref.dealloc %alloc_1 : memref<1024xi16, 1>
-      }
-    }
-    return
-  }
-}
diff --git a/examples/elementwise_arith/air_project/asm_src.mlir b/examples/elementwise_arith/air_project/asm_src.mlir
deleted file mode 100644
index aa0162c..0000000
--- a/examples/elementwise_arith/air_project/asm_src.mlir
+++ /dev/null
@@ -1,34 +0,0 @@
-#loc = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":86:1)
-#loc5 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":89:9)
-#map = affine_map<(d0) -> (d0)>
-#loc8 = loc("X"(#loc))
-#loc9 = loc("OUT"(#loc))
-#loc12 = loc("x"(#loc5))
-module {
-  func.func @square_kernel(%arg0: memref<*xi16> {tt.divisibility = 16 : i32} loc("X"(#loc)), %arg1: memref<*xi16> {tt.divisibility = 16 : i32} loc("OUT"(#loc)), %arg2: i32 loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":86:1), %arg3: i32 loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":86:1), %arg4: i32 loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":86:1), %arg5: i32 loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":86:1), %arg6: i32 loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":86:1), %arg7: i32 loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":86:1)) {
-    %c1024_i32 = arith.constant 1024 : i32 loc(#loc1)
-    %0 = arith.muli %arg5, %c1024_i32 : i32 loc(#loc10)
-    %1 = arith.index_cast %0 : i32 to index loc(#loc3)
-    %reinterpret_cast = memref.reinterpret_cast %arg0 to offset: [%1], sizes: [1024], strides: [1] : memref<*xi16> to memref<1024xi16, strided<[1], offset: ?>> loc(#loc11)
-    %alloc = memref.alloc() : memref<1024xi16> loc(#loc12)
-    memref.copy %reinterpret_cast, %alloc : memref<1024xi16, strided<[1], offset: ?>> to memref<1024xi16> loc(#loc12)
-    %2 = bufferization.to_tensor %alloc restrict writable : memref<1024xi16> to tensor<1024xi16> loc(#loc12)
-    %reinterpret_cast_0 = memref.reinterpret_cast %arg1 to offset: [%1], sizes: [1024], strides: [1] : memref<*xi16> to memref<1024xi16, strided<[1], offset: ?>> loc(#loc3)
-    %3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%2, %2 : tensor<1024xi16>, tensor<1024xi16>) outs(%2 : tensor<1024xi16>) {
-    ^bb0(%in: i16 loc("x"(#loc5)), %in_1: i16 loc("x"(#loc5)), %out: i16 loc("x"(#loc5))):
-      %4 = arith.muli %in, %in_1 : i16 loc(#loc6)
-      linalg.yield %4 : i16 loc(#loc6)
-    } -> tensor<1024xi16> loc(#loc6)
-    bufferization.materialize_in_destination %3 in writable %reinterpret_cast_0 : (tensor<1024xi16>, memref<1024xi16, strided<[1], offset: ?>>) -> () loc(#loc7)
-    return loc(#loc)
-  } loc(#loc)
-} loc(#loc)
-#loc1 = loc(unknown)
-#loc2 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":88:15)
-#loc3 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":90:14)
-#loc4 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":89:17)
-#loc6 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":90:32)
-#loc7 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":90:5)
-#loc10 = loc("offsets"(#loc2))
-#loc11 = loc("x"(#loc4))
-
diff --git a/examples/elementwise_arith/air_project/div_kernel_0.pdi b/examples/elementwise_arith/air_project/div_kernel_0.pdi
deleted file mode 100644
index 3681781c43b9bf80414d1863d6cae3111aa21810..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 15904
zcmeHOZ)_aJ6@UB3z9V(Gy~Iuz+aVj<5JN6-i49F4fpzRwF16ANA(h)k#Yjzowy3UD
zT7OA7h$;UPVbUTEP=t$ws-k{4S8WuzUyc$GAZSeqQmR69g;Z^&qMi|zOmJ|2v%9nN
z?))!lrHT(X(%QfGn>TOXdow$J-rIYdi2AzP()a)18+%{<?X!;&*<^t0z$Jfo<C5*i
z`Bxf46#o2tUoPi49%VThVL9%<X(N|E_qd#okH6*i{Wsr!$1S(+*9KcPtVB<AZd==(
zBHDsW{^o5iYBzzbCo=gQT<?Fu4S$?FYr{Zq-xo0h(I)ut(6fcPG$u0*i^yw_W0SIT
zYQ0noFUW3{OjUA{$<n60>Wn%1rZf_?HnI&8HuX{If`lUJQe^C%8!V5?8JhHoenb+v
z_TA2LSV)rCQG}<bd&Ud*B>$pDebw;DOU$@Lj-rTM>Wm~ga%PU`heI3bLMT8RHpP=x
zN~}$!#E#CCXu2dNu3DcGd)KDKofoCVqfM!6(zK;$P~qx#zVA@s4I}^N%0v$VXWupQ
z|Gpq$CNr>AnsbwFYXH#P<jx3X9DAkpapv0mH0JThnep<j`)k_jcGLBtE@Fpw;?82q
zoGR8Cg?nc3yT=$>Gv{`FkUHIS`9#h~e7@Y{B;$j}VG~U+p?~AVjr~VM!lU8DoVfOb
zD2btEqGJcG=3@sBHjAcLm$5f>Zuahly?d<|(R2d(nI-%?iT+K?`*$Dg-FFc7-h%!W
zOV}%+|H|d<Jpg+T$aqgff2sWa9sTE!zlRPsAA87xfB%GjX5yyY!s{{PjWx@7zcjea
zdK|V|jvYq4b!!|M@8Hm4_M8?O?^kYH#@@pRTaG<!HH*4t*z26wH@v7l_;=aF*G49z
z{j%yq_mNi<?juvUPT)F;>n&WR#ED&|cVren+g6=Dv8%1)$k~w%>sv-PY;GOd(6e@A
z!|U#eU8~wJoEh2hSo6q+U$l(;e6n@qZ?B-|&udTr{^-U*ck<Zk_nNoVd-u<r-c@+Q
zaz_n+ING@6I=;Mr<X!cKS>Hpeh@yS%l7YKtD7t&x<T%hM(U2VJCMn_(pF>HLyy$*7
zDlJ2VnX_W$=Nz#z@9gt_Gja8Y`@C(1$r-UG@0_ulD8Kj3gjdQKKdJ94iGhLiHMfxS
zOKK=wH*?J+bN`xMM}@>J?)FZzw65^<M?n3iM}<p2+V_t`g)7dS`QC$+fAqCPijIyv
zW7yOho*rKyXB4MHe4N3Rz`y&A(mLgz5AO{XqqyW{XVRfg1OAdRIY_sCn^sU~d2;aM
z+ctILbfnB>@8K@bJa@Rh?X1{ze2pl-=m-y2{uM`j`K49J<w<xJH_QiVCO#i<Lg-WC
zh5Rb?kInsIc24<o<m>&B@4Fgd%eA&DQ++VxOaB+QSLZvfXHe#eF}ieC+|l5O{mFEu
z!!g&BxM(1aTnt&+(4JT`>WEFYGdVW*9b^|i_0hORdC^^;78_rJu0Mto9w+ptNv!nK
zq7E@WQ*@S^-%XGs^WU-B4r6E;kTGmkbIO^Q8ZYYd!T95y$zVLj7A%~=VUzI}??MbE
zE2+3;>^0rJ>d0>2!(V7yd}{^vf`8J&DuykL*X<lnWPWzYs3&3M(_+Vij@W?=tQ>XR
z@(pQm$#Lvh$a|5kAS;xVSqWQ|4@7*t-H!GX<|t{O=liGr{)xulIl>{CPdUER)Z@vH
zW5yjO<BeyWZ?9rMAzo>3Yz6#_?8(2ULG#GFZpMB6g@n7UlyE0r##!=O!u{^+33vCK
z33v0K5^nnK#QeX<nBT{0#tm}LuFOCD&l~m8d?NFibmqktYBP{e$2~KecF$al-1W%p
z51WylKJx1GcZp3eI!9sqr*cnY2fA55?Y?U{!YHm5Yi~%q8!ldQzs-0}F5-oSh3~JZ
zU;0<4*2gWF>C97yjKs&{z|)R6(2*9Y0Y_Zfmlo@K9g)g6iggb(iUZ?mw{R#evRHS=
zY!p6z9~g4n!cIrfPDkb`W2`V%n_@=~dcf`{Xx=0J^+{GRQ9dTa^AFL9C!bH5!{uLD
zh5Kg2t+R=YY5SJEAK)LxMEYuf$V9o@j_aa}+{d0@V10PqC&x%x)qP4<b)S+|-KS*o
zy2y|MT4--G$(0JOz+R<Hrz+(nlhcMA6}I{H<d#}JS*s^&_2j(&7iPR#Jz1+KYxN{5
zYtr>`t)9%F{;SoKwR-aZT~CJfovh-*da_bh^`Gij^`DZ->#eFMd#dWmN|{bo%1I_K
zT~F?;)swY)vQ|&l>PZ|uwda$y=aaSPleOoQwda$cyyuf)eW&U}H>&THtm;1{tNKsL
z<n5`dCwEuXla(@^s+5yVUb>#Vu~tvk>d9I?S*s^;^wjFfT0L2-Cu{X&t)Bek)sta;
zr|LsDs_&Gn>OUo``cKOI-AVo#w|tjUeR6Hd&#=qur8-IMFzhj$W!Psp$M6ut!wlyc
z9%Fc%;R3@&hNl=VF<g$Yd}b>@)31y#lvPYr-pu0D*kgQ_VV~g~!$S-YGn{95jNx&H
z3k(+-o?^Jfa5=(Ze7#lig|d#XhsCF{SJpVou+MOg;UR{H8O}32#_%}91%`_ZPcd9#
zxE$dyzCBg(g|d!sH;YeWudH#FVV~g~!$S-YGn{95jNx&H3k(+-o?^JfaG7C;T21+G
zb8xQ#tGK|+>QnJLMrySgFJYjQ!=5)H?6Z3z%Xp9R4&#aOIqWTMN8b-w#(QipI*cdA
z=dgy_PUT)xyvNquVfw`Q9CD@feU`T@<2}YZjHigN{9jjElkay&FK<m@pBQ*qyQ-D0
zx9j;hrSAp$ew8-SyEoexE~M{=_JeiegZ}!Iy+*EiBhGBu<X(3otm3sHOB~z>RecSR
zgL4hz<hd@J)J+@J@%5Z%d{W<+<|^9a@dfsmh!6c}VSFJrAj{ZoJpYh5jIXk<L4EcW
z#Hah_$A>+7Uh60Ieau?f7LPBmzeIfKM+@T%v5qgim%{ie`x?}{VnKYmZ+?91&oe%$
z?*~5dAB)Eq*k2+(^rMCGh1h@`u3vcOgz;7OHK^}Y1o7!U9iMzTQ?_P1IOy<x@j}g?
z?HNsD!a5`D_oTjbfSMOlr=B_gZmv&Oq;EVJjH(_~t?w)Q=H=C@fn2TcDf`yri`tjD
z>aK?$XfjvX2+ILM9wAE{<gu!+0W#i&4EmB)eT|US8Q@g)wL-oq8t1aU1@bTV7uq8E
zpX;g4XQ;nOKK&|vtXHUSF1%jz^sPvLk$hI`W4(f}oe(XuUbFsalz$nd%71`W{<qgR
z<0(hjw-s16)z=4Et&LyRHvn1XxWB5;hOE|xSl<HqH_tQwq5iq&Kh!_>{D=DIo_`$V
zGGoi;Kd5b@{U2bJW8~jt`LF0h{)6w7$#KYkMIZ8Sviw){A^)Z%qI;^MZ+qFoHy9`{
z_eJA0{zr&#68vY0ijt?05ApjJU1h9G1w6ILJ0%#~X|%*+v*48-V!RI?aR&M}<1^qf
zOTZ@?zZE<@3it-bcb6@b`7GlJ$AV`*OTHNR`MzZ`pG_1vfxc(7#AD%ejNx;P;j_$n
zpbwv89zM&QhVwEWKAX(vSl=mI@)iC--=DVRE(yniXFf~582M}~pF5}})`xEzgU6TZ
z0z2CoZ-d9%K>@#&@k#J2BfgvQ4d7$n7;l%`3EyK3?6}kIa4p0>_;}qJ-w$4|i`MT0
zALpFuC&A+@LV^83#t(pxbIy3X+{Ns4#r|}I*ZydJJNOGC`yJDoPl9iX_?sBt4}NvT
z4={cJy!J=ivCF+I-d^TUFX1KqAg<PF%_pI+{nUIvc<qno2f*ujH9rVGUUz2SF7ILf
z?1}wJg4gqEz7f2(ula1b-DK+?@8PZBwVn2u-wvLwd(3x(XX_sGIq;u~@{^g?b_T)Y
zyJbP%`Wc@ocQJpuVmlq+b)Gek{lNUud=9*xSMwhDc>hmpK2z>xc6#G^UGUnD=CLkj
zeBVuLzO~$LH$-t|r`zNE*9Fhwjrm6K+K$%u!0UDAd>6Cd#q4zH`+8dQ@SoZ54fKP*
u4Q1>GI7(n5+2pcaHta0Ak*kVlW*8;+Q8*_0-iVgN_x`jTer_ve)&2uCkqQU^

diff --git a/examples/elementwise_arith/air_project/div_kernel_0_aie_cdo_elfs.bin b/examples/elementwise_arith/air_project/div_kernel_0_aie_cdo_elfs.bin
deleted file mode 100644
index 3542f70d63d78dc642ddc7dc29acd12d643307f5..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 10704
zcmeHLZEO@p7=Cy6?)GRXH{15=aM#*xDfF;gC?6spDejfVAT||HOx4EJL?ZDc)I`kv
zsT{P(R|$nkG6ad0gcuVOw?>=hepM+12q=V-)QHUqP1HnUO)xo1%lXdTZo7EbmVfeN
zCz;;vGc(V;?>jT^J{=bTtgCw-Km^#H1MuuuMp6*~WWj|kSk!rduc=~TWT=#PPgM(}
zSun_gd2JY<$1Nn`El!pyW#AHkxu6c>vt$c|1w{ZiH%KasXb{r@J_7zkUAss>GI)X2
z^!WgW!(GO1@2}RVR|OSRIY9x41p!b9nQwe#A_?$eb0y5d4%izx*-6RAie(?G3Hg|R
zzK=aw>SJ4qeQd{EA3NsvtyW-Zp8&>+n;#G`t_jiWxpElrx%!q6{b!aedNt&VC#kor
z5FfBmZ>{;H5m&BsUt1IX8|P7hB-ALS-@r<`9G*`1Mc>gnwxiD{Ug>iS#_k0EhK1(B
zBwcpLHynPVd!vs2=>WJk+&{E9U6}towpVs<Uf+`jRA`ZtY|R}rh-qek-Zm-F+twCf
z{!26PU3oaZJ;=953Nrry_SdGdZxGw7X0~rH^6hOyz8lzo(KLK<Y+pPx-+tuV&;1?7
z{?o<pcWi%%_#JEu^d6MZzCW?Q*8Sqf?CX(GI|JO`ml|hSk3&+h_YnH)E+pLF#^$N`
zNRa#c%C;H!4z~q+4@&{&4j^BsyQyVLKD2LP_iK@uxnJ_`(j%8-dgKb$0jz^qZ(xnf
z0}Y~jbPWHN<&6(Cl$9SHk5rTfBNa=EA{F7{NX1n;(2zH8P9joqJP@h)A{hBRRuuW|
zBDVZoJp4mXWh0IC=HCu1^{D$2!wtp>iM9)Ri@9-u5}ltPeyXmCo-pJAnERT7fTyR~
zJU!Igg-`7O&3q*Ff{CMl1iT`s=6UGh%=n5l<1F_X!g8aeN&U8a#oZ=#l@Uv@!YCPa
z`60UHx~#@E;bTu-oYmJWt6vB5CCoO~CRQIw{xwzthCGI+JtW3Uj8pgUX}zd|vEW|Q
zA05V`(b4x0K=jyU*#|w5)0ioX(%IuVAnlkPCej(2lCp1~5HGRp)6sgt@{!HIKw<<!
z0@@2gtP!@o133^Hi8bEeRu#hR2-?Eic*?c!4tdJPSyg`_8#zaqiZyzXu$Ry0VJrvH
zvWbH8;ag%NKVXKK%o1m#dDz~W{BbO4*>m)b^``9&1;|CEl@?VU66xIjsrlSGp{fAd
ziB4EB#@?Mx*jBHil@qZP*xY&rW6>;Cr}<c6J7HBW66;KE$LOL>b<ib2lr8rttnxhe
z)jKi6V}^bVSgx)xH~M(Gk4!f|m4Wd1m*>0EJ_PHz4=b%X`OMReXYOcfe1FKB8joWQ
z2{UlIaR2*uq7QM&Yhg3+`FG_V-K47k9v6E%2ls+LDB&umOWI!=>XNn3Is^|eA*!&N
z1BBIJ1ajL69oe9;`Te+K@qG(MD-{*+@~EVX84vWhf1b<SPdG<>(&kM2tnVKy_?|EV
zJf3`fNSquGt_h#<gmHf-X3XS!U_YV1oUapggY`Y)-_ul4L0y}mSI^3HRa~as7ciGx
zmg##}WxDCQOqcv5Q{@*q^Lvc*>$sW|1>ZNi9*jTQ9~C^N{Y-mG(ar^-Ou%>&I@+$#
z(Z?}%VIKV!(d4NE)f3&xs?L!f<o<;3Y21OdT36^Ti7=rrpA~OVXvO2x?ze>Mr|2&-
zWItbKJGakmtq(OsMf<cvkngkBQ-rmaE6i6<*y1{cm8>Jo7cF2V`wLiWmqLvWg;nFa
z6S08l_}kh{s8LH8)Dj*iO~?`QMOG8W7DTrK_T0_w)qyJ*Gaj*Y{zLEB$umB&W#ntg
z*qdN)j>($n(j|T#&^{bv_P6#24-{S1KPkF^9zT<1eJs6aX+S>C!YmDF$1DwK$1DwK
z$3R^M0tc{Id}L?|@D=3M!e|x@vS3~trU;xNT^`h%OC7!G=uJm&X8b>qP#wML=uJm&
zqVk4G{p{#X4fUa;HyyqCf9TD$7UU}Kzgp0aSz6GJSz6GJfx0e>-VA5ao0%94Wnx|%
zHnrZYb@ZmAHyyp{=uJ#T&i$ryzv<j>I`^B-{pO>2znRu}cFjmljc3O!EojFqEojF;
zeIbk9+>}LcW@0duiFs|<)OvHXqc<JB>F7;IZ(=HP^roXX9lh!3O-FA&ntC&>@$8zB
Rni|iJSz6GJSz3@|{{p#ZAFu!b

diff --git a/examples/elementwise_arith/air_project/div_kernel_0_aie_cdo_enable.bin b/examples/elementwise_arith/air_project/div_kernel_0_aie_cdo_enable.bin
deleted file mode 100644
index 7cc1818bce4d6ce1226fc5fda519967a4842b99e..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 104
zcmZQ!U|?`|@n>LQVqg#jvKQ~%@L!OTfq|Wo1;_&e1!V>x7mN{N##qFhv4{o3#Q>))
B1+xGE

diff --git a/examples/elementwise_arith/air_project/div_kernel_0_aie_cdo_init.bin b/examples/elementwise_arith/air_project/div_kernel_0_aie_cdo_init.bin
deleted file mode 100644
index d4549ba181167c6e6d7475f23335e34fea2c3376..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 6032
zcmb7{&rak<5XQ@%hINw3u1Uy>EJ*QcI7AT#M?)kINHH2BaX`vM<iIUF0*^qXJ?)8e
zF5wYI$_vaRaLt7vMY;Fvc_s37+iiD!;}I|tf4=&CUGA#tw!nzU<oIWiROFvEk-z_%
zJ!~W*FA@WSPcd@He{sy=EE%{TBKEES5p+(vAYGEKNKZ*$kiH~+Mf#fb4e1%_Iq6%{
z_oQv3yPk}D`c%hUV?tlVeouYUx*%SXu1HTwUy!~eeMS13^bP45={f0J()Xloqq{y$
zdiqqybf1o>Pg)mN>ymUudP@3&^d;#l($}PKNY6;mN#BybCv6+u_35mqPjyW9=@;sg
z)`iu&Bwdl7lD;5)N&1TPHR&7DGtzU?x1{e$Thh7gr0z}C-Y@95Kj1=K_e*vRasT2A
z`KGw%NtC~$_drR!Af6K!;#1rYonOBXO5z3Gubj9LpW<BU{LA+y<^`S0oYo8RDdrKc
zuV_9c@q&0xTpGTS_%ZrfttFZNDYrPwi8P`Px$pNsn)N<^;r&%-Y<~OqH}kU3U)A~j
z{NqLc{b^o@^RRhW-=}a-^G3%#Fo-!FJksMCBIYU#o-Fco^&UO5Ugxjkt^40s*X#2?
zQZGFCgnj0BwN4NVJu!$m9X!(G86u|Z?ecW>;_N?bzRq9a$@;UpUZ4Mwdf~w*)RN!T
zTGw0O+2D~L&k!+PZ<nX5_w6(5b^gllFaEZ=UZ4Mwdf~w*)RN!TIzg<PZ~YDo9_jH6
z5!3Z{dDfCo{WC<~82>Vspqp^>qxyLCZUYYv{=)g)M&yk)HtOi1`aSl`pz$Z~{VHC>
z{wm~8zhkS{>@Pz8!5=I0ho0@PV=wrGo|TPu3xmfX=5+8#k7tP3R^-WgJe!Ecz0Z3*
zJBWQ9<=Kinq2IpGzOohbc{t*JyYq{${(e{F$2@C)+CR@F|Df@&=r{W@PpQw>ig|vl
z-i6*{KG=BId(6k{cs_@S#l1i1@f;%-&#>z8oFW$Y{-no)v!Lz|&O*DNXg+CuAD%p)
z+W++V)c&W>r}jU6KIt>&<BoKDH!vTsX+9~<XUT*4q%@x;59X87e3m?zPwIlvU0w1V
z+JWF-=&NwXHW&VX<5p+jKZ(o(mzsC_!$5vWzIoMh*>-QrcKx@>wrfeNOYm3=@d_NZ
zw|+x>4;)3bJR^PpjtW{nB))G4Db?$G()v+vO7%KEZ|nOoNU7cwf84d}i)35nL%m9)
zUZqj5n{~S$^(u~f-3(ifdKE{#Db=fZZU-ih)>n&xd&hVAz^Pux=TW^Tu6I|q)q2#U
zHTbJ2{~>V${xaeR#53^EBfd|32(IcOZtO_bqxD;h5z6W6t-#g!Bz^*}&xc-r46f!)
z>of4z(euuUpMtA-6E}8D`NwMi``~*2njeCH5<P!+p?L=WRm6WKegghw#7~Kzg6sY3
z{KihGe-qmOL|#PscNUswSg-5T`~+O@U-MINy+6&*!PWVs=No%Q`#)3r&%pKmG~Wc*
z&)2-Pqm<64x~~V|I{!%VLvT8uitmHd`BZ!g{#n%jy@k$y4*o^NPl)f?G3|e>^6!G{
z`D%{yOZ(S+3a<C3c>%8O|3dRUJE8m&wZAQJonLdDk5s*%7Mkzakr_t)DHkL4{@Vhl
z{wcl*uJh~l1-L$+%*XWnG36iY_v1oy)KAZ!`1S4I2o|Ro1qz~3EajN7i78>jEaZQL
C^4EL-

diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_2.elf b/examples/elementwise_arith/air_project/div_kernel_0_core_0_2.elf
deleted file mode 100755
index 33f5143237b2608b0fcdae4c633076f2bc60cb03..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 4132
zcma)9eN0=|6+h3<_&Hgquld?7gz`d2a2f+PI0>esHH=kJ=AdL-BDO^}#x}tU7={gV
zHf4DYv}u||B~c^Ys*TDjb<?CtGt<Tpe|am3(yHAOl}M@9NnXmdO<Of0wUSFoymS5D
zGY&~1N4mb}ch0%zo^$TUyFU3X@3Vp+fKCQ5=a379n=Ak#_AZdY05*`o2;5r1rqkpG
z0Nx5vK(HPBM=Yr8@_SQ0(0HQ&%m^Kj+92h$+cyK;-Y3eiXop!3z>B~+YJ8r|;nZee
ztx1e(p=~m8%KUeI)DHz2WGPM65Hu+T8xIJHk4$F(e%8Gc9z!f-y|c@rg;h!x)@rk`
z(rp&DYm0>)s<g0?$1UtqsinmVjY$C#yKj#YNPH+LHw&dUK;H3*pnSeTGMMe?E0v+<
z>M~@IsJUI^?8I0o-m^cY+_3|bfuv=H(%(Tdb-^>LE#{8Vv5}<3a6MTdBu=HVZxg!9
zGPLP4%R<|eQwKblp9iq!!1Kc@sKiRIp?>GoclJ)J0vU#+414M`t%#~;fSEC|d}eH{
zoRz-6j=$@V=I;gc_kvi#O6SquzD9hvP+zxxe5cXh=`r;8G1}L!;V*^yhV}iOL4Rj>
zz6)r-R{j2k`bVhW*|GAOvm(a#U$nPReeb}-*CQ=Yl=FNKcdfG?=f#Sd^O$cz8R7YM
zb$^vVQo-~6LEk$5UKy*Hc||N|1?A|^Hg#m^EB#@7JEoqC&uaUn=nHz`eTiPUj%^;>
zEo>iSOG)#c2KnML{#F;=o$su!xp+5TyQLyt>)af#ZL5se-k|fHMOz<B$7^3JkJtXH
zBL2(S&GCP}hnl}uF8q0VXBVBFDgLy)(Imf=Ug%7`F4A$qGo)=?pv1$^4?k5O8cc0a
z1fcC}3Id*<Zte6?^CWWP0Ns2fHG?L{{0Nv0AZzDgntMiwZtk+eX9+7*$PxMXQ@g)7
zBD)i_X;!9?C1WWlhi*!8$}YTW>PRuK*V^I-@@v?Xcskv3A@jfGDo9Anc-n1-R8``w
zFOl18kRh@C%On3jm#AM_ioFcVr4J+vOvm3AjIdd~J-!WO9B&5$4^g9Rd@l*9Dm^|A
zMla|&vTg5@SpqgeMss1d3;OzDBiI&ayYBVX+4v1=-|;D)a{C|8o2u`!y16p8c$F|2
zoAMrE-@jIbwY-Iq<p}3Pl+N7`ctfn*5?7QW)K6sovYgT5xftB5#ok$hzNpx&yXrtk
z9$3>~frn680Q>6`uzi^gY$7aVw%TimVGFRwy;iJ6x9Cv)v9fW(>WpM|A~T59#h5x^
zQUry$Ojfq@8rpg$@P-EpUk6s`v9bcp@$Dp8Ykn#L;q|X6HmW&PczF)Z`kZ{`wZ^jo
zB|Co3X3mbsu?Z1xV3m0O$&;8vN;K=Tb^Mh+Uv%-HM+Pt%*|Ckd7d*E_Tt$`Ce5q|x
zvj6;?U;-v6R@VA5VXat!!f`?u_gUGtIoz>`4`H>kRRJ@vib`5NFypzcMr}Xg9I2I<
zAB<1`{;`rj5k`R5laIF<R>p%dmouI)o^Ni(2mRf*pD<tUZ(<|Hr}^XG(=5p#Kb@vG
zu1M6KlIYaCc$a)2(I4HA=)s#3b$%pK>))gYzsESg2Ujykhz}U=uRq4G2qtYlX5VAA
zU#);@0qaTV(zul_J%P1r<JBKBXns0CHYq1r-BmJ;zTe<y1a}~Hc&zjjkuV`y%qsU;
zY3&nh?zgmDN-<yb@bL4c>3Mtw`ufm>!D@f=oFLs}C*C6LM2(eMyo5D$SXtFK39~3A
ztm;e&J27dciE~!wz;!2v66V4FL^q*{cEX^Y@H*LrjY6@3wYH%Kr6EAeQ#@V|@E43$
zk6HEp!|d4QcPxgX#ovmF(`j~eS+X099+961j1R|X_WJ(dg`)1cl}8uQ*WP)^`gp^U
zW1-;|+3W3)o$h)^J)ZACzdv$R9t{mg!-Ek`D%<*Ev7u;7L&MQ<tZ$^JJ}@}Y&=u|v
zN8%0r{i6eQ{%}a{X$k}#?qE|>$RBKSxOcmngQ38lpexwq>}l$81)T0s(CzAV@8Hqh
zPeL9v!ktVp;$%)HKn3!tJiIl>^Vidz41YOw^9O2{1bok;JwB^c65t!iB=Ynfh?BYM
z2(TZqBlmz6;4s(cVIN{S&%O^a$<y;?8p^}R5qIa=pF*6-!)FoWQ&%GaUPVsH!<P|H
z=HXxHrr7g*f3phz0kH$$=qd?t1sNIJ0c=Gez(?3SupSfGIKGXT$D9;Z{2%1Os0K*D
z1dAMdR`BN>D=T;jVG`rxKDa+oTR>UEU(4bZ`x?YNKHa_nG3Pn)@h-%27V!sw0Bx)A
zcUR%(R^iAh{L@wVMZ{oK7l`L~mSfKfp5Rzn!B8Iy#bdy?`9LUwO!W9b%-@53Uo<wX
za=G3gIOdCne-erUc)f@BcZB`@p+Ic^U~r^A1ohtbcBjv&^^Gtv(jV<@st;rZ4&N@V
zZ_@fbTHmM(oDNOm)J>hbja%0=>WnUF)Fq8B-CwiLxOoV{Fb^Rx9COwOcqqLik$|r!
zbTk}sg3pJCxpBxB4fy;0!@k~Ncob!7+z=QX4*49uM(7#o&Hgw!=THsMD>BiTe>eud
zD0<+WZ`9vE67qHI=P+;#d;|V)#21c)W57A^1!9B4QGFDb`~1Vh{^PntRfi%$$PUt!
zSsctCsRAy?fOhd87yoI>?mG`I@1qjm3cRVi0k;M6{Qs{`L}U2w=5`JZ%lO^=vBvXG
z_rvWCd2)t!lPKWV5r6jLk<9+D#=9HvIOeFi5MWOZss1>u#+Gk4h<2CwWE$!0lZZL5
PXmTU|tLAg+dcFTY7d$5=

diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_2.ld.script b/examples/elementwise_arith/air_project/div_kernel_0_core_0_2.ld.script
deleted file mode 100644
index fc4f0cf..0000000
--- a/examples/elementwise_arith/air_project/div_kernel_0_core_0_2.ld.script
+++ /dev/null
@@ -1,72 +0,0 @@
-
-MEMORY
-{
-   program (RX) : ORIGIN = 0, LENGTH = 0x0020000
-   data (!RX) : ORIGIN = 0x78400, LENGTH = 0x7C00
-}
-ENTRY(__start)
-SECTIONS
-{
-  . = 0x0;
-  .text : {
-     /* the __start symbol has to come at address zero. */
-     *crt0.o(.text*)
-     _ctors_start = .;
-     _init_array_start = .;
-     KEEP(SORT(*.init_array))
-     _ctors_end = .;
-     _init_array_end = .;
-     _dtors_start = .;
-     _dtors_end = .;
-     *(.text*)
-  } > program
-  .data : {
-     *(.data*)
-     *(.rodata*)
-  } > data
-  .comment : {
-     *(.comment*)
-  }
-  .symtab : {
-     *(.symtab)
-  }
-  .shstrtab : {
-     *(.shstrtab)
-  }
-  .strtab : {
-     *(.strtab)
-  }
-  .stack_sizes : {
-     *(.stack_sizes)
-  }
-
-. = 0x70000;
-_sp_start_value_DM_stack = .;
-. += 0x400; /* stack */
-/* No tile with memory exists to the south. */
-. = 0x40000;
-. += 0x10000;
-/* No tile with memory exists to the west. */
-. = 0x50000;
-. += 0x10000;
-. = 0x60400;
-buf5 = .;
-. += 0x400;
-. = 0x64000;
-buf4 = .;
-. += 0x400;
-. = 0x68000;
-buf3 = .;
-. += 0x400;
-. = 0x70400;
-buf2 = .;
-. += 0x400;
-. = 0x74000;
-buf1 = .;
-. += 0x400;
-. = 0x78000;
-buf0 = .;
-. += 0x400;
-  .bss : { *(.bss*) } > data
-}
-PROVIDE(main = core_0_2);
diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_2.ll b/examples/elementwise_arith/air_project/div_kernel_0_core_0_2.ll
deleted file mode 100644
index bf98238..0000000
--- a/examples/elementwise_arith/air_project/div_kernel_0_core_0_2.ll
+++ /dev/null
@@ -1,158 +0,0 @@
-; ModuleID = 'LLVMDialectModule'
-source_filename = "LLVMDialectModule"
-target triple = "aie2p"
-
-@buf0 = external global [256 x float]
-@buf1 = external global [256 x float]
-@buf2 = external global [256 x float]
-@buf3 = external global [256 x float]
-@buf4 = external global [256 x float]
-@buf5 = external global [256 x float]
-@buf6 = external global [256 x float]
-@buf7 = external global [256 x float]
-@buf8 = external global [256 x float]
-@buf9 = external global [256 x float]
-@buf10 = external global [256 x float]
-@buf11 = external global [256 x float]
-@buf12 = external global [1024 x float]
-@buf13 = external global [1024 x float]
-@buf14 = external global [1024 x float]
-
-; Function Attrs: noinline
-define float @__aie2p_scalar_fdiv(float %0, float %1) #0 {
-  %3 = call float @llvm.aie2p.inv(float %1)
-  %4 = fmul float %0, %3
-  ret float %4
-}
-
-declare void @debug_i32(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.event(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.put.ms(i32, i32)
-
-; Unknown intrinsic
-declare { i32, i32 } @llvm.aie2p.get.ss()
-
-; Unknown intrinsic
-declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32)
-
-; Unknown intrinsic
-declare <16 x i32> @llvm.aie2p.scd.read.vec(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.acquire(i32, i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.release(i32, i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.set.ctrl.reg(i32, i32)
-
-define void @core_0_2() {
-  br label %1
-
-1:                                                ; preds = %76, %0
-  call void @llvm.aie2p.acquire(i32 49, i32 -1)
-  call void @llvm.aie2p.acquire(i32 50, i32 -1)
-  call void @llvm.aie2p.acquire(i32 52, i32 -1)
-  br label %2
-
-2:                                                ; preds = %5, %1
-  %3 = phi i32 [ %75, %5 ], [ 0, %1 ]
-  %4 = icmp slt i32 %3, 256
-  br i1 %4, label %5, label %76
-
-5:                                                ; preds = %2
-  %6 = getelementptr float, ptr @buf2, i32 %3
-  %7 = load <16 x float>, ptr %6, align 4
-  %8 = getelementptr float, ptr @buf1, i32 %3
-  %9 = load <16 x float>, ptr %8, align 4
-  %10 = extractelement <16 x float> %7, i64 0
-  %11 = extractelement <16 x float> %9, i64 0
-  %12 = call float @__aie2p_scalar_fdiv(float %10, float %11)
-  %13 = insertelement <16 x float> poison, float %12, i64 0
-  %14 = extractelement <16 x float> %7, i64 1
-  %15 = extractelement <16 x float> %9, i64 1
-  %16 = call float @__aie2p_scalar_fdiv(float %14, float %15)
-  %17 = insertelement <16 x float> %13, float %16, i64 1
-  %18 = extractelement <16 x float> %7, i64 2
-  %19 = extractelement <16 x float> %9, i64 2
-  %20 = call float @__aie2p_scalar_fdiv(float %18, float %19)
-  %21 = insertelement <16 x float> %17, float %20, i64 2
-  %22 = extractelement <16 x float> %7, i64 3
-  %23 = extractelement <16 x float> %9, i64 3
-  %24 = call float @__aie2p_scalar_fdiv(float %22, float %23)
-  %25 = insertelement <16 x float> %21, float %24, i64 3
-  %26 = extractelement <16 x float> %7, i64 4
-  %27 = extractelement <16 x float> %9, i64 4
-  %28 = call float @__aie2p_scalar_fdiv(float %26, float %27)
-  %29 = insertelement <16 x float> %25, float %28, i64 4
-  %30 = extractelement <16 x float> %7, i64 5
-  %31 = extractelement <16 x float> %9, i64 5
-  %32 = call float @__aie2p_scalar_fdiv(float %30, float %31)
-  %33 = insertelement <16 x float> %29, float %32, i64 5
-  %34 = extractelement <16 x float> %7, i64 6
-  %35 = extractelement <16 x float> %9, i64 6
-  %36 = call float @__aie2p_scalar_fdiv(float %34, float %35)
-  %37 = insertelement <16 x float> %33, float %36, i64 6
-  %38 = extractelement <16 x float> %7, i64 7
-  %39 = extractelement <16 x float> %9, i64 7
-  %40 = call float @__aie2p_scalar_fdiv(float %38, float %39)
-  %41 = insertelement <16 x float> %37, float %40, i64 7
-  %42 = extractelement <16 x float> %7, i64 8
-  %43 = extractelement <16 x float> %9, i64 8
-  %44 = call float @__aie2p_scalar_fdiv(float %42, float %43)
-  %45 = insertelement <16 x float> %41, float %44, i64 8
-  %46 = extractelement <16 x float> %7, i64 9
-  %47 = extractelement <16 x float> %9, i64 9
-  %48 = call float @__aie2p_scalar_fdiv(float %46, float %47)
-  %49 = insertelement <16 x float> %45, float %48, i64 9
-  %50 = extractelement <16 x float> %7, i64 10
-  %51 = extractelement <16 x float> %9, i64 10
-  %52 = call float @__aie2p_scalar_fdiv(float %50, float %51)
-  %53 = insertelement <16 x float> %49, float %52, i64 10
-  %54 = extractelement <16 x float> %7, i64 11
-  %55 = extractelement <16 x float> %9, i64 11
-  %56 = call float @__aie2p_scalar_fdiv(float %54, float %55)
-  %57 = insertelement <16 x float> %53, float %56, i64 11
-  %58 = extractelement <16 x float> %7, i64 12
-  %59 = extractelement <16 x float> %9, i64 12
-  %60 = call float @__aie2p_scalar_fdiv(float %58, float %59)
-  %61 = insertelement <16 x float> %57, float %60, i64 12
-  %62 = extractelement <16 x float> %7, i64 13
-  %63 = extractelement <16 x float> %9, i64 13
-  %64 = call float @__aie2p_scalar_fdiv(float %62, float %63)
-  %65 = insertelement <16 x float> %61, float %64, i64 13
-  %66 = extractelement <16 x float> %7, i64 14
-  %67 = extractelement <16 x float> %9, i64 14
-  %68 = call float @__aie2p_scalar_fdiv(float %66, float %67)
-  %69 = insertelement <16 x float> %65, float %68, i64 14
-  %70 = extractelement <16 x float> %7, i64 15
-  %71 = extractelement <16 x float> %9, i64 15
-  %72 = call float @__aie2p_scalar_fdiv(float %70, float %71)
-  %73 = insertelement <16 x float> %69, float %72, i64 15
-  %74 = getelementptr float, ptr @buf0, i32 %3
-  store <16 x float> %73, ptr %74, align 4
-  %75 = add i32 %3, 16
-  br label %2, !llvm.loop !1
-
-76:                                               ; preds = %2
-  call void @llvm.aie2p.release(i32 51, i32 1)
-  call void @llvm.aie2p.release(i32 53, i32 1)
-  call void @llvm.aie2p.release(i32 48, i32 1)
-  br label %1
-}
-
-; Unknown intrinsic
-declare float @llvm.aie2p.inv(float)
-
-attributes #0 = { noinline }
-
-!llvm.module.flags = !{!0}
-
-!0 = !{i32 2, !"Debug Info Version", i32 3}
-!1 = distinct !{!1, !2}
-!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_2.o b/examples/elementwise_arith/air_project/div_kernel_0_core_0_2.o
deleted file mode 100644
index 728d41f24858dcddcc40661dd9b21afbd5fbd4eb..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 2048
zcma)7VQ3R)7=AC8G*_738A<t;SU9`QRGZ$~>WV00Qk?Tg4MC=qDW<WetSs8nB<1|G
zbxpO3lUCW_M1=ku`!f;{3H{X{V~inESvCayQ6;E96e=i0&}Hv?_g#CG+4P0)-uHc;
z=brn%yYKG4`@Hv{AP8WG0Ft8vFd6_vBzKvDR%nD3EkSb;lUND^bAPY=hNJ|>zIhp-
z5}hqfNRLb!561-+RBx35C^iJZpp|!YX0-<JZGRhVL;<SN)3_<hMzdEo_O;4}{}b8x
zY^!V>ZI+GPhqCdLU*59`I#68bemtf?L4@(YH+Z9fbqCh38@(<mjPzoSNi9uSU@)nN
zS7F7yLHswocl;$@M+K#-jx%-(cC%fu-_p9xr5d?;*|jk55egGk%u%7gsm6A`k}J_2
z#drkQ7cteeKAf=?hW`)b+logH-L?P~M!Yp+?<=p>TFwBaycj6u^8v$ur;gvk`uxtL
z-+3`;_{*pduQ9*-$hX&@-xT^y<<ajU>a{ie7Lot7zTYMEyF}})pnk3U_YC=U+`r5D
zK<ToG^Lvi^aPiCd|38naIuoGv9_y>~Jg$hr(iL2<r%9pp_VvHdPYKd`zxuk4-*i4$
znid0wCxCvf#S<g%^~3pXFP_xr_<r%dVOQ^X+0_LcWgPc$JjAi+EyrBywYPX|@x3d@
zT0Xw^P7iGj>Y>gpdMMhghwia*%=gj8svf!?&_llj^`Gaq=uiJd=0S7i?(McdHdosG
zBGA>KUaVGPg&QK96e8AD@yL{j`Tk(}m#ZNPJ^;S28NT^U>gR6{lV-7+<mWL7IKFJe
zi*5(thL^P|?BmOr2L`boar7VXnlwKA^7Jip)`#O9@8|KO<O2^RSMd|8`sr0YwJwhZ
zYM9#i8@U_ncFDpIP~B+NKVTBvR$ZXP>n9G}{3LR`TUgP@v9|RW9QdpQ|HpwZIq+?+
z6`u&!w$GRYpK;*7JMcvZp2hnl&<7l25_3CW@2`ZDjzSwvrIT7VrDwH7A~}@m8cAdZ
zlIi4VV(|3P7?=!bXNR*X?Tgsa_DnW8@D23#9y@$sD49+TWDgIY&ZSeJWwN8$<SESO
z&QMMy&g9aW!JTj_H`oPQZ%<EW0`tnn28Ksdi4Me2(8&dz)O1+$TF3DaYWVU@wHyCe
zKRV8%TS}qgpc-j2B0Sqk;RW%lb+i}luyv%NM;_tC5;nAVX$#SOh@(D(!|6AJJ?&jP
uY`?PzIQ`yYBjJhUN88o*+lDpy*%2qg5jID2wjVFiq?^S|Je@*=uKgdN+&{(u

diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_2.opt.ll b/examples/elementwise_arith/air_project/div_kernel_0_core_0_2.opt.ll
deleted file mode 100644
index 9cfe6d1..0000000
--- a/examples/elementwise_arith/air_project/div_kernel_0_core_0_2.opt.ll
+++ /dev/null
@@ -1,129 +0,0 @@
-; ModuleID = 'air_project/div_kernel_0_core_0_2.peanohack.ll'
-source_filename = "LLVMDialectModule"
-target datalayout = "e-m:e-p:20:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-f32:32:32-i64:32-f64:32-a:0:32-n32"
-target triple = "aie2p"
-
-@buf0 = external local_unnamed_addr global [256 x float]
-@buf1 = external local_unnamed_addr global [256 x float]
-@buf2 = external local_unnamed_addr global [256 x float]
-
-; Function Attrs: nofree noinline nosync nounwind memory(none)
-define float @__aie2p_scalar_fdiv(float %0, float %1) local_unnamed_addr #0 {
-  %3 = tail call float @llvm.aie2p.inv(float %1)
-  %4 = fmul float %3, %0
-  ret float %4
-}
-
-; Function Attrs: nounwind
-declare void @llvm.aie2p.acquire(i32, i32) #1
-
-; Function Attrs: nounwind
-declare void @llvm.aie2p.release(i32, i32) #1
-
-; Function Attrs: noreturn nounwind
-define void @core_0_2() local_unnamed_addr #2 {
-  br label %1
-
-1:                                                ; preds = %76, %0
-  tail call void @llvm.aie2p.acquire(i32 49, i32 -1)
-  tail call void @llvm.aie2p.acquire(i32 50, i32 -1)
-  tail call void @llvm.aie2p.acquire(i32 52, i32 -1)
-  br label %2
-
-2:                                                ; preds = %1, %2
-  %3 = phi i32 [ 0, %1 ], [ %74, %2 ]
-  %4 = trunc nuw i32 %3 to i20
-  %5 = getelementptr float, ptr @buf2, i20 %4
-  %6 = load <16 x float>, ptr %5, align 64
-  %7 = getelementptr float, ptr @buf1, i20 %4
-  %8 = load <16 x float>, ptr %7, align 64
-  %9 = extractelement <16 x float> %6, i64 0
-  %10 = extractelement <16 x float> %8, i64 0
-  %11 = tail call float @__aie2p_scalar_fdiv(float %9, float %10)
-  %12 = insertelement <16 x float> poison, float %11, i64 0
-  %13 = extractelement <16 x float> %6, i64 1
-  %14 = extractelement <16 x float> %8, i64 1
-  %15 = tail call float @__aie2p_scalar_fdiv(float %13, float %14)
-  %16 = insertelement <16 x float> %12, float %15, i64 1
-  %17 = extractelement <16 x float> %6, i64 2
-  %18 = extractelement <16 x float> %8, i64 2
-  %19 = tail call float @__aie2p_scalar_fdiv(float %17, float %18)
-  %20 = insertelement <16 x float> %16, float %19, i64 2
-  %21 = extractelement <16 x float> %6, i64 3
-  %22 = extractelement <16 x float> %8, i64 3
-  %23 = tail call float @__aie2p_scalar_fdiv(float %21, float %22)
-  %24 = insertelement <16 x float> %20, float %23, i64 3
-  %25 = extractelement <16 x float> %6, i64 4
-  %26 = extractelement <16 x float> %8, i64 4
-  %27 = tail call float @__aie2p_scalar_fdiv(float %25, float %26)
-  %28 = insertelement <16 x float> %24, float %27, i64 4
-  %29 = extractelement <16 x float> %6, i64 5
-  %30 = extractelement <16 x float> %8, i64 5
-  %31 = tail call float @__aie2p_scalar_fdiv(float %29, float %30)
-  %32 = insertelement <16 x float> %28, float %31, i64 5
-  %33 = extractelement <16 x float> %6, i64 6
-  %34 = extractelement <16 x float> %8, i64 6
-  %35 = tail call float @__aie2p_scalar_fdiv(float %33, float %34)
-  %36 = insertelement <16 x float> %32, float %35, i64 6
-  %37 = extractelement <16 x float> %6, i64 7
-  %38 = extractelement <16 x float> %8, i64 7
-  %39 = tail call float @__aie2p_scalar_fdiv(float %37, float %38)
-  %40 = insertelement <16 x float> %36, float %39, i64 7
-  %41 = extractelement <16 x float> %6, i64 8
-  %42 = extractelement <16 x float> %8, i64 8
-  %43 = tail call float @__aie2p_scalar_fdiv(float %41, float %42)
-  %44 = insertelement <16 x float> %40, float %43, i64 8
-  %45 = extractelement <16 x float> %6, i64 9
-  %46 = extractelement <16 x float> %8, i64 9
-  %47 = tail call float @__aie2p_scalar_fdiv(float %45, float %46)
-  %48 = insertelement <16 x float> %44, float %47, i64 9
-  %49 = extractelement <16 x float> %6, i64 10
-  %50 = extractelement <16 x float> %8, i64 10
-  %51 = tail call float @__aie2p_scalar_fdiv(float %49, float %50)
-  %52 = insertelement <16 x float> %48, float %51, i64 10
-  %53 = extractelement <16 x float> %6, i64 11
-  %54 = extractelement <16 x float> %8, i64 11
-  %55 = tail call float @__aie2p_scalar_fdiv(float %53, float %54)
-  %56 = insertelement <16 x float> %52, float %55, i64 11
-  %57 = extractelement <16 x float> %6, i64 12
-  %58 = extractelement <16 x float> %8, i64 12
-  %59 = tail call float @__aie2p_scalar_fdiv(float %57, float %58)
-  %60 = insertelement <16 x float> %56, float %59, i64 12
-  %61 = extractelement <16 x float> %6, i64 13
-  %62 = extractelement <16 x float> %8, i64 13
-  %63 = tail call float @__aie2p_scalar_fdiv(float %61, float %62)
-  %64 = insertelement <16 x float> %60, float %63, i64 13
-  %65 = extractelement <16 x float> %6, i64 14
-  %66 = extractelement <16 x float> %8, i64 14
-  %67 = tail call float @__aie2p_scalar_fdiv(float %65, float %66)
-  %68 = insertelement <16 x float> %64, float %67, i64 14
-  %69 = extractelement <16 x float> %6, i64 15
-  %70 = extractelement <16 x float> %8, i64 15
-  %71 = tail call float @__aie2p_scalar_fdiv(float %69, float %70)
-  %72 = insertelement <16 x float> %68, float %71, i64 15
-  %73 = getelementptr float, ptr @buf0, i20 %4
-  store <16 x float> %72, ptr %73, align 64
-  %74 = add nuw nsw i32 %3, 16
-  %75 = icmp ult i32 %3, 240
-  br i1 %75, label %2, label %76, !llvm.loop !1
-
-76:                                               ; preds = %2
-  tail call void @llvm.aie2p.release(i32 51, i32 1)
-  tail call void @llvm.aie2p.release(i32 53, i32 1)
-  tail call void @llvm.aie2p.release(i32 48, i32 1)
-  br label %1
-}
-
-; Function Attrs: nofree nosync nounwind memory(none)
-declare float @llvm.aie2p.inv(float) #3
-
-attributes #0 = { nofree noinline nosync nounwind memory(none) }
-attributes #1 = { nounwind }
-attributes #2 = { noreturn nounwind }
-attributes #3 = { nofree nosync nounwind memory(none) }
-
-!llvm.module.flags = !{!0}
-
-!0 = !{i32 2, !"Debug Info Version", i32 3}
-!1 = distinct !{!1, !2}
-!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_2.peanohack.ll b/examples/elementwise_arith/air_project/div_kernel_0_core_0_2.peanohack.ll
deleted file mode 100644
index 61bace1..0000000
--- a/examples/elementwise_arith/air_project/div_kernel_0_core_0_2.peanohack.ll
+++ /dev/null
@@ -1,158 +0,0 @@
-; ModuleID = 'LLVMDialectModule'
-source_filename = "LLVMDialectModule"
-target triple = "aie2p"
-
-@buf0 = external global [256 x float]
-@buf1 = external global [256 x float]
-@buf2 = external global [256 x float]
-@buf3 = external global [256 x float]
-@buf4 = external global [256 x float]
-@buf5 = external global [256 x float]
-@buf6 = external global [256 x float]
-@buf7 = external global [256 x float]
-@buf8 = external global [256 x float]
-@buf9 = external global [256 x float]
-@buf10 = external global [256 x float]
-@buf11 = external global [256 x float]
-@buf12 = external global [1024 x float]
-@buf13 = external global [1024 x float]
-@buf14 = external global [1024 x float]
-
-; Function Attrs: noinline
-define float @__aie2p_scalar_fdiv(float %0, float %1) #0 {
-  %3 = call float @llvm.aie2p.inv(float %1)
-  %4 = fmul float %0, %3
-  ret float %4
-}
-
-declare void @debug_i32(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.event(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.put.ms(i32, i32)
-
-; Unknown intrinsic
-declare { i32, i32 } @llvm.aie2p.get.ss()
-
-; Unknown intrinsic
-declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32)
-
-; Unknown intrinsic
-declare <16 x i32> @llvm.aie2p.scd.read.vec(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.acquire(i32, i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.release(i32, i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.set.ctrl.reg(i32, i32)
-
-define void @core_0_2() {
-  br label %1
-
-1:                                                ; preds = %76, %0
-  call void @llvm.aie2p.acquire(i32 49, i32 -1)
-  call void @llvm.aie2p.acquire(i32 50, i32 -1)
-  call void @llvm.aie2p.acquire(i32 52, i32 -1)
-  br label %2
-
-2:                                                ; preds = %5, %1
-  %3 = phi i32 [ %75, %5 ], [ 0, %1 ]
-  %4 = icmp slt i32 %3, 256
-  br i1 %4, label %5, label %76
-
-5:                                                ; preds = %2
-  %6 = getelementptr float, ptr @buf2, i32 %3
-  %7 = load <16 x float>, ptr %6
-  %8 = getelementptr float, ptr @buf1, i32 %3
-  %9 = load <16 x float>, ptr %8
-  %10 = extractelement <16 x float> %7, i64 0
-  %11 = extractelement <16 x float> %9, i64 0
-  %12 = call float @__aie2p_scalar_fdiv(float %10, float %11)
-  %13 = insertelement <16 x float> poison, float %12, i64 0
-  %14 = extractelement <16 x float> %7, i64 1
-  %15 = extractelement <16 x float> %9, i64 1
-  %16 = call float @__aie2p_scalar_fdiv(float %14, float %15)
-  %17 = insertelement <16 x float> %13, float %16, i64 1
-  %18 = extractelement <16 x float> %7, i64 2
-  %19 = extractelement <16 x float> %9, i64 2
-  %20 = call float @__aie2p_scalar_fdiv(float %18, float %19)
-  %21 = insertelement <16 x float> %17, float %20, i64 2
-  %22 = extractelement <16 x float> %7, i64 3
-  %23 = extractelement <16 x float> %9, i64 3
-  %24 = call float @__aie2p_scalar_fdiv(float %22, float %23)
-  %25 = insertelement <16 x float> %21, float %24, i64 3
-  %26 = extractelement <16 x float> %7, i64 4
-  %27 = extractelement <16 x float> %9, i64 4
-  %28 = call float @__aie2p_scalar_fdiv(float %26, float %27)
-  %29 = insertelement <16 x float> %25, float %28, i64 4
-  %30 = extractelement <16 x float> %7, i64 5
-  %31 = extractelement <16 x float> %9, i64 5
-  %32 = call float @__aie2p_scalar_fdiv(float %30, float %31)
-  %33 = insertelement <16 x float> %29, float %32, i64 5
-  %34 = extractelement <16 x float> %7, i64 6
-  %35 = extractelement <16 x float> %9, i64 6
-  %36 = call float @__aie2p_scalar_fdiv(float %34, float %35)
-  %37 = insertelement <16 x float> %33, float %36, i64 6
-  %38 = extractelement <16 x float> %7, i64 7
-  %39 = extractelement <16 x float> %9, i64 7
-  %40 = call float @__aie2p_scalar_fdiv(float %38, float %39)
-  %41 = insertelement <16 x float> %37, float %40, i64 7
-  %42 = extractelement <16 x float> %7, i64 8
-  %43 = extractelement <16 x float> %9, i64 8
-  %44 = call float @__aie2p_scalar_fdiv(float %42, float %43)
-  %45 = insertelement <16 x float> %41, float %44, i64 8
-  %46 = extractelement <16 x float> %7, i64 9
-  %47 = extractelement <16 x float> %9, i64 9
-  %48 = call float @__aie2p_scalar_fdiv(float %46, float %47)
-  %49 = insertelement <16 x float> %45, float %48, i64 9
-  %50 = extractelement <16 x float> %7, i64 10
-  %51 = extractelement <16 x float> %9, i64 10
-  %52 = call float @__aie2p_scalar_fdiv(float %50, float %51)
-  %53 = insertelement <16 x float> %49, float %52, i64 10
-  %54 = extractelement <16 x float> %7, i64 11
-  %55 = extractelement <16 x float> %9, i64 11
-  %56 = call float @__aie2p_scalar_fdiv(float %54, float %55)
-  %57 = insertelement <16 x float> %53, float %56, i64 11
-  %58 = extractelement <16 x float> %7, i64 12
-  %59 = extractelement <16 x float> %9, i64 12
-  %60 = call float @__aie2p_scalar_fdiv(float %58, float %59)
-  %61 = insertelement <16 x float> %57, float %60, i64 12
-  %62 = extractelement <16 x float> %7, i64 13
-  %63 = extractelement <16 x float> %9, i64 13
-  %64 = call float @__aie2p_scalar_fdiv(float %62, float %63)
-  %65 = insertelement <16 x float> %61, float %64, i64 13
-  %66 = extractelement <16 x float> %7, i64 14
-  %67 = extractelement <16 x float> %9, i64 14
-  %68 = call float @__aie2p_scalar_fdiv(float %66, float %67)
-  %69 = insertelement <16 x float> %65, float %68, i64 14
-  %70 = extractelement <16 x float> %7, i64 15
-  %71 = extractelement <16 x float> %9, i64 15
-  %72 = call float @__aie2p_scalar_fdiv(float %70, float %71)
-  %73 = insertelement <16 x float> %69, float %72, i64 15
-  %74 = getelementptr float, ptr @buf0, i32 %3
-  store <16 x float> %73, ptr %74
-  %75 = add i32 %3, 16
-  br label %2, !llvm.loop !1
-
-76:                                               ; preds = %2
-  call void @llvm.aie2p.release(i32 51, i32 1)
-  call void @llvm.aie2p.release(i32 53, i32 1)
-  call void @llvm.aie2p.release(i32 48, i32 1)
-  br label %1
-}
-
-; Unknown intrinsic
-declare float @llvm.aie2p.inv(float)
-
-attributes #0 = { noinline }
-
-!llvm.module.flags = !{!0}
-
-!0 = !{i32 2, !"Debug Info Version", i32 3}
-!1 = distinct !{!1, !2}
-!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_3.elf b/examples/elementwise_arith/air_project/div_kernel_0_core_0_3.elf
deleted file mode 100755
index a92e4ae85366d8708c15091ef67871349c6409af..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 4192
zcma)9eQZ<L6+h2*{0wXA>p-&G1j-|P#1tp7V*@cAtwXa4F@s<mDBDmS$2P<wakAK9
zW)tfxNlPi922`Zmb%?B5H%*!}XWCelzq~adTD2RfAW^N8Jj%39TeTpylA8vybNt@3
zO`r)!y6>LfIp>~x&bc4He*WvhXC+AjgG^v6A|DF3*#XSh`#=E`I6(#rkhO}<pwUGD
z)GAQGu<iawDr)G;duut+d5Z+X0_~7nBp0=7mjT?|ZdPE*1rq^)mw-5Kd4Wyh)HV=X
zvKZB^w$aQ{+uw~*-;)$j<UH3x(4`!l6c7@P%+CV+xN|)`idZUm7w61&v0S!`El#_r
zUSk&<R@%j$<#sXnm|dK$wr_SoOICu+#+!#2WImA8>y>gFAn*QIQa@WHn`|!hm7C?Z
z`Wj@AxNWP>xtOKWeAo4)dfNp;0hU)(&VL83+y_r<wwOCcCkC^2)0J$6lsTHmzD??^
zndRP3?YG*V7~2)V`~rYA2g(ntppmG)jQaIs-`sXV6DZIx&x$8M)r+Wk1{fbQ*NzVj
z)r#u(7VvlF;rzXb{$4cKiRx=;@0usR8>nwwIKE@(@7NIf`v~ou=JA(9ee=Tpj-$Wh
zl<zIHpRa!ZLj6P3@5E5;_z5${_g}PkjeUF9gV!UkjMP%Tdpj0bk5lHl@l%*@MGd2T
zJ37D2AFHE$zuUclzgLIq#$Pqpii%qF=N#MD|E2ygzI9{Ir6=_LQuR4M{hrKEU%_?_
z+YM|VVav(acAAtkGx)8qx^r!3{pvG!(hV!?(hZ(v>4vuD>4vNP+Rmy~kLJ@2uh*s<
zepQ$L<;1e|Ki@^oUzgwd^M&;td}4gbC$%kB<>maXotZbxd{_$f>l+t13()zYQ}uz#
z+6Gks`o88M;pyqrPY<_^A~y`sNh7%pbUEh7z-9tPKMxnkGeUH8M^rw`M5W61DZd}v
z`1wA?pP9&u8kJ33szKdzT~=}~={0M6P6UIF&0%1_hDS0_<u{+6{ol+A$jCEz+MTA{
zip-l|Ah*q=KxXY1`~H10(=<JudIi+8@5^?$kbX<Dz%uRj_y(|Hyd6vwqE3bQUY2qz
zjQ9c=y=3Gl*1W?e7&s*b&83MB=<b2V;GCN1xZB<6q#M+=?h`!au0Ni#*5470lQm-M
z5)%qG^<5^ubGZs@c>^OW63&MxU%VgihA7<<7u71%kIepMX4Z)3Omv$b`_854i<?^w
zSMBI10Q35*2r#Zl;Cf>O*3O9DN0^A&9In;Ov=YQ)K?l~N)9lv#iJD<18ZB&MWVR2h
zi!rsss2Nn@vpU53%V--I!5bbpd=*4xz#%Fy$G5U<zWKQfjOxF7iABqyE=W1F8gtUj
z^Nkl3YGM4O(^eRdV>4#Ffi<H1v(IA=IkU}>E#R;Eg{m{V0}6oEA`ULby%4xz##Piv
z%a=PxW!KM6N>&h(>JVFAVPXqbpmLb;sqGH2W)gQS;yqZcLRG*<RnbVV2WC9E%A)Tl
zoFi9?`TqEf-#@YRPfReNdeV5OsWcue#hmekQNG0)@Ar4le!_go-^gN&PxnW^rv*|#
zc`DDZUX-~%C-bp)@Gf~@=HI_6^WE2F?)gyWj=#zG{~qJ~0bI=@A>L)VxBeKvDp~dU
zglm(-b*T>OC9Ego)58uv{W#XHjjG>o(*3l9VpX3PjhEO3^!-yhBe(;(JK*3So0*WZ
zOT_Z+4&Lziy!$P$RCCN1Jv{h)>3WK<!dM@kF*#i4PfGG#apX-Vj;wYFdyt9dc86H;
z3=?*BsaSD*sW>v~;F*&S;l_1mrllf){gF<_Gh3N}t&HmAk`_x#OkztLYET*jv^+}j
z27q2LdOaqz`wz1d=iauP`lo(t&K%2&12eMAWC@t*JYak{Mz=Ti2NjC@Crghm;IF^^
zfb|K+69;1ho0VX&UGeyv+)a4CBR%270p(C^AQ|sV=u*Ymol5m5H#avQh^M*-yP6_>
zz0Do*o_HeN+|zTYw=o=#DP7)3)a{RYy|HlA>-KN-wMJu+O;KOe>*?}#`63>FEb8~|
z_phVq?j@-VTHyBV62#e}OoBS(b7go{k*C+w?OA#`b<zX1KoY)Z(H@^w8cFaqWHM#?
zcEs6YbtKq<*j;?UO0bvoWjKUbDYNfJ%*yn%O#Nl}FyhWK`=f|6W%vYQeCp~X!E4B=
zW%wN8(K7rC!xVeU_cwF!9}v6ojjoXd7m-o0?ZQ?C5`2ig8|yKGjqpvx6m!<B;r}2H
z7A-&qR+u6jDB;ftt0g>*FpKe#5AtW$7f{jhR|>dfzZx;cXV^C*CZ34K`w%MyL=ON7
z+UDSI&B4#j!HGHeM|1E?h{2+52+HpS;XnzG5LQe0Y{6bzP_q9?!M=pg7wk(Ino_ZJ
z3g}q$#uCUR5BH|RUD$UgQv(`T?8A|Rp=A7ru_Qn+xOYc;JlqqDq;~X02YX`B6x_Pi
z6Y}VN3-k{5B=>upA_ajvv_bE^dcR5UTMU86txG(HsmHMK8=4k_F(fU9q{V0WYc&`@
zg%FKX2$6x5rzt|A>>o@-LS3-~@q`CLAw2vo{h?$e+!G!M?T^L}p-hV#B7Fm~kUQjs
zuEG5q=;Nc0S2IAb$RtzYffR(2=z+M<p>WS&EY!Y(VB{c#dc*NVD4vL?fH(+6QhftS
zV-(3l;emnhVMC&+V~Hpf2I<NIj+T$q0Ld|+7W#PT<EBq5eKyf&V`19{aCbZu(YBzb
z?F`&f$kTskgP6|H$sjwojurgP{ejL?XZRs|Q<+?#-4ztjRYp%{+OcGV_deiJE>d?T
nz@{S7{1L3jR&IA5?S4*^>141^ASPbb<<H<ha+=f78~y(Q$lEV}

diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_3.ld.script b/examples/elementwise_arith/air_project/div_kernel_0_core_0_3.ld.script
deleted file mode 100644
index 6120a88..0000000
--- a/examples/elementwise_arith/air_project/div_kernel_0_core_0_3.ld.script
+++ /dev/null
@@ -1,78 +0,0 @@
-
-MEMORY
-{
-   program (RX) : ORIGIN = 0, LENGTH = 0x0020000
-   data (!RX) : ORIGIN = 0x78400, LENGTH = 0x7C00
-}
-ENTRY(__start)
-SECTIONS
-{
-  . = 0x0;
-  .text : {
-     /* the __start symbol has to come at address zero. */
-     *crt0.o(.text*)
-     _ctors_start = .;
-     _init_array_start = .;
-     KEEP(SORT(*.init_array))
-     _ctors_end = .;
-     _init_array_end = .;
-     _dtors_start = .;
-     _dtors_end = .;
-     *(.text*)
-  } > program
-  .data : {
-     *(.data*)
-     *(.rodata*)
-  } > data
-  .comment : {
-     *(.comment*)
-  }
-  .symtab : {
-     *(.symtab)
-  }
-  .shstrtab : {
-     *(.shstrtab)
-  }
-  .strtab : {
-     *(.strtab)
-  }
-  .stack_sizes : {
-     *(.stack_sizes)
-  }
-
-. = 0x70000;
-_sp_start_value_DM_stack = .;
-. += 0x400; /* stack */
-. = 0x40400;
-buf2 = .;
-. += 0x400;
-. = 0x44000;
-buf1 = .;
-. += 0x400;
-. = 0x48000;
-buf0 = .;
-. += 0x400;
-/* No tile with memory exists to the west. */
-. = 0x50000;
-. += 0x10000;
-. = 0x60400;
-buf8 = .;
-. += 0x400;
-. = 0x64000;
-buf7 = .;
-. += 0x400;
-. = 0x68000;
-buf6 = .;
-. += 0x400;
-. = 0x70400;
-buf5 = .;
-. += 0x400;
-. = 0x74000;
-buf4 = .;
-. += 0x400;
-. = 0x78000;
-buf3 = .;
-. += 0x400;
-  .bss : { *(.bss*) } > data
-}
-PROVIDE(main = core_0_3);
diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_3.ll b/examples/elementwise_arith/air_project/div_kernel_0_core_0_3.ll
deleted file mode 100644
index 666390f..0000000
--- a/examples/elementwise_arith/air_project/div_kernel_0_core_0_3.ll
+++ /dev/null
@@ -1,158 +0,0 @@
-; ModuleID = 'LLVMDialectModule'
-source_filename = "LLVMDialectModule"
-target triple = "aie2p"
-
-@buf0 = external global [256 x float]
-@buf1 = external global [256 x float]
-@buf2 = external global [256 x float]
-@buf3 = external global [256 x float]
-@buf4 = external global [256 x float]
-@buf5 = external global [256 x float]
-@buf6 = external global [256 x float]
-@buf7 = external global [256 x float]
-@buf8 = external global [256 x float]
-@buf9 = external global [256 x float]
-@buf10 = external global [256 x float]
-@buf11 = external global [256 x float]
-@buf12 = external global [1024 x float]
-@buf13 = external global [1024 x float]
-@buf14 = external global [1024 x float]
-
-; Function Attrs: noinline
-define float @__aie2p_scalar_fdiv(float %0, float %1) #0 {
-  %3 = call float @llvm.aie2p.inv(float %1)
-  %4 = fmul float %0, %3
-  ret float %4
-}
-
-declare void @debug_i32(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.event(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.put.ms(i32, i32)
-
-; Unknown intrinsic
-declare { i32, i32 } @llvm.aie2p.get.ss()
-
-; Unknown intrinsic
-declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32)
-
-; Unknown intrinsic
-declare <16 x i32> @llvm.aie2p.scd.read.vec(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.acquire(i32, i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.release(i32, i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.set.ctrl.reg(i32, i32)
-
-define void @core_0_3() {
-  br label %1
-
-1:                                                ; preds = %76, %0
-  call void @llvm.aie2p.acquire(i32 49, i32 -1)
-  call void @llvm.aie2p.acquire(i32 50, i32 -1)
-  call void @llvm.aie2p.acquire(i32 52, i32 -1)
-  br label %2
-
-2:                                                ; preds = %5, %1
-  %3 = phi i32 [ %75, %5 ], [ 0, %1 ]
-  %4 = icmp slt i32 %3, 256
-  br i1 %4, label %5, label %76
-
-5:                                                ; preds = %2
-  %6 = getelementptr float, ptr @buf5, i32 %3
-  %7 = load <16 x float>, ptr %6, align 4
-  %8 = getelementptr float, ptr @buf4, i32 %3
-  %9 = load <16 x float>, ptr %8, align 4
-  %10 = extractelement <16 x float> %7, i64 0
-  %11 = extractelement <16 x float> %9, i64 0
-  %12 = call float @__aie2p_scalar_fdiv(float %10, float %11)
-  %13 = insertelement <16 x float> poison, float %12, i64 0
-  %14 = extractelement <16 x float> %7, i64 1
-  %15 = extractelement <16 x float> %9, i64 1
-  %16 = call float @__aie2p_scalar_fdiv(float %14, float %15)
-  %17 = insertelement <16 x float> %13, float %16, i64 1
-  %18 = extractelement <16 x float> %7, i64 2
-  %19 = extractelement <16 x float> %9, i64 2
-  %20 = call float @__aie2p_scalar_fdiv(float %18, float %19)
-  %21 = insertelement <16 x float> %17, float %20, i64 2
-  %22 = extractelement <16 x float> %7, i64 3
-  %23 = extractelement <16 x float> %9, i64 3
-  %24 = call float @__aie2p_scalar_fdiv(float %22, float %23)
-  %25 = insertelement <16 x float> %21, float %24, i64 3
-  %26 = extractelement <16 x float> %7, i64 4
-  %27 = extractelement <16 x float> %9, i64 4
-  %28 = call float @__aie2p_scalar_fdiv(float %26, float %27)
-  %29 = insertelement <16 x float> %25, float %28, i64 4
-  %30 = extractelement <16 x float> %7, i64 5
-  %31 = extractelement <16 x float> %9, i64 5
-  %32 = call float @__aie2p_scalar_fdiv(float %30, float %31)
-  %33 = insertelement <16 x float> %29, float %32, i64 5
-  %34 = extractelement <16 x float> %7, i64 6
-  %35 = extractelement <16 x float> %9, i64 6
-  %36 = call float @__aie2p_scalar_fdiv(float %34, float %35)
-  %37 = insertelement <16 x float> %33, float %36, i64 6
-  %38 = extractelement <16 x float> %7, i64 7
-  %39 = extractelement <16 x float> %9, i64 7
-  %40 = call float @__aie2p_scalar_fdiv(float %38, float %39)
-  %41 = insertelement <16 x float> %37, float %40, i64 7
-  %42 = extractelement <16 x float> %7, i64 8
-  %43 = extractelement <16 x float> %9, i64 8
-  %44 = call float @__aie2p_scalar_fdiv(float %42, float %43)
-  %45 = insertelement <16 x float> %41, float %44, i64 8
-  %46 = extractelement <16 x float> %7, i64 9
-  %47 = extractelement <16 x float> %9, i64 9
-  %48 = call float @__aie2p_scalar_fdiv(float %46, float %47)
-  %49 = insertelement <16 x float> %45, float %48, i64 9
-  %50 = extractelement <16 x float> %7, i64 10
-  %51 = extractelement <16 x float> %9, i64 10
-  %52 = call float @__aie2p_scalar_fdiv(float %50, float %51)
-  %53 = insertelement <16 x float> %49, float %52, i64 10
-  %54 = extractelement <16 x float> %7, i64 11
-  %55 = extractelement <16 x float> %9, i64 11
-  %56 = call float @__aie2p_scalar_fdiv(float %54, float %55)
-  %57 = insertelement <16 x float> %53, float %56, i64 11
-  %58 = extractelement <16 x float> %7, i64 12
-  %59 = extractelement <16 x float> %9, i64 12
-  %60 = call float @__aie2p_scalar_fdiv(float %58, float %59)
-  %61 = insertelement <16 x float> %57, float %60, i64 12
-  %62 = extractelement <16 x float> %7, i64 13
-  %63 = extractelement <16 x float> %9, i64 13
-  %64 = call float @__aie2p_scalar_fdiv(float %62, float %63)
-  %65 = insertelement <16 x float> %61, float %64, i64 13
-  %66 = extractelement <16 x float> %7, i64 14
-  %67 = extractelement <16 x float> %9, i64 14
-  %68 = call float @__aie2p_scalar_fdiv(float %66, float %67)
-  %69 = insertelement <16 x float> %65, float %68, i64 14
-  %70 = extractelement <16 x float> %7, i64 15
-  %71 = extractelement <16 x float> %9, i64 15
-  %72 = call float @__aie2p_scalar_fdiv(float %70, float %71)
-  %73 = insertelement <16 x float> %69, float %72, i64 15
-  %74 = getelementptr float, ptr @buf3, i32 %3
-  store <16 x float> %73, ptr %74, align 4
-  %75 = add i32 %3, 16
-  br label %2, !llvm.loop !1
-
-76:                                               ; preds = %2
-  call void @llvm.aie2p.release(i32 51, i32 1)
-  call void @llvm.aie2p.release(i32 53, i32 1)
-  call void @llvm.aie2p.release(i32 48, i32 1)
-  br label %1
-}
-
-; Unknown intrinsic
-declare float @llvm.aie2p.inv(float)
-
-attributes #0 = { noinline }
-
-!llvm.module.flags = !{!0}
-
-!0 = !{i32 2, !"Debug Info Version", i32 3}
-!1 = distinct !{!1, !2}
-!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_3.o b/examples/elementwise_arith/air_project/div_kernel_0_core_0_3.o
deleted file mode 100644
index 0cbf9dffbcb51eefa1ba59e76899a9a17c429aac..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 2048
zcma)7VQ3R)7=AC8G*_73EhFVyV&Uwj>DDxBYb`R7CB<$2s38oMGB>Yjjayk<Lz|Qv
zgRL`lPMla}gBxSef3iO#0g=#O{UKuv869Rrm_Mon^@l<Q2N~$H_r3eB{V22P3*Yy>
z@AEwO-1psm_uc1j4!tEw61X9OB2)lY3xJGdzb)v5R#?*#v=Fz6l_)Up7uq!>6|nX#
zssOdbeCd>O+orKQD(Rs6>kL41Api!QyrL`X4S>%^x?n2`P>r6}53*{t`&DaSr)mXv
zsMhZ7s&%+swF)n)*0(|R_2;1n#ifBe(;AdynE7LiKLMEcVE%Wj-=jp4UT!d@qYV=b
zrVR2ds(H7_|3>%DJi`0vpw;ym#;(8~)(>wuTCcrStFWkgmKJ?d=~NxZgf!CDV7nfx
zwZyCCR2<hAx7D;hoUs#D@E7E}%I_Sw>Hs?A{0(dGW4}{6&H$C79I6zHAuD*jiQm%Z
z{7$3aX*p~KtEi7|Fu$9~cQ>Ek8T30-M88|8k8R+$jQnfO{m!D_Sz2!m^&36E`^azN
z`JF3<D(7UJ-viV~%ZF0`|32#aT!_|tWVp%uI4_4Q=W)HhHjUOhJn}3*Ellfu_roTB
z7mDG^1vzB-Lg?37elPb-Kb+qy<zwao-!FlG*v0F9c5w+?728d0x3Deyt4WW3=?V57
zfv44E$IF+Vnvv~cGt#@wj3nC4$nUJ047{|pZbrTfnUNpE=H-QL=DlB$`K!J5+tsdN
zworNDVW_V~|EgX~mcEzStQ2?diapcf_V<J74Npq~0swqpGyL+IGQz(erp#kD%dcY!
zaD3H>Cj$cDg-4CAu}-aGKHDKD)X(vpGyyz$K3L_PgX2?PFXEvT03Qsm<KJIbkFKk!
z^!OYwhC>(sBKKh4tvL98%3GbB<|}!fyhH=f?+UnmByxN}TGOX6clBQi_`HDsDc~yt
zzSFbj6UW^3nHKOl0smRRmjyhJ&q<;K9Ag&8Zob~12qzsyY%-Hg$MP97A2W>fc&0CB
zOpT?p=}F_n@$qS}8Hjy6k<Y~5P9E-_%BROZf}x=!2ltPsvzf8{!HMIAYzAUe`N@2G
z6vt0a(l}Z;v75Sn>UM!)oGfIgFcEGyHZhqodJM#d1_yhMK0fw3^V&pz05yDhw%UvT
zt8aws=#f(B*eFNZjyNCPLE{PW>vgmj-EeiJp;sQ~#0nO)cWDdJe2AmEfKBw9!<zQ4
v8?Ijt0nzUX77AWSezaX(znz$qpBpJ6gmAeOV(#lDnsoCx5>J;<qkI1cP#r(5

diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_3.opt.ll b/examples/elementwise_arith/air_project/div_kernel_0_core_0_3.opt.ll
deleted file mode 100644
index 7485372..0000000
--- a/examples/elementwise_arith/air_project/div_kernel_0_core_0_3.opt.ll
+++ /dev/null
@@ -1,129 +0,0 @@
-; ModuleID = 'air_project/div_kernel_0_core_0_3.peanohack.ll'
-source_filename = "LLVMDialectModule"
-target datalayout = "e-m:e-p:20:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-f32:32:32-i64:32-f64:32-a:0:32-n32"
-target triple = "aie2p"
-
-@buf3 = external local_unnamed_addr global [256 x float]
-@buf4 = external local_unnamed_addr global [256 x float]
-@buf5 = external local_unnamed_addr global [256 x float]
-
-; Function Attrs: nofree noinline nosync nounwind memory(none)
-define float @__aie2p_scalar_fdiv(float %0, float %1) local_unnamed_addr #0 {
-  %3 = tail call float @llvm.aie2p.inv(float %1)
-  %4 = fmul float %3, %0
-  ret float %4
-}
-
-; Function Attrs: nounwind
-declare void @llvm.aie2p.acquire(i32, i32) #1
-
-; Function Attrs: nounwind
-declare void @llvm.aie2p.release(i32, i32) #1
-
-; Function Attrs: noreturn nounwind
-define void @core_0_3() local_unnamed_addr #2 {
-  br label %1
-
-1:                                                ; preds = %76, %0
-  tail call void @llvm.aie2p.acquire(i32 49, i32 -1)
-  tail call void @llvm.aie2p.acquire(i32 50, i32 -1)
-  tail call void @llvm.aie2p.acquire(i32 52, i32 -1)
-  br label %2
-
-2:                                                ; preds = %1, %2
-  %3 = phi i32 [ 0, %1 ], [ %74, %2 ]
-  %4 = trunc nuw i32 %3 to i20
-  %5 = getelementptr float, ptr @buf5, i20 %4
-  %6 = load <16 x float>, ptr %5, align 64
-  %7 = getelementptr float, ptr @buf4, i20 %4
-  %8 = load <16 x float>, ptr %7, align 64
-  %9 = extractelement <16 x float> %6, i64 0
-  %10 = extractelement <16 x float> %8, i64 0
-  %11 = tail call float @__aie2p_scalar_fdiv(float %9, float %10)
-  %12 = insertelement <16 x float> poison, float %11, i64 0
-  %13 = extractelement <16 x float> %6, i64 1
-  %14 = extractelement <16 x float> %8, i64 1
-  %15 = tail call float @__aie2p_scalar_fdiv(float %13, float %14)
-  %16 = insertelement <16 x float> %12, float %15, i64 1
-  %17 = extractelement <16 x float> %6, i64 2
-  %18 = extractelement <16 x float> %8, i64 2
-  %19 = tail call float @__aie2p_scalar_fdiv(float %17, float %18)
-  %20 = insertelement <16 x float> %16, float %19, i64 2
-  %21 = extractelement <16 x float> %6, i64 3
-  %22 = extractelement <16 x float> %8, i64 3
-  %23 = tail call float @__aie2p_scalar_fdiv(float %21, float %22)
-  %24 = insertelement <16 x float> %20, float %23, i64 3
-  %25 = extractelement <16 x float> %6, i64 4
-  %26 = extractelement <16 x float> %8, i64 4
-  %27 = tail call float @__aie2p_scalar_fdiv(float %25, float %26)
-  %28 = insertelement <16 x float> %24, float %27, i64 4
-  %29 = extractelement <16 x float> %6, i64 5
-  %30 = extractelement <16 x float> %8, i64 5
-  %31 = tail call float @__aie2p_scalar_fdiv(float %29, float %30)
-  %32 = insertelement <16 x float> %28, float %31, i64 5
-  %33 = extractelement <16 x float> %6, i64 6
-  %34 = extractelement <16 x float> %8, i64 6
-  %35 = tail call float @__aie2p_scalar_fdiv(float %33, float %34)
-  %36 = insertelement <16 x float> %32, float %35, i64 6
-  %37 = extractelement <16 x float> %6, i64 7
-  %38 = extractelement <16 x float> %8, i64 7
-  %39 = tail call float @__aie2p_scalar_fdiv(float %37, float %38)
-  %40 = insertelement <16 x float> %36, float %39, i64 7
-  %41 = extractelement <16 x float> %6, i64 8
-  %42 = extractelement <16 x float> %8, i64 8
-  %43 = tail call float @__aie2p_scalar_fdiv(float %41, float %42)
-  %44 = insertelement <16 x float> %40, float %43, i64 8
-  %45 = extractelement <16 x float> %6, i64 9
-  %46 = extractelement <16 x float> %8, i64 9
-  %47 = tail call float @__aie2p_scalar_fdiv(float %45, float %46)
-  %48 = insertelement <16 x float> %44, float %47, i64 9
-  %49 = extractelement <16 x float> %6, i64 10
-  %50 = extractelement <16 x float> %8, i64 10
-  %51 = tail call float @__aie2p_scalar_fdiv(float %49, float %50)
-  %52 = insertelement <16 x float> %48, float %51, i64 10
-  %53 = extractelement <16 x float> %6, i64 11
-  %54 = extractelement <16 x float> %8, i64 11
-  %55 = tail call float @__aie2p_scalar_fdiv(float %53, float %54)
-  %56 = insertelement <16 x float> %52, float %55, i64 11
-  %57 = extractelement <16 x float> %6, i64 12
-  %58 = extractelement <16 x float> %8, i64 12
-  %59 = tail call float @__aie2p_scalar_fdiv(float %57, float %58)
-  %60 = insertelement <16 x float> %56, float %59, i64 12
-  %61 = extractelement <16 x float> %6, i64 13
-  %62 = extractelement <16 x float> %8, i64 13
-  %63 = tail call float @__aie2p_scalar_fdiv(float %61, float %62)
-  %64 = insertelement <16 x float> %60, float %63, i64 13
-  %65 = extractelement <16 x float> %6, i64 14
-  %66 = extractelement <16 x float> %8, i64 14
-  %67 = tail call float @__aie2p_scalar_fdiv(float %65, float %66)
-  %68 = insertelement <16 x float> %64, float %67, i64 14
-  %69 = extractelement <16 x float> %6, i64 15
-  %70 = extractelement <16 x float> %8, i64 15
-  %71 = tail call float @__aie2p_scalar_fdiv(float %69, float %70)
-  %72 = insertelement <16 x float> %68, float %71, i64 15
-  %73 = getelementptr float, ptr @buf3, i20 %4
-  store <16 x float> %72, ptr %73, align 64
-  %74 = add nuw nsw i32 %3, 16
-  %75 = icmp ult i32 %3, 240
-  br i1 %75, label %2, label %76, !llvm.loop !1
-
-76:                                               ; preds = %2
-  tail call void @llvm.aie2p.release(i32 51, i32 1)
-  tail call void @llvm.aie2p.release(i32 53, i32 1)
-  tail call void @llvm.aie2p.release(i32 48, i32 1)
-  br label %1
-}
-
-; Function Attrs: nofree nosync nounwind memory(none)
-declare float @llvm.aie2p.inv(float) #3
-
-attributes #0 = { nofree noinline nosync nounwind memory(none) }
-attributes #1 = { nounwind }
-attributes #2 = { noreturn nounwind }
-attributes #3 = { nofree nosync nounwind memory(none) }
-
-!llvm.module.flags = !{!0}
-
-!0 = !{i32 2, !"Debug Info Version", i32 3}
-!1 = distinct !{!1, !2}
-!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_3.peanohack.ll b/examples/elementwise_arith/air_project/div_kernel_0_core_0_3.peanohack.ll
deleted file mode 100644
index 0c167b0..0000000
--- a/examples/elementwise_arith/air_project/div_kernel_0_core_0_3.peanohack.ll
+++ /dev/null
@@ -1,158 +0,0 @@
-; ModuleID = 'LLVMDialectModule'
-source_filename = "LLVMDialectModule"
-target triple = "aie2p"
-
-@buf0 = external global [256 x float]
-@buf1 = external global [256 x float]
-@buf2 = external global [256 x float]
-@buf3 = external global [256 x float]
-@buf4 = external global [256 x float]
-@buf5 = external global [256 x float]
-@buf6 = external global [256 x float]
-@buf7 = external global [256 x float]
-@buf8 = external global [256 x float]
-@buf9 = external global [256 x float]
-@buf10 = external global [256 x float]
-@buf11 = external global [256 x float]
-@buf12 = external global [1024 x float]
-@buf13 = external global [1024 x float]
-@buf14 = external global [1024 x float]
-
-; Function Attrs: noinline
-define float @__aie2p_scalar_fdiv(float %0, float %1) #0 {
-  %3 = call float @llvm.aie2p.inv(float %1)
-  %4 = fmul float %0, %3
-  ret float %4
-}
-
-declare void @debug_i32(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.event(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.put.ms(i32, i32)
-
-; Unknown intrinsic
-declare { i32, i32 } @llvm.aie2p.get.ss()
-
-; Unknown intrinsic
-declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32)
-
-; Unknown intrinsic
-declare <16 x i32> @llvm.aie2p.scd.read.vec(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.acquire(i32, i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.release(i32, i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.set.ctrl.reg(i32, i32)
-
-define void @core_0_3() {
-  br label %1
-
-1:                                                ; preds = %76, %0
-  call void @llvm.aie2p.acquire(i32 49, i32 -1)
-  call void @llvm.aie2p.acquire(i32 50, i32 -1)
-  call void @llvm.aie2p.acquire(i32 52, i32 -1)
-  br label %2
-
-2:                                                ; preds = %5, %1
-  %3 = phi i32 [ %75, %5 ], [ 0, %1 ]
-  %4 = icmp slt i32 %3, 256
-  br i1 %4, label %5, label %76
-
-5:                                                ; preds = %2
-  %6 = getelementptr float, ptr @buf5, i32 %3
-  %7 = load <16 x float>, ptr %6
-  %8 = getelementptr float, ptr @buf4, i32 %3
-  %9 = load <16 x float>, ptr %8
-  %10 = extractelement <16 x float> %7, i64 0
-  %11 = extractelement <16 x float> %9, i64 0
-  %12 = call float @__aie2p_scalar_fdiv(float %10, float %11)
-  %13 = insertelement <16 x float> poison, float %12, i64 0
-  %14 = extractelement <16 x float> %7, i64 1
-  %15 = extractelement <16 x float> %9, i64 1
-  %16 = call float @__aie2p_scalar_fdiv(float %14, float %15)
-  %17 = insertelement <16 x float> %13, float %16, i64 1
-  %18 = extractelement <16 x float> %7, i64 2
-  %19 = extractelement <16 x float> %9, i64 2
-  %20 = call float @__aie2p_scalar_fdiv(float %18, float %19)
-  %21 = insertelement <16 x float> %17, float %20, i64 2
-  %22 = extractelement <16 x float> %7, i64 3
-  %23 = extractelement <16 x float> %9, i64 3
-  %24 = call float @__aie2p_scalar_fdiv(float %22, float %23)
-  %25 = insertelement <16 x float> %21, float %24, i64 3
-  %26 = extractelement <16 x float> %7, i64 4
-  %27 = extractelement <16 x float> %9, i64 4
-  %28 = call float @__aie2p_scalar_fdiv(float %26, float %27)
-  %29 = insertelement <16 x float> %25, float %28, i64 4
-  %30 = extractelement <16 x float> %7, i64 5
-  %31 = extractelement <16 x float> %9, i64 5
-  %32 = call float @__aie2p_scalar_fdiv(float %30, float %31)
-  %33 = insertelement <16 x float> %29, float %32, i64 5
-  %34 = extractelement <16 x float> %7, i64 6
-  %35 = extractelement <16 x float> %9, i64 6
-  %36 = call float @__aie2p_scalar_fdiv(float %34, float %35)
-  %37 = insertelement <16 x float> %33, float %36, i64 6
-  %38 = extractelement <16 x float> %7, i64 7
-  %39 = extractelement <16 x float> %9, i64 7
-  %40 = call float @__aie2p_scalar_fdiv(float %38, float %39)
-  %41 = insertelement <16 x float> %37, float %40, i64 7
-  %42 = extractelement <16 x float> %7, i64 8
-  %43 = extractelement <16 x float> %9, i64 8
-  %44 = call float @__aie2p_scalar_fdiv(float %42, float %43)
-  %45 = insertelement <16 x float> %41, float %44, i64 8
-  %46 = extractelement <16 x float> %7, i64 9
-  %47 = extractelement <16 x float> %9, i64 9
-  %48 = call float @__aie2p_scalar_fdiv(float %46, float %47)
-  %49 = insertelement <16 x float> %45, float %48, i64 9
-  %50 = extractelement <16 x float> %7, i64 10
-  %51 = extractelement <16 x float> %9, i64 10
-  %52 = call float @__aie2p_scalar_fdiv(float %50, float %51)
-  %53 = insertelement <16 x float> %49, float %52, i64 10
-  %54 = extractelement <16 x float> %7, i64 11
-  %55 = extractelement <16 x float> %9, i64 11
-  %56 = call float @__aie2p_scalar_fdiv(float %54, float %55)
-  %57 = insertelement <16 x float> %53, float %56, i64 11
-  %58 = extractelement <16 x float> %7, i64 12
-  %59 = extractelement <16 x float> %9, i64 12
-  %60 = call float @__aie2p_scalar_fdiv(float %58, float %59)
-  %61 = insertelement <16 x float> %57, float %60, i64 12
-  %62 = extractelement <16 x float> %7, i64 13
-  %63 = extractelement <16 x float> %9, i64 13
-  %64 = call float @__aie2p_scalar_fdiv(float %62, float %63)
-  %65 = insertelement <16 x float> %61, float %64, i64 13
-  %66 = extractelement <16 x float> %7, i64 14
-  %67 = extractelement <16 x float> %9, i64 14
-  %68 = call float @__aie2p_scalar_fdiv(float %66, float %67)
-  %69 = insertelement <16 x float> %65, float %68, i64 14
-  %70 = extractelement <16 x float> %7, i64 15
-  %71 = extractelement <16 x float> %9, i64 15
-  %72 = call float @__aie2p_scalar_fdiv(float %70, float %71)
-  %73 = insertelement <16 x float> %69, float %72, i64 15
-  %74 = getelementptr float, ptr @buf3, i32 %3
-  store <16 x float> %73, ptr %74
-  %75 = add i32 %3, 16
-  br label %2, !llvm.loop !1
-
-76:                                               ; preds = %2
-  call void @llvm.aie2p.release(i32 51, i32 1)
-  call void @llvm.aie2p.release(i32 53, i32 1)
-  call void @llvm.aie2p.release(i32 48, i32 1)
-  br label %1
-}
-
-; Unknown intrinsic
-declare float @llvm.aie2p.inv(float)
-
-attributes #0 = { noinline }
-
-!llvm.module.flags = !{!0}
-
-!0 = !{i32 2, !"Debug Info Version", i32 3}
-!1 = distinct !{!1, !2}
-!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_4.elf b/examples/elementwise_arith/air_project/div_kernel_0_core_0_4.elf
deleted file mode 100755
index b9ef327d81371a030c01aa887037ee689da34b56..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 4196
zcma)9eQZ<L6+h2*{0wXA>p-&G1e!<qh$&8DCkA3VT8E_-q6Wb>P`05uj%|oV;$*SI
z%qGSwNlPi922`Y5G1RnDTTPlYXWCelzq~adTD2RfAW^N8Jj%39TeTpylA8vybNt@3
zO`r)!y6>LfIp>~x&bc4HKJrZ9SxJ(>AQRXM$cw^&9l(se7Zfmo6J)ReS*zF#8eIfH
ztqcVW+pd2kBZjWDx0V8(w@4r?&<@!}azVRh8NkhLW(B5PFyRMy35esC7uY0DZ3D46
zgHhdT8%-az{oNS#V@Ux;&T%aSUCO~p0U^=I+$_K^I@iIYh^4%Dao%hf%VoRR?6ix@
z)poIdg<b4kZWn`(*~QsP`z8l8XCz2(xOtF4`eRAGUM{x*^6t+h^~*)F$>u^|*;#I@
zt3n2e+qUSOi&@IecU@1aw_P9<U^zwQ{CCjGz3{YVi@9TTVlZPjUCES5>7zO9+oaB_
zS>E!6{Z`u(V>|trpC7R1K>1-6G!m7UQNM2Nds{DP0tNczS@Gl-dJ#3x0OLdE>hYnW
zYEk*&0{*T%oWB>*-;3rNQF#sRUGv0u1N99H$9D|<9UDS_pQ3%^JpQt%Z(7*jarAeb
z^1X%j^VRQPsDFt1ofxVfKVioB{)_gmu^;Sw@OtEwk!s3!Psal5amrjXehTv~t74RI
zN9Q;BV>OiTkGdD|_v%p1_^ak>QC5xqoMU_YztJDYw|4CL)P%lYD!$^UKa~0DE7-1K
zyMgUfY+3o*4wG_b2ETO`cdqTITXp76s(wXHs@}6KRo}KeRezOV+flLd(Ojzj_3Bjp
zuWC~7PAp6P^8?iUb@{D7Us%_{C&rh2Ufpa}Ue4Xxkv?bU!;-&W-?+e;pUw}Rs*g?9
zHmCs5_caFzPfw?Qdbn*AxnY1#8p&;-%P~I&HWMiNdALBH5u%$rqWoDV%2l>k`Tf|2
zul6dw^h8cnschO(3F_|avXXU4uUXr(A`ozF3IY2yJd%DYx9Rlk|7L0-EzjU-cbc-b
z>9@W{ZmUUw^qQ~t{`+LQae6xW3aDp4lI?II^|oYzW!mlW9I#=$9ZVFWPWkv=ma?@*
ze142xGIA8F-)9pHoRWg((nJSz_rPLsPEB;&?QU?=4eDC^Ii7OYA5U59?udrTDlv75
z2?d+_0TVyGT!FQ`fsqvm=R=e$+z)s|6mN-(Y6a>?X8$rXYs7OVvQ>|L$5Qmg&8>#3
zc68*2dHt378CN85og0BQGotqqCZaZnYZWuC0P$GBfwkx~yET8JYM6-z3!503?ZfI~
zOzkjg232^i4zcbs+WJTEh6fJc22t*Jh%(Ia?F^f5el7!}`mb7I(Q>E>P!6reoHX-%
z<3*X8A3y1|<;UaLv>9(;jVS-j5zHZLwi&Vo{8heCab}la0kB%cfyKBN{5Q<FiW+J8
za_6Y*dh?`Y1tF;pvH2AyHe&_KhZ&#R<`AnVamOOwjn&Fm1#DCmjr4k8#*-^8`hLPW
zvc;J1kI(r16HEWZ1Ouukjdz-g<H1tM8BZAHTbS{FfA{Ps%$NL)EXMeBfAo8rCk2$J
za{TH=nftObAA29~l8<EmldCe{bzSD3Ph{@+n|%N8G0yMD)hrO=otAs+kMXOLRi96|
zHac9FYM@TSdNMve?BLUnW9{0g`u!%|Pdg}9^@wP=#4e!kU(y-D9mw5&2mj2>gp^q#
zmTz<L`p4(pZ#kusW4`F&!RJfYQ+#E{`tY>L;d<kwB;OT>-(uqMDu=KKm}qKuh}!Qm
zVON)m+T%;b;ZX-qpL7T}t~)a=6@KgwcQT&d!USw#R413TSXyEdo7+%>(kP(iQHs|O
z^n%grF`?amn4LKPp54?x^;>iLSWfJpkzFQ>-%RHL<HIq!y|F*2P~10Jd~^YS{k;dQ
zPaqaQ5FOa01On}f$Jgj?#Pc2Q3B~s-2crXtSYKS1D$ed?vOlq@scC;K**)0R81CzB
z>WKBk;;E*do`by&p;%PuY6(Z&zDP?;G!$uZ`!;x6Bhm22h&R&W>1yfnhCRM$#OK}T
zTT9X1OHv86!0p*3h%*J51U1NKOYq79Pp_xjv-EQ6qz7uABz(`JJwB^6lHfbYq)YVe
zh%<%iNU$BTyYPUOU=Qg_a1gOlV&9FJmFQ`i`b+R3#GNJfM-it>@Cn5D)YVCX*N{_7
z@Oi|eCHNh~6no0|H*@eG5WDe>u8{;6kx{Vi#8v?ke1g3j>oJ0j@J+-NbH=RU{~!+*
zEkFiVm?G>i;x7rSMLdl#gYl6M@@Li;P|@+X^0;Wf3NgiJ*f$|2o`}YK5i5B_4*&_;
z=HTzo!OzdZ@j3YCbMQ-u!J=&l%I^eWe-V!mR*U#--d<Z!vOkx%FXA`z_C*Yh$!IDG
zbS!$KabyyQdXu3p?7I`m0gWs4q40rVBKEUr0{?#n_H1vDg?ggl<o3SEU{4ep16#Iu
zf*!qZhTg%R#J-lsa9-dJuGjk(z2B(!&4$3^)+HXp)MMEA3{A7a7?Nf~((E<-wHl0%
zLWsmDgz!Mp(-@{u_6^3v!LI23Slk1_ARhka{$L^;>In@5_eEj{QKrQW;l6=r&>i$b
z*WkWZ`fQ}n2F(DyB9lml29gj=pa<fD2SYuB(O~;_g5d)Y><z`@!B{+&1mYkVPWBBX
zj8P;Hh6V;ghYX3Pj>aR9AEYbuI8r)N10=_Q*3%~+WD9W~PyXwsuP?uG{kT6Kl4xsC
z({=}LE9B|_vq4N}=yZ^sTgM9i?*3HgsWbeLy{SYl(C!Kf=sKfkGwoWk!Mh*uI2Wk9
n9AIMsY5oY-VJo$J1MS|S$#gQ<ClC{_>hfpsUpdWb=#BpWGCVKz

diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_4.ld.script b/examples/elementwise_arith/air_project/div_kernel_0_core_0_4.ld.script
deleted file mode 100644
index ddda3c2..0000000
--- a/examples/elementwise_arith/air_project/div_kernel_0_core_0_4.ld.script
+++ /dev/null
@@ -1,78 +0,0 @@
-
-MEMORY
-{
-   program (RX) : ORIGIN = 0, LENGTH = 0x0020000
-   data (!RX) : ORIGIN = 0x78400, LENGTH = 0x7C00
-}
-ENTRY(__start)
-SECTIONS
-{
-  . = 0x0;
-  .text : {
-     /* the __start symbol has to come at address zero. */
-     *crt0.o(.text*)
-     _ctors_start = .;
-     _init_array_start = .;
-     KEEP(SORT(*.init_array))
-     _ctors_end = .;
-     _init_array_end = .;
-     _dtors_start = .;
-     _dtors_end = .;
-     *(.text*)
-  } > program
-  .data : {
-     *(.data*)
-     *(.rodata*)
-  } > data
-  .comment : {
-     *(.comment*)
-  }
-  .symtab : {
-     *(.symtab)
-  }
-  .shstrtab : {
-     *(.shstrtab)
-  }
-  .strtab : {
-     *(.strtab)
-  }
-  .stack_sizes : {
-     *(.stack_sizes)
-  }
-
-. = 0x70000;
-_sp_start_value_DM_stack = .;
-. += 0x400; /* stack */
-. = 0x40400;
-buf5 = .;
-. += 0x400;
-. = 0x44000;
-buf4 = .;
-. += 0x400;
-. = 0x48000;
-buf3 = .;
-. += 0x400;
-/* No tile with memory exists to the west. */
-. = 0x50000;
-. += 0x10000;
-. = 0x60400;
-buf11 = .;
-. += 0x400;
-. = 0x64000;
-buf10 = .;
-. += 0x400;
-. = 0x68000;
-buf9 = .;
-. += 0x400;
-. = 0x70400;
-buf8 = .;
-. += 0x400;
-. = 0x74000;
-buf7 = .;
-. += 0x400;
-. = 0x78000;
-buf6 = .;
-. += 0x400;
-  .bss : { *(.bss*) } > data
-}
-PROVIDE(main = core_0_4);
diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_4.ll b/examples/elementwise_arith/air_project/div_kernel_0_core_0_4.ll
deleted file mode 100644
index 678847a..0000000
--- a/examples/elementwise_arith/air_project/div_kernel_0_core_0_4.ll
+++ /dev/null
@@ -1,158 +0,0 @@
-; ModuleID = 'LLVMDialectModule'
-source_filename = "LLVMDialectModule"
-target triple = "aie2p"
-
-@buf0 = external global [256 x float]
-@buf1 = external global [256 x float]
-@buf2 = external global [256 x float]
-@buf3 = external global [256 x float]
-@buf4 = external global [256 x float]
-@buf5 = external global [256 x float]
-@buf6 = external global [256 x float]
-@buf7 = external global [256 x float]
-@buf8 = external global [256 x float]
-@buf9 = external global [256 x float]
-@buf10 = external global [256 x float]
-@buf11 = external global [256 x float]
-@buf12 = external global [1024 x float]
-@buf13 = external global [1024 x float]
-@buf14 = external global [1024 x float]
-
-; Function Attrs: noinline
-define float @__aie2p_scalar_fdiv(float %0, float %1) #0 {
-  %3 = call float @llvm.aie2p.inv(float %1)
-  %4 = fmul float %0, %3
-  ret float %4
-}
-
-declare void @debug_i32(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.event(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.put.ms(i32, i32)
-
-; Unknown intrinsic
-declare { i32, i32 } @llvm.aie2p.get.ss()
-
-; Unknown intrinsic
-declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32)
-
-; Unknown intrinsic
-declare <16 x i32> @llvm.aie2p.scd.read.vec(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.acquire(i32, i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.release(i32, i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.set.ctrl.reg(i32, i32)
-
-define void @core_0_4() {
-  br label %1
-
-1:                                                ; preds = %76, %0
-  call void @llvm.aie2p.acquire(i32 49, i32 -1)
-  call void @llvm.aie2p.acquire(i32 50, i32 -1)
-  call void @llvm.aie2p.acquire(i32 52, i32 -1)
-  br label %2
-
-2:                                                ; preds = %5, %1
-  %3 = phi i32 [ %75, %5 ], [ 0, %1 ]
-  %4 = icmp slt i32 %3, 256
-  br i1 %4, label %5, label %76
-
-5:                                                ; preds = %2
-  %6 = getelementptr float, ptr @buf8, i32 %3
-  %7 = load <16 x float>, ptr %6, align 4
-  %8 = getelementptr float, ptr @buf7, i32 %3
-  %9 = load <16 x float>, ptr %8, align 4
-  %10 = extractelement <16 x float> %7, i64 0
-  %11 = extractelement <16 x float> %9, i64 0
-  %12 = call float @__aie2p_scalar_fdiv(float %10, float %11)
-  %13 = insertelement <16 x float> poison, float %12, i64 0
-  %14 = extractelement <16 x float> %7, i64 1
-  %15 = extractelement <16 x float> %9, i64 1
-  %16 = call float @__aie2p_scalar_fdiv(float %14, float %15)
-  %17 = insertelement <16 x float> %13, float %16, i64 1
-  %18 = extractelement <16 x float> %7, i64 2
-  %19 = extractelement <16 x float> %9, i64 2
-  %20 = call float @__aie2p_scalar_fdiv(float %18, float %19)
-  %21 = insertelement <16 x float> %17, float %20, i64 2
-  %22 = extractelement <16 x float> %7, i64 3
-  %23 = extractelement <16 x float> %9, i64 3
-  %24 = call float @__aie2p_scalar_fdiv(float %22, float %23)
-  %25 = insertelement <16 x float> %21, float %24, i64 3
-  %26 = extractelement <16 x float> %7, i64 4
-  %27 = extractelement <16 x float> %9, i64 4
-  %28 = call float @__aie2p_scalar_fdiv(float %26, float %27)
-  %29 = insertelement <16 x float> %25, float %28, i64 4
-  %30 = extractelement <16 x float> %7, i64 5
-  %31 = extractelement <16 x float> %9, i64 5
-  %32 = call float @__aie2p_scalar_fdiv(float %30, float %31)
-  %33 = insertelement <16 x float> %29, float %32, i64 5
-  %34 = extractelement <16 x float> %7, i64 6
-  %35 = extractelement <16 x float> %9, i64 6
-  %36 = call float @__aie2p_scalar_fdiv(float %34, float %35)
-  %37 = insertelement <16 x float> %33, float %36, i64 6
-  %38 = extractelement <16 x float> %7, i64 7
-  %39 = extractelement <16 x float> %9, i64 7
-  %40 = call float @__aie2p_scalar_fdiv(float %38, float %39)
-  %41 = insertelement <16 x float> %37, float %40, i64 7
-  %42 = extractelement <16 x float> %7, i64 8
-  %43 = extractelement <16 x float> %9, i64 8
-  %44 = call float @__aie2p_scalar_fdiv(float %42, float %43)
-  %45 = insertelement <16 x float> %41, float %44, i64 8
-  %46 = extractelement <16 x float> %7, i64 9
-  %47 = extractelement <16 x float> %9, i64 9
-  %48 = call float @__aie2p_scalar_fdiv(float %46, float %47)
-  %49 = insertelement <16 x float> %45, float %48, i64 9
-  %50 = extractelement <16 x float> %7, i64 10
-  %51 = extractelement <16 x float> %9, i64 10
-  %52 = call float @__aie2p_scalar_fdiv(float %50, float %51)
-  %53 = insertelement <16 x float> %49, float %52, i64 10
-  %54 = extractelement <16 x float> %7, i64 11
-  %55 = extractelement <16 x float> %9, i64 11
-  %56 = call float @__aie2p_scalar_fdiv(float %54, float %55)
-  %57 = insertelement <16 x float> %53, float %56, i64 11
-  %58 = extractelement <16 x float> %7, i64 12
-  %59 = extractelement <16 x float> %9, i64 12
-  %60 = call float @__aie2p_scalar_fdiv(float %58, float %59)
-  %61 = insertelement <16 x float> %57, float %60, i64 12
-  %62 = extractelement <16 x float> %7, i64 13
-  %63 = extractelement <16 x float> %9, i64 13
-  %64 = call float @__aie2p_scalar_fdiv(float %62, float %63)
-  %65 = insertelement <16 x float> %61, float %64, i64 13
-  %66 = extractelement <16 x float> %7, i64 14
-  %67 = extractelement <16 x float> %9, i64 14
-  %68 = call float @__aie2p_scalar_fdiv(float %66, float %67)
-  %69 = insertelement <16 x float> %65, float %68, i64 14
-  %70 = extractelement <16 x float> %7, i64 15
-  %71 = extractelement <16 x float> %9, i64 15
-  %72 = call float @__aie2p_scalar_fdiv(float %70, float %71)
-  %73 = insertelement <16 x float> %69, float %72, i64 15
-  %74 = getelementptr float, ptr @buf6, i32 %3
-  store <16 x float> %73, ptr %74, align 4
-  %75 = add i32 %3, 16
-  br label %2, !llvm.loop !1
-
-76:                                               ; preds = %2
-  call void @llvm.aie2p.release(i32 51, i32 1)
-  call void @llvm.aie2p.release(i32 53, i32 1)
-  call void @llvm.aie2p.release(i32 48, i32 1)
-  br label %1
-}
-
-; Unknown intrinsic
-declare float @llvm.aie2p.inv(float)
-
-attributes #0 = { noinline }
-
-!llvm.module.flags = !{!0}
-
-!0 = !{i32 2, !"Debug Info Version", i32 3}
-!1 = distinct !{!1, !2}
-!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_4.o b/examples/elementwise_arith/air_project/div_kernel_0_core_0_4.o
deleted file mode 100644
index 75208a26da516d5325930c6a9cff6520d51ab84d..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 2048
zcma)7Z)h837=JF8G*@uDGm`R}tZ=KkRGXe{S8H($lQMN5x)5X=#edhdOSPEUk|yOu
z(7JTaF~_>0a3Vs#4ZlbNGD1J~!w``n6{8H?hblq+pwPjD2%GVF?%r!(ifj77`#yhu
zzx&<$+`aeSd(VwNFG&)3CV?Vk0CpRIjNqU{=!SOK;!<fP;Q$+PVE*^D>j)}fA6``f
z>dEEu8RZuT#_45A2R+ze0Gd|<V9?Doy0Xy(cz3)9_96k<sA+#Lt9EBlwGVfzcK9*X
zermsJpXgNW;=`)_Nmza6LFhwrdFa=94azbs{J1BW1dRJI{<}TsQ{o7(H<{AafdLv*
zMtBt0{Cnho<A)Y*<9&3{8u|ibU%@aNglAo@-)Yq@uByJZ)qqq!(?Fk;#ygtq;2pJ|
ze4>(0V1EgRP5t4DU9-dAAl_4X@#r<Tpu=piX&<^1bW>Lupjwh6)lw;9hi|mdTic!9
zS=2i#N9}M8`SBgrcN6j6*6TZmdgn^0cMJK(4tnc|KiOLEJnEgNe(T8J>G}PE_%5E`
zg;Jz?LB{p{iTrrwc>4d}M?+tVP`@X~TD*^oa<qC8`wetx)bH5%{q(db_50ErE%Yvx
zqSZ@s#12GIue<Wf?0xlceUDdOvsU<i3H`$^-w3kHYdC5+ZsNFwV?9_)`SdGyaqbG;
ztEIXgy>icr?T=cq{(V*~*=fbTXSG!5k-ZHo_G!e5eG#>;uI#gZ{}z#-JL}(F>ltG!
z)rbCy47BMVHtMPJXEIxq67F4bW?I7eelYyO*Or730N>XPzkH^Q^RI^~%NQ;4>zD$(
zzA40mA)(-h+s%(KPjA)&j>4>vKfq(+gz)5fXA^S|jy%s-@lXmu0LHfAuWrjHw&j%i
z0&dYj*TcWaVT^kfw|<1;b~mQ=N`5ykljHGkLfttMd3{LQ;-@k8_#X)MWug9)P~Q;h
z2Yg#P35-3Rd7-`})V~($>q5PN&q<;K9BmPOFYotDsuPdQ$Y*mIqmZ=<hG}M|vjel{
z++-$~$(vKBrsu&yz<6t>kTqUNo#>q_WG3H)(b1E~j!b8A*~!AOnN!7F7L2(<zL1$f
z|MqF}6UC`vnx3X<NW9DBOg?M&nS)@Or;E9{se@pQj*Rr11Kjt!>)OS96gj*-hwaDz
z)yKkh^hn8c926t$Sb}@cX*?i&JCF9FXC99@^vV+)*uaGLE^Q%N4`Gz&aEN+KnA6_%
v%+q@v3!>g#OccD3^k}<!dIvBjJ<rkv2<Bn5X7BY9OuS|EgwrK7=-&SUdbvNa

diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_4.opt.ll b/examples/elementwise_arith/air_project/div_kernel_0_core_0_4.opt.ll
deleted file mode 100644
index 849c352..0000000
--- a/examples/elementwise_arith/air_project/div_kernel_0_core_0_4.opt.ll
+++ /dev/null
@@ -1,129 +0,0 @@
-; ModuleID = 'air_project/div_kernel_0_core_0_4.peanohack.ll'
-source_filename = "LLVMDialectModule"
-target datalayout = "e-m:e-p:20:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-f32:32:32-i64:32-f64:32-a:0:32-n32"
-target triple = "aie2p"
-
-@buf6 = external local_unnamed_addr global [256 x float]
-@buf7 = external local_unnamed_addr global [256 x float]
-@buf8 = external local_unnamed_addr global [256 x float]
-
-; Function Attrs: nofree noinline nosync nounwind memory(none)
-define float @__aie2p_scalar_fdiv(float %0, float %1) local_unnamed_addr #0 {
-  %3 = tail call float @llvm.aie2p.inv(float %1)
-  %4 = fmul float %3, %0
-  ret float %4
-}
-
-; Function Attrs: nounwind
-declare void @llvm.aie2p.acquire(i32, i32) #1
-
-; Function Attrs: nounwind
-declare void @llvm.aie2p.release(i32, i32) #1
-
-; Function Attrs: noreturn nounwind
-define void @core_0_4() local_unnamed_addr #2 {
-  br label %1
-
-1:                                                ; preds = %76, %0
-  tail call void @llvm.aie2p.acquire(i32 49, i32 -1)
-  tail call void @llvm.aie2p.acquire(i32 50, i32 -1)
-  tail call void @llvm.aie2p.acquire(i32 52, i32 -1)
-  br label %2
-
-2:                                                ; preds = %1, %2
-  %3 = phi i32 [ 0, %1 ], [ %74, %2 ]
-  %4 = trunc nuw i32 %3 to i20
-  %5 = getelementptr float, ptr @buf8, i20 %4
-  %6 = load <16 x float>, ptr %5, align 64
-  %7 = getelementptr float, ptr @buf7, i20 %4
-  %8 = load <16 x float>, ptr %7, align 64
-  %9 = extractelement <16 x float> %6, i64 0
-  %10 = extractelement <16 x float> %8, i64 0
-  %11 = tail call float @__aie2p_scalar_fdiv(float %9, float %10)
-  %12 = insertelement <16 x float> poison, float %11, i64 0
-  %13 = extractelement <16 x float> %6, i64 1
-  %14 = extractelement <16 x float> %8, i64 1
-  %15 = tail call float @__aie2p_scalar_fdiv(float %13, float %14)
-  %16 = insertelement <16 x float> %12, float %15, i64 1
-  %17 = extractelement <16 x float> %6, i64 2
-  %18 = extractelement <16 x float> %8, i64 2
-  %19 = tail call float @__aie2p_scalar_fdiv(float %17, float %18)
-  %20 = insertelement <16 x float> %16, float %19, i64 2
-  %21 = extractelement <16 x float> %6, i64 3
-  %22 = extractelement <16 x float> %8, i64 3
-  %23 = tail call float @__aie2p_scalar_fdiv(float %21, float %22)
-  %24 = insertelement <16 x float> %20, float %23, i64 3
-  %25 = extractelement <16 x float> %6, i64 4
-  %26 = extractelement <16 x float> %8, i64 4
-  %27 = tail call float @__aie2p_scalar_fdiv(float %25, float %26)
-  %28 = insertelement <16 x float> %24, float %27, i64 4
-  %29 = extractelement <16 x float> %6, i64 5
-  %30 = extractelement <16 x float> %8, i64 5
-  %31 = tail call float @__aie2p_scalar_fdiv(float %29, float %30)
-  %32 = insertelement <16 x float> %28, float %31, i64 5
-  %33 = extractelement <16 x float> %6, i64 6
-  %34 = extractelement <16 x float> %8, i64 6
-  %35 = tail call float @__aie2p_scalar_fdiv(float %33, float %34)
-  %36 = insertelement <16 x float> %32, float %35, i64 6
-  %37 = extractelement <16 x float> %6, i64 7
-  %38 = extractelement <16 x float> %8, i64 7
-  %39 = tail call float @__aie2p_scalar_fdiv(float %37, float %38)
-  %40 = insertelement <16 x float> %36, float %39, i64 7
-  %41 = extractelement <16 x float> %6, i64 8
-  %42 = extractelement <16 x float> %8, i64 8
-  %43 = tail call float @__aie2p_scalar_fdiv(float %41, float %42)
-  %44 = insertelement <16 x float> %40, float %43, i64 8
-  %45 = extractelement <16 x float> %6, i64 9
-  %46 = extractelement <16 x float> %8, i64 9
-  %47 = tail call float @__aie2p_scalar_fdiv(float %45, float %46)
-  %48 = insertelement <16 x float> %44, float %47, i64 9
-  %49 = extractelement <16 x float> %6, i64 10
-  %50 = extractelement <16 x float> %8, i64 10
-  %51 = tail call float @__aie2p_scalar_fdiv(float %49, float %50)
-  %52 = insertelement <16 x float> %48, float %51, i64 10
-  %53 = extractelement <16 x float> %6, i64 11
-  %54 = extractelement <16 x float> %8, i64 11
-  %55 = tail call float @__aie2p_scalar_fdiv(float %53, float %54)
-  %56 = insertelement <16 x float> %52, float %55, i64 11
-  %57 = extractelement <16 x float> %6, i64 12
-  %58 = extractelement <16 x float> %8, i64 12
-  %59 = tail call float @__aie2p_scalar_fdiv(float %57, float %58)
-  %60 = insertelement <16 x float> %56, float %59, i64 12
-  %61 = extractelement <16 x float> %6, i64 13
-  %62 = extractelement <16 x float> %8, i64 13
-  %63 = tail call float @__aie2p_scalar_fdiv(float %61, float %62)
-  %64 = insertelement <16 x float> %60, float %63, i64 13
-  %65 = extractelement <16 x float> %6, i64 14
-  %66 = extractelement <16 x float> %8, i64 14
-  %67 = tail call float @__aie2p_scalar_fdiv(float %65, float %66)
-  %68 = insertelement <16 x float> %64, float %67, i64 14
-  %69 = extractelement <16 x float> %6, i64 15
-  %70 = extractelement <16 x float> %8, i64 15
-  %71 = tail call float @__aie2p_scalar_fdiv(float %69, float %70)
-  %72 = insertelement <16 x float> %68, float %71, i64 15
-  %73 = getelementptr float, ptr @buf6, i20 %4
-  store <16 x float> %72, ptr %73, align 64
-  %74 = add nuw nsw i32 %3, 16
-  %75 = icmp ult i32 %3, 240
-  br i1 %75, label %2, label %76, !llvm.loop !1
-
-76:                                               ; preds = %2
-  tail call void @llvm.aie2p.release(i32 51, i32 1)
-  tail call void @llvm.aie2p.release(i32 53, i32 1)
-  tail call void @llvm.aie2p.release(i32 48, i32 1)
-  br label %1
-}
-
-; Function Attrs: nofree nosync nounwind memory(none)
-declare float @llvm.aie2p.inv(float) #3
-
-attributes #0 = { nofree noinline nosync nounwind memory(none) }
-attributes #1 = { nounwind }
-attributes #2 = { noreturn nounwind }
-attributes #3 = { nofree nosync nounwind memory(none) }
-
-!llvm.module.flags = !{!0}
-
-!0 = !{i32 2, !"Debug Info Version", i32 3}
-!1 = distinct !{!1, !2}
-!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_4.peanohack.ll b/examples/elementwise_arith/air_project/div_kernel_0_core_0_4.peanohack.ll
deleted file mode 100644
index 9a0f789..0000000
--- a/examples/elementwise_arith/air_project/div_kernel_0_core_0_4.peanohack.ll
+++ /dev/null
@@ -1,158 +0,0 @@
-; ModuleID = 'LLVMDialectModule'
-source_filename = "LLVMDialectModule"
-target triple = "aie2p"
-
-@buf0 = external global [256 x float]
-@buf1 = external global [256 x float]
-@buf2 = external global [256 x float]
-@buf3 = external global [256 x float]
-@buf4 = external global [256 x float]
-@buf5 = external global [256 x float]
-@buf6 = external global [256 x float]
-@buf7 = external global [256 x float]
-@buf8 = external global [256 x float]
-@buf9 = external global [256 x float]
-@buf10 = external global [256 x float]
-@buf11 = external global [256 x float]
-@buf12 = external global [1024 x float]
-@buf13 = external global [1024 x float]
-@buf14 = external global [1024 x float]
-
-; Function Attrs: noinline
-define float @__aie2p_scalar_fdiv(float %0, float %1) #0 {
-  %3 = call float @llvm.aie2p.inv(float %1)
-  %4 = fmul float %0, %3
-  ret float %4
-}
-
-declare void @debug_i32(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.event(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.put.ms(i32, i32)
-
-; Unknown intrinsic
-declare { i32, i32 } @llvm.aie2p.get.ss()
-
-; Unknown intrinsic
-declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32)
-
-; Unknown intrinsic
-declare <16 x i32> @llvm.aie2p.scd.read.vec(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.acquire(i32, i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.release(i32, i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.set.ctrl.reg(i32, i32)
-
-define void @core_0_4() {
-  br label %1
-
-1:                                                ; preds = %76, %0
-  call void @llvm.aie2p.acquire(i32 49, i32 -1)
-  call void @llvm.aie2p.acquire(i32 50, i32 -1)
-  call void @llvm.aie2p.acquire(i32 52, i32 -1)
-  br label %2
-
-2:                                                ; preds = %5, %1
-  %3 = phi i32 [ %75, %5 ], [ 0, %1 ]
-  %4 = icmp slt i32 %3, 256
-  br i1 %4, label %5, label %76
-
-5:                                                ; preds = %2
-  %6 = getelementptr float, ptr @buf8, i32 %3
-  %7 = load <16 x float>, ptr %6
-  %8 = getelementptr float, ptr @buf7, i32 %3
-  %9 = load <16 x float>, ptr %8
-  %10 = extractelement <16 x float> %7, i64 0
-  %11 = extractelement <16 x float> %9, i64 0
-  %12 = call float @__aie2p_scalar_fdiv(float %10, float %11)
-  %13 = insertelement <16 x float> poison, float %12, i64 0
-  %14 = extractelement <16 x float> %7, i64 1
-  %15 = extractelement <16 x float> %9, i64 1
-  %16 = call float @__aie2p_scalar_fdiv(float %14, float %15)
-  %17 = insertelement <16 x float> %13, float %16, i64 1
-  %18 = extractelement <16 x float> %7, i64 2
-  %19 = extractelement <16 x float> %9, i64 2
-  %20 = call float @__aie2p_scalar_fdiv(float %18, float %19)
-  %21 = insertelement <16 x float> %17, float %20, i64 2
-  %22 = extractelement <16 x float> %7, i64 3
-  %23 = extractelement <16 x float> %9, i64 3
-  %24 = call float @__aie2p_scalar_fdiv(float %22, float %23)
-  %25 = insertelement <16 x float> %21, float %24, i64 3
-  %26 = extractelement <16 x float> %7, i64 4
-  %27 = extractelement <16 x float> %9, i64 4
-  %28 = call float @__aie2p_scalar_fdiv(float %26, float %27)
-  %29 = insertelement <16 x float> %25, float %28, i64 4
-  %30 = extractelement <16 x float> %7, i64 5
-  %31 = extractelement <16 x float> %9, i64 5
-  %32 = call float @__aie2p_scalar_fdiv(float %30, float %31)
-  %33 = insertelement <16 x float> %29, float %32, i64 5
-  %34 = extractelement <16 x float> %7, i64 6
-  %35 = extractelement <16 x float> %9, i64 6
-  %36 = call float @__aie2p_scalar_fdiv(float %34, float %35)
-  %37 = insertelement <16 x float> %33, float %36, i64 6
-  %38 = extractelement <16 x float> %7, i64 7
-  %39 = extractelement <16 x float> %9, i64 7
-  %40 = call float @__aie2p_scalar_fdiv(float %38, float %39)
-  %41 = insertelement <16 x float> %37, float %40, i64 7
-  %42 = extractelement <16 x float> %7, i64 8
-  %43 = extractelement <16 x float> %9, i64 8
-  %44 = call float @__aie2p_scalar_fdiv(float %42, float %43)
-  %45 = insertelement <16 x float> %41, float %44, i64 8
-  %46 = extractelement <16 x float> %7, i64 9
-  %47 = extractelement <16 x float> %9, i64 9
-  %48 = call float @__aie2p_scalar_fdiv(float %46, float %47)
-  %49 = insertelement <16 x float> %45, float %48, i64 9
-  %50 = extractelement <16 x float> %7, i64 10
-  %51 = extractelement <16 x float> %9, i64 10
-  %52 = call float @__aie2p_scalar_fdiv(float %50, float %51)
-  %53 = insertelement <16 x float> %49, float %52, i64 10
-  %54 = extractelement <16 x float> %7, i64 11
-  %55 = extractelement <16 x float> %9, i64 11
-  %56 = call float @__aie2p_scalar_fdiv(float %54, float %55)
-  %57 = insertelement <16 x float> %53, float %56, i64 11
-  %58 = extractelement <16 x float> %7, i64 12
-  %59 = extractelement <16 x float> %9, i64 12
-  %60 = call float @__aie2p_scalar_fdiv(float %58, float %59)
-  %61 = insertelement <16 x float> %57, float %60, i64 12
-  %62 = extractelement <16 x float> %7, i64 13
-  %63 = extractelement <16 x float> %9, i64 13
-  %64 = call float @__aie2p_scalar_fdiv(float %62, float %63)
-  %65 = insertelement <16 x float> %61, float %64, i64 13
-  %66 = extractelement <16 x float> %7, i64 14
-  %67 = extractelement <16 x float> %9, i64 14
-  %68 = call float @__aie2p_scalar_fdiv(float %66, float %67)
-  %69 = insertelement <16 x float> %65, float %68, i64 14
-  %70 = extractelement <16 x float> %7, i64 15
-  %71 = extractelement <16 x float> %9, i64 15
-  %72 = call float @__aie2p_scalar_fdiv(float %70, float %71)
-  %73 = insertelement <16 x float> %69, float %72, i64 15
-  %74 = getelementptr float, ptr @buf6, i32 %3
-  store <16 x float> %73, ptr %74
-  %75 = add i32 %3, 16
-  br label %2, !llvm.loop !1
-
-76:                                               ; preds = %2
-  call void @llvm.aie2p.release(i32 51, i32 1)
-  call void @llvm.aie2p.release(i32 53, i32 1)
-  call void @llvm.aie2p.release(i32 48, i32 1)
-  br label %1
-}
-
-; Unknown intrinsic
-declare float @llvm.aie2p.inv(float)
-
-attributes #0 = { noinline }
-
-!llvm.module.flags = !{!0}
-
-!0 = !{i32 2, !"Debug Info Version", i32 3}
-!1 = distinct !{!1, !2}
-!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_5.elf b/examples/elementwise_arith/air_project/div_kernel_0_core_0_5.elf
deleted file mode 100755
index 8162c282e6181e1320623e660451ca081a79c550..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 4132
zcma)9eQZ<L6+h2*{0t)Xb-s3+KzW3ZnBpXMk|r@7okOz<F%!WwP`05tj_nYOd@Xh|
zGb!Vhqyq}50kzt#I<%}(H%*!}XWFtTe|c*_v}!j{gG9AX@+i|bZPkL*X>J<G&hdNC
zF@Yu=>Arh@=bU@)Ip==(`s7#rFG!LE2ARN?Lk%e0U<WW`-vA1jzzH%~fUH$)22HF3
zpjLnahV9Vb;$cIV-&^y6&RZl97HEa^IytA^u@T_*ezOAeE|~EF{1AxamKWJ9PHh9R
zCxubnX_-i#vi;Q<^=(N3Mb2<71YOF(NdY0z$jlPJ4?1?i6Nsg(cX8Ql7n@|e*yFT|
z((QKf%oe*iyvZ(xp0tbCO6|KHP?wS**?4=HLGlAhy;&%?0P^mSCH24SWRuN>zS2wF
zR#}D&61VNuITy1On(w=wRqwh$D8Mp`%K2}gnK!^!G+WFaqZ30ZyXktWKuVs<VBaEj
zlr8c4PwjVFo}N7D!~A@JH3!NMtDuo6y@vW-lV9IAr3n-mke9@>pXx=_JOfORn#-q0
zN6SU&du#Z+{&@agLVqurD@5rW+PhYX?-uH-*N*Qr`a3;}{yswcnpOOzQD3{Zzq9D?
zEaiI#?N_VcKT!V|^*c9OK7G!N@%<a^U6bE9_~`Y>DC6an?~(R3*5iV?V)_E+TTsR*
z-}a6#^2aJD-*0uV;qR5vis@I(<)WY*{W&L(4t$|MjBn@UcM>!DekuBlUwlvI7q4TR
z!*&bXN7&NxT$@R`w20rzqI+{~mD?`eOH^&CNK|<?CaPLCC8}=lxwfLMPh=8Rua_sP
zeo>M5`OL<|-`_>epEuq4<J7KpJ~LhXNqL=BIg`25mVCp^$0Xl?zHxywADtgMRUeqF
zEl>oY?`sYco}Lc<^l;k*a$^7;G?Lpumt%emY$j0j^Dsr85u%%WqVNSK3RQMg`R!!m
zXGaxpawa3nRJLF#1@-VvSxLL3SFNpS;rBas2Z8+(HYA_R?7q14pT*6Plo#=|J5A}$
z$u~boZl6hk<c`me{_}jYW?>=zGN`Y;FWX@%@s?zPjoR(;HDF_SJD4a$owD(rk<y!u
z_<R_>WaKEezr$u2I3)$mrI~i<?t%5-oS$jG-(Bsb8`QP)6FlXv-(Rp+-V@ccWn%s+
z6ACu<T_(PHtq5y*3nR-B&W9+IyC3j|SiU8$s70tBU;5MHk`d3P@IF2Ewi5Kk&CQ0Z
zR&?ZpRs9wC7*`~4y)h0u7Dev{CL%V6Ya26d0r8~Yfwkx`yET8JY>bI&3!52V>c{G0
zOsz0s230gz9b(rtwDpbS4G$c?1ftOA5CxdyTPe2M{9Fb`_1{)((Q>HpQx46>oHX-l
z<3)j*9Y5=|Wyj;#q#18ujVS-rNz5T_wi&WD{FS~~bm@>!0kB%c@%6YDe7DTFiW+J8
za_5BX`ssPe3PMsHV$aJ=?7<2Yjxj#J-yyco;*Ldp7^{`73fQPB8tL`GjAyr6^!<c$
zq?coUI6mX|Pn7(D2?kV88t*hMj|WRGXFOq)Z*Ini{XMXsFkkXFz8>S#{n77fmK0E)
z%kUdlWbRGNeDWQ<OWv3HcW=o2&`p_pK9srRFY?2`$2h+aS2IV54_Y3qKgO?0R((F<
zYI3-)RzRhM^<;cu%)u9)!rHY^^#@G4pH@(;>Pb<3l}(}VAJZAZ9mw522mjd2gp?{4
zoAx_+)l;kPw~SKCF<<oX=<}uPDZT<@eR$I3aQ)=GB;OY!Z!$5m%^~c5CTd$9V)OG%
z*wqrT`D}?8nQ-vrd53W0x-(OW@L@mF!FX~n6R?+2om|p-sn{g;w4es15kSjR6t55H
z1*6wvM!WwoJ8}7KyJ=wl*XHEuj5xL^yG$0Jna%^ohhubmV}DSgxOaB>(FOeVw;!=S
z{%GIv$lz|p?{8H+-Wqoep6^gku<w{M92ty7`}=gM;_Qyc2V%QxYmY_a-9w!<q5j_5
z_GnMEFHzgmGu&Gpj7F5s`cT;I4cFI4g5i3%x3Qr)90@gr8^ZOT&ic-VkjEPddmFmE
zJ1M#cNy>v3xVu!0IF*x0P=S0p4{y!!^m@9xL@%cfdZ1=W!uKrN<FiU53BHU>GEd)%
zIF+l81P2hia}QVvj*vbN2M{ZH_T7kCo}QLzAP=8F+>vL03UM+IpF@mKU7aL&6*)Bz
zUq(EUhks_6Vo&-0Y6bosVmH3gHIm>8G77eX*or`c53zS+J;t#SzKxh-PMJ0QFXX|Z
z1<1e(cM1EJ@&5>`%XkrC3gaUm<gY+qKt;!eS-fn&12M&C*n1EYPekJz5i40l4*&_C
zUx5#=z#S{_zzRIF0-r$)7HvaNeisP)mhmOR>N19!cq9=A+UC8HK4fAidgH-P?7L&}
zL5<7x!O-zQEc*RO48ZR{a-cOD?1_Zp2l~T9JrSt!@7?POc=WywdWU*qUG+7gtiT<3
zM(^wOzDe)v41ve3OFV|D$FT7lnmU6qBz1<QuEFrvY%pF5AsnR;LW6NnO^8D28tMxL
zIwQxTeI5t|@G#d61Y)6JPjE2M6^;(0Op6;r{ezJJh21&S<?&>{Zu)vP6ZDHrEFK(;
zLm-Aehzkq{dxj!`)&m4X$05)gjP?bhebG1&2Z2z$e=uf@B6%P<I2b%(NHle%FAUj1
zx-yHy`6D$zatx@MK27v#%<eoNF7V?KZ3Sw^aaNJ1|Gx$?ouS=Lc5WRj_`CT%ou|(5
zL-wXTxj?%K6wq}<&tBSfWP^7%;Bm}Rb0I)e4r%@fR$|My>qomE(PTOq?9+&eS9Q4+
M|5ekRhTiD^55T1-w*UYD

diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_5.ld.script b/examples/elementwise_arith/air_project/div_kernel_0_core_0_5.ld.script
deleted file mode 100644
index 51c13db..0000000
--- a/examples/elementwise_arith/air_project/div_kernel_0_core_0_5.ld.script
+++ /dev/null
@@ -1,72 +0,0 @@
-
-MEMORY
-{
-   program (RX) : ORIGIN = 0, LENGTH = 0x0020000
-   data (!RX) : ORIGIN = 0x78400, LENGTH = 0x7C00
-}
-ENTRY(__start)
-SECTIONS
-{
-  . = 0x0;
-  .text : {
-     /* the __start symbol has to come at address zero. */
-     *crt0.o(.text*)
-     _ctors_start = .;
-     _init_array_start = .;
-     KEEP(SORT(*.init_array))
-     _ctors_end = .;
-     _init_array_end = .;
-     _dtors_start = .;
-     _dtors_end = .;
-     *(.text*)
-  } > program
-  .data : {
-     *(.data*)
-     *(.rodata*)
-  } > data
-  .comment : {
-     *(.comment*)
-  }
-  .symtab : {
-     *(.symtab)
-  }
-  .shstrtab : {
-     *(.shstrtab)
-  }
-  .strtab : {
-     *(.strtab)
-  }
-  .stack_sizes : {
-     *(.stack_sizes)
-  }
-
-. = 0x70000;
-_sp_start_value_DM_stack = .;
-. += 0x400; /* stack */
-. = 0x40400;
-buf8 = .;
-. += 0x400;
-. = 0x44000;
-buf7 = .;
-. += 0x400;
-. = 0x48000;
-buf6 = .;
-. += 0x400;
-/* No tile with memory exists to the west. */
-. = 0x50000;
-. += 0x10000;
-/* No tile with memory exists to the north. */
-. = 0x60000;
-. += 0x10000;
-. = 0x70400;
-buf11 = .;
-. += 0x400;
-. = 0x74000;
-buf10 = .;
-. += 0x400;
-. = 0x78000;
-buf9 = .;
-. += 0x400;
-  .bss : { *(.bss*) } > data
-}
-PROVIDE(main = core_0_5);
diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_5.ll b/examples/elementwise_arith/air_project/div_kernel_0_core_0_5.ll
deleted file mode 100644
index e652b65..0000000
--- a/examples/elementwise_arith/air_project/div_kernel_0_core_0_5.ll
+++ /dev/null
@@ -1,158 +0,0 @@
-; ModuleID = 'LLVMDialectModule'
-source_filename = "LLVMDialectModule"
-target triple = "aie2p"
-
-@buf0 = external global [256 x float]
-@buf1 = external global [256 x float]
-@buf2 = external global [256 x float]
-@buf3 = external global [256 x float]
-@buf4 = external global [256 x float]
-@buf5 = external global [256 x float]
-@buf6 = external global [256 x float]
-@buf7 = external global [256 x float]
-@buf8 = external global [256 x float]
-@buf9 = external global [256 x float]
-@buf10 = external global [256 x float]
-@buf11 = external global [256 x float]
-@buf12 = external global [1024 x float]
-@buf13 = external global [1024 x float]
-@buf14 = external global [1024 x float]
-
-; Function Attrs: noinline
-define float @__aie2p_scalar_fdiv(float %0, float %1) #0 {
-  %3 = call float @llvm.aie2p.inv(float %1)
-  %4 = fmul float %0, %3
-  ret float %4
-}
-
-declare void @debug_i32(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.event(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.put.ms(i32, i32)
-
-; Unknown intrinsic
-declare { i32, i32 } @llvm.aie2p.get.ss()
-
-; Unknown intrinsic
-declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32)
-
-; Unknown intrinsic
-declare <16 x i32> @llvm.aie2p.scd.read.vec(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.acquire(i32, i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.release(i32, i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.set.ctrl.reg(i32, i32)
-
-define void @core_0_5() {
-  br label %1
-
-1:                                                ; preds = %76, %0
-  call void @llvm.aie2p.acquire(i32 49, i32 -1)
-  call void @llvm.aie2p.acquire(i32 50, i32 -1)
-  call void @llvm.aie2p.acquire(i32 52, i32 -1)
-  br label %2
-
-2:                                                ; preds = %5, %1
-  %3 = phi i32 [ %75, %5 ], [ 0, %1 ]
-  %4 = icmp slt i32 %3, 256
-  br i1 %4, label %5, label %76
-
-5:                                                ; preds = %2
-  %6 = getelementptr float, ptr @buf11, i32 %3
-  %7 = load <16 x float>, ptr %6, align 4
-  %8 = getelementptr float, ptr @buf10, i32 %3
-  %9 = load <16 x float>, ptr %8, align 4
-  %10 = extractelement <16 x float> %7, i64 0
-  %11 = extractelement <16 x float> %9, i64 0
-  %12 = call float @__aie2p_scalar_fdiv(float %10, float %11)
-  %13 = insertelement <16 x float> poison, float %12, i64 0
-  %14 = extractelement <16 x float> %7, i64 1
-  %15 = extractelement <16 x float> %9, i64 1
-  %16 = call float @__aie2p_scalar_fdiv(float %14, float %15)
-  %17 = insertelement <16 x float> %13, float %16, i64 1
-  %18 = extractelement <16 x float> %7, i64 2
-  %19 = extractelement <16 x float> %9, i64 2
-  %20 = call float @__aie2p_scalar_fdiv(float %18, float %19)
-  %21 = insertelement <16 x float> %17, float %20, i64 2
-  %22 = extractelement <16 x float> %7, i64 3
-  %23 = extractelement <16 x float> %9, i64 3
-  %24 = call float @__aie2p_scalar_fdiv(float %22, float %23)
-  %25 = insertelement <16 x float> %21, float %24, i64 3
-  %26 = extractelement <16 x float> %7, i64 4
-  %27 = extractelement <16 x float> %9, i64 4
-  %28 = call float @__aie2p_scalar_fdiv(float %26, float %27)
-  %29 = insertelement <16 x float> %25, float %28, i64 4
-  %30 = extractelement <16 x float> %7, i64 5
-  %31 = extractelement <16 x float> %9, i64 5
-  %32 = call float @__aie2p_scalar_fdiv(float %30, float %31)
-  %33 = insertelement <16 x float> %29, float %32, i64 5
-  %34 = extractelement <16 x float> %7, i64 6
-  %35 = extractelement <16 x float> %9, i64 6
-  %36 = call float @__aie2p_scalar_fdiv(float %34, float %35)
-  %37 = insertelement <16 x float> %33, float %36, i64 6
-  %38 = extractelement <16 x float> %7, i64 7
-  %39 = extractelement <16 x float> %9, i64 7
-  %40 = call float @__aie2p_scalar_fdiv(float %38, float %39)
-  %41 = insertelement <16 x float> %37, float %40, i64 7
-  %42 = extractelement <16 x float> %7, i64 8
-  %43 = extractelement <16 x float> %9, i64 8
-  %44 = call float @__aie2p_scalar_fdiv(float %42, float %43)
-  %45 = insertelement <16 x float> %41, float %44, i64 8
-  %46 = extractelement <16 x float> %7, i64 9
-  %47 = extractelement <16 x float> %9, i64 9
-  %48 = call float @__aie2p_scalar_fdiv(float %46, float %47)
-  %49 = insertelement <16 x float> %45, float %48, i64 9
-  %50 = extractelement <16 x float> %7, i64 10
-  %51 = extractelement <16 x float> %9, i64 10
-  %52 = call float @__aie2p_scalar_fdiv(float %50, float %51)
-  %53 = insertelement <16 x float> %49, float %52, i64 10
-  %54 = extractelement <16 x float> %7, i64 11
-  %55 = extractelement <16 x float> %9, i64 11
-  %56 = call float @__aie2p_scalar_fdiv(float %54, float %55)
-  %57 = insertelement <16 x float> %53, float %56, i64 11
-  %58 = extractelement <16 x float> %7, i64 12
-  %59 = extractelement <16 x float> %9, i64 12
-  %60 = call float @__aie2p_scalar_fdiv(float %58, float %59)
-  %61 = insertelement <16 x float> %57, float %60, i64 12
-  %62 = extractelement <16 x float> %7, i64 13
-  %63 = extractelement <16 x float> %9, i64 13
-  %64 = call float @__aie2p_scalar_fdiv(float %62, float %63)
-  %65 = insertelement <16 x float> %61, float %64, i64 13
-  %66 = extractelement <16 x float> %7, i64 14
-  %67 = extractelement <16 x float> %9, i64 14
-  %68 = call float @__aie2p_scalar_fdiv(float %66, float %67)
-  %69 = insertelement <16 x float> %65, float %68, i64 14
-  %70 = extractelement <16 x float> %7, i64 15
-  %71 = extractelement <16 x float> %9, i64 15
-  %72 = call float @__aie2p_scalar_fdiv(float %70, float %71)
-  %73 = insertelement <16 x float> %69, float %72, i64 15
-  %74 = getelementptr float, ptr @buf9, i32 %3
-  store <16 x float> %73, ptr %74, align 4
-  %75 = add i32 %3, 16
-  br label %2, !llvm.loop !1
-
-76:                                               ; preds = %2
-  call void @llvm.aie2p.release(i32 51, i32 1)
-  call void @llvm.aie2p.release(i32 53, i32 1)
-  call void @llvm.aie2p.release(i32 48, i32 1)
-  br label %1
-}
-
-; Unknown intrinsic
-declare float @llvm.aie2p.inv(float)
-
-attributes #0 = { noinline }
-
-!llvm.module.flags = !{!0}
-
-!0 = !{i32 2, !"Debug Info Version", i32 3}
-!1 = distinct !{!1, !2}
-!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_5.o b/examples/elementwise_arith/air_project/div_kernel_0_core_0_5.o
deleted file mode 100644
index 78cc76657161c0618a24de8080e832261028cff6..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 2052
zcma)7VQ5=b6h1F6X<otYJtHaCWQDitmTJ>y+ihh!hDn*aA6*FIMsezE+NG?R+1e)M
z=?|?<=bUq_8z@hl(0{T&l7NiRU;QB>GNfWA1NWm!P=6?NFd@QbJm<ap+Dmaw58Qjs
z`ObI0-1FXjH}5_-_`D=Z;F$!9kO5q60g$n{-yw8DD{OEHT1q&DwKy>Ud)n7nRKPyG
ztOC@Mi={c`7pIKVNl6DiSZ4s52LUkX<Qc83Hvrxl?t-02KsIXHpUbM<9#rkaovIz)
zquK{{tM>7B)h;}$+8>A2XC8(gB$xVsozb8q!|cr+!6abZgYn<3L7x)G@@j)A9c>t(
zF=c>9am~L&{x^PT_747z4q9EGW$Y_B$o9jtF4yn0Y8RGO-^y}8D$Ui=C#B)G2HSU6
zttFo<rxVy;!eLW?xMSDs@V8j+D!+L2x(nzq8En{x?grh|RR*XO<w&JijM(8DP4rf_
zr*{_h&dO0cTt$9-i~ZfgdUx~vokP8IMbx{Ed}9l}RjfbNT<<*Uou__l$lvPu{eksu
zJiiOYNaccz`}-65@$yUQ|9>BKeIY{qo)~KKJ}%19%0=uq(56wpL&Fc!)1uVx%cq;@
zT`EQ^m*j{Yh@f6)`IX5B>f!#LD8FVc@$(Y;hh4rAWS3WPRB_zGaT~{Ku$uDeSMK55
z5xQSZbv%CMz7^XYwPL-ytXQ($iv7T<snBCP>sIX3h!wjQwZ2%|W&QpgR(@`;eSf`c
zh%Hqf`76@bqJL1YrAnX4Y+g#Zzlt-{63+L7;g7zSB!mF?xn}s~Gi8{6Jxp1|Xr8ZQ
z3UGW~hzI=w;D<YnIn2}RSODI}cv8q8;xTbTc=NouUUM&w(>!0sODO~a_;6EwVN*V|
zDW}vIa6toI7ylv;V%)8`_z{X*-S{JPlHZL><aqp>fIC+r$NQxXei~zs|Gt1P3iwX~
zz9!&%eH%Inj6Izh0bdaCZv=c*!1MT=B)Y)S=FxZae!nD~cx1*@HkUE-Su1atW@bFw
zH)&3fW^$P+bL`ak3^)}q-k8W|jTchKyQlM+(br*c@Winr<C$D`G=FU3R3VoIV>&;T
z&y1jd>kRpk!r0TI_UOb^*6cA4fN7p7<fg~=fiXBR&};T_-%Bfc-MPoz*|z4R$l@(J
zd_VqYeIl%+$4aK-pcu=JCAjyT#sjKv=A}IndXGmOdhH1uWNVnv`K4n-`=J`;BRE98
z1<dLEdgkf9ih!th4-*9!k{%sdPj4^Aq~}?h3W9kx+OzkHsZ6{@^i-!MH0Zbg1Bs+S
AEdT%j

diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_5.opt.ll b/examples/elementwise_arith/air_project/div_kernel_0_core_0_5.opt.ll
deleted file mode 100644
index 6e22dce..0000000
--- a/examples/elementwise_arith/air_project/div_kernel_0_core_0_5.opt.ll
+++ /dev/null
@@ -1,129 +0,0 @@
-; ModuleID = 'air_project/div_kernel_0_core_0_5.peanohack.ll'
-source_filename = "LLVMDialectModule"
-target datalayout = "e-m:e-p:20:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-f32:32:32-i64:32-f64:32-a:0:32-n32"
-target triple = "aie2p"
-
-@buf9 = external local_unnamed_addr global [256 x float]
-@buf10 = external local_unnamed_addr global [256 x float]
-@buf11 = external local_unnamed_addr global [256 x float]
-
-; Function Attrs: nofree noinline nosync nounwind memory(none)
-define float @__aie2p_scalar_fdiv(float %0, float %1) local_unnamed_addr #0 {
-  %3 = tail call float @llvm.aie2p.inv(float %1)
-  %4 = fmul float %3, %0
-  ret float %4
-}
-
-; Function Attrs: nounwind
-declare void @llvm.aie2p.acquire(i32, i32) #1
-
-; Function Attrs: nounwind
-declare void @llvm.aie2p.release(i32, i32) #1
-
-; Function Attrs: noreturn nounwind
-define void @core_0_5() local_unnamed_addr #2 {
-  br label %1
-
-1:                                                ; preds = %76, %0
-  tail call void @llvm.aie2p.acquire(i32 49, i32 -1)
-  tail call void @llvm.aie2p.acquire(i32 50, i32 -1)
-  tail call void @llvm.aie2p.acquire(i32 52, i32 -1)
-  br label %2
-
-2:                                                ; preds = %1, %2
-  %3 = phi i32 [ 0, %1 ], [ %74, %2 ]
-  %4 = trunc nuw i32 %3 to i20
-  %5 = getelementptr float, ptr @buf11, i20 %4
-  %6 = load <16 x float>, ptr %5, align 64
-  %7 = getelementptr float, ptr @buf10, i20 %4
-  %8 = load <16 x float>, ptr %7, align 64
-  %9 = extractelement <16 x float> %6, i64 0
-  %10 = extractelement <16 x float> %8, i64 0
-  %11 = tail call float @__aie2p_scalar_fdiv(float %9, float %10)
-  %12 = insertelement <16 x float> poison, float %11, i64 0
-  %13 = extractelement <16 x float> %6, i64 1
-  %14 = extractelement <16 x float> %8, i64 1
-  %15 = tail call float @__aie2p_scalar_fdiv(float %13, float %14)
-  %16 = insertelement <16 x float> %12, float %15, i64 1
-  %17 = extractelement <16 x float> %6, i64 2
-  %18 = extractelement <16 x float> %8, i64 2
-  %19 = tail call float @__aie2p_scalar_fdiv(float %17, float %18)
-  %20 = insertelement <16 x float> %16, float %19, i64 2
-  %21 = extractelement <16 x float> %6, i64 3
-  %22 = extractelement <16 x float> %8, i64 3
-  %23 = tail call float @__aie2p_scalar_fdiv(float %21, float %22)
-  %24 = insertelement <16 x float> %20, float %23, i64 3
-  %25 = extractelement <16 x float> %6, i64 4
-  %26 = extractelement <16 x float> %8, i64 4
-  %27 = tail call float @__aie2p_scalar_fdiv(float %25, float %26)
-  %28 = insertelement <16 x float> %24, float %27, i64 4
-  %29 = extractelement <16 x float> %6, i64 5
-  %30 = extractelement <16 x float> %8, i64 5
-  %31 = tail call float @__aie2p_scalar_fdiv(float %29, float %30)
-  %32 = insertelement <16 x float> %28, float %31, i64 5
-  %33 = extractelement <16 x float> %6, i64 6
-  %34 = extractelement <16 x float> %8, i64 6
-  %35 = tail call float @__aie2p_scalar_fdiv(float %33, float %34)
-  %36 = insertelement <16 x float> %32, float %35, i64 6
-  %37 = extractelement <16 x float> %6, i64 7
-  %38 = extractelement <16 x float> %8, i64 7
-  %39 = tail call float @__aie2p_scalar_fdiv(float %37, float %38)
-  %40 = insertelement <16 x float> %36, float %39, i64 7
-  %41 = extractelement <16 x float> %6, i64 8
-  %42 = extractelement <16 x float> %8, i64 8
-  %43 = tail call float @__aie2p_scalar_fdiv(float %41, float %42)
-  %44 = insertelement <16 x float> %40, float %43, i64 8
-  %45 = extractelement <16 x float> %6, i64 9
-  %46 = extractelement <16 x float> %8, i64 9
-  %47 = tail call float @__aie2p_scalar_fdiv(float %45, float %46)
-  %48 = insertelement <16 x float> %44, float %47, i64 9
-  %49 = extractelement <16 x float> %6, i64 10
-  %50 = extractelement <16 x float> %8, i64 10
-  %51 = tail call float @__aie2p_scalar_fdiv(float %49, float %50)
-  %52 = insertelement <16 x float> %48, float %51, i64 10
-  %53 = extractelement <16 x float> %6, i64 11
-  %54 = extractelement <16 x float> %8, i64 11
-  %55 = tail call float @__aie2p_scalar_fdiv(float %53, float %54)
-  %56 = insertelement <16 x float> %52, float %55, i64 11
-  %57 = extractelement <16 x float> %6, i64 12
-  %58 = extractelement <16 x float> %8, i64 12
-  %59 = tail call float @__aie2p_scalar_fdiv(float %57, float %58)
-  %60 = insertelement <16 x float> %56, float %59, i64 12
-  %61 = extractelement <16 x float> %6, i64 13
-  %62 = extractelement <16 x float> %8, i64 13
-  %63 = tail call float @__aie2p_scalar_fdiv(float %61, float %62)
-  %64 = insertelement <16 x float> %60, float %63, i64 13
-  %65 = extractelement <16 x float> %6, i64 14
-  %66 = extractelement <16 x float> %8, i64 14
-  %67 = tail call float @__aie2p_scalar_fdiv(float %65, float %66)
-  %68 = insertelement <16 x float> %64, float %67, i64 14
-  %69 = extractelement <16 x float> %6, i64 15
-  %70 = extractelement <16 x float> %8, i64 15
-  %71 = tail call float @__aie2p_scalar_fdiv(float %69, float %70)
-  %72 = insertelement <16 x float> %68, float %71, i64 15
-  %73 = getelementptr float, ptr @buf9, i20 %4
-  store <16 x float> %72, ptr %73, align 64
-  %74 = add nuw nsw i32 %3, 16
-  %75 = icmp ult i32 %3, 240
-  br i1 %75, label %2, label %76, !llvm.loop !1
-
-76:                                               ; preds = %2
-  tail call void @llvm.aie2p.release(i32 51, i32 1)
-  tail call void @llvm.aie2p.release(i32 53, i32 1)
-  tail call void @llvm.aie2p.release(i32 48, i32 1)
-  br label %1
-}
-
-; Function Attrs: nofree nosync nounwind memory(none)
-declare float @llvm.aie2p.inv(float) #3
-
-attributes #0 = { nofree noinline nosync nounwind memory(none) }
-attributes #1 = { nounwind }
-attributes #2 = { noreturn nounwind }
-attributes #3 = { nofree nosync nounwind memory(none) }
-
-!llvm.module.flags = !{!0}
-
-!0 = !{i32 2, !"Debug Info Version", i32 3}
-!1 = distinct !{!1, !2}
-!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/div_kernel_0_core_0_5.peanohack.ll b/examples/elementwise_arith/air_project/div_kernel_0_core_0_5.peanohack.ll
deleted file mode 100644
index 5ef9373..0000000
--- a/examples/elementwise_arith/air_project/div_kernel_0_core_0_5.peanohack.ll
+++ /dev/null
@@ -1,158 +0,0 @@
-; ModuleID = 'LLVMDialectModule'
-source_filename = "LLVMDialectModule"
-target triple = "aie2p"
-
-@buf0 = external global [256 x float]
-@buf1 = external global [256 x float]
-@buf2 = external global [256 x float]
-@buf3 = external global [256 x float]
-@buf4 = external global [256 x float]
-@buf5 = external global [256 x float]
-@buf6 = external global [256 x float]
-@buf7 = external global [256 x float]
-@buf8 = external global [256 x float]
-@buf9 = external global [256 x float]
-@buf10 = external global [256 x float]
-@buf11 = external global [256 x float]
-@buf12 = external global [1024 x float]
-@buf13 = external global [1024 x float]
-@buf14 = external global [1024 x float]
-
-; Function Attrs: noinline
-define float @__aie2p_scalar_fdiv(float %0, float %1) #0 {
-  %3 = call float @llvm.aie2p.inv(float %1)
-  %4 = fmul float %0, %3
-  ret float %4
-}
-
-declare void @debug_i32(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.event(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.put.ms(i32, i32)
-
-; Unknown intrinsic
-declare { i32, i32 } @llvm.aie2p.get.ss()
-
-; Unknown intrinsic
-declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32)
-
-; Unknown intrinsic
-declare <16 x i32> @llvm.aie2p.scd.read.vec(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.acquire(i32, i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.release(i32, i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.set.ctrl.reg(i32, i32)
-
-define void @core_0_5() {
-  br label %1
-
-1:                                                ; preds = %76, %0
-  call void @llvm.aie2p.acquire(i32 49, i32 -1)
-  call void @llvm.aie2p.acquire(i32 50, i32 -1)
-  call void @llvm.aie2p.acquire(i32 52, i32 -1)
-  br label %2
-
-2:                                                ; preds = %5, %1
-  %3 = phi i32 [ %75, %5 ], [ 0, %1 ]
-  %4 = icmp slt i32 %3, 256
-  br i1 %4, label %5, label %76
-
-5:                                                ; preds = %2
-  %6 = getelementptr float, ptr @buf11, i32 %3
-  %7 = load <16 x float>, ptr %6
-  %8 = getelementptr float, ptr @buf10, i32 %3
-  %9 = load <16 x float>, ptr %8
-  %10 = extractelement <16 x float> %7, i64 0
-  %11 = extractelement <16 x float> %9, i64 0
-  %12 = call float @__aie2p_scalar_fdiv(float %10, float %11)
-  %13 = insertelement <16 x float> poison, float %12, i64 0
-  %14 = extractelement <16 x float> %7, i64 1
-  %15 = extractelement <16 x float> %9, i64 1
-  %16 = call float @__aie2p_scalar_fdiv(float %14, float %15)
-  %17 = insertelement <16 x float> %13, float %16, i64 1
-  %18 = extractelement <16 x float> %7, i64 2
-  %19 = extractelement <16 x float> %9, i64 2
-  %20 = call float @__aie2p_scalar_fdiv(float %18, float %19)
-  %21 = insertelement <16 x float> %17, float %20, i64 2
-  %22 = extractelement <16 x float> %7, i64 3
-  %23 = extractelement <16 x float> %9, i64 3
-  %24 = call float @__aie2p_scalar_fdiv(float %22, float %23)
-  %25 = insertelement <16 x float> %21, float %24, i64 3
-  %26 = extractelement <16 x float> %7, i64 4
-  %27 = extractelement <16 x float> %9, i64 4
-  %28 = call float @__aie2p_scalar_fdiv(float %26, float %27)
-  %29 = insertelement <16 x float> %25, float %28, i64 4
-  %30 = extractelement <16 x float> %7, i64 5
-  %31 = extractelement <16 x float> %9, i64 5
-  %32 = call float @__aie2p_scalar_fdiv(float %30, float %31)
-  %33 = insertelement <16 x float> %29, float %32, i64 5
-  %34 = extractelement <16 x float> %7, i64 6
-  %35 = extractelement <16 x float> %9, i64 6
-  %36 = call float @__aie2p_scalar_fdiv(float %34, float %35)
-  %37 = insertelement <16 x float> %33, float %36, i64 6
-  %38 = extractelement <16 x float> %7, i64 7
-  %39 = extractelement <16 x float> %9, i64 7
-  %40 = call float @__aie2p_scalar_fdiv(float %38, float %39)
-  %41 = insertelement <16 x float> %37, float %40, i64 7
-  %42 = extractelement <16 x float> %7, i64 8
-  %43 = extractelement <16 x float> %9, i64 8
-  %44 = call float @__aie2p_scalar_fdiv(float %42, float %43)
-  %45 = insertelement <16 x float> %41, float %44, i64 8
-  %46 = extractelement <16 x float> %7, i64 9
-  %47 = extractelement <16 x float> %9, i64 9
-  %48 = call float @__aie2p_scalar_fdiv(float %46, float %47)
-  %49 = insertelement <16 x float> %45, float %48, i64 9
-  %50 = extractelement <16 x float> %7, i64 10
-  %51 = extractelement <16 x float> %9, i64 10
-  %52 = call float @__aie2p_scalar_fdiv(float %50, float %51)
-  %53 = insertelement <16 x float> %49, float %52, i64 10
-  %54 = extractelement <16 x float> %7, i64 11
-  %55 = extractelement <16 x float> %9, i64 11
-  %56 = call float @__aie2p_scalar_fdiv(float %54, float %55)
-  %57 = insertelement <16 x float> %53, float %56, i64 11
-  %58 = extractelement <16 x float> %7, i64 12
-  %59 = extractelement <16 x float> %9, i64 12
-  %60 = call float @__aie2p_scalar_fdiv(float %58, float %59)
-  %61 = insertelement <16 x float> %57, float %60, i64 12
-  %62 = extractelement <16 x float> %7, i64 13
-  %63 = extractelement <16 x float> %9, i64 13
-  %64 = call float @__aie2p_scalar_fdiv(float %62, float %63)
-  %65 = insertelement <16 x float> %61, float %64, i64 13
-  %66 = extractelement <16 x float> %7, i64 14
-  %67 = extractelement <16 x float> %9, i64 14
-  %68 = call float @__aie2p_scalar_fdiv(float %66, float %67)
-  %69 = insertelement <16 x float> %65, float %68, i64 14
-  %70 = extractelement <16 x float> %7, i64 15
-  %71 = extractelement <16 x float> %9, i64 15
-  %72 = call float @__aie2p_scalar_fdiv(float %70, float %71)
-  %73 = insertelement <16 x float> %69, float %72, i64 15
-  %74 = getelementptr float, ptr @buf9, i32 %3
-  store <16 x float> %73, ptr %74
-  %75 = add i32 %3, 16
-  br label %2, !llvm.loop !1
-
-76:                                               ; preds = %2
-  call void @llvm.aie2p.release(i32 51, i32 1)
-  call void @llvm.aie2p.release(i32 53, i32 1)
-  call void @llvm.aie2p.release(i32 48, i32 1)
-  br label %1
-}
-
-; Unknown intrinsic
-declare float @llvm.aie2p.inv(float)
-
-attributes #0 = { noinline }
-
-!llvm.module.flags = !{!0}
-
-!0 = !{i32 2, !"Debug Info Version", i32 3}
-!1 = distinct !{!1, !2}
-!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/div_kernel_0_design.bif b/examples/elementwise_arith/air_project/div_kernel_0_design.bif
deleted file mode 100644
index 11c5e21..0000000
--- a/examples/elementwise_arith/air_project/div_kernel_0_design.bif
+++ /dev/null
@@ -1,10 +0,0 @@
-all:
-{
-  id_code = 0x14ca8093
-  extended_id_code = 0x01
-  image
-  {
-    name=aie_image, id=0x1c000000
-    { type=cdo file=air_project/div_kernel_0_aie_cdo_elfs.bin file=air_project/div_kernel_0_aie_cdo_init.bin file=air_project/div_kernel_0_aie_cdo_enable.bin }
-  }
-}
diff --git a/examples/elementwise_arith/air_project/div_kernel_0_div_kernel_0_sequence.bin b/examples/elementwise_arith/air_project/div_kernel_0_div_kernel_0_sequence.bin
deleted file mode 100644
index f2eb383b7eb903e8ad8809b7d44fb0a4d660bfb6..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 3248
zcmcJQ-Aw~A5QNu<g?K=M2OfApq6JF0GAPG%L<ue-N=1yF_0OCH8j;via_gU++3a_@
zvk>neg4w5;eLtEZyo^(@j}krmd|WcZ&8OQJf_lyu+9e|Yukmso`>#0IXQ!}T>04{-
zdWRz5+iJIiZ@khs7OO7qzIV<49O}M3Vd35Et*zr4+n!k}H6?s5ZHHP2b8iNd9f!uq
z2%ELeWFG=0l9+H#Mw;SV&vYiNQGA=m?@TyH^<^fmv5ZNwd1Or5@6ws1SwdvOIT;Fy
zFEe2cjS1(dzRbimmN6++jgb*5y%`xL$~=|+C!AAeiZ3%^4UGxssJ_g^HI^|MPc*|&
zYb<A{(Vr%DO=9vtUuMD@8WYaRC|J1{n2BpFW3p2})f#K>`xk4J-t?`*yLP<eIY;$n
zCaj?`;T+YMnYhL>CTa8!jZ|yI(P^-T#)NY+)Rg;`nXrb&gmYA1X5t#lnB*HUVGWH5
W=cv75Caj?`;T+YMnYhL+nEU|x-i@&U

diff --git a/examples/elementwise_arith/air_project/empty_0.pdi b/examples/elementwise_arith/air_project/empty_0.pdi
deleted file mode 100644
index a2347424a644d017f5e8ac814673b9061a6becd0..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 368
zcmcc1z`)>QtR&b`o)&s$_o|r;3@i+cKrs~{2GJ%!3<8rIPKiKR5Yp2nAV5Jtfx!S-
z5XuJWBL`Gy9h?QS4_SX=W@>z9Zen^W!ax}W6O}bv!%~(5C<MdEvj3qV1SrVHz`%rT
k2eLX!*nR*1|78Ih;OycLw1I(v8OYwcbHjf@pk*)&03)a&DF6Tf

diff --git a/examples/elementwise_arith/air_project/empty_0_aie_cdo_elfs.bin b/examples/elementwise_arith/air_project/empty_0_aie_cdo_elfs.bin
deleted file mode 100644
index cba6b8778c42200ab6ec35c68cb3586f8fb4e055..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 24
dcmZQ!U|?`|@n>LQVqjndvbXQt@Lv$f001PR1P=fJ

diff --git a/examples/elementwise_arith/air_project/empty_0_aie_cdo_enable.bin b/examples/elementwise_arith/air_project/empty_0_aie_cdo_enable.bin
deleted file mode 100644
index cba6b8778c42200ab6ec35c68cb3586f8fb4e055..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 24
dcmZQ!U|?`|@n>LQVqjndvbXQt@Lv$f001PR1P=fJ

diff --git a/examples/elementwise_arith/air_project/empty_0_aie_cdo_init.bin b/examples/elementwise_arith/air_project/empty_0_aie_cdo_init.bin
deleted file mode 100644
index cba6b8778c42200ab6ec35c68cb3586f8fb4e055..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 24
dcmZQ!U|?`|@n>LQVqjndvbXQt@Lv$f001PR1P=fJ

diff --git a/examples/elementwise_arith/air_project/empty_0_design.bif b/examples/elementwise_arith/air_project/empty_0_design.bif
deleted file mode 100644
index b22ae3c..0000000
--- a/examples/elementwise_arith/air_project/empty_0_design.bif
+++ /dev/null
@@ -1,10 +0,0 @@
-all:
-{
-  id_code = 0x14ca8093
-  extended_id_code = 0x01
-  image
-  {
-    name=aie_image, id=0x1c000000
-    { type=cdo file=air_project/empty_0_aie_cdo_elfs.bin file=air_project/empty_0_aie_cdo_init.bin file=air_project/empty_0_aie_cdo_enable.bin }
-  }
-}
diff --git a/examples/elementwise_arith/air_project/full_elf_config.json b/examples/elementwise_arith/air_project/full_elf_config.json
deleted file mode 100644
index eab4fdb..0000000
--- a/examples/elementwise_arith/air_project/full_elf_config.json
+++ /dev/null
@@ -1,134 +0,0 @@
-{
-  "xrt-kernels": [
-    {
-      "PDIs": [
-        {
-          "PDI_file": "/home/strixminipc/Triton-XDNA/examples/elementwise_arith/air_project/empty_0.pdi",
-          "id": 1
-        },
-        {
-          "PDI_file": "/home/strixminipc/Triton-XDNA/examples/elementwise_arith/air_project/square_kernel_0.pdi",
-          "id": 2
-        },
-        {
-          "PDI_file": "/home/strixminipc/Triton-XDNA/examples/elementwise_arith/air_project/main.pdi",
-          "id": 3
-        }
-      ],
-      "arguments": [
-        {
-          "name": "arg_0",
-          "offset": "0x0",
-          "type": "char *"
-        },
-        {
-          "name": "arg_1",
-          "offset": "0x8",
-          "type": "char *"
-        },
-        {
-          "name": "arg_2",
-          "offset": "0x10",
-          "type": "char *"
-        },
-        {
-          "name": "arg_3",
-          "offset": "0x18",
-          "type": "char *"
-        },
-        {
-          "name": "arg_4",
-          "offset": "0x20",
-          "type": "char *"
-        },
-        {
-          "name": "arg_5",
-          "offset": "0x28",
-          "type": "char *"
-        },
-        {
-          "name": "arg_6",
-          "offset": "0x30",
-          "type": "char *"
-        },
-        {
-          "name": "arg_7",
-          "offset": "0x38",
-          "type": "char *"
-        }
-      ],
-      "instance": [
-        {
-          "TXN_ctrl_code_file": "/home/strixminipc/Triton-XDNA/examples/elementwise_arith/air_project/square_kernel_0_square_kernel_0_sequence.bin",
-          "id": "square_kernel_0_sequence"
-        }
-      ],
-      "name": "square_kernel_0"
-    },
-    {
-      "PDIs": [
-        {
-          "PDI_file": "/home/strixminipc/Triton-XDNA/examples/elementwise_arith/air_project/empty_0.pdi",
-          "id": 1
-        },
-        {
-          "PDI_file": "/home/strixminipc/Triton-XDNA/examples/elementwise_arith/air_project/square_kernel_0.pdi",
-          "id": 2
-        },
-        {
-          "PDI_file": "/home/strixminipc/Triton-XDNA/examples/elementwise_arith/air_project/main.pdi",
-          "id": 3
-        }
-      ],
-      "arguments": [
-        {
-          "name": "arg_0",
-          "offset": "0x0",
-          "type": "char *"
-        },
-        {
-          "name": "arg_1",
-          "offset": "0x8",
-          "type": "char *"
-        },
-        {
-          "name": "arg_2",
-          "offset": "0x10",
-          "type": "char *"
-        },
-        {
-          "name": "arg_3",
-          "offset": "0x18",
-          "type": "char *"
-        },
-        {
-          "name": "arg_4",
-          "offset": "0x20",
-          "type": "char *"
-        },
-        {
-          "name": "arg_5",
-          "offset": "0x28",
-          "type": "char *"
-        },
-        {
-          "name": "arg_6",
-          "offset": "0x30",
-          "type": "char *"
-        },
-        {
-          "name": "arg_7",
-          "offset": "0x38",
-          "type": "char *"
-        }
-      ],
-      "instance": [
-        {
-          "TXN_ctrl_code_file": "/home/strixminipc/Triton-XDNA/examples/elementwise_arith/air_project/main_square_kernel.bin",
-          "id": "square_kernel"
-        }
-      ],
-      "name": "main"
-    }
-  ]
-}
diff --git a/examples/elementwise_arith/air_project/input_with_addresses.mlir b/examples/elementwise_arith/air_project/input_with_addresses.mlir
deleted file mode 100644
index f2c48f0..0000000
--- a/examples/elementwise_arith/air_project/input_with_addresses.mlir
+++ /dev/null
@@ -1,328 +0,0 @@
-#loop_annotation = #llvm.loop_annotation<mustProgress = true>
-module {
-  aie.device(npu2) @square_kernel_0 {
-    %shim_noc_tile_0_0 = aie.tile(0, 0) {controller_id = #aie.packet_info<pkt_type = 0, pkt_id = 15>}
-    %shim_noc_tile_1_0 = aie.tile(1, 0) {controller_id = #aie.packet_info<pkt_type = 0, pkt_id = 15>}
-    %mem_tile_0_1 = aie.tile(0, 1) {controller_id = #aie.packet_info<pkt_type = 0, pkt_id = 26>}
-    %mem_tile_1_1 = aie.tile(1, 1) {controller_id = #aie.packet_info<pkt_type = 0, pkt_id = 26>}
-    %tile_0_2 = aie.tile(0, 2) {controller_id = #aie.packet_info<pkt_type = 0, pkt_id = 27>}
-    %tile_0_3 = aie.tile(0, 3) {controller_id = #aie.packet_info<pkt_type = 0, pkt_id = 29>}
-    %tile_0_4 = aie.tile(0, 4) {controller_id = #aie.packet_info<pkt_type = 0, pkt_id = 30>}
-    %tile_0_5 = aie.tile(0, 5) {controller_id = #aie.packet_info<pkt_type = 0, pkt_id = 31>}
-    %lock_0_1 = aie.lock(%mem_tile_0_1, 1) {init = 4 : i32}
-    %lock_0_1_0 = aie.lock(%mem_tile_0_1, 0) {init = 0 : i32}
-    %lock_1_1 = aie.lock(%mem_tile_1_1, 1) {init = 4 : i32}
-    %lock_1_1_1 = aie.lock(%mem_tile_1_1, 0) {init = 0 : i32}
-    %lock_0_2 = aie.lock(%tile_0_2, 3) {init = 1 : i32}
-    %lock_0_2_2 = aie.lock(%tile_0_2, 2) {init = 0 : i32}
-    %lock_0_2_3 = aie.lock(%tile_0_2, 1) {init = 1 : i32}
-    %lock_0_2_4 = aie.lock(%tile_0_2, 0) {init = 0 : i32}
-    %lock_0_3 = aie.lock(%tile_0_3, 3) {init = 1 : i32}
-    %lock_0_3_5 = aie.lock(%tile_0_3, 2) {init = 0 : i32}
-    %lock_0_3_6 = aie.lock(%tile_0_3, 1) {init = 1 : i32}
-    %lock_0_3_7 = aie.lock(%tile_0_3, 0) {init = 0 : i32}
-    %lock_0_4 = aie.lock(%tile_0_4, 3) {init = 1 : i32}
-    %lock_0_4_8 = aie.lock(%tile_0_4, 2) {init = 0 : i32}
-    %lock_0_4_9 = aie.lock(%tile_0_4, 1) {init = 1 : i32}
-    %lock_0_4_10 = aie.lock(%tile_0_4, 0) {init = 0 : i32}
-    %lock_0_5 = aie.lock(%tile_0_5, 3) {init = 1 : i32}
-    %lock_0_5_11 = aie.lock(%tile_0_5, 2) {init = 0 : i32}
-    %lock_0_5_12 = aie.lock(%tile_0_5, 1) {init = 1 : i32}
-    %lock_0_5_13 = aie.lock(%tile_0_5, 0) {init = 0 : i32}
-    %buf9 = aie.buffer(%mem_tile_0_1) {address = 0 : i32, mem_bank = 0 : i32, sym_name = "buf9"} : memref<1024xi16, 1 : i32> 
-    %buf8 = aie.buffer(%mem_tile_1_1) {address = 0 : i32, mem_bank = 0 : i32, sym_name = "buf8"} : memref<1024xi16, 1> 
-    %buf7 = aie.buffer(%tile_0_5) {address = 1024 : i32, mem_bank = 0 : i32, sym_name = "buf7"} : memref<256xi16, 2> 
-    %buf6 = aie.buffer(%tile_0_5) {address = 16384 : i32, mem_bank = 1 : i32, sym_name = "buf6"} : memref<256xi16, 2> 
-    %buf5 = aie.buffer(%tile_0_4) {address = 1024 : i32, mem_bank = 0 : i32, sym_name = "buf5"} : memref<256xi16, 2> 
-    %buf4 = aie.buffer(%tile_0_4) {address = 16384 : i32, mem_bank = 1 : i32, sym_name = "buf4"} : memref<256xi16, 2> 
-    %buf3 = aie.buffer(%tile_0_3) {address = 1024 : i32, mem_bank = 0 : i32, sym_name = "buf3"} : memref<256xi16, 2> 
-    %buf2 = aie.buffer(%tile_0_3) {address = 16384 : i32, mem_bank = 1 : i32, sym_name = "buf2"} : memref<256xi16, 2> 
-    %buf1 = aie.buffer(%tile_0_2) {address = 1024 : i32, mem_bank = 0 : i32, sym_name = "buf1"} : memref<256xi16, 2> 
-    %buf0 = aie.buffer(%tile_0_2) {address = 16384 : i32, mem_bank = 1 : i32, sym_name = "buf0"} : memref<256xi16, 2> 
-    %mem_0_5 = aie.mem(%tile_0_5) {
-      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
-    ^bb1:  // 2 preds: ^bb0, ^bb1
-      aie.use_lock(%lock_0_5_13, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf6 : memref<256xi16, 2>, 0, 256) {bd_id = 0 : i32, next_bd_id = 0 : i32, task_id = 0 : i32}
-      aie.use_lock(%lock_0_5_12, Release, 1)
-      aie.next_bd ^bb1
-    ^bb2:  // pred: ^bb3
-      aie.end
-    ^bb3:  // pred: ^bb0
-      %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb2)
-    ^bb4:  // 2 preds: ^bb3, ^bb4
-      aie.use_lock(%lock_0_5, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf7 : memref<256xi16, 2>, 0, 256) {bd_id = 1 : i32, next_bd_id = 1 : i32, task_id = 0 : i32}
-      aie.use_lock(%lock_0_5_11, Release, 1)
-      aie.next_bd ^bb4
-    }
-    %core_0_5 = aie.core(%tile_0_5) {
-      %c0_i32 = arith.constant 0 : i32
-      %c256 = arith.constant 256 : index
-      %c32 = arith.constant 32 : index
-      %c0 = arith.constant 0 : index
-      cf.br ^bb1
-    ^bb1:  // 2 preds: ^bb0, ^bb4
-      aie.use_lock(%lock_0_5_12, AcquireGreaterEqual, 1)
-      aie.use_lock(%lock_0_5_11, AcquireGreaterEqual, 1)
-      cf.br ^bb2(%c0 : index)
-    ^bb2(%0: index):  // 2 preds: ^bb1, ^bb3
-      %1 = arith.cmpi slt, %0, %c256 : index
-      cf.cond_br %1, ^bb3, ^bb4
-    ^bb3:  // pred: ^bb2
-      %2 = vector.load %buf7[%0] : memref<256xi16, 2>, vector<32xi16>
-      %3 = aievec.mul_elem %2, %2 : vector<32xi16>, vector<32xi16>, vector<32xi32>
-      %4 = aievec.srs %3, %c0_i32 : vector<32xi32>, i32, vector<32xi16>
-      vector.store %4, %buf6[%0] : memref<256xi16, 2>, vector<32xi16>
-      %5 = arith.addi %0, %c32 : index
-      cf.br ^bb2(%5 : index) {loop_annotation = #loop_annotation}
-    ^bb4:  // pred: ^bb2
-      aie.use_lock(%lock_0_5, Release, 1)
-      aie.use_lock(%lock_0_5_13, Release, 1)
-      cf.br ^bb1
-    }
-    %mem_0_4 = aie.mem(%tile_0_4) {
-      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
-    ^bb1:  // 2 preds: ^bb0, ^bb1
-      aie.use_lock(%lock_0_4_10, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf4 : memref<256xi16, 2>, 0, 256) {bd_id = 0 : i32, next_bd_id = 0 : i32, task_id = 0 : i32}
-      aie.use_lock(%lock_0_4_9, Release, 1)
-      aie.next_bd ^bb1
-    ^bb2:  // pred: ^bb3
-      aie.end
-    ^bb3:  // pred: ^bb0
-      %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb2)
-    ^bb4:  // 2 preds: ^bb3, ^bb4
-      aie.use_lock(%lock_0_4, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf5 : memref<256xi16, 2>, 0, 256) {bd_id = 1 : i32, next_bd_id = 1 : i32, task_id = 0 : i32}
-      aie.use_lock(%lock_0_4_8, Release, 1)
-      aie.next_bd ^bb4
-    }
-    %core_0_4 = aie.core(%tile_0_4) {
-      %c0_i32 = arith.constant 0 : i32
-      %c256 = arith.constant 256 : index
-      %c32 = arith.constant 32 : index
-      %c0 = arith.constant 0 : index
-      cf.br ^bb1
-    ^bb1:  // 2 preds: ^bb0, ^bb4
-      aie.use_lock(%lock_0_4_9, AcquireGreaterEqual, 1)
-      aie.use_lock(%lock_0_4_8, AcquireGreaterEqual, 1)
-      cf.br ^bb2(%c0 : index)
-    ^bb2(%0: index):  // 2 preds: ^bb1, ^bb3
-      %1 = arith.cmpi slt, %0, %c256 : index
-      cf.cond_br %1, ^bb3, ^bb4
-    ^bb3:  // pred: ^bb2
-      %2 = vector.load %buf5[%0] : memref<256xi16, 2>, vector<32xi16>
-      %3 = aievec.mul_elem %2, %2 : vector<32xi16>, vector<32xi16>, vector<32xi32>
-      %4 = aievec.srs %3, %c0_i32 : vector<32xi32>, i32, vector<32xi16>
-      vector.store %4, %buf4[%0] : memref<256xi16, 2>, vector<32xi16>
-      %5 = arith.addi %0, %c32 : index
-      cf.br ^bb2(%5 : index) {loop_annotation = #loop_annotation}
-    ^bb4:  // pred: ^bb2
-      aie.use_lock(%lock_0_4, Release, 1)
-      aie.use_lock(%lock_0_4_10, Release, 1)
-      cf.br ^bb1
-    }
-    %mem_0_3 = aie.mem(%tile_0_3) {
-      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
-    ^bb1:  // 2 preds: ^bb0, ^bb1
-      aie.use_lock(%lock_0_3_7, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf2 : memref<256xi16, 2>, 0, 256) {bd_id = 0 : i32, next_bd_id = 0 : i32, task_id = 0 : i32}
-      aie.use_lock(%lock_0_3_6, Release, 1)
-      aie.next_bd ^bb1
-    ^bb2:  // pred: ^bb3
-      aie.end
-    ^bb3:  // pred: ^bb0
-      %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb2)
-    ^bb4:  // 2 preds: ^bb3, ^bb4
-      aie.use_lock(%lock_0_3, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf3 : memref<256xi16, 2>, 0, 256) {bd_id = 1 : i32, next_bd_id = 1 : i32, task_id = 0 : i32}
-      aie.use_lock(%lock_0_3_5, Release, 1)
-      aie.next_bd ^bb4
-    }
-    %core_0_3 = aie.core(%tile_0_3) {
-      %c0_i32 = arith.constant 0 : i32
-      %c256 = arith.constant 256 : index
-      %c32 = arith.constant 32 : index
-      %c0 = arith.constant 0 : index
-      cf.br ^bb1
-    ^bb1:  // 2 preds: ^bb0, ^bb4
-      aie.use_lock(%lock_0_3_6, AcquireGreaterEqual, 1)
-      aie.use_lock(%lock_0_3_5, AcquireGreaterEqual, 1)
-      cf.br ^bb2(%c0 : index)
-    ^bb2(%0: index):  // 2 preds: ^bb1, ^bb3
-      %1 = arith.cmpi slt, %0, %c256 : index
-      cf.cond_br %1, ^bb3, ^bb4
-    ^bb3:  // pred: ^bb2
-      %2 = vector.load %buf3[%0] : memref<256xi16, 2>, vector<32xi16>
-      %3 = aievec.mul_elem %2, %2 : vector<32xi16>, vector<32xi16>, vector<32xi32>
-      %4 = aievec.srs %3, %c0_i32 : vector<32xi32>, i32, vector<32xi16>
-      vector.store %4, %buf2[%0] : memref<256xi16, 2>, vector<32xi16>
-      %5 = arith.addi %0, %c32 : index
-      cf.br ^bb2(%5 : index) {loop_annotation = #loop_annotation}
-    ^bb4:  // pred: ^bb2
-      aie.use_lock(%lock_0_3, Release, 1)
-      aie.use_lock(%lock_0_3_7, Release, 1)
-      cf.br ^bb1
-    }
-    %mem_0_2 = aie.mem(%tile_0_2) {
-      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
-    ^bb1:  // 2 preds: ^bb0, ^bb1
-      aie.use_lock(%lock_0_2_4, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf0 : memref<256xi16, 2>, 0, 256) {bd_id = 0 : i32, next_bd_id = 0 : i32, task_id = 0 : i32}
-      aie.use_lock(%lock_0_2_3, Release, 1)
-      aie.next_bd ^bb1
-    ^bb2:  // pred: ^bb3
-      aie.end
-    ^bb3:  // pred: ^bb0
-      %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb2)
-    ^bb4:  // 2 preds: ^bb3, ^bb4
-      aie.use_lock(%lock_0_2, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf1 : memref<256xi16, 2>, 0, 256) {bd_id = 1 : i32, next_bd_id = 1 : i32, task_id = 0 : i32}
-      aie.use_lock(%lock_0_2_2, Release, 1)
-      aie.next_bd ^bb4
-    }
-    %core_0_2 = aie.core(%tile_0_2) {
-      %c0_i32 = arith.constant 0 : i32
-      %c256 = arith.constant 256 : index
-      %c32 = arith.constant 32 : index
-      %c0 = arith.constant 0 : index
-      cf.br ^bb1
-    ^bb1:  // 2 preds: ^bb0, ^bb4
-      aie.use_lock(%lock_0_2_3, AcquireGreaterEqual, 1)
-      aie.use_lock(%lock_0_2_2, AcquireGreaterEqual, 1)
-      cf.br ^bb2(%c0 : index)
-    ^bb2(%0: index):  // 2 preds: ^bb1, ^bb3
-      %1 = arith.cmpi slt, %0, %c256 : index
-      cf.cond_br %1, ^bb3, ^bb4
-    ^bb3:  // pred: ^bb2
-      %2 = vector.load %buf1[%0] : memref<256xi16, 2>, vector<32xi16>
-      %3 = aievec.mul_elem %2, %2 : vector<32xi16>, vector<32xi16>, vector<32xi32>
-      %4 = aievec.srs %3, %c0_i32 : vector<32xi32>, i32, vector<32xi16>
-      vector.store %4, %buf0[%0] : memref<256xi16, 2>, vector<32xi16>
-      %5 = arith.addi %0, %c32 : index
-      cf.br ^bb2(%5 : index) {loop_annotation = #loop_annotation}
-    ^bb4:  // pred: ^bb2
-      aie.use_lock(%lock_0_2, Release, 1)
-      aie.use_lock(%lock_0_2_4, Release, 1)
-      cf.br ^bb1
-    }
-    aie.flow(%shim_noc_tile_0_0, DMA : 0, %mem_tile_0_1, DMA : 0)
-    aie.flow(%mem_tile_1_1, DMA : 0, %shim_noc_tile_1_0, DMA : 0)
-    aie.flow(%mem_tile_0_1, DMA : 0, %tile_0_2, DMA : 0)
-    aie.flow(%mem_tile_0_1, DMA : 1, %tile_0_3, DMA : 0)
-    aie.flow(%mem_tile_0_1, DMA : 2, %tile_0_4, DMA : 0)
-    aie.flow(%mem_tile_0_1, DMA : 3, %tile_0_5, DMA : 0)
-    aie.flow(%tile_0_2, DMA : 0, %mem_tile_1_1, DMA : 0)
-    aie.flow(%tile_0_3, DMA : 0, %mem_tile_1_1, DMA : 1)
-    aie.flow(%tile_0_4, DMA : 0, %mem_tile_1_1, DMA : 2)
-    aie.flow(%tile_0_5, DMA : 0, %mem_tile_1_1, DMA : 3)
-    %memtile_dma_1_1 = aie.memtile_dma(%mem_tile_1_1) {
-      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
-    ^bb1:  // 2 preds: ^bb0, ^bb1
-      aie.use_lock(%lock_1_1_1, AcquireGreaterEqual, 4)
-      aie.dma_bd(%buf8 : memref<1024xi16, 1>, 0, 1024) {bd_id = 0 : i32, next_bd_id = 0 : i32, task_id = 0 : i32}
-      aie.use_lock(%lock_1_1, Release, 4)
-      aie.next_bd ^bb1
-    ^bb2:  // pred: ^bb9
-      aie.end
-    ^bb3:  // pred: ^bb0
-      %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb5)
-    ^bb4:  // 2 preds: ^bb3, ^bb4
-      aie.use_lock(%lock_1_1, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf8 : memref<1024xi16, 1>, 0, 256) {bd_id = 1 : i32, next_bd_id = 1 : i32, task_id = 0 : i32}
-      aie.use_lock(%lock_1_1_1, Release, 1)
-      aie.next_bd ^bb4
-    ^bb5:  // pred: ^bb3
-      %2 = aie.dma_start(S2MM, 1, ^bb6, ^bb7)
-    ^bb6:  // 2 preds: ^bb5, ^bb6
-      aie.use_lock(%lock_1_1, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf8 : memref<1024xi16, 1>, 256, 256) {bd_id = 24 : i32, next_bd_id = 24 : i32, task_id = 0 : i32}
-      aie.use_lock(%lock_1_1_1, Release, 1)
-      aie.next_bd ^bb6
-    ^bb7:  // pred: ^bb5
-      %3 = aie.dma_start(S2MM, 2, ^bb8, ^bb9)
-    ^bb8:  // 2 preds: ^bb7, ^bb8
-      aie.use_lock(%lock_1_1, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf8 : memref<1024xi16, 1>, 512, 256) {bd_id = 2 : i32, next_bd_id = 2 : i32, task_id = 0 : i32}
-      aie.use_lock(%lock_1_1_1, Release, 1)
-      aie.next_bd ^bb8
-    ^bb9:  // pred: ^bb7
-      %4 = aie.dma_start(S2MM, 3, ^bb10, ^bb2)
-    ^bb10:  // 2 preds: ^bb9, ^bb10
-      aie.use_lock(%lock_1_1, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf8 : memref<1024xi16, 1>, 768, 256) {bd_id = 25 : i32, next_bd_id = 25 : i32, task_id = 0 : i32}
-      aie.use_lock(%lock_1_1_1, Release, 1)
-      aie.next_bd ^bb10
-    }
-    %memtile_dma_0_1 = aie.memtile_dma(%mem_tile_0_1) {
-      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
-    ^bb1:  // 2 preds: ^bb0, ^bb1
-      aie.use_lock(%lock_0_1_0, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 0, 256) {bd_id = 0 : i32, next_bd_id = 0 : i32, task_id = 0 : i32}
-      aie.use_lock(%lock_0_1, Release, 1)
-      aie.next_bd ^bb1
-    ^bb2:  // pred: ^bb9
-      aie.end
-    ^bb3:  // pred: ^bb0
-      %1 = aie.dma_start(MM2S, 1, ^bb4, ^bb5)
-    ^bb4:  // 2 preds: ^bb3, ^bb4
-      aie.use_lock(%lock_0_1_0, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 256, 256) {bd_id = 24 : i32, next_bd_id = 24 : i32, task_id = 0 : i32}
-      aie.use_lock(%lock_0_1, Release, 1)
-      aie.next_bd ^bb4
-    ^bb5:  // pred: ^bb3
-      %2 = aie.dma_start(MM2S, 2, ^bb6, ^bb7)
-    ^bb6:  // 2 preds: ^bb5, ^bb6
-      aie.use_lock(%lock_0_1_0, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 512, 256) {bd_id = 1 : i32, next_bd_id = 1 : i32, task_id = 0 : i32}
-      aie.use_lock(%lock_0_1, Release, 1)
-      aie.next_bd ^bb6
-    ^bb7:  // pred: ^bb5
-      %3 = aie.dma_start(MM2S, 3, ^bb8, ^bb9)
-    ^bb8:  // 2 preds: ^bb7, ^bb8
-      aie.use_lock(%lock_0_1_0, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 768, 256) {bd_id = 25 : i32, next_bd_id = 25 : i32, task_id = 0 : i32}
-      aie.use_lock(%lock_0_1, Release, 1)
-      aie.next_bd ^bb8
-    ^bb9:  // pred: ^bb7
-      %4 = aie.dma_start(S2MM, 0, ^bb10, ^bb2)
-    ^bb10:  // 2 preds: ^bb9, ^bb10
-      aie.use_lock(%lock_0_1, AcquireGreaterEqual, 4)
-      aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 0, 1024) {bd_id = 2 : i32, next_bd_id = 2 : i32, task_id = 0 : i32}
-      aie.use_lock(%lock_0_1_0, Release, 4)
-      aie.next_bd ^bb10
-    }
-    aie.shim_dma_allocation @air_channel_3(%shim_noc_tile_1_0, S2MM, 0)
-    aie.shim_dma_allocation @air_channel_0(%shim_noc_tile_0_0, MM2S, 0)
-    aie.runtime_sequence @square_kernel_0_sequence(%arg0: memref<*xi16>, %arg1: memref<*xi16>, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32) {
-      %0 = aiex.dma_configure_task_for @air_channel_0 {
-        aie.dma_bd(%arg0 : memref<*xi16>, 0, 1024, [<size = 2, stride = 512>, <size = 512, stride = 1>])
-        aie.end
-      }
-      aiex.dma_start_task(%0)
-      %1 = aiex.dma_configure_task_for @air_channel_3 {
-        aie.dma_bd(%arg1 : memref<*xi16>, 0, 1024, [<size = 2, stride = 512>, <size = 512, stride = 1>])
-        aie.end
-      } {issue_token = true}
-      aiex.dma_start_task(%1)
-      aiex.dma_free_task(%0)
-      aiex.dma_await_task(%1)
-    }
-    aie.packet_flow(15) {
-      aie.packet_source<%shim_noc_tile_0_0, TileControl : 0>
-      aie.packet_dest<%shim_noc_tile_0_0, South : 0>
-    } {keep_pkt_header = true, priority_route = true}
-    aie.packet_flow(15) {
-      aie.packet_source<%shim_noc_tile_1_0, TileControl : 0>
-      aie.packet_dest<%shim_noc_tile_1_0, South : 0>
-    } {keep_pkt_header = true, priority_route = true}
-  } {dlti.dl_spec = #dlti.dl_spec<index = 32 : i64>}
-  aie.device(npu2) {
-    aie.runtime_sequence @square_kernel(%arg0: memref<*xi16>, %arg1: memref<*xi16>, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32) {
-      aiex.configure @square_kernel_0 {
-        aiex.run @square_kernel_0_sequence(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7) : (memref<*xi16>, memref<*xi16>, i32, i32, i32, i32, i32, i32)
-      }
-    }
-  }
-}
diff --git a/examples/elementwise_arith/air_project/main.pdi b/examples/elementwise_arith/air_project/main.pdi
deleted file mode 100644
index a2347424a644d017f5e8ac814673b9061a6becd0..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 368
zcmcc1z`)>QtR&b`o)&s$_o|r;3@i+cKrs~{2GJ%!3<8rIPKiKR5Yp2nAV5Jtfx!S-
z5XuJWBL`Gy9h?QS4_SX=W@>z9Zen^W!ax}W6O}bv!%~(5C<MdEvj3qV1SrVHz`%rT
k2eLX!*nR*1|78Ih;OycLw1I(v8OYwcbHjf@pk*)&03)a&DF6Tf

diff --git a/examples/elementwise_arith/air_project/main_aie_cdo_elfs.bin b/examples/elementwise_arith/air_project/main_aie_cdo_elfs.bin
deleted file mode 100644
index cba6b8778c42200ab6ec35c68cb3586f8fb4e055..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 24
dcmZQ!U|?`|@n>LQVqjndvbXQt@Lv$f001PR1P=fJ

diff --git a/examples/elementwise_arith/air_project/main_aie_cdo_enable.bin b/examples/elementwise_arith/air_project/main_aie_cdo_enable.bin
deleted file mode 100644
index cba6b8778c42200ab6ec35c68cb3586f8fb4e055..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 24
dcmZQ!U|?`|@n>LQVqjndvbXQt@Lv$f001PR1P=fJ

diff --git a/examples/elementwise_arith/air_project/main_aie_cdo_init.bin b/examples/elementwise_arith/air_project/main_aie_cdo_init.bin
deleted file mode 100644
index cba6b8778c42200ab6ec35c68cb3586f8fb4e055..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 24
dcmZQ!U|?`|@n>LQVqjndvbXQt@Lv$f001PR1P=fJ

diff --git a/examples/elementwise_arith/air_project/main_design.bif b/examples/elementwise_arith/air_project/main_design.bif
deleted file mode 100644
index 27149ca..0000000
--- a/examples/elementwise_arith/air_project/main_design.bif
+++ /dev/null
@@ -1,10 +0,0 @@
-all:
-{
-  id_code = 0x14ca8093
-  extended_id_code = 0x01
-  image
-  {
-    name=aie_image, id=0x1c000000
-    { type=cdo file=air_project/main_aie_cdo_elfs.bin file=air_project/main_aie_cdo_init.bin file=air_project/main_aie_cdo_enable.bin }
-  }
-}
diff --git a/examples/elementwise_arith/air_project/main_div_kernel.bin b/examples/elementwise_arith/air_project/main_div_kernel.bin
deleted file mode 100644
index e44b65c166f6fc2f297dc1df988f5b8b540f6252..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 22460
zcmeHPZ)_aJ6@PoT_8qCs?KyV3*bZJN1{-pL3o+C{0(*&DS=5Rg{<LlzwXRfDq_(On
zmC9dIF2t07L6``np%mdFp{i0poU1m9tbVwbfB->jAV{f7qXSa4m5RDXR62r#`(}3L
zc4v3S-XbFU;YPZ<-QWD?&71d|nSHn3GZ<!T)&QvD>vvxU$O0aON?IH{Tg4G$^x~PQ
zvrhN3x)i?CSGdkzqYl=n%PQ&u&$dA20szwhV!|lYF37O{>D#5)`P(<89q<aU=mPu%
z=#gz6vQA+0ERfyR9Ke~<eC6)!TcXw13<sP{-3G7(1YpAuKcg+{s{lWq+6EUQ0oiDi
zJekgs&6yn8J(MF|m*mLR{W)^m<{Y{6q8xdwD|cNUuEMBRu6g@M7F1q0%5Su1N`S}3
zcZ~9XF36;^1+-OLwX<6`;Q<yqyN4eYEVDiRQQ`XXGWO$uRd>p^{Tmpuhv7z_i(^Oc
z$eq<(YN^^{RPL_hXUUk_v}*5sKX;~d`ND*Y<8v`e0Carlv0q5n3s}Ex;pVYpKH$J~
zW|dt3J`bXA8Q}QgbocSYhr3DF%j?)%N@njKw0BRshjg7peqjy&PGSA__5HgS?cIAA
z?Y)WoE7q`A!}=@Nw|76<yPuBt4D#2C-#@TELHr&(+<pAPH2U{1<QEoholxshcjmh3
zc)v8cjvf!CdyYSZ<89ew(eX}B>FilObi7}^V;y@BAMQE+aJrkcbfdkYg#*)Cd+6U~
z3tyjE<nyKD1N)JeGxj4(_&SNNQ}}ulU$xB1eJSVY3chdYICpa2mVu+^W(NCvW(Ifk
z&J30|&kVk5pWN56^}_nh;N#shgTL&V`Nd-I%->(an!jv5^T%V`Chf)J8{h4|s@1u#
zer8|giL^ayxYK;%f^E5U{m@nQdaAVq9RPe@+hE}8nc}O*&d%e}EWi|PX=j1sI6e!q
zDRB6DI7ThwDVjM)+P`3t_Of-r`R&3r9~^M5tt{5brm}U`?1J)bZ)BWW!FaTFv_{6q
z^VfY7tY1M}<%arokF5TCr4K5Z6<qB@sajv<=@0Q}EagDu(hm>(^GM~2vuA(!0F)nl
zC6j|=GtU?%^!mHU7r~mv-GT0xqA>sNGirUpKNr0>#2CpXFItNh3>gmcjm1g0<GZi{
zhP=hekM7t$1keJ7%ihIRUike(ty|8K?I$)7?|F+j_$t3-k*~bafw4S=o;3>ghiCQ1
z{(w6~uuD8w?!fxF)jzGQ3V)7%YmEE8uM=(A=@DUS6b-qsro9%|vK<2oPt3ukE984^
z7P&o}FAP|zejpc(=P?#j>7s9sY?`&mcGFs%Tm3#p7kwIq`81TtaBH4ydjYxb9PaSA
zLw^=XyPGF1IL2qH)>{4B8L(*l2R54i7<$I(7)C^&wCA<jla?}Te_|-h+GE>F8h2p7
z(D7IA!ZFm+S%Ix%uj^39(fzIi&}x$JY{0qTo=T&Ne(}d^56x!^KR;r$0x`;Yvikvx
z?8XSR&sw&3Q=VLM0%t6q--gj*QGqOtieGp<aKtCJntVQCAGN^exPRjMM>_vt5esNM
zY5Sp6&>qajm~n;C@iuyl+mq%Kj+fe-+kpOYd-QtBiUZCKb^F!lGWNB#jJ@z8?j^5e
z>>s?EvG>1`v3LAAW9R>xiC>Sge;3tk6g;0W<@lrjWuuk%M+%?I7oP8dEe6KZvd_-u
z?XwqS>`FBH(<yFe6r9%bU1a<7)-kmGGrC4_2HHh8Z@-hah*8~0Hs6%D2QOZ8zSW&B
z8^?<l)a#4aQ~z3oK6WLQFFbX`$b3W&K5dbM19_4gx5$;FdD6GnBDr!W>ASy^9GuVF
zl_PmlMBS}aCvoxf;FM)o_E-e=STs%rV}r3VMRu342GQFAnRiovU7#5ZkH@0F|KaG!
zlh5W-)84Psm3!;t+bfwu%5>9oJ)nQshV#Yzpn<ZlJrN{bU_bt>LZ4`UNZZjUtMWrt
zoyZSWbs|4h)q%5<!mJqh9`4UNy@TP@kGL=TI#{D!R#8X$^~+VwT(z02Hgi?B_e8&o
z)SYIo+RRm(xhiJu5aiy?T(y9?WiwZ8=Bl4YuIlH2G<VbGhpIY}AFAp^eyFMgXD@|W
ziRY>%hO2W`eI1<E*JTxTv|qpMY38cUT(z02Hgi>W&x77!Yu;CF-dBBVjr&;5T(z02
zHt(x~`{~?Q_47QMyTx-=eVxb;RqaH6sHy|!77DXMb4K~T>VAf+b5(sEoYvQ66?L><
zzuer+RhzkLGgocqs?A)r`3ykw8Gz<904yzNJ_GRo`V4@d=TVX3xvIWS<cF$uB0p5s
z0e@aqm5++La65X;iq|iW@a{)YEDi2x@S+BHHTZ-EpVHvd8oaE*=QQ}d2Cr!Fss>-u
z;57~IMR4(~h<J8Gty}P1hnPbRg}fb&Ls>U9?r8W$4eo022@O7_!KXENS%c4M@Oceh
z(co1LzNEox8r+NEfo}faG1R&Rf8&Vj7JN^YbyMSxr^bsK+|}R{8hlEFPiye92A|X5
z^BTOO!K)g4NrTrkxEH|#-Td$LYTbhG`*GcZJzLgIjXRziFKTdCgHLGiDGffY!OI$a
zPJ_>D@QMbnYVaiuUen;72DhM>N<yFRm#$NISBBxVPSS<ltx@tRm6ZsenpW9oIIRow
zC-{DDL~u8<&(n6a6g7ND!?!ei(C{btJ{9*zeLW~@_)cWM^S#Fh$u)e?@F)2Ci~F<o
zeir3Bk#!vS<J*byLBpTm^D4@BHS?{g;X4|>rQt(_Py1(OX+N;XpAGzxPNdXsG+PDF
zY4F?#3*en^m=_A3+u*S&EOGz#l*xY)0MN<rF;YT!K3g1*#>dp)B2SI-+d}s+!}dk)
zD)_>0SSlrqgW$Q+*yxEI2f<q}4yA^ju%8xxUO@OMNTvoC_m-plw$O72Vf*5Kv*6QD
zR+e@ojDz60ykYPm$3gJci-UT95YG(@KSeJlfA%+ui|0Y2cw6XMVL1+dzk*LcSy}2&
z7ze>~dBfmCj)UN>7l;0afv}(Aw|>G;K{EO8^`f|VhC7P4g?`@`wlALR7JT~Ymu{2&
zJcM0+=#L2E#a$`k*MGk_!1)a5j*}57t-6N<AJQSd@jwF(Hw^RLu>I8KVOrxj%y+`}
z(?8YP7mbHoyOr13BFCwLaNJA{4jR0T<J3mjPFBP3<Ty<(!hB1^@8$SK5xyPa2Z87Y
zrY{3D`mUD3dIfxqUT%nw^C94;)bl~&r$c;=USU4Y2j4zZO*0=>+*u3<E2ytHl;Vl{
zGCZp9E{;d_8qx4aHMpz6$2lI=YfQt(`NsT=&Ig=tlK&xnQ~#sB0Y90(0Y90(0Y90(
zK0mp>$~+wPWw=6LhAZ?{@=;$Uj`}Kb)R*B3eU*IFm*Mm+yxvh?C4ZOaf1_*xD2Mr-
z%Z+cE5}xJ!&*M+e!+Z$%GX9=#=W3%bC-I?&%{SaJB>O{#k7K2T7dbzQgNE;NKI$de
zF*W=G=VOp0epbUD;d~sO#BbB^hrKk5zaq{IkB$rD%y31V1-~lA^F5!hh%@73axVGl
z7(K#{gkzkQIL29tW1JbTh_jN9ab`HaN&RA+m3)jd!xeE>@+~jTR=O3j?=JheBF>Dj
zh%>{HAB{7^qj4U99>JH!g>h5koR9z1K;rMx@J-IgC#xm?h=!l#{PqZcSi^7Q{8-#H
zeAC;Cp}}9lM(o?mK8{tApDyRe*1Lv3#`*E}qP9QE`LTJfv7hC9{6Z=DIjP}~bAD`|
zYxt(;-}8?9)3_Hd^*hY@@%X6uyEq^JorYv*U|G%2a(-8Yf2)Q+#`zl~{BaF`ob%)H
zQTt<hd)fSn>+9pu`7_M<_~TV+yuHh6ewOp&@l^B2I6odAHGiD*<KtEHCpka1-ZlQ4
z-YuH=_;@rvS<a7-SIzI_{J8&We$m^?_FKhzSMI|joFDhcw-fb$7w5<4gPK3g`SJCo
z=1*|`XCm{bu&m}!az6gA3TeKLY4`<irzSo=9v$xh=f~%>nveSfvmcL-nm@t$@$stp
z4(G@A|7A75;O*7;<Kxls+MFNvN6p9e!tBSccgt#iueX(5XWJrr6_<TH?vIZv^i^K3
zJGuS1KWh69=f~H(j=xjmzmG@#@AQ0Jp_kHsj6cKU{`<JFpT+}LJlq**p{+2e;b%s{
zGnh82I@7~{m4XHRV&AK!cKmPq)G~d^bvqI9@pzrTr>fie`&GD(9<#D!_}}fRGjgfo
zsXqq%JcK7jxt^Vw*nX|j`C3p<bMeO6lHq^n{P_Jc-x~HJPGG;ncZl@s-)sB${W9$u
z#(BSuIq{mWE#RbvK3_lKnknk|`YU6F?*>{kjBsoMO<d8MeOB>Zbe&uim$hxYCDKH0
zQ^m$f`emGkCbA7kq>0=nX}_{2+{Ss+qzvIw#ginl3Ae!+AfG1EL~c_>3rYHwHIZ#3
z(?o8Qv|m{hZsWXZLcIz~6%UidCTKyAs`6<fP2@K8EJ?qzCbErWn#gUE_A6_`ZJakv
zf*lR8B=wPUL#45sFf5TKAFp3o6WK;GP2@IJbU4XekTu~p&YLE#6wjnSve-A?WE(U#
zjjG7CHn_&iZIbpYYa-i7rit7pX}_{2+{Ss+q#VQ#1DVuEq=t%YBbg?0n<`pMGGAp)
yWE;sek=rEgSJs5vIB%L%KLJf-8_6`0+a!&Rtch$RnI>|Zr2Wd8a2ub5CjSM>GL3!!

diff --git a/examples/elementwise_arith/air_project/main_mul_kernel.bin b/examples/elementwise_arith/air_project/main_mul_kernel.bin
deleted file mode 100644
index 48ac55f27234b0be60abf7239927bde8dd3fe95a..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 14460
zcmeHNO>87r5w7-h>{)PjCy9v$uTgq`gjSIcV{9ZNS)!dak_ASZ5Du0&V7PGMU~<X<
z2}fB8gb>;b$RHtn;XdM!!=6S;4xG>^r<_(!fO}6)IjmUfy;raM)q7Rm$11+8_Q{^^
z{_3lG)%8{X&OlKv_6vY1{{M0tpbxkbjGkYwZuvFKSi>!E8$7&WZ43Bm{zKdF0vlXl
z>kGCnvzu^^8)X3#fIT?BvSjBsA797GKmBg6f@U2p|9~ZUe*lwN0Ttlig8_^_LC;Sf
zKR7HtzH$@by`=|-@bUL*fEW7jzw@_ZI@qwjCKkYO_{OVlZGBX}xcJ}G@BjX#)A2uF
z`uFbAPkys;`W1lFKZFGvzFPN(f2^+n?C+khp8o0=U)wt}^X=82F5;{JIK$81Uj9*G
z=WyHi-o6@a&vu7D{oP`5)8vnKH;jF<5N*F)t%G6t!RIz8^LO!Z{u(xEKFofx_!91A
z{|xu?9>5*rHS7cJGy2z9WdPzoI7ah=ne+57A6#2FZvNhz{QC3LpFLVyUnuJy?H~FO
ze+V!z5zQEW>%&eY82;&lr6JrQ>xTuv=a18VGJXsAFZGjT^Zg{*d_PGxs5T932d2X2
z8$8^wxXB~UMYq8Petp4a=5^y6&wG3FEcdqS8Kbq<Pm;~|lVtP#B-x<4YhXLl-rlvi
z+}pYh9_qHfU^DZ&@y7Gsex`fd^@Y(o?QPxW`$_Wi{Uq6-ddt9em>vtS6N86uT3qgJ
z-3Ax<^#xnk+dp{T+s|}wyPh#kmG-u7^Zg|G`F@gYK=<2H|9V^KXV&d#Zum98`>%Ek
z6ueUKk%HF>K34D}1wU5s69vDo;AaXxQShmP&lLPz!J7#8&qDrQQr4}z&-6Xi3e;@0
zt6Vo3uN3}B!D|H{EBKLuA1nBYg5Ov0GX<Y0_*B7X3VyEOO@w#4x%(tpx9%P))vdd(
zaouFRYGiz*;I)E}75qrSj}`nx!S5^hnSxIge5&9x1wU8tCc-=2+&zG-TX%nu>eijJ
zxo$FEH8MU@@LIvg3Vx*E#|nO;;P(~$Ou;7#K2`9Uf}bmRqu>KrGm@CM3+y_DcX1Xs
zHl~ZcyA%0_YDdI3p{4j1H@1*Jrt^6b;dOMLH-2V}6n>@f2MQk){+P~FX+HAxV5IP?
z=zQn0#}_jS9~AzWj=waYi_d2<zlx6IE*}>s=7Yi?Q(wjWTIsiu!mkwmK;c8gH}l&u
zd(*u>EASO+G|s<WE_z<o^6H2M@UI`2S3R$8dA5Za_dkCv=?xvg3O$n;LfFs#`6sSV
zN&Fc1_o*?y96a-c@%_E4=bNP+v)#-(cwSA8-GbM_^PW_P-8Mj&=YZZv_&hzSRB-<+
z8uOQf_Zne*|9tBCW@*Q4BeM>kSCeD6;C1l4C)GhdKlt~UK2JYON$(<K+`n&%@#Wy%
zC9i{<*YnNNj@gaOI(S}9j@^RS!SkL}hZ}8xFi-zoh0oKIO8UMb#{E0x7+(&)p9tgo
z_spJemTs(;BeP<!KKRmI81<vwd!Anb@hz|_umN@-*YA2h^a6hI+ZGNTLw+5`UwA#F
zwd*0j3gh?wtj71wTZ^mWb#_F!2@uw;RB%x6Wx`F2Fiv0LuMlo-E<*l5;ja<?)rdcg
z_+24Z!Rl)PMc>D}p<W$d(W?&l*bg0lLGA~}?*)8CuaJ-Z;Nn}=lzw<r-@|g)f%^JY
zR!`K|;<3IvgvWYqEBrkLuNC}&@K~>Xg^&Ga^Njle`;Fxv=)3TL>f7=2^zHb0`gZ(0
zeH}kvU-3R1^|iR5uf+v@MLy~);;65PqrMgw^cDH2uf@$rc)X*&B7diGPn0hJoP_+9
zdHbZv@ILXsgI}G8eCYTb|IEetX7SoJ#)s>+-|&pV;@>G8jz)%$h#%vi@N43uUMx<j
z@YjitMPmHE!rvx7?j7SVEBvjdXX`K2+2ZkjVVx~5)Y<c=te&4azEEe&$D4DOXH{JH
zaTtzu7ICbzh+~~CF4S4%W1TIIA8cN%v&hFfTU@BK$R9L4d!*x6>@0pgceqez%NOcw
zapX^<I$J!h^KH29`D|XSn~W3xT9oIG!Y_$`HR5k8{66ttjQCp$f0_76-4uS=+``fT
z;LkhQ{to93M-#(q;wQ(u!rv!;dc4T-_lTeLxe~ume0-o}c^)eK1L7xruJFsoz4K1<
zY2SsjdAEq4)<@>=5FdYX!{XeY%ltm^@uxeC|3ihpPyAOR{(-_jAbwgOIiIq*Yx^hF
z*Wq#hY!M%SOvCbDo6Gz@@zZ+B{C(o5^^y4p#83BE<{uJ2Io_50%jPYmJ`RuT(<gqq
zzcPP?_-Xz!f7INv=Ud@;7th1n#82~aapL@Uh@bX@%-<q@dVI<JG4a0~_0RfT<{uLO
z<%qwp@YkD7r9KXi_xCpO(|(rucz&?))B4E#G4a#=mH8F%lk@*v=C3z*m3$l?@9&WK
zX+AO^j~5$1x!%oX{#tX(UT2phy+(6~r};Qs&{w=(uTcCnA31(S{PcL&_?t@p4v+I+
zX&f%-CFYOyw|JVr!+rc7uGrDwnZXR>g+&d2V**WKwUKOPQ`mVhFZ*20{OG%V6WRQ8
z+a@9oPi^j;D*L(fRcJHocHqxnM`pz&!^adi-9F&*UEHud&VdsBFB&T!b#`-)*lx`V
z_sse9^V;0ncu^8Kug@K#dEL9Vr=Qnq*9NC~+n)HyX$#oY@S4-lr&f9!9e*ciaA{z)
zq6ll#X;OP{yPV>upE_TY+WNL1S(<R4DLUuOYdNhZJO*TG!hLe*<(g28OVi{8!ZF2-
zoZ5sw*a3VqOB3!h#Rxg`a!q)QJWaSy?z~(Rig9V0n5?=n#nqhJbRK4H$~Ut#;XY<7
zXI`!ekCCSd_sN}?YeF$DO_T171~_taG~8gcXA_HMY4UXQa!q)QJWaUI6cf(r1+EFj
zxHL^_R?plVZSC6+9>dh8HAUCj?i$a1a_8lm@ECcTaG%_Hxh53j(lj~g>W78Q%@L`=
z;4$(v;XYH0mea3X6CNW^6Yi5cFV}=(T$(1+FF+F>BTp0VlUp0E36GJd3HQmJmuo^X
Iz6eeJ2joH%+5i9m

diff --git a/examples/elementwise_arith/air_project/main_square_kernel.bin b/examples/elementwise_arith/air_project/main_square_kernel.bin
deleted file mode 100644
index 8ba56366c72a88bc1322e4d03447a2fe49afc7c7..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 11048
zcmeHNzi%8x6n=ZRu{U6n4LYKYW0Z9gp$J@@6C(vF5_`l*78v0qO(Y5o4Go1!MaM;0
z0U?B38YCn)^e2iGsa8sAbV`-Vx&;1!b*dB*=FQBT+xhj(#)^wb8OgV^-}~mxoA<t(
zo7*!)@k9~<e2RbP-T_DeJ4S2kett&xBx45ev@PA9m9`eXia*nq?O}sGY)QaYM|K)Y
zyon>&0$7C6;Ym5aczFvafAmp1hiVQU0;F&rnV&^DKn}N47z`uqz{A};*P{oAPXl~E
zdFMJj__zQ#l-&OIkEl%N<yeqyuDd=p@$cp*zZ~0K`Rmv}y~!_roZoyy+4O66yhs3+
zqS1-X@4k6+v3Q7)E#{9Va25dc`1Oq`F~5oj@7He}G46-G?8~2T!7SO8Q(+&p#Q6Fi
z$KO$R*wd|^TOck!OID8qY~F&y09gXZ*RjNUB=HN(BFOv*|7(c0L?oNP-#yw|tNv^c
zet5F^)zitjR$R1cJy2+VCh|)yd+%Od1jv59JDI^bvfjgtJlXa2B8GB$3E8w>LN=|J
zkPY$+0+tO!vGkd4&r4jy;m5+;V2^Q0z$WHZ?fgsa%0#@V+EtnmOW^bpvT419Y+5fN
z8|0S-ESs<0p2Wl2<!!Ld+me7y%&Xc5FSRQZ@uF&1)m<#X*Di0<dI^owdI{Mezb0VW
zgoey>LArfK;$iLbHrQia60p_Uz4}tSGUG3*c9kZ?5`69QHm#S?IIWkE4d}cZ(x<kW
z<3wM!YF76E&qw2yGI-A5T?Q{0e1*YR8GMbw`wYIp;9CqnVDOT`hYUVq@XEpUcM*No
z3)iha2WkzC3slXIyLR2ec+T*<3|=t!3WKjQ_!@)v8GM7mw-|iD;3b0(8GOXxm4nxH
zQ|Fv;-Rd)tU$^>m%dT4(&#N%rW$=Q*R~USi!PgkP&)^#jzQy1J1}_<W$lxOeuN=It
zoBE6o*RB5i_v=>g*>>H+cwU9^E`t{gzQW+E48F$TeFooP@GS-(FnGz}Lk1r)c*WqU
zJdMe3|JLV}I;E+t(5ef}h!TpY1ukFW!e;Wjv_D4<o;&-t7$;iF@PXlXX&>?ZhxhB0
z;e)dusb~PYq<7zcc>hQlJ~;22+P~7X??0^bl<Mg6!KtsX%Qn2C9`;1CWEz#%(fjd)
z=H+8vVR6LG-=D`vG}xiDwYWFytk3aoTru%oT%X4@uH%4VuVK8JSLo2jS3BO`7`Nk9
zxSn5=V*1|e;`+Of#>G{(;e5k*HLsw7^~H{t(Jeho((~)lF@x*p44PlZJKHc`&HKN2
zXLTr!9u%(U*QA)9=eoFlrl)anRc*S}6&<nRz$@a#YsJ&vy5{FVd<jf@TmkfUi<_Dc
zZG#{EG=|NACciNKx862s<G#tyP5<rRSpRzP!o+lVA0=EkFm)U=I57AW;lhXMCt>&<
z!bMOfKV|qcgr9KunZvIWF%7c55@71P+cWD`^O<@T1|RiM^IKs(Sp2rZXX<70Q4h+$
z)F-2dr^Py^LpH3h?ppcm67JUb65(#WE;9T@1}_+VnQ*sWOAH_NCgXSYfO?a-7k{Nr
zh=&0BrhYm*<CCL>C&d2{pfveV^KJaG^7C$V>ZrwslQJGul;!_iq;ND^c$fGt4h+8_
zKCajD6Epld;$z+{e!}oC5+7G*@uwJmTD9fAfj_la{9;Gp?z%9~k{`&k#F1ZG`FyPS
zZl2G=NzJ$B#k_@a;^TqB;$LL=G4Z{1G5m!1uQ~n~82%LTk2?HI3_q^UU}^yHXFV(a
zY)9c}vhafVo(>uQ67l`_OW6M+@$sieYu=pkzf62jw+#O}@xA;o{J2ujR{VIz&s?lH
z7l`lYC(OS@eEccX@^f}4%wHzHzuqvvLwtX|VSZd)mU{5(rEpgd3&h7CNv-&2cEbE+
z;`{N3`5of>@rU_wb&ZK%;comL;`{N0`B`;F?gOtn>z&(Cxa&VBzW07-_#NWE?)YC~
z_%p=!_8*3yRTr4}74FBca6kSqKdUY?{uQqMx3MEzg_##koHwTI-%{c#lD=?mc_uM`
zXVK59#c1(o+myD$Hl>@eP1W4AiN0+34;Ed~5y9B4#9JAsxZcDU$-}GJVjRC$>)fyH
z;&W<zH;mNxh5gSfV;lRUAjbK>uW}6d27^=PQxYtRwviV5NKs*+w<t1WtaDQ!sct29
zpFKB4?LhPz<;EUU;;<(3+PNVg2bP;YMpd^GyG>%OM~GhEzHO8ndrXNQn#^nGhI|}Y
zZbVddE3w-o#(IS4mG*6;+}LBpU6Xn3+>nn0%T4`G!|>!Po5U!%MzeQ-M!DJFymoHL
z$ARUhu+~-Lf}cG%wvUGU&A#Tfb3^kUSZ@0JiqX&vc5NTca$}EaS`#}r<m13{Q$B~>
T*gl%&#vZf3+>npwk(>Vj59vP!

diff --git a/examples/elementwise_arith/air_project/main_sub_kernel.bin b/examples/elementwise_arith/air_project/main_sub_kernel.bin
deleted file mode 100644
index 32bcee0ccb77c74fa42e783e1359dfb1a7806953..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 14396
zcmeHN&ub*t5w4zTOXFm`J9d;X*;R0_cL|HpVPq+wfWR%U0wO3dj&oSZ!JAy{Lk_|x
zALX!+5JK2XKnaBC639{Hki(uD>~k3W2Q0Fa+*>~Rutcf%UcK&D?^S7^7wfX@gPHFB
z>Z^Ly^;Q3zfudX*6#xx>wzmLAfGfcm`t|CjU$cx=-14@`qZ`&Xz;E*jZPN>EaDi<k
z*!s*~g9UDs1^gXg7fxSXw)2~>@8jg3|9DtIy9S>DOyCXNv!53g!0GuvCvbv(xLyn%
zPGI&K#`yK}!@I@%do{odqlX{<t*9sK=;_9E^wxLW+S;UiY3YAQKl#JUNArKZ{GXlW
zU;K9c=-U8CfAr<rd$WAy==|V&yS0hBTYbI+W`2zLs}IIzewPn!-~ZrBu)Wxs{_^)r
z#cO1D(;558!1xE-uT*Rs<6)jQS2MRE#&$P<Q*6+DnD0{Y4S?y^G49uWfZN7vx&^eC
z$gi=gVC#jEA0D82VCFdb%g0v-2kk$GC%^gf=)K3wYlE^L(*B_j#h;pb)0qD7lU@{<
z{^{f8DcmOOCk4Ql=V|{KSFL{}o9`dV=KDvoLA7CEJ1`a2uaigX7B_jMx#%{yz;7hj
z%)D;A@w}Ixsa|$HVzkxzN3!|;k!-$yBpXzB3~Wc*%R3gAds(-^Bi%L<Y-V0J-g@54
z&r~nFJ}}y*y{y}O|44qme<T}J?-<w)(^KJn#N^T27MFWjx4{K|Bf-}9@=u@l@-x-T
zu18E0rM;}%eE&#(zJDYe(EYO1uih5=nRPqb8-5M&eyST21+Nr*rr@=L&lUWhf*&aO
zp@Kh9@M8r(QSe5=7Ycr=;BAEaXBmHQDC^eWH~Jpx1Zvm2Rj!+iR|<co;I)F!75tup
zA1L^tf<I93V+B7^@J7KG3Vy2KZG`u_x%(hlxBea})vdpdaouFRYGr(;;I)F!75tup
zA1L^tf<I93V+B7^@J7KG3Vy2KZG`u_xof_xTYo>0>eipLxo$FEwK6_a@LIv=3Vu()
z4;1`R!5=92v4Wo{c%$G81wU2rR>3E*Y9ujl3)poE@7gSGY)lt>Hz)E9)sBd7LQC;2
zZfqfcPUrI?!t3ZfZ~V-dDf~*|PZT~V{5hSc(tPCW!A#*-(fQ70k1tvjJ}CS-9e-&)
z7oX2!eia?ZeLgNu%m;-(r@o5$wbE}hg<mQBiNc47Z|1jS@wR&%R^ThpXq>)TE_q(n
z@#=^L@b7oaE1p+(Jln#I``^Eo^u`Tfg`PnSA?#=Wyc5@_Bz}zh`_vd82hTWRe1Gri
z`DSUyVkff>o>!A&zu<N7yeHLRrwb6~IidFrK2J|772H32#{6;c-Xe_epGQ64EbUmV
zXV$^<YI5usybhlCq&mpw2mfBu=jmrD>D^+C`}avPJ`UbR@;bPAJ>M+tSlr00gXh)c
z*e`e;Jnu<$xX}d&^Yp(p_&hzSq~8l-+`l7^@p16`LKxq_H}-t9bmLk%Gb{G$gD=sA
zQ9s_f@A(xF-vZYJHo(q#{gLOxFyI%z>)_Ba<kw;R!RsNdTMzkF7=QR@HNJP=T)HM+
zXJ>?)0Abxq1qTHm6K-OJaYhP%g>Z9o5%MPrf0gj>M*L~S?+bAatiBdd^gZ7R_3HVG
zUUk67e(3oFxgQvR81NOnLO%9`i*Hp^`r&bXAIo6}>g!ioJyBna$NFv)9_zKG@OKrw
zR`7koW4-niKK7f<GwuiMH<o{(@8B!y+w=4E?fH57_WV439Y0@R@je{&wYZ?K#RYvu
zKI$vtsIQ2lz7`ks75S*I#m!fEyraG%f4g;0lrI1rhWwSY?n#s3BjW!EfBFvj(DOO|
zg^Tlp;?=8+57%wK;TeO)zg;*SjSQa=KgL1f*ThG?Se#PfuMr=M#P}nHzeRl9JH{U?
z{LOY~>o3&V;_-fAoh>fZ+4CD#&o3NbsI%qc%{j}nDz5uD497Z)IM!LjvCbA3>MZiH
z&KAcnHZRs$<YS#JF4S4%Puih9((xyA7Qa4oxKL-y7wT+r<Tp{BEgskT7F_pyHZRsq
z#)*G5%5z)cm&Csk@wXKIi1;r>{7r>FCVo;kg<rNeu`~eqcMZ0`(=&&oiQzTzljB|C
z?-4&eUgY?@#83KMi9aGfK2Wke?<)L#;wOEs@XOY{^G@^W-i5PyH;JFtN9J!6|HUZ(
zTW2zVM11`74V(983V)CI--`JA3V)yYX?^5;%Jz=!pHyFm$NjTOeEf3^%YXGu=8uS<
z)>G#15kIYu%-<(|y1z32F7cD&UCF;}-%;x0@VGuB;-~v7^H+$U<}dSS?M-{W6^?iD
zJiJBxG#?iy&VQTuX+OyPP2#7=m&~6N|C>?&tewgHyTr#oh_Qa#Q}}D`hEgAg$NPJW
z_-Q}Od^|tc_-TD){+#&f{>uD{_{sVIOy;k(ca(e_9`Emz_-Q^eACDIsKe^tW$^6y!
zroGONBfVy44o~xOxS+3iy<VaCX+CoNiumdAuJJdN{2d<WztTEf&`Zo8>u>Qie~0_{
zLtL?=#WRB$#tVxY{-*@m!fGSg%C@lcU|#mQn)%Uh`zEsaaN7nV4o_|FoGSad^Hpdw
z>vrIuR%d3#B*RC8n{FR)`4Mhd9_K)5{w#aJSoyTKn|s7=Yf-po&ZnQ(=GMiFa=>|g
z?hwuE-nBjbyjHs|IL+Jj#HUVMz@~;*oqj&G(%b0xJ4Ay^1EUp1Sessx+I!n&gWrDY
zd`)WW+kIte!hITa&Y9P8I!$;C$kK%S<j%`Ap%|B@$svTJ!Ht~Sgg)2-d^1ZE?$cm|
zoO!t>JVu@-+$VQlt_j7sG)+ua{b+DCr#8KZS!?)amL}ZCY~{?$HQ_PxG~qtE^Kwln
z#-(Y}pV0tEZjOc<jP7h=u`ErVZeFelkCCSd_h~TUoL=CXP>f5{q-OQZ&C%Ar`{FT7
zZ8}qQt?jSz+$VQlt_hEkrwRATotJAuF)mG$!@ho4$lM%}8VnvIPZRFbV6>cm<(lvq
td75ya+<Cbs6ywq~X}$(cc#J$vxKD0vxF$SCo+jKUcV4au#rQfj`9BCz25tZV

diff --git a/examples/elementwise_arith/air_project/mul_kernel_0.pdi b/examples/elementwise_arith/air_project/mul_kernel_0.pdi
deleted file mode 100644
index fe255396317cd13639d937b59f0b9075ebf0e6a4..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 7856
zcmeHMPiP!v6n`^2Nv67OH#L?qNpYG42_=V(8zF^&v)u?$BW_KVfQP0+Ed_<xgZ5^`
zfJH<CqNEiu7cZWc9(stENP7?w$<d1;t$Oe_+fxtK_<P@<`QA*D+UTwHgU$Zl@4fGR
z@83*zXMZK4$=X2orT5-^^{4N?y-XC42W|*2ejme&*Jleq_GqO4K0n!P#+8_=k~En2
z{g-v|*!NfXJh}hO`Ln0bzkTN1S?93d!94nM`1q08E~3});&-{W%sK_KlgO86NW1@o
zSN`1kD}aN><jXjL=p<q|ceS<Ejl+DeL=+EZxv^Z?ILc=F1vx6nRFFd<cY8Vm_TAyV
zYr-C&z@rvAN}Ky!0iHs|ZoN>7DIJ042kN406>6<}6eA>Btx)wQyxiPco%Md%H$e1Z
z*Xk_&@?rwd;kAq3cx#nWwO1b!MJEnxG*S)rcmBQn(x=_a&7Zq}uXmlfG`jpa(eme6
znlOwGCt>tOvX5x0zS+C{{yRq|=lHzI<Q5unlL#lgb)o#ar_Rvw;)Mf3Uav=QeA4L+
zaQx8g#rl?CkPpQp#8!6Y&t^8aU;D%P3DwMbh`!T%1W|`8$a0Zrn0G}XnVhjd;cJA9
z{pvi_KA*Y#)%An^e0sIi`smK`xw~B>evp(*e%NE|Bccj><P{yi;hIFz<?CG$4IAwZ
zkLb>po*O$Syocw;&WV-noLJe;iIpj?Wew4DH>zhY^Ou)tBQJ+S&d=M^59jTHoHyNn
zJ3o=>zm@HLSlP~pl__o%=B-|sx4cXnc{vnve%?+zoVN#Z-gN)%{6wbzR<`qDWjh~M
zrg)|>Zzl`$mX~QGFNZ?T&)X{x=k0-<H{E|bKauIbmF;|3+0F;cJRkP6oDW&O@IRS9
z+cD-<@_^E#8LkK%3tSaA5x6PvoWSz}F9^IO@QT1Kf!73H7kE?P)L@-oEq4v^XNY`$
zqar_t<J94*z=^<3f#(FC7kEM7C4pB2ZV9|5@Vda80;dM+{2GP)wCws<7x_6Hrw&&I
zP6TcWJSXtHzzYH|3A`e3OW-wu*9G1bI5k-3H&e(@%Pzl@B0q=Y)ZwbYiNH;P=LDV?
zctPMLfmZ}>3A`roy1<(Prvg`~-{<dI>RJP~d4cCo=azRlQhy+LhRUXinfDA%#I;Zr
zd@T5i;7RaJ%$D<$zXmKHi&?A)dlGyTJ+$_T=xtT-vEVC$C&4#SE9WQQQ_IJ)p2D6C
z-_7?iv+VcH4lmTqI(&L$1ZzdK`E1ZB_gVW`*(c7>px;gh2X?Ygw13qnx#^FugTvhO
z9^A8e@vK)2ws{4R8LR7{pesY>tXRkQbzW?{J-_-M^JDvj-R0Kq`6>VR$Paofou9@Y
zWX?Un=Oe~Czr3z&KRc`ZoNjx5qkGJc?Gv1uU%Th0{NE!#=&^Ku8oT`TwWRaQ>&o`K
zsLIdjw&!<rkNL5EqGFPNcF#}wzej%1W9j@f_8{y2>3fdOFRv@xXFMuDr*rw?;e=Yf
zfx6L=l?`p)lDqZAZXDRb4>Pc0f<0uSUjDOf)whxlq&=04z4x)&RoqatPpp6cSkY9;
zMf=$LFJ0NyKi6uk6LDa1t*Qp|231GMj8z>Ax-w)uYf-wepzDEb?*Wy9t{?IfW?v+9
zJJg@&3u~A9Z`F(S)b_j7Gbz}kU)tW^xnJAtOU8bedKT@`FZCFUXqSH7O%_f4IZ0c8
zg>C&OI(y01cpR9Q)lEXSy-5nXX~?$5Qw3cB+4hEnZio8&d#t~<zyJDc`}?oIw!i=S
z<3`RE``7xbf5K+|6}B}-{e4mYoDTI@&$xLX>Yvl0{=TSxPKWyYOqgpbr<+Ji)Iqes
zbJ5ercOHX7@XrygF;AM;_;!h&@t!=Wc<SRTMeQB-`m()M@YW9rJ^_zBm3<)i5%4%g
z@uA?y!6PEYmjyqTmV6O2=cD(67cn!xrsBL^@<q%(KHw<(*z3#oBIXPu<_sfduDP;D
z%o&fExu*JD!6Rm0#GKhz(o%rOz{);pmv~C_UhpDj=GRQjfsJ{H`Z9aO<}mn!#?OS{
z1Mmk7KQ8zX{C>la3BC+Ii%sxBI>^7Ps5qncAVM?ogU|Xd_$lzNFV21vd{%Q|AA-jt
zc;$aq@YCS4nhQQiYr;=0i)ReHi^uU3;PDfM@;}sed<Y&tX(;}b;HSVpX838rPlI>y
zI6pz!5cxJlJPrIFqy6`{9UsEp#p(Dd@Gc(5PlI>oI(`;>)_37QNM}SmGg&+#cz3Sj
zd%!#Yj<2SJzUX^4!{gwcpTUfu05AHU@nhgc-!r}m{!vq(k+$<Q3;vMdrvyKe)<itD
z%+C;bS7*m#9z;BjZ-RH{Iz9%U&41hRBWXkUX=LX{;GG}Gqc47T-L)OxpAH6PlUKDp
zm|edSc#&_$_keePoP7-5^<DBc;lC#Q)ZBI5c0A%2{u|0({clJy4G5IL7}=L{ka|HC
dYSb$C%nZADj`SY$yOfjl?*>lR&vjY0>mMR!jRXJy

diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_aie_cdo_elfs.bin b/examples/elementwise_arith/air_project/mul_kernel_0_aie_cdo_elfs.bin
deleted file mode 100644
index 9f902fee90c41225fe0fdf079a639fbbc0bb5606..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 2656
zcmeH_KT8}z7>9qex8ClBnCuCWK@!-UBv@SGX=S&kSBMglm_|sG*w~~n79m}V2wH?R
zlCTznjs23sz*PY)ti;mlP{{|dv<Nxh*_m<QbN3S*H?Xkty#HpH#|1FgdI>ndHhs^2
zC*7(5JmFG=RyYpYrpvOR^Him$pJlRup@8{}RfX<6BtDr5HMjx^(877eGYokC7`R+0
zg}ACy<QN_f0wkvbAsr9x0FxKg=%U+R7H5?j@YQQC<7_bkhWvxi`{GZ~NNaUNDQ8FZ
zUwzUYF8_~Te6Pl@PO5*K-utaa{0NAD++LXy<!I!~pHT&vZ=R3EUp_u*y;JjA(Pf#Y
z0yN>n`u(>eouT&|>kruev?*7<l|@aRpW=F{c;ejIkA!uoEW3JVE80`9`_cRe(`Fu>
zzAWz1sr@~=<pwaJs-zEdXR40?dRvp<UZb$1X2!dR51qBEU!`R8I$r(f)g3n~nfp<V
z$(KN&I;tgSj<QFB+&T0lCQS550N35jBkPQ+yXy>R))~&MGn^ruzI`OyFX%K<C!L-d
zd4_^KpRvrxsezAu>!bF9J0*<+oLN^mv#xN4aIV0|W`U1+hEAUOjAcH~4t(reAGHJA
zDQO(w%(}vvb%is8Zwh>TUEpJ$p_6AmW0{Xj10Va=N9_Q2N*V_^v#xMvT~X`??RlZC

diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_aie_cdo_enable.bin b/examples/elementwise_arith/air_project/mul_kernel_0_aie_cdo_enable.bin
deleted file mode 100644
index 7cc1818bce4d6ce1226fc5fda519967a4842b99e..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 104
zcmZQ!U|?`|@n>LQVqg#jvKQ~%@L!OTfq|Wo1;_&e1!V>x7mN{N##qFhv4{o3#Q>))
B1+xGE

diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_aie_cdo_init.bin b/examples/elementwise_arith/air_project/mul_kernel_0_aie_cdo_init.bin
deleted file mode 100644
index d4549ba181167c6e6d7475f23335e34fea2c3376..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 6032
zcmb7{&rak<5XQ@%hINw3u1Uy>EJ*QcI7AT#M?)kINHH2BaX`vM<iIUF0*^qXJ?)8e
zF5wYI$_vaRaLt7vMY;Fvc_s37+iiD!;}I|tf4=&CUGA#tw!nzU<oIWiROFvEk-z_%
zJ!~W*FA@WSPcd@He{sy=EE%{TBKEES5p+(vAYGEKNKZ*$kiH~+Mf#fb4e1%_Iq6%{
z_oQv3yPk}D`c%hUV?tlVeouYUx*%SXu1HTwUy!~eeMS13^bP45={f0J()Xloqq{y$
zdiqqybf1o>Pg)mN>ymUudP@3&^d;#l($}PKNY6;mN#BybCv6+u_35mqPjyW9=@;sg
z)`iu&Bwdl7lD;5)N&1TPHR&7DGtzU?x1{e$Thh7gr0z}C-Y@95Kj1=K_e*vRasT2A
z`KGw%NtC~$_drR!Af6K!;#1rYonOBXO5z3Gubj9LpW<BU{LA+y<^`S0oYo8RDdrKc
zuV_9c@q&0xTpGTS_%ZrfttFZNDYrPwi8P`Px$pNsn)N<^;r&%-Y<~OqH}kU3U)A~j
z{NqLc{b^o@^RRhW-=}a-^G3%#Fo-!FJksMCBIYU#o-Fco^&UO5Ugxjkt^40s*X#2?
zQZGFCgnj0BwN4NVJu!$m9X!(G86u|Z?ecW>;_N?bzRq9a$@;UpUZ4Mwdf~w*)RN!T
zTGw0O+2D~L&k!+PZ<nX5_w6(5b^gllFaEZ=UZ4Mwdf~w*)RN!TIzg<PZ~YDo9_jH6
z5!3Z{dDfCo{WC<~82>Vspqp^>qxyLCZUYYv{=)g)M&yk)HtOi1`aSl`pz$Z~{VHC>
z{wm~8zhkS{>@Pz8!5=I0ho0@PV=wrGo|TPu3xmfX=5+8#k7tP3R^-WgJe!Ecz0Z3*
zJBWQ9<=Kinq2IpGzOohbc{t*JyYq{${(e{F$2@C)+CR@F|Df@&=r{W@PpQw>ig|vl
z-i6*{KG=BId(6k{cs_@S#l1i1@f;%-&#>z8oFW$Y{-no)v!Lz|&O*DNXg+CuAD%p)
z+W++V)c&W>r}jU6KIt>&<BoKDH!vTsX+9~<XUT*4q%@x;59X87e3m?zPwIlvU0w1V
z+JWF-=&NwXHW&VX<5p+jKZ(o(mzsC_!$5vWzIoMh*>-QrcKx@>wrfeNOYm3=@d_NZ
zw|+x>4;)3bJR^PpjtW{nB))G4Db?$G()v+vO7%KEZ|nOoNU7cwf84d}i)35nL%m9)
zUZqj5n{~S$^(u~f-3(ifdKE{#Db=fZZU-ih)>n&xd&hVAz^Pux=TW^Tu6I|q)q2#U
zHTbJ2{~>V${xaeR#53^EBfd|32(IcOZtO_bqxD;h5z6W6t-#g!Bz^*}&xc-r46f!)
z>of4z(euuUpMtA-6E}8D`NwMi``~*2njeCH5<P!+p?L=WRm6WKegghw#7~Kzg6sY3
z{KihGe-qmOL|#PscNUswSg-5T`~+O@U-MINy+6&*!PWVs=No%Q`#)3r&%pKmG~Wc*
z&)2-Pqm<64x~~V|I{!%VLvT8uitmHd`BZ!g{#n%jy@k$y4*o^NPl)f?G3|e>^6!G{
z`D%{yOZ(S+3a<C3c>%8O|3dRUJE8m&wZAQJonLdDk5s*%7Mkzakr_t)DHkL4{@Vhl
z{wcl*uJh~l1-L$+%*XWnG36iY_v1oy)KAZ!`1S4I2o|Ro1qz~3EajN7i78>jEaZQL
C^4EL-

diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.elf b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.elf
deleted file mode 100755
index c7c58092079ae00dffa136a058d3c0d77b6c8d5b..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 1672
zcma)6O^6$17=9)jldSlYRxE=*FsvY}tl3Eut?6lYNu}5<tOsS$!|eR+?y$)u&P>*=
z9yVZ6LGh-<gW$!Bry$tFo`hEDNs6algzlnv@zCCSkZzwhGn1s^!H0bFywCgdz2DDd
z4lcCbP!t6jawttyozC7Pz#O>_4LK~Lh&&{Fz>(o59Wt1TGRIqg#%>my<-)8;?Yx2!
z(8TTe;w1XwbHMSNa~kd}F>-@Fua+@l2MzokmZ2Z9+tJD4hH`6u0r<FhxPe=j46dMb
zb^m+i=W=yin@p&xt7j5xNzXrCxIbL`^3-ts*HeG26s^yy!>57aSK}=uNl?!iCG~4#
z9=N!2`<dav`_DHwrC-xH<_3-cc4&u>y{C+OB>GDD#EibVqS{{+lm%Ho;|U}_%Kb~9
z)|Mby86NH}-iPXw`Pkn;E$zqE7nDZ<we%f#c?G;8LM<WPGwTiT&XDFG_887d&*8V%
zp3L>`{5UuI^xkm$pW;$3Z_K6pV@;Z`CRZwxLTy~1UJ}%At`$`*%7qix72w_p(DH*_
zFFdccT1~CmsOS|PaN1U|t?hYX<adMAt1a%taW6W*yu9tlJAJ$2bUVu%e%lZBm)q^V
z&MTJhX?D$V^@dxkd6rw#8?V)0cRlBvTX$<!yJpv&YQu9I^{vKB$r@&#T+Re}+&y`U
zYJK8S@GR}ojDCT(HYrjty%RaTyC?Er*qn*4N<N1{-)ShgL>u|!c*-awy^N`TTjGPM
zepfW1aY(%_f^WJ&>OUfHa_$~k;-8Q;4uR;;$(jhFOg$uPA_&nxkTns6=%2`%2txEP
zWK9Gi`ZuyBf)M??82RKME9*HHZA|rFq6brrO6={&kZbIC0gdQ#C$?<zP85fUHDSxy
zH6#B+FXH>sdS|WaTW!yY*Sc=M?V-|IUDeHM%4f-D>q6k_aKgA+aU43f`hjEG-nJi9
z;dDdK)XlT7`&;8jRV*E*8O2r@!;E@~W$sz+zGpVqL^``LJC+}qe&ELt3#Joy!zil~
zziEY`bvg4SVJ~noE=q%A?ao#vLi}9EtMaJJqc*-yLyqnLKw*-M<b-+DwB;L{p`04>
z#)w-_^?DBYAhm_Ec_g2q32smf7?f|e%)u`N@16f~4*A|^xX8e{OpvoBEYP0KJ7C`5
U|B?4{S~EyvyKE>uvw1Rp0HPHLbpQYW

diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.ld.script b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.ld.script
deleted file mode 100644
index fc4f0cf..0000000
--- a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.ld.script
+++ /dev/null
@@ -1,72 +0,0 @@
-
-MEMORY
-{
-   program (RX) : ORIGIN = 0, LENGTH = 0x0020000
-   data (!RX) : ORIGIN = 0x78400, LENGTH = 0x7C00
-}
-ENTRY(__start)
-SECTIONS
-{
-  . = 0x0;
-  .text : {
-     /* the __start symbol has to come at address zero. */
-     *crt0.o(.text*)
-     _ctors_start = .;
-     _init_array_start = .;
-     KEEP(SORT(*.init_array))
-     _ctors_end = .;
-     _init_array_end = .;
-     _dtors_start = .;
-     _dtors_end = .;
-     *(.text*)
-  } > program
-  .data : {
-     *(.data*)
-     *(.rodata*)
-  } > data
-  .comment : {
-     *(.comment*)
-  }
-  .symtab : {
-     *(.symtab)
-  }
-  .shstrtab : {
-     *(.shstrtab)
-  }
-  .strtab : {
-     *(.strtab)
-  }
-  .stack_sizes : {
-     *(.stack_sizes)
-  }
-
-. = 0x70000;
-_sp_start_value_DM_stack = .;
-. += 0x400; /* stack */
-/* No tile with memory exists to the south. */
-. = 0x40000;
-. += 0x10000;
-/* No tile with memory exists to the west. */
-. = 0x50000;
-. += 0x10000;
-. = 0x60400;
-buf5 = .;
-. += 0x400;
-. = 0x64000;
-buf4 = .;
-. += 0x400;
-. = 0x68000;
-buf3 = .;
-. += 0x400;
-. = 0x70400;
-buf2 = .;
-. += 0x400;
-. = 0x74000;
-buf1 = .;
-. += 0x400;
-. = 0x78000;
-buf0 = .;
-. += 0x400;
-  .bss : { *(.bss*) } > data
-}
-PROVIDE(main = core_0_2);
diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.ll b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.ll
deleted file mode 100644
index 19c8134..0000000
--- a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.ll
+++ /dev/null
@@ -1,95 +0,0 @@
-; ModuleID = 'LLVMDialectModule'
-source_filename = "LLVMDialectModule"
-target triple = "aie2p"
-
-@buf0 = external global [256 x float]
-@buf1 = external global [256 x float]
-@buf2 = external global [256 x float]
-@buf3 = external global [256 x float]
-@buf4 = external global [256 x float]
-@buf5 = external global [256 x float]
-@buf6 = external global [256 x float]
-@buf7 = external global [256 x float]
-@buf8 = external global [256 x float]
-@buf9 = external global [256 x float]
-@buf10 = external global [256 x float]
-@buf11 = external global [256 x float]
-@buf12 = external global [1024 x float]
-@buf13 = external global [1024 x float]
-@buf14 = external global [1024 x float]
-
-declare void @debug_i32(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.event(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.put.ms(i32, i32)
-
-; Unknown intrinsic
-declare { i32, i32 } @llvm.aie2p.get.ss()
-
-; Unknown intrinsic
-declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32)
-
-; Unknown intrinsic
-declare <16 x i32> @llvm.aie2p.scd.read.vec(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.acquire(i32, i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.release(i32, i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.set.ctrl.reg(i32, i32)
-
-define void @core_0_2() {
-  call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1)
-  call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0)
-  br label %1
-
-1:                                                ; preds = %17, %0
-  call void @llvm.aie2p.acquire(i32 49, i32 -1)
-  call void @llvm.aie2p.acquire(i32 50, i32 -1)
-  call void @llvm.aie2p.acquire(i32 52, i32 -1)
-  br label %2
-
-2:                                                ; preds = %5, %1
-  %3 = phi i32 [ %16, %5 ], [ 0, %1 ]
-  %4 = icmp slt i32 %3, 256
-  br i1 %4, label %5, label %17
-
-5:                                                ; preds = %2
-  %6 = getelementptr float, ptr @buf2, i32 %3
-  %7 = load <16 x float>, ptr %6, align 4
-  %8 = getelementptr float, ptr @buf1, i32 %3
-  %9 = load <16 x float>, ptr %8, align 4
-  %10 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %7)
-  %11 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %9)
-  %12 = shufflevector <16 x bfloat> %10, <16 x bfloat> %10, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-  %13 = shufflevector <16 x bfloat> %11, <16 x bfloat> %11, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-  %14 = call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %12, <32 x bfloat> %13, i32 60)
-  %15 = getelementptr float, ptr @buf0, i32 %3
-  store <16 x float> %14, ptr %15, align 4
-  %16 = add i32 %3, 16
-  br label %2, !llvm.loop !1
-
-17:                                               ; preds = %2
-  call void @llvm.aie2p.release(i32 51, i32 1)
-  call void @llvm.aie2p.release(i32 53, i32 1)
-  call void @llvm.aie2p.release(i32 48, i32 1)
-  br label %1
-}
-
-; Unknown intrinsic
-declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>)
-
-; Unknown intrinsic
-declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32)
-
-!llvm.module.flags = !{!0}
-
-!0 = !{i32 2, !"Debug Info Version", i32 3}
-!1 = distinct !{!1, !2}
-!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.o b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.o
deleted file mode 100644
index 43b8281695feaa671e97fd88c5e79951f78aeaed..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 1000
zcmaJ<O>5L(5Psh7b~lwOX~7x<Vd+WNHe}U{7s;+$t(X-Nltm9qb{j>ET}-o3PZjJz
zM5GrbM=#!c>S0g)1=3@W75oAE2dvNJtLYvbGMRa1=ACEWNw!v;J4z{JqM#KJa4-dw
zn7k1eTt*cqwGwJ$pYJ^#yEHX<#yTvi`FT9<s>7K@;C*%6#o>J$nAY|-zpH({QN!;g
z)`%EJYa#KLtn%5)->CC>E_(Q5?pL$werQDJf#^$-)L5`J<EuRbtTvA>Mq6(#wQ)p3
zzAIz}$lwRF&s5Bj;O*dC0UtDt{-=sslo^;$MDn{+6L`MHDf3A=>nUzIc<Ox8kNh^S
zr+l2gqR#N7_KmmPPPmAW-Z>5A=0ffPW&Anr5nFkzgh%vu3fLE4Wq}gzrLN7`Jipw{
zEJdqRpHAQx6IfuKdlV6~T#a4!0<LO_FSn@6PeJ)et(RjLCOH-Vmsrfu+YDi@4@1wq
z`{42NM(7S+!Ev5++Am$-8-$(V+Q|1{ZiGST_G#a~7TX^^zXmh#d^hf34u*l(vwDm<
zt(I68G*ZwIU=j43mG!lgHKil`&s<?Ly+T~p<=msffo!E>MbnkQF4TxCI~BQ2ovf2@
xkV#f|ivd}eohEehuF{wOGLPhDsj~LMal9*nLasrdEpm~%9U>{8rb<8I{s-KramN4v

diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.opt.ll b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.opt.ll
deleted file mode 100644
index 0eee48f..0000000
--- a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.opt.ll
+++ /dev/null
@@ -1,72 +0,0 @@
-; ModuleID = 'air_project/mul_kernel_0_core_0_2.peanohack.ll'
-source_filename = "LLVMDialectModule"
-target datalayout = "e-m:e-p:20:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-f32:32:32-i64:32-f64:32-a:0:32-n32"
-target triple = "aie2p"
-
-@buf0 = external local_unnamed_addr global [256 x float]
-@buf1 = external local_unnamed_addr global [256 x float]
-@buf2 = external local_unnamed_addr global [256 x float]
-
-; Function Attrs: nounwind
-declare void @llvm.aie2p.acquire(i32, i32) #0
-
-; Function Attrs: nounwind
-declare void @llvm.aie2p.release(i32, i32) #0
-
-; Function Attrs: nounwind memory(inaccessiblemem: write)
-declare void @llvm.aie2p.set.ctrl.reg(i32, i32) #1
-
-; Function Attrs: noreturn nounwind
-define void @core_0_2() local_unnamed_addr #2 {
-  tail call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1)
-  tail call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0)
-  br label %1
-
-1:                                                ; preds = %17, %0
-  tail call void @llvm.aie2p.acquire(i32 49, i32 -1)
-  tail call void @llvm.aie2p.acquire(i32 50, i32 -1)
-  tail call void @llvm.aie2p.acquire(i32 52, i32 -1)
-  br label %2
-
-2:                                                ; preds = %1, %2
-  %3 = phi i32 [ 0, %1 ], [ %15, %2 ]
-  %4 = trunc nuw i32 %3 to i20
-  %5 = getelementptr float, ptr @buf2, i20 %4
-  %6 = load <16 x float>, ptr %5, align 64
-  %7 = getelementptr float, ptr @buf1, i20 %4
-  %8 = load <16 x float>, ptr %7, align 64
-  %9 = tail call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %6)
-  %10 = tail call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %8)
-  %11 = shufflevector <16 x bfloat> %9, <16 x bfloat> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-  %12 = shufflevector <16 x bfloat> %10, <16 x bfloat> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-  %13 = tail call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %11, <32 x bfloat> %12, i32 60)
-  %14 = getelementptr float, ptr @buf0, i20 %4
-  store <16 x float> %13, ptr %14, align 64
-  %15 = add nuw nsw i32 %3, 16
-  %16 = icmp ult i32 %3, 240
-  br i1 %16, label %2, label %17, !llvm.loop !1
-
-17:                                               ; preds = %2
-  tail call void @llvm.aie2p.release(i32 51, i32 1)
-  tail call void @llvm.aie2p.release(i32 53, i32 1)
-  tail call void @llvm.aie2p.release(i32 48, i32 1)
-  br label %1
-}
-
-; Function Attrs: nofree nounwind memory(inaccessiblemem: read)
-declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>) #3
-
-; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read)
-declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32) #4
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind memory(inaccessiblemem: write) }
-attributes #2 = { noreturn nounwind }
-attributes #3 = { nofree nounwind memory(inaccessiblemem: read) }
-attributes #4 = { mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) }
-
-!llvm.module.flags = !{!0}
-
-!0 = !{i32 2, !"Debug Info Version", i32 3}
-!1 = distinct !{!1, !2}
-!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.peanohack.ll b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.peanohack.ll
deleted file mode 100644
index 7de74b2..0000000
--- a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_2.peanohack.ll
+++ /dev/null
@@ -1,95 +0,0 @@
-; ModuleID = 'LLVMDialectModule'
-source_filename = "LLVMDialectModule"
-target triple = "aie2p"
-
-@buf0 = external global [256 x float]
-@buf1 = external global [256 x float]
-@buf2 = external global [256 x float]
-@buf3 = external global [256 x float]
-@buf4 = external global [256 x float]
-@buf5 = external global [256 x float]
-@buf6 = external global [256 x float]
-@buf7 = external global [256 x float]
-@buf8 = external global [256 x float]
-@buf9 = external global [256 x float]
-@buf10 = external global [256 x float]
-@buf11 = external global [256 x float]
-@buf12 = external global [1024 x float]
-@buf13 = external global [1024 x float]
-@buf14 = external global [1024 x float]
-
-declare void @debug_i32(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.event(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.put.ms(i32, i32)
-
-; Unknown intrinsic
-declare { i32, i32 } @llvm.aie2p.get.ss()
-
-; Unknown intrinsic
-declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32)
-
-; Unknown intrinsic
-declare <16 x i32> @llvm.aie2p.scd.read.vec(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.acquire(i32, i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.release(i32, i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.set.ctrl.reg(i32, i32)
-
-define void @core_0_2() {
-  call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1)
-  call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0)
-  br label %1
-
-1:                                                ; preds = %17, %0
-  call void @llvm.aie2p.acquire(i32 49, i32 -1)
-  call void @llvm.aie2p.acquire(i32 50, i32 -1)
-  call void @llvm.aie2p.acquire(i32 52, i32 -1)
-  br label %2
-
-2:                                                ; preds = %5, %1
-  %3 = phi i32 [ %16, %5 ], [ 0, %1 ]
-  %4 = icmp slt i32 %3, 256
-  br i1 %4, label %5, label %17
-
-5:                                                ; preds = %2
-  %6 = getelementptr float, ptr @buf2, i32 %3
-  %7 = load <16 x float>, ptr %6
-  %8 = getelementptr float, ptr @buf1, i32 %3
-  %9 = load <16 x float>, ptr %8
-  %10 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %7)
-  %11 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %9)
-  %12 = shufflevector <16 x bfloat> %10, <16 x bfloat> %10, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-  %13 = shufflevector <16 x bfloat> %11, <16 x bfloat> %11, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-  %14 = call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %12, <32 x bfloat> %13, i32 60)
-  %15 = getelementptr float, ptr @buf0, i32 %3
-  store <16 x float> %14, ptr %15
-  %16 = add i32 %3, 16
-  br label %2, !llvm.loop !1
-
-17:                                               ; preds = %2
-  call void @llvm.aie2p.release(i32 51, i32 1)
-  call void @llvm.aie2p.release(i32 53, i32 1)
-  call void @llvm.aie2p.release(i32 48, i32 1)
-  br label %1
-}
-
-; Unknown intrinsic
-declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>)
-
-; Unknown intrinsic
-declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32)
-
-!llvm.module.flags = !{!0}
-
-!0 = !{i32 2, !"Debug Info Version", i32 3}
-!1 = distinct !{!1, !2}
-!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.elf b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.elf
deleted file mode 100755
index f1be4eefad7b89482558fe9d6e292826c0748801..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 1736
zcma)7O^6$17=9)jldSlYRxE=*FsvYJS+g@u>^42kE~$vk!g^2^J<LvKW_O28CUG)Z
zw|Z#6qJrX0i3h=pmtG8lJ?u$ng`T8%>ZQ<K6fZq=FH%rkpEom;q~gJc{oeO|-k<ON
z{xW;{Lj6@mQIH~o!kAU)96kcfP*<QKgL&kUg=F_RQrx0L22)Ywc;mOoO=HtKH%$^d
zt6&J!aCbI8j=uODaQOO+hWiVQ++@#dMGV<N1ON7m&=1({;CO#sxjj1ve3;)~$L)(I
zS5UaN_pS0{(HPan6RPUUsn}Z3vybQg9;|+TVzBn}iC>rV_9w>RX<+chXiISt)Kg|b
z{nDHTUR%EV%;567&(}7jU(Gz^1`YsrXoioyql|jQ{7U%56u-5sHa^QKbFzNMV@Q0M
z`4>N_EkLqTJltEn57o!>vA>CO(vPdpDUSkb;T!Jq3V20?T0pXA)|=p+A<5tGFr1N|
zgRieYnd#jBerEXbgTdBc`Gri@oJsb_nj~M1uT&<5TD>v3B&c6q&#Rc13ny+Uz=LC;
z?gu+wcwVd5Yno9l=_MU-ns%_I?RsI?ZwHB2o8OM2PWSxc;+7w6_ZlUq-CA7tn|`pj
z*lg~$UbcNtYm^;Vue#;3XS-#+dbV=T^_(TQ;+Bm@xlwV9s^?ZKo7I=%HB3t`XN)Y4
zj-Mi18+#NyOM5uQU!bjxixf=mL{9JMSbhr|Q}Gqa=P>Cz2?ZBvBO4!25xKaRIpJ?g
zyg%V@3nw%Vsc#73OBYD}2h<y!yF)GUkEk^cf$&eMH6cWid_b)UA%tJ0)`Sqkf1uWc
z5W;_=)`Sqkf1%cd5W??>k>$G~WIcysm=pfH@cx9~6P}HK;gWwO+??<~g!d;LrO4Zh
zAlKgV0vg>*t;lXrZ+D|Gw#M3acC4=dzSrgZs=u{b^X;bRM5}GL*Yr@TudL{nk?1qj
zR{cWY>TtryC^-%to4vrX8s3&47;xI5XX#cMjo#)fA}S)vu&i!mhY_r9C$_9zyV>)s
z+Nw}z2Ug4W1IrKm2x7r<qITF#tHf{FVQ61UJ#p9zT#Sm6;E21^l`#=N*KtOKAwm~%
zPQ;Rkv!mBw$|?RYD9n-(pE!@6wtS;g6ca;UoVfLbuVsMu5?fR{kK{8n#!ZqTgYwOn
vIpifs4!;+W{}(Ct7+6XPb-IK(+S7U8G4Id+$UB|X^pn_CHk6*}JgNQ%b><P(

diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.ld.script b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.ld.script
deleted file mode 100644
index 6120a88..0000000
--- a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.ld.script
+++ /dev/null
@@ -1,78 +0,0 @@
-
-MEMORY
-{
-   program (RX) : ORIGIN = 0, LENGTH = 0x0020000
-   data (!RX) : ORIGIN = 0x78400, LENGTH = 0x7C00
-}
-ENTRY(__start)
-SECTIONS
-{
-  . = 0x0;
-  .text : {
-     /* the __start symbol has to come at address zero. */
-     *crt0.o(.text*)
-     _ctors_start = .;
-     _init_array_start = .;
-     KEEP(SORT(*.init_array))
-     _ctors_end = .;
-     _init_array_end = .;
-     _dtors_start = .;
-     _dtors_end = .;
-     *(.text*)
-  } > program
-  .data : {
-     *(.data*)
-     *(.rodata*)
-  } > data
-  .comment : {
-     *(.comment*)
-  }
-  .symtab : {
-     *(.symtab)
-  }
-  .shstrtab : {
-     *(.shstrtab)
-  }
-  .strtab : {
-     *(.strtab)
-  }
-  .stack_sizes : {
-     *(.stack_sizes)
-  }
-
-. = 0x70000;
-_sp_start_value_DM_stack = .;
-. += 0x400; /* stack */
-. = 0x40400;
-buf2 = .;
-. += 0x400;
-. = 0x44000;
-buf1 = .;
-. += 0x400;
-. = 0x48000;
-buf0 = .;
-. += 0x400;
-/* No tile with memory exists to the west. */
-. = 0x50000;
-. += 0x10000;
-. = 0x60400;
-buf8 = .;
-. += 0x400;
-. = 0x64000;
-buf7 = .;
-. += 0x400;
-. = 0x68000;
-buf6 = .;
-. += 0x400;
-. = 0x70400;
-buf5 = .;
-. += 0x400;
-. = 0x74000;
-buf4 = .;
-. += 0x400;
-. = 0x78000;
-buf3 = .;
-. += 0x400;
-  .bss : { *(.bss*) } > data
-}
-PROVIDE(main = core_0_3);
diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.ll b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.ll
deleted file mode 100644
index 79b2ca7..0000000
--- a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.ll
+++ /dev/null
@@ -1,95 +0,0 @@
-; ModuleID = 'LLVMDialectModule'
-source_filename = "LLVMDialectModule"
-target triple = "aie2p"
-
-@buf0 = external global [256 x float]
-@buf1 = external global [256 x float]
-@buf2 = external global [256 x float]
-@buf3 = external global [256 x float]
-@buf4 = external global [256 x float]
-@buf5 = external global [256 x float]
-@buf6 = external global [256 x float]
-@buf7 = external global [256 x float]
-@buf8 = external global [256 x float]
-@buf9 = external global [256 x float]
-@buf10 = external global [256 x float]
-@buf11 = external global [256 x float]
-@buf12 = external global [1024 x float]
-@buf13 = external global [1024 x float]
-@buf14 = external global [1024 x float]
-
-declare void @debug_i32(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.event(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.put.ms(i32, i32)
-
-; Unknown intrinsic
-declare { i32, i32 } @llvm.aie2p.get.ss()
-
-; Unknown intrinsic
-declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32)
-
-; Unknown intrinsic
-declare <16 x i32> @llvm.aie2p.scd.read.vec(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.acquire(i32, i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.release(i32, i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.set.ctrl.reg(i32, i32)
-
-define void @core_0_3() {
-  call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1)
-  call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0)
-  br label %1
-
-1:                                                ; preds = %17, %0
-  call void @llvm.aie2p.acquire(i32 49, i32 -1)
-  call void @llvm.aie2p.acquire(i32 50, i32 -1)
-  call void @llvm.aie2p.acquire(i32 52, i32 -1)
-  br label %2
-
-2:                                                ; preds = %5, %1
-  %3 = phi i32 [ %16, %5 ], [ 0, %1 ]
-  %4 = icmp slt i32 %3, 256
-  br i1 %4, label %5, label %17
-
-5:                                                ; preds = %2
-  %6 = getelementptr float, ptr @buf5, i32 %3
-  %7 = load <16 x float>, ptr %6, align 4
-  %8 = getelementptr float, ptr @buf4, i32 %3
-  %9 = load <16 x float>, ptr %8, align 4
-  %10 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %7)
-  %11 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %9)
-  %12 = shufflevector <16 x bfloat> %10, <16 x bfloat> %10, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-  %13 = shufflevector <16 x bfloat> %11, <16 x bfloat> %11, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-  %14 = call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %12, <32 x bfloat> %13, i32 60)
-  %15 = getelementptr float, ptr @buf3, i32 %3
-  store <16 x float> %14, ptr %15, align 4
-  %16 = add i32 %3, 16
-  br label %2, !llvm.loop !1
-
-17:                                               ; preds = %2
-  call void @llvm.aie2p.release(i32 51, i32 1)
-  call void @llvm.aie2p.release(i32 53, i32 1)
-  call void @llvm.aie2p.release(i32 48, i32 1)
-  br label %1
-}
-
-; Unknown intrinsic
-declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>)
-
-; Unknown intrinsic
-declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32)
-
-!llvm.module.flags = !{!0}
-
-!0 = !{i32 2, !"Debug Info Version", i32 3}
-!1 = distinct !{!1, !2}
-!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.o b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.o
deleted file mode 100644
index 4343d406be95fbbfffcd92a0f42bf02db06255c2..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 1000
zcmaJ<O=}Zj5Pse!O}0{HTd*#I5PH(sE*o19UStzfEw~Xulz>9W#}>r3!R-d>se(O-
zi1ecD(Tn$<ddR82Kzi)4f<Hk2fc2UE>NW?5%scbU%sbD#v)Ng5?klB`ih`Cyz|j;?
zU~)AoxQ;T;Y9-XfA<qLH`!p5#jdfU3^Ye7vR>w1o!29yJjpK(lFs&VIe^-Zkt%Bc6
ztPwGc#zO2ZS;dQ`zhUe1T=@9M+^>4M`=J(I2Es3SQe(l^jIZ_#uwFm88t%Ng*2D=3
zd6viukiqk3pR0%?#=HKd96qWWy-y{zC^Il0i{$sGCh&ZPQ)ZKL)^pr)@!b8mANg&p
zBz&B{q%QEL_KmOHjk$=Byg3PE=1S%PMf^D(5L;QSfEV=la@Z4JrGWyTB(BZaJipwX
zG)1dZ?@r)*6IfuCdlV6~T#j7!0&Z%tFR!S|pMvs{S}jH{OmZszUm`IBcRPT&ISgF$
z!K0_kTS2$~3Xb!v)qL6Y+<wp+4o02}b1U$JZjbiuYq7o2#%(!P<ye8~yIwcyW%h@@
z+p#)~IgN(Zxs})&ZO2*L9GtBw8R38C29wDXqOvyU9_22iD-|o6tORzhMqJscNQ*jI
yC*L5Iv~HaNS(lw8bn;3}`pZ0$o2AOybJy`M4|2I0eYVI%>h_4Fe3~l#g!><@G;zKF

diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.opt.ll b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.opt.ll
deleted file mode 100644
index ce97114..0000000
--- a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.opt.ll
+++ /dev/null
@@ -1,72 +0,0 @@
-; ModuleID = 'air_project/mul_kernel_0_core_0_3.peanohack.ll'
-source_filename = "LLVMDialectModule"
-target datalayout = "e-m:e-p:20:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-f32:32:32-i64:32-f64:32-a:0:32-n32"
-target triple = "aie2p"
-
-@buf3 = external local_unnamed_addr global [256 x float]
-@buf4 = external local_unnamed_addr global [256 x float]
-@buf5 = external local_unnamed_addr global [256 x float]
-
-; Function Attrs: nounwind
-declare void @llvm.aie2p.acquire(i32, i32) #0
-
-; Function Attrs: nounwind
-declare void @llvm.aie2p.release(i32, i32) #0
-
-; Function Attrs: nounwind memory(inaccessiblemem: write)
-declare void @llvm.aie2p.set.ctrl.reg(i32, i32) #1
-
-; Function Attrs: noreturn nounwind
-define void @core_0_3() local_unnamed_addr #2 {
-  tail call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1)
-  tail call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0)
-  br label %1
-
-1:                                                ; preds = %17, %0
-  tail call void @llvm.aie2p.acquire(i32 49, i32 -1)
-  tail call void @llvm.aie2p.acquire(i32 50, i32 -1)
-  tail call void @llvm.aie2p.acquire(i32 52, i32 -1)
-  br label %2
-
-2:                                                ; preds = %1, %2
-  %3 = phi i32 [ 0, %1 ], [ %15, %2 ]
-  %4 = trunc nuw i32 %3 to i20
-  %5 = getelementptr float, ptr @buf5, i20 %4
-  %6 = load <16 x float>, ptr %5, align 64
-  %7 = getelementptr float, ptr @buf4, i20 %4
-  %8 = load <16 x float>, ptr %7, align 64
-  %9 = tail call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %6)
-  %10 = tail call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %8)
-  %11 = shufflevector <16 x bfloat> %9, <16 x bfloat> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-  %12 = shufflevector <16 x bfloat> %10, <16 x bfloat> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-  %13 = tail call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %11, <32 x bfloat> %12, i32 60)
-  %14 = getelementptr float, ptr @buf3, i20 %4
-  store <16 x float> %13, ptr %14, align 64
-  %15 = add nuw nsw i32 %3, 16
-  %16 = icmp ult i32 %3, 240
-  br i1 %16, label %2, label %17, !llvm.loop !1
-
-17:                                               ; preds = %2
-  tail call void @llvm.aie2p.release(i32 51, i32 1)
-  tail call void @llvm.aie2p.release(i32 53, i32 1)
-  tail call void @llvm.aie2p.release(i32 48, i32 1)
-  br label %1
-}
-
-; Function Attrs: nofree nounwind memory(inaccessiblemem: read)
-declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>) #3
-
-; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read)
-declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32) #4
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind memory(inaccessiblemem: write) }
-attributes #2 = { noreturn nounwind }
-attributes #3 = { nofree nounwind memory(inaccessiblemem: read) }
-attributes #4 = { mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) }
-
-!llvm.module.flags = !{!0}
-
-!0 = !{i32 2, !"Debug Info Version", i32 3}
-!1 = distinct !{!1, !2}
-!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.peanohack.ll b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.peanohack.ll
deleted file mode 100644
index c86e34d..0000000
--- a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_3.peanohack.ll
+++ /dev/null
@@ -1,95 +0,0 @@
-; ModuleID = 'LLVMDialectModule'
-source_filename = "LLVMDialectModule"
-target triple = "aie2p"
-
-@buf0 = external global [256 x float]
-@buf1 = external global [256 x float]
-@buf2 = external global [256 x float]
-@buf3 = external global [256 x float]
-@buf4 = external global [256 x float]
-@buf5 = external global [256 x float]
-@buf6 = external global [256 x float]
-@buf7 = external global [256 x float]
-@buf8 = external global [256 x float]
-@buf9 = external global [256 x float]
-@buf10 = external global [256 x float]
-@buf11 = external global [256 x float]
-@buf12 = external global [1024 x float]
-@buf13 = external global [1024 x float]
-@buf14 = external global [1024 x float]
-
-declare void @debug_i32(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.event(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.put.ms(i32, i32)
-
-; Unknown intrinsic
-declare { i32, i32 } @llvm.aie2p.get.ss()
-
-; Unknown intrinsic
-declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32)
-
-; Unknown intrinsic
-declare <16 x i32> @llvm.aie2p.scd.read.vec(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.acquire(i32, i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.release(i32, i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.set.ctrl.reg(i32, i32)
-
-define void @core_0_3() {
-  call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1)
-  call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0)
-  br label %1
-
-1:                                                ; preds = %17, %0
-  call void @llvm.aie2p.acquire(i32 49, i32 -1)
-  call void @llvm.aie2p.acquire(i32 50, i32 -1)
-  call void @llvm.aie2p.acquire(i32 52, i32 -1)
-  br label %2
-
-2:                                                ; preds = %5, %1
-  %3 = phi i32 [ %16, %5 ], [ 0, %1 ]
-  %4 = icmp slt i32 %3, 256
-  br i1 %4, label %5, label %17
-
-5:                                                ; preds = %2
-  %6 = getelementptr float, ptr @buf5, i32 %3
-  %7 = load <16 x float>, ptr %6
-  %8 = getelementptr float, ptr @buf4, i32 %3
-  %9 = load <16 x float>, ptr %8
-  %10 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %7)
-  %11 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %9)
-  %12 = shufflevector <16 x bfloat> %10, <16 x bfloat> %10, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-  %13 = shufflevector <16 x bfloat> %11, <16 x bfloat> %11, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-  %14 = call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %12, <32 x bfloat> %13, i32 60)
-  %15 = getelementptr float, ptr @buf3, i32 %3
-  store <16 x float> %14, ptr %15
-  %16 = add i32 %3, 16
-  br label %2, !llvm.loop !1
-
-17:                                               ; preds = %2
-  call void @llvm.aie2p.release(i32 51, i32 1)
-  call void @llvm.aie2p.release(i32 53, i32 1)
-  call void @llvm.aie2p.release(i32 48, i32 1)
-  br label %1
-}
-
-; Unknown intrinsic
-declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>)
-
-; Unknown intrinsic
-declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32)
-
-!llvm.module.flags = !{!0}
-
-!0 = !{i32 2, !"Debug Info Version", i32 3}
-!1 = distinct !{!1, !2}
-!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.elf b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.elf
deleted file mode 100755
index 2158287344d4e58ea0bb86865df912b73324cb82..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 1740
zcma)7O=u%!7=9*=Nh<ziS6l{vV5lH<sp*V~+s&S)B`e~lP!CGc!*nt;ZHFe4IGNP0
z9yZ{jg5phy2f>RM5e%Yx=t<ZW_9V-m_9ASHix&^Oo_bL0^JZp}ta$LD-}k=H`}4i;
z_mk<R^Yxb$ML~)T3S(BG@%smW8OjPYWH66BvXJOLM~WLX$Y3an9IxJq+|)NMbJHZz
zvkHbl4Yz0WWAC$10f(>5Xt=vT&vn+kQpAuIH1Kb~2>pQ74vzQNm0PoOz<c@qb=<mO
zvV+2vy|0uXi^eE7UQkt6PQ}`So_#p?_h9vt6N9y%PyD)^w?8xnj{}2GM_Y=6pq?@d
z>SyLG@bdEQCkB_^db+kD^=jrJH*f&3LNk2mO=VOg=9j}qrudCzwefLInUntMk0JhH
z=3o4zwg8Draer;G-j^S*$NDD9Nj-L-Qyv7=!k66TW$=m!wSZ*L%s0V1LlVE=p*tfr
z2VY!$G}F2J-OTWVqrujn`Gri@oJsb_oFrb2uT&O=TD>;8B&eTX&8wJ~3n#8Az|k>K
z_k$fTJg3#`HO;7&^pXxZO*`1qcD=Ccw}Zs0&2L9hr+aR3am$akdySIQZY{3+O+VOM
zY&LgW&)dGIHOh{wSKV^iv)!^@JySXBdd`wtamz-d+^9H4)pM(r&FXV;57RT-86%5(
z$B&V%jV%hEq&}SD&rsLKNeU)+BByunSbhr|Q~ni+=P+qI2?ZCZBO4!25xKaQIpME~
zzdzw`2q!cS$!`eZOBYD~yObMTyF)4d_bD|Ff$)zgH6cWid_buQA%uTTsR<#3|3Ik;
zA%y=#sR<#3-=x%p5W;^ECChh1NI!?7m=pe+@cxAVE<79m!X^F>;pT)N3GYuhN|Co0
zL9V^!1ys5hTan$M-0ntUtc|7Z>{wm@9k0vxReyc8=G#rriB{Wguj!#wUs=&DBatsq
zTJ`gRtHTK+qvSX=Z1w`jYIs|IV8CgIo~2tAG<ut7Wm=Ny3@ob~*<l2$+le)6*KYPa
ztF|iC*@4xv{lM}AKZ0nmoTwdk(=4%Db{N_hQ%mgj0vDsC#5v;bbY@J%&OTn0Y1CU;
zG{YFpy3Ezldobl3|0fj2$%s##$4_0p(<zFHBCk%&dcxN-z}tx~B3(z~=^EoY$&gO@
x?#mkT8YG6_49NeC6nk_mrGzqVVUGHA+_#MT>p$X7CprDZ_YDh5&2*hq{sTMI5=sC7

diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.ld.script b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.ld.script
deleted file mode 100644
index ddda3c2..0000000
--- a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.ld.script
+++ /dev/null
@@ -1,78 +0,0 @@
-
-MEMORY
-{
-   program (RX) : ORIGIN = 0, LENGTH = 0x0020000
-   data (!RX) : ORIGIN = 0x78400, LENGTH = 0x7C00
-}
-ENTRY(__start)
-SECTIONS
-{
-  . = 0x0;
-  .text : {
-     /* the __start symbol has to come at address zero. */
-     *crt0.o(.text*)
-     _ctors_start = .;
-     _init_array_start = .;
-     KEEP(SORT(*.init_array))
-     _ctors_end = .;
-     _init_array_end = .;
-     _dtors_start = .;
-     _dtors_end = .;
-     *(.text*)
-  } > program
-  .data : {
-     *(.data*)
-     *(.rodata*)
-  } > data
-  .comment : {
-     *(.comment*)
-  }
-  .symtab : {
-     *(.symtab)
-  }
-  .shstrtab : {
-     *(.shstrtab)
-  }
-  .strtab : {
-     *(.strtab)
-  }
-  .stack_sizes : {
-     *(.stack_sizes)
-  }
-
-. = 0x70000;
-_sp_start_value_DM_stack = .;
-. += 0x400; /* stack */
-. = 0x40400;
-buf5 = .;
-. += 0x400;
-. = 0x44000;
-buf4 = .;
-. += 0x400;
-. = 0x48000;
-buf3 = .;
-. += 0x400;
-/* No tile with memory exists to the west. */
-. = 0x50000;
-. += 0x10000;
-. = 0x60400;
-buf11 = .;
-. += 0x400;
-. = 0x64000;
-buf10 = .;
-. += 0x400;
-. = 0x68000;
-buf9 = .;
-. += 0x400;
-. = 0x70400;
-buf8 = .;
-. += 0x400;
-. = 0x74000;
-buf7 = .;
-. += 0x400;
-. = 0x78000;
-buf6 = .;
-. += 0x400;
-  .bss : { *(.bss*) } > data
-}
-PROVIDE(main = core_0_4);
diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.ll b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.ll
deleted file mode 100644
index 2552e6c..0000000
--- a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.ll
+++ /dev/null
@@ -1,95 +0,0 @@
-; ModuleID = 'LLVMDialectModule'
-source_filename = "LLVMDialectModule"
-target triple = "aie2p"
-
-@buf0 = external global [256 x float]
-@buf1 = external global [256 x float]
-@buf2 = external global [256 x float]
-@buf3 = external global [256 x float]
-@buf4 = external global [256 x float]
-@buf5 = external global [256 x float]
-@buf6 = external global [256 x float]
-@buf7 = external global [256 x float]
-@buf8 = external global [256 x float]
-@buf9 = external global [256 x float]
-@buf10 = external global [256 x float]
-@buf11 = external global [256 x float]
-@buf12 = external global [1024 x float]
-@buf13 = external global [1024 x float]
-@buf14 = external global [1024 x float]
-
-declare void @debug_i32(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.event(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.put.ms(i32, i32)
-
-; Unknown intrinsic
-declare { i32, i32 } @llvm.aie2p.get.ss()
-
-; Unknown intrinsic
-declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32)
-
-; Unknown intrinsic
-declare <16 x i32> @llvm.aie2p.scd.read.vec(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.acquire(i32, i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.release(i32, i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.set.ctrl.reg(i32, i32)
-
-define void @core_0_4() {
-  call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1)
-  call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0)
-  br label %1
-
-1:                                                ; preds = %17, %0
-  call void @llvm.aie2p.acquire(i32 49, i32 -1)
-  call void @llvm.aie2p.acquire(i32 50, i32 -1)
-  call void @llvm.aie2p.acquire(i32 52, i32 -1)
-  br label %2
-
-2:                                                ; preds = %5, %1
-  %3 = phi i32 [ %16, %5 ], [ 0, %1 ]
-  %4 = icmp slt i32 %3, 256
-  br i1 %4, label %5, label %17
-
-5:                                                ; preds = %2
-  %6 = getelementptr float, ptr @buf8, i32 %3
-  %7 = load <16 x float>, ptr %6, align 4
-  %8 = getelementptr float, ptr @buf7, i32 %3
-  %9 = load <16 x float>, ptr %8, align 4
-  %10 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %7)
-  %11 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %9)
-  %12 = shufflevector <16 x bfloat> %10, <16 x bfloat> %10, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-  %13 = shufflevector <16 x bfloat> %11, <16 x bfloat> %11, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-  %14 = call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %12, <32 x bfloat> %13, i32 60)
-  %15 = getelementptr float, ptr @buf6, i32 %3
-  store <16 x float> %14, ptr %15, align 4
-  %16 = add i32 %3, 16
-  br label %2, !llvm.loop !1
-
-17:                                               ; preds = %2
-  call void @llvm.aie2p.release(i32 51, i32 1)
-  call void @llvm.aie2p.release(i32 53, i32 1)
-  call void @llvm.aie2p.release(i32 48, i32 1)
-  br label %1
-}
-
-; Unknown intrinsic
-declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>)
-
-; Unknown intrinsic
-declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32)
-
-!llvm.module.flags = !{!0}
-
-!0 = !{i32 2, !"Debug Info Version", i32 3}
-!1 = distinct !{!1, !2}
-!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.o b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.o
deleted file mode 100644
index 2fea81b77544b9d8acd307af0a8527ae8aff2af3..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 1000
zcmaJ<O=}Zj5Pse!O}0`cEm#*p2t8?RmyH%bF0zTK7Tky+N<blOk`=_Y!R-d>se(O-
zi1ecD(Tn$<ddR82Kzi)4f<Hk2fc2UE>NW?5%scbU%sbD#v)Nj;?<u8_ih`Cyz`+zy
zU~(lYxQsGRY9-XdKF_@hc4?~e8&_aT&Cla;M;*>A0`JS?4h|n!z_hlv`CaW->Q($+
zVvUHdHy2`W$t<2N{SDin=fX!n=6*HG&WCz<9tgkWN%aLw)4y6Xz*^(zVz~9@QVT~U
z<XIvsKnBmBeWoIg7;pRMa`>R3_db=>qRha2ERx@yn!xi_PMJ;0Sx<4x#Z&j=e&n}s
zJ>lc@C3S{3wQqdocFaYD<jqMSGZ!)sDB{oYfY{7p1w5y}lf$0)Dh(9yIB_k;=K1Ar
zrzzSB_09yoJAnn(xJMB&%jL*rFW{;c`|^sK{3$3OskLI{!XT&O|0NP5a5n=O>%+h`
z?mv98yb(D4SFr7;?bb`jbNfMiI2d^@jE%q#oF47l*J6947q{fNDaQ>MzUw(rFQY&7
z-LBbPfnhhBX7^fR>$F{ab$xKMreuWwnJY{tPl(DooO_hJkgim$XtEMmxf*e0ry_0Y
zWSxA2RMNUN24r1UlBkeZV$xsck=!g*)}FhLcX^P@)#<ZECQ`RUB<0go=_lO(0J<@8
A$N&HU

diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.opt.ll b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.opt.ll
deleted file mode 100644
index e15e691..0000000
--- a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.opt.ll
+++ /dev/null
@@ -1,72 +0,0 @@
-; ModuleID = 'air_project/mul_kernel_0_core_0_4.peanohack.ll'
-source_filename = "LLVMDialectModule"
-target datalayout = "e-m:e-p:20:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-f32:32:32-i64:32-f64:32-a:0:32-n32"
-target triple = "aie2p"
-
-@buf6 = external local_unnamed_addr global [256 x float]
-@buf7 = external local_unnamed_addr global [256 x float]
-@buf8 = external local_unnamed_addr global [256 x float]
-
-; Function Attrs: nounwind
-declare void @llvm.aie2p.acquire(i32, i32) #0
-
-; Function Attrs: nounwind
-declare void @llvm.aie2p.release(i32, i32) #0
-
-; Function Attrs: nounwind memory(inaccessiblemem: write)
-declare void @llvm.aie2p.set.ctrl.reg(i32, i32) #1
-
-; Function Attrs: noreturn nounwind
-define void @core_0_4() local_unnamed_addr #2 {
-  tail call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1)
-  tail call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0)
-  br label %1
-
-1:                                                ; preds = %17, %0
-  tail call void @llvm.aie2p.acquire(i32 49, i32 -1)
-  tail call void @llvm.aie2p.acquire(i32 50, i32 -1)
-  tail call void @llvm.aie2p.acquire(i32 52, i32 -1)
-  br label %2
-
-2:                                                ; preds = %1, %2
-  %3 = phi i32 [ 0, %1 ], [ %15, %2 ]
-  %4 = trunc nuw i32 %3 to i20
-  %5 = getelementptr float, ptr @buf8, i20 %4
-  %6 = load <16 x float>, ptr %5, align 64
-  %7 = getelementptr float, ptr @buf7, i20 %4
-  %8 = load <16 x float>, ptr %7, align 64
-  %9 = tail call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %6)
-  %10 = tail call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %8)
-  %11 = shufflevector <16 x bfloat> %9, <16 x bfloat> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-  %12 = shufflevector <16 x bfloat> %10, <16 x bfloat> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-  %13 = tail call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %11, <32 x bfloat> %12, i32 60)
-  %14 = getelementptr float, ptr @buf6, i20 %4
-  store <16 x float> %13, ptr %14, align 64
-  %15 = add nuw nsw i32 %3, 16
-  %16 = icmp ult i32 %3, 240
-  br i1 %16, label %2, label %17, !llvm.loop !1
-
-17:                                               ; preds = %2
-  tail call void @llvm.aie2p.release(i32 51, i32 1)
-  tail call void @llvm.aie2p.release(i32 53, i32 1)
-  tail call void @llvm.aie2p.release(i32 48, i32 1)
-  br label %1
-}
-
-; Function Attrs: nofree nounwind memory(inaccessiblemem: read)
-declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>) #3
-
-; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read)
-declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32) #4
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind memory(inaccessiblemem: write) }
-attributes #2 = { noreturn nounwind }
-attributes #3 = { nofree nounwind memory(inaccessiblemem: read) }
-attributes #4 = { mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) }
-
-!llvm.module.flags = !{!0}
-
-!0 = !{i32 2, !"Debug Info Version", i32 3}
-!1 = distinct !{!1, !2}
-!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.peanohack.ll b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.peanohack.ll
deleted file mode 100644
index bfe891f..0000000
--- a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_4.peanohack.ll
+++ /dev/null
@@ -1,95 +0,0 @@
-; ModuleID = 'LLVMDialectModule'
-source_filename = "LLVMDialectModule"
-target triple = "aie2p"
-
-@buf0 = external global [256 x float]
-@buf1 = external global [256 x float]
-@buf2 = external global [256 x float]
-@buf3 = external global [256 x float]
-@buf4 = external global [256 x float]
-@buf5 = external global [256 x float]
-@buf6 = external global [256 x float]
-@buf7 = external global [256 x float]
-@buf8 = external global [256 x float]
-@buf9 = external global [256 x float]
-@buf10 = external global [256 x float]
-@buf11 = external global [256 x float]
-@buf12 = external global [1024 x float]
-@buf13 = external global [1024 x float]
-@buf14 = external global [1024 x float]
-
-declare void @debug_i32(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.event(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.put.ms(i32, i32)
-
-; Unknown intrinsic
-declare { i32, i32 } @llvm.aie2p.get.ss()
-
-; Unknown intrinsic
-declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32)
-
-; Unknown intrinsic
-declare <16 x i32> @llvm.aie2p.scd.read.vec(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.acquire(i32, i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.release(i32, i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.set.ctrl.reg(i32, i32)
-
-define void @core_0_4() {
-  call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1)
-  call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0)
-  br label %1
-
-1:                                                ; preds = %17, %0
-  call void @llvm.aie2p.acquire(i32 49, i32 -1)
-  call void @llvm.aie2p.acquire(i32 50, i32 -1)
-  call void @llvm.aie2p.acquire(i32 52, i32 -1)
-  br label %2
-
-2:                                                ; preds = %5, %1
-  %3 = phi i32 [ %16, %5 ], [ 0, %1 ]
-  %4 = icmp slt i32 %3, 256
-  br i1 %4, label %5, label %17
-
-5:                                                ; preds = %2
-  %6 = getelementptr float, ptr @buf8, i32 %3
-  %7 = load <16 x float>, ptr %6
-  %8 = getelementptr float, ptr @buf7, i32 %3
-  %9 = load <16 x float>, ptr %8
-  %10 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %7)
-  %11 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %9)
-  %12 = shufflevector <16 x bfloat> %10, <16 x bfloat> %10, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-  %13 = shufflevector <16 x bfloat> %11, <16 x bfloat> %11, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-  %14 = call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %12, <32 x bfloat> %13, i32 60)
-  %15 = getelementptr float, ptr @buf6, i32 %3
-  store <16 x float> %14, ptr %15
-  %16 = add i32 %3, 16
-  br label %2, !llvm.loop !1
-
-17:                                               ; preds = %2
-  call void @llvm.aie2p.release(i32 51, i32 1)
-  call void @llvm.aie2p.release(i32 53, i32 1)
-  call void @llvm.aie2p.release(i32 48, i32 1)
-  br label %1
-}
-
-; Unknown intrinsic
-declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>)
-
-; Unknown intrinsic
-declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32)
-
-!llvm.module.flags = !{!0}
-
-!0 = !{i32 2, !"Debug Info Version", i32 3}
-!1 = distinct !{!1, !2}
-!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.elf b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.elf
deleted file mode 100755
index 680e4695811b0567bb930a499c2383fd1f5d7bf8..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 1676
zcma)6O^6$17=9)jldSlYR$2ysU|2y`S+g@HcAK7NmsE<)!uFyI7G@_iv%AA4lXNm!
zw}NcIf<34=B?^KUZyth&3VRY-uqWxkLvMCVJ$NbYtq1A$c{4LfDjs~8?|a|p{rTSS
zn|J2hKU;rKQ52-epfF|?8lz``8OjPYWH66BvXJOLM~YiC$Y3an92fqI+|)NMbJHZz
zvkHbl4R>bqWA7WU0(ak^(ePk_o|~+Bt%xBjXy9{Dgnr0sherqN%I(=X;IsU}I&Pmg
zxr4&By&sidi^eE7UQkt6PR81To_#*|c(D4_vBBE!$L=lX?W@M%1Tgq|bfh>4>PfSp
zeq+u8=a%ohJlMbVT5Ut>)y%s*z#+g2&G5O8lu?bCUkP8B;<uL7#uqtdPWDfK4Dk;$
z|Ki8B1xQqir)!J#sr-07);Cd3>T&lu<yk;2{J>LQ0b4|<1te!?z6tgWN&G>F?u^tN
ze1HALOy|MRGsDjx4YvNtFJ!XjOmaTvB=Kt8Qdtyg^~R)0P`|yNS1~UQCvGUfqa&d1
z2RmMPMyuCrno%w3B^_{@cCe-GdSTaZ2Z>di-;SbA_srtrmLF~R8YQRQT3q*=ez3RL
zZ0@$+wtY`)lpR;Ey5+KGyJfw4s&d-(oF%v7mW@WaQE`l_=T<A5)i>ikOwZiT7+E|#
zdWmdpY*FwE_2CqMow_zoQZVU>yuF7<@-J*m`Bx;K!=&vv6r87yZ2WqP$l-!;bHYCq
z-k<P`!U>H-^5Y+7;{TNP4X)iu{8uS84uQmfNvR1Tisau>YC;I%KT&Ez2;sj_YC;I%
zzfo#J2;qNFYC;I%e~OajcSFc|?ulYf_}{|&6OK~k?M09_w!DB!_i`(;8<gAKD2%nS
zw4EKR>wn^P`MuOXSgrYX({rNLw%coZDAiY1bjwKOJCs)aY~bo}!pJB&4h@^Vz_A+M
zmLC{!+M#FZ*1Kr*HVtDm>oQkiS>4DEBUs%|tXaEuv*%g0RiVxftd{KumLK>LM1$o-
z?Xa6>iQTfp(7v2nVz(E#7$qgn5qGCEV<L9$<FrgmGMySV)RY(Z|DZ5TMtX1dsmnc^
zqL?W1)reV7_*w?|IMGF<>qtCZW85U^(<%41tRdZ$82<B+J3qxaI+jvGnQmc@`gGhr
W<L>`Q+*?UbKk>C#P->>@r1A+>rU?lE

diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.ld.script b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.ld.script
deleted file mode 100644
index 51c13db..0000000
--- a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.ld.script
+++ /dev/null
@@ -1,72 +0,0 @@
-
-MEMORY
-{
-   program (RX) : ORIGIN = 0, LENGTH = 0x0020000
-   data (!RX) : ORIGIN = 0x78400, LENGTH = 0x7C00
-}
-ENTRY(__start)
-SECTIONS
-{
-  . = 0x0;
-  .text : {
-     /* the __start symbol has to come at address zero. */
-     *crt0.o(.text*)
-     _ctors_start = .;
-     _init_array_start = .;
-     KEEP(SORT(*.init_array))
-     _ctors_end = .;
-     _init_array_end = .;
-     _dtors_start = .;
-     _dtors_end = .;
-     *(.text*)
-  } > program
-  .data : {
-     *(.data*)
-     *(.rodata*)
-  } > data
-  .comment : {
-     *(.comment*)
-  }
-  .symtab : {
-     *(.symtab)
-  }
-  .shstrtab : {
-     *(.shstrtab)
-  }
-  .strtab : {
-     *(.strtab)
-  }
-  .stack_sizes : {
-     *(.stack_sizes)
-  }
-
-. = 0x70000;
-_sp_start_value_DM_stack = .;
-. += 0x400; /* stack */
-. = 0x40400;
-buf8 = .;
-. += 0x400;
-. = 0x44000;
-buf7 = .;
-. += 0x400;
-. = 0x48000;
-buf6 = .;
-. += 0x400;
-/* No tile with memory exists to the west. */
-. = 0x50000;
-. += 0x10000;
-/* No tile with memory exists to the north. */
-. = 0x60000;
-. += 0x10000;
-. = 0x70400;
-buf11 = .;
-. += 0x400;
-. = 0x74000;
-buf10 = .;
-. += 0x400;
-. = 0x78000;
-buf9 = .;
-. += 0x400;
-  .bss : { *(.bss*) } > data
-}
-PROVIDE(main = core_0_5);
diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.ll b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.ll
deleted file mode 100644
index 4ed7251..0000000
--- a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.ll
+++ /dev/null
@@ -1,95 +0,0 @@
-; ModuleID = 'LLVMDialectModule'
-source_filename = "LLVMDialectModule"
-target triple = "aie2p"
-
-@buf0 = external global [256 x float]
-@buf1 = external global [256 x float]
-@buf2 = external global [256 x float]
-@buf3 = external global [256 x float]
-@buf4 = external global [256 x float]
-@buf5 = external global [256 x float]
-@buf6 = external global [256 x float]
-@buf7 = external global [256 x float]
-@buf8 = external global [256 x float]
-@buf9 = external global [256 x float]
-@buf10 = external global [256 x float]
-@buf11 = external global [256 x float]
-@buf12 = external global [1024 x float]
-@buf13 = external global [1024 x float]
-@buf14 = external global [1024 x float]
-
-declare void @debug_i32(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.event(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.put.ms(i32, i32)
-
-; Unknown intrinsic
-declare { i32, i32 } @llvm.aie2p.get.ss()
-
-; Unknown intrinsic
-declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32)
-
-; Unknown intrinsic
-declare <16 x i32> @llvm.aie2p.scd.read.vec(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.acquire(i32, i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.release(i32, i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.set.ctrl.reg(i32, i32)
-
-define void @core_0_5() {
-  call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1)
-  call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0)
-  br label %1
-
-1:                                                ; preds = %17, %0
-  call void @llvm.aie2p.acquire(i32 49, i32 -1)
-  call void @llvm.aie2p.acquire(i32 50, i32 -1)
-  call void @llvm.aie2p.acquire(i32 52, i32 -1)
-  br label %2
-
-2:                                                ; preds = %5, %1
-  %3 = phi i32 [ %16, %5 ], [ 0, %1 ]
-  %4 = icmp slt i32 %3, 256
-  br i1 %4, label %5, label %17
-
-5:                                                ; preds = %2
-  %6 = getelementptr float, ptr @buf11, i32 %3
-  %7 = load <16 x float>, ptr %6, align 4
-  %8 = getelementptr float, ptr @buf10, i32 %3
-  %9 = load <16 x float>, ptr %8, align 4
-  %10 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %7)
-  %11 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %9)
-  %12 = shufflevector <16 x bfloat> %10, <16 x bfloat> %10, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-  %13 = shufflevector <16 x bfloat> %11, <16 x bfloat> %11, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-  %14 = call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %12, <32 x bfloat> %13, i32 60)
-  %15 = getelementptr float, ptr @buf9, i32 %3
-  store <16 x float> %14, ptr %15, align 4
-  %16 = add i32 %3, 16
-  br label %2, !llvm.loop !1
-
-17:                                               ; preds = %2
-  call void @llvm.aie2p.release(i32 51, i32 1)
-  call void @llvm.aie2p.release(i32 53, i32 1)
-  call void @llvm.aie2p.release(i32 48, i32 1)
-  br label %1
-}
-
-; Unknown intrinsic
-declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>)
-
-; Unknown intrinsic
-declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32)
-
-!llvm.module.flags = !{!0}
-
-!0 = !{i32 2, !"Debug Info Version", i32 3}
-!1 = distinct !{!1, !2}
-!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.o b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.o
deleted file mode 100644
index e70224c9ad756c9bb1acf0ca65fa018ff13a4066..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 1000
zcmZ`%O=}Zj5Pse!O}0{HTd*#I5PH(sF1xlKdXY^`wc<uZP=X#plCCJmhHkf5PZjJz
zM5GsGk6yg@)I(1F1=3@W75oAE2dvNRSGNI&ot=4R=ACEW+1*}t?<l2^ih`Cyz~LBB
zU~(}kxQa4PYbDgc0pI&N_Gl{d40Tvi^YdibQb!ZB!29yBg`@ipFs|)yeOCv%UBT};
z)`%EJeJ1witm667->~_4Dt!23>Q}AY`Cx|^f$&S7)R=KJ<Et|PtkjM#huiB{8aO5)
z-zBmFWa#^o&s4+_;~oD(4j<Nx?x&KPl^K|iMe=)NBY3*PDYHpA>lto2c;<ZEkNgf6
z5<X5}Qs;P5`^H=D#9Ty3?wkZNb1CzHBL18_CAP9y0gveK=I{&gRT?PZUgA28P4mlj
z(-f^vT_3?qBUoUSdlV5fUyfY%0<LSZFSn@5PeJ)etrjB}COH-Vmq^UO+X`T=4Fb=+
z`{42XM$qZKg6lqMHePo6UN2}4RyX?|%#FYgI$heguf=vZpD)4my?!U^VD<*S*S6X>
zVY>CY)xMEfTMBKPsO>JVty&R0T~so}|IIa~k}E_-EzbQWSDvm^tZ1?lIJp{eWv3#y
zsiSrB4N^(#ZZRP1a*~8j-WB@NU*?hABvsa)JC1jGkjvTh*&-9E+a;3najNta?tf@J
Bay|e6

diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.opt.ll b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.opt.ll
deleted file mode 100644
index 80307e8..0000000
--- a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.opt.ll
+++ /dev/null
@@ -1,72 +0,0 @@
-; ModuleID = 'air_project/mul_kernel_0_core_0_5.peanohack.ll'
-source_filename = "LLVMDialectModule"
-target datalayout = "e-m:e-p:20:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-f32:32:32-i64:32-f64:32-a:0:32-n32"
-target triple = "aie2p"
-
-@buf9 = external local_unnamed_addr global [256 x float]
-@buf10 = external local_unnamed_addr global [256 x float]
-@buf11 = external local_unnamed_addr global [256 x float]
-
-; Function Attrs: nounwind
-declare void @llvm.aie2p.acquire(i32, i32) #0
-
-; Function Attrs: nounwind
-declare void @llvm.aie2p.release(i32, i32) #0
-
-; Function Attrs: nounwind memory(inaccessiblemem: write)
-declare void @llvm.aie2p.set.ctrl.reg(i32, i32) #1
-
-; Function Attrs: noreturn nounwind
-define void @core_0_5() local_unnamed_addr #2 {
-  tail call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1)
-  tail call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0)
-  br label %1
-
-1:                                                ; preds = %17, %0
-  tail call void @llvm.aie2p.acquire(i32 49, i32 -1)
-  tail call void @llvm.aie2p.acquire(i32 50, i32 -1)
-  tail call void @llvm.aie2p.acquire(i32 52, i32 -1)
-  br label %2
-
-2:                                                ; preds = %1, %2
-  %3 = phi i32 [ 0, %1 ], [ %15, %2 ]
-  %4 = trunc nuw i32 %3 to i20
-  %5 = getelementptr float, ptr @buf11, i20 %4
-  %6 = load <16 x float>, ptr %5, align 64
-  %7 = getelementptr float, ptr @buf10, i20 %4
-  %8 = load <16 x float>, ptr %7, align 64
-  %9 = tail call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %6)
-  %10 = tail call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %8)
-  %11 = shufflevector <16 x bfloat> %9, <16 x bfloat> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-  %12 = shufflevector <16 x bfloat> %10, <16 x bfloat> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-  %13 = tail call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %11, <32 x bfloat> %12, i32 60)
-  %14 = getelementptr float, ptr @buf9, i20 %4
-  store <16 x float> %13, ptr %14, align 64
-  %15 = add nuw nsw i32 %3, 16
-  %16 = icmp ult i32 %3, 240
-  br i1 %16, label %2, label %17, !llvm.loop !1
-
-17:                                               ; preds = %2
-  tail call void @llvm.aie2p.release(i32 51, i32 1)
-  tail call void @llvm.aie2p.release(i32 53, i32 1)
-  tail call void @llvm.aie2p.release(i32 48, i32 1)
-  br label %1
-}
-
-; Function Attrs: nofree nounwind memory(inaccessiblemem: read)
-declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>) #3
-
-; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read)
-declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32) #4
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind memory(inaccessiblemem: write) }
-attributes #2 = { noreturn nounwind }
-attributes #3 = { nofree nounwind memory(inaccessiblemem: read) }
-attributes #4 = { mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) }
-
-!llvm.module.flags = !{!0}
-
-!0 = !{i32 2, !"Debug Info Version", i32 3}
-!1 = distinct !{!1, !2}
-!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.peanohack.ll b/examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.peanohack.ll
deleted file mode 100644
index 5a9b5b8..0000000
--- a/examples/elementwise_arith/air_project/mul_kernel_0_core_0_5.peanohack.ll
+++ /dev/null
@@ -1,95 +0,0 @@
-; ModuleID = 'LLVMDialectModule'
-source_filename = "LLVMDialectModule"
-target triple = "aie2p"
-
-@buf0 = external global [256 x float]
-@buf1 = external global [256 x float]
-@buf2 = external global [256 x float]
-@buf3 = external global [256 x float]
-@buf4 = external global [256 x float]
-@buf5 = external global [256 x float]
-@buf6 = external global [256 x float]
-@buf7 = external global [256 x float]
-@buf8 = external global [256 x float]
-@buf9 = external global [256 x float]
-@buf10 = external global [256 x float]
-@buf11 = external global [256 x float]
-@buf12 = external global [1024 x float]
-@buf13 = external global [1024 x float]
-@buf14 = external global [1024 x float]
-
-declare void @debug_i32(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.event(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.put.ms(i32, i32)
-
-; Unknown intrinsic
-declare { i32, i32 } @llvm.aie2p.get.ss()
-
-; Unknown intrinsic
-declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32)
-
-; Unknown intrinsic
-declare <16 x i32> @llvm.aie2p.scd.read.vec(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.acquire(i32, i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.release(i32, i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.set.ctrl.reg(i32, i32)
-
-define void @core_0_5() {
-  call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1)
-  call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0)
-  br label %1
-
-1:                                                ; preds = %17, %0
-  call void @llvm.aie2p.acquire(i32 49, i32 -1)
-  call void @llvm.aie2p.acquire(i32 50, i32 -1)
-  call void @llvm.aie2p.acquire(i32 52, i32 -1)
-  br label %2
-
-2:                                                ; preds = %5, %1
-  %3 = phi i32 [ %16, %5 ], [ 0, %1 ]
-  %4 = icmp slt i32 %3, 256
-  br i1 %4, label %5, label %17
-
-5:                                                ; preds = %2
-  %6 = getelementptr float, ptr @buf11, i32 %3
-  %7 = load <16 x float>, ptr %6
-  %8 = getelementptr float, ptr @buf10, i32 %3
-  %9 = load <16 x float>, ptr %8
-  %10 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %7)
-  %11 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %9)
-  %12 = shufflevector <16 x bfloat> %10, <16 x bfloat> %10, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-  %13 = shufflevector <16 x bfloat> %11, <16 x bfloat> %11, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-  %14 = call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %12, <32 x bfloat> %13, i32 60)
-  %15 = getelementptr float, ptr @buf9, i32 %3
-  store <16 x float> %14, ptr %15
-  %16 = add i32 %3, 16
-  br label %2, !llvm.loop !1
-
-17:                                               ; preds = %2
-  call void @llvm.aie2p.release(i32 51, i32 1)
-  call void @llvm.aie2p.release(i32 53, i32 1)
-  call void @llvm.aie2p.release(i32 48, i32 1)
-  br label %1
-}
-
-; Unknown intrinsic
-declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>)
-
-; Unknown intrinsic
-declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32)
-
-!llvm.module.flags = !{!0}
-
-!0 = !{i32 2, !"Debug Info Version", i32 3}
-!1 = distinct !{!1, !2}
-!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_design.bif b/examples/elementwise_arith/air_project/mul_kernel_0_design.bif
deleted file mode 100644
index 86ba205..0000000
--- a/examples/elementwise_arith/air_project/mul_kernel_0_design.bif
+++ /dev/null
@@ -1,10 +0,0 @@
-all:
-{
-  id_code = 0x14ca8093
-  extended_id_code = 0x01
-  image
-  {
-    name=aie_image, id=0x1c000000
-    { type=cdo file=air_project/mul_kernel_0_aie_cdo_elfs.bin file=air_project/mul_kernel_0_aie_cdo_init.bin file=air_project/mul_kernel_0_aie_cdo_enable.bin }
-  }
-}
diff --git a/examples/elementwise_arith/air_project/mul_kernel_0_mul_kernel_0_sequence.bin b/examples/elementwise_arith/air_project/mul_kernel_0_mul_kernel_0_sequence.bin
deleted file mode 100644
index f2eb383b7eb903e8ad8809b7d44fb0a4d660bfb6..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 3248
zcmcJQ-Aw~A5QNu<g?K=M2OfApq6JF0GAPG%L<ue-N=1yF_0OCH8j;via_gU++3a_@
zvk>neg4w5;eLtEZyo^(@j}krmd|WcZ&8OQJf_lyu+9e|Yukmso`>#0IXQ!}T>04{-
zdWRz5+iJIiZ@khs7OO7qzIV<49O}M3Vd35Et*zr4+n!k}H6?s5ZHHP2b8iNd9f!uq
z2%ELeWFG=0l9+H#Mw;SV&vYiNQGA=m?@TyH^<^fmv5ZNwd1Or5@6ws1SwdvOIT;Fy
zFEe2cjS1(dzRbimmN6++jgb*5y%`xL$~=|+C!AAeiZ3%^4UGxssJ_g^HI^|MPc*|&
zYb<A{(Vr%DO=9vtUuMD@8WYaRC|J1{n2BpFW3p2})f#K>`xk4J-t?`*yLP<eIY;$n
zCaj?`;T+YMnYhL>CTa8!jZ|yI(P^-T#)NY+)Rg;`nXrb&gmYA1X5t#lnB*HUVGWH5
W=cv75Caj?`;T+YMnYhL+nEU|x-i@&U

diff --git a/examples/elementwise_arith/air_project/npu.asm_air_output.mlir b/examples/elementwise_arith/air_project/npu.asm_air_output.mlir
deleted file mode 100644
index a66ce9e..0000000
--- a/examples/elementwise_arith/air_project/npu.asm_air_output.mlir
+++ /dev/null
@@ -1,300 +0,0 @@
-#loop_annotation = #llvm.loop_annotation<mustProgress = true>
-module {
-  aie.device(npu2) @square_kernel_0 {
-    %shim_noc_tile_0_0 = aie.tile(0, 0)
-    %shim_noc_tile_1_0 = aie.tile(1, 0)
-    %mem_tile_0_1 = aie.tile(0, 1)
-    %mem_tile_1_1 = aie.tile(1, 1)
-    %tile_0_2 = aie.tile(0, 2)
-    %tile_0_3 = aie.tile(0, 3)
-    %tile_0_4 = aie.tile(0, 4)
-    %tile_0_5 = aie.tile(0, 5)
-    %lock_0_1 = aie.lock(%mem_tile_0_1, 1) {init = 4 : i32}
-    %lock_0_1_0 = aie.lock(%mem_tile_0_1, 0) {init = 0 : i32}
-    %lock_1_1 = aie.lock(%mem_tile_1_1, 1) {init = 4 : i32}
-    %lock_1_1_1 = aie.lock(%mem_tile_1_1, 0) {init = 0 : i32}
-    %lock_0_2 = aie.lock(%tile_0_2, 3) {init = 1 : i32}
-    %lock_0_2_2 = aie.lock(%tile_0_2, 2) {init = 0 : i32}
-    %lock_0_2_3 = aie.lock(%tile_0_2, 1) {init = 1 : i32}
-    %lock_0_2_4 = aie.lock(%tile_0_2, 0) {init = 0 : i32}
-    %lock_0_3 = aie.lock(%tile_0_3, 3) {init = 1 : i32}
-    %lock_0_3_5 = aie.lock(%tile_0_3, 2) {init = 0 : i32}
-    %lock_0_3_6 = aie.lock(%tile_0_3, 1) {init = 1 : i32}
-    %lock_0_3_7 = aie.lock(%tile_0_3, 0) {init = 0 : i32}
-    %lock_0_4 = aie.lock(%tile_0_4, 3) {init = 1 : i32}
-    %lock_0_4_8 = aie.lock(%tile_0_4, 2) {init = 0 : i32}
-    %lock_0_4_9 = aie.lock(%tile_0_4, 1) {init = 1 : i32}
-    %lock_0_4_10 = aie.lock(%tile_0_4, 0) {init = 0 : i32}
-    %lock_0_5 = aie.lock(%tile_0_5, 3) {init = 1 : i32}
-    %lock_0_5_11 = aie.lock(%tile_0_5, 2) {init = 0 : i32}
-    %lock_0_5_12 = aie.lock(%tile_0_5, 1) {init = 1 : i32}
-    %lock_0_5_13 = aie.lock(%tile_0_5, 0) {init = 0 : i32}
-    %buf9 = aie.buffer(%mem_tile_0_1) {sym_name = "buf9"} : memref<1024xi16, 1 : i32> 
-    %buf8 = aie.buffer(%mem_tile_1_1) {sym_name = "buf8"} : memref<1024xi16, 1> 
-    %buf7 = aie.buffer(%tile_0_5) {sym_name = "buf7"} : memref<256xi16, 2> 
-    %buf6 = aie.buffer(%tile_0_5) {sym_name = "buf6"} : memref<256xi16, 2> 
-    %buf5 = aie.buffer(%tile_0_4) {sym_name = "buf5"} : memref<256xi16, 2> 
-    %buf4 = aie.buffer(%tile_0_4) {sym_name = "buf4"} : memref<256xi16, 2> 
-    %buf3 = aie.buffer(%tile_0_3) {sym_name = "buf3"} : memref<256xi16, 2> 
-    %buf2 = aie.buffer(%tile_0_3) {sym_name = "buf2"} : memref<256xi16, 2> 
-    %buf1 = aie.buffer(%tile_0_2) {sym_name = "buf1"} : memref<256xi16, 2> 
-    %buf0 = aie.buffer(%tile_0_2) {sym_name = "buf0"} : memref<256xi16, 2> 
-    %mem_0_5 = aie.mem(%tile_0_5) {
-      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
-    ^bb1:  // 2 preds: ^bb0, ^bb1
-      aie.use_lock(%lock_0_5_13, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf6 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32}
-      aie.use_lock(%lock_0_5_12, Release, 1)
-      aie.next_bd ^bb1
-    ^bb2:  // pred: ^bb3
-      aie.end
-    ^bb3:  // pred: ^bb0
-      %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb2)
-    ^bb4:  // 2 preds: ^bb3, ^bb4
-      aie.use_lock(%lock_0_5, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf7 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32}
-      aie.use_lock(%lock_0_5_11, Release, 1)
-      aie.next_bd ^bb4
-    }
-    %core_0_5 = aie.core(%tile_0_5) {
-      %0 = ub.poison : i16
-      %c256 = arith.constant 256 : index
-      %c32 = arith.constant 32 : index
-      %c0 = arith.constant 0 : index
-      cf.br ^bb1
-    ^bb1:  // 2 preds: ^bb0, ^bb1
-      aie.use_lock(%lock_0_5_12, AcquireGreaterEqual, 1)
-      aie.use_lock(%lock_0_5_11, AcquireGreaterEqual, 1)
-      scf.for %arg0 = %c0 to %c256 step %c32 {
-        %subview = memref.subview %buf7[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2>
-        %subview_14 = memref.subview %buf6[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2>
-        %1 = vector.transfer_read %subview[%c0], %0 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16>
-        %2 = arith.muli %1, %1 : vector<32xi16>
-        vector.transfer_write %2, %subview_14[%c0] {in_bounds = [true]} : vector<32xi16>, memref<32xi16, strided<[1], offset: ?>, 2>
-      } {loop_annotation = #loop_annotation}
-      aie.use_lock(%lock_0_5, Release, 1)
-      aie.use_lock(%lock_0_5_13, Release, 1)
-      cf.br ^bb1
-    }
-    %mem_0_4 = aie.mem(%tile_0_4) {
-      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
-    ^bb1:  // 2 preds: ^bb0, ^bb1
-      aie.use_lock(%lock_0_4_10, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf4 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32}
-      aie.use_lock(%lock_0_4_9, Release, 1)
-      aie.next_bd ^bb1
-    ^bb2:  // pred: ^bb3
-      aie.end
-    ^bb3:  // pred: ^bb0
-      %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb2)
-    ^bb4:  // 2 preds: ^bb3, ^bb4
-      aie.use_lock(%lock_0_4, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf5 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32}
-      aie.use_lock(%lock_0_4_8, Release, 1)
-      aie.next_bd ^bb4
-    }
-    %core_0_4 = aie.core(%tile_0_4) {
-      %0 = ub.poison : i16
-      %c256 = arith.constant 256 : index
-      %c32 = arith.constant 32 : index
-      %c0 = arith.constant 0 : index
-      cf.br ^bb1
-    ^bb1:  // 2 preds: ^bb0, ^bb1
-      aie.use_lock(%lock_0_4_9, AcquireGreaterEqual, 1)
-      aie.use_lock(%lock_0_4_8, AcquireGreaterEqual, 1)
-      scf.for %arg0 = %c0 to %c256 step %c32 {
-        %subview = memref.subview %buf5[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2>
-        %subview_14 = memref.subview %buf4[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2>
-        %1 = vector.transfer_read %subview[%c0], %0 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16>
-        %2 = arith.muli %1, %1 : vector<32xi16>
-        vector.transfer_write %2, %subview_14[%c0] {in_bounds = [true]} : vector<32xi16>, memref<32xi16, strided<[1], offset: ?>, 2>
-      } {loop_annotation = #loop_annotation}
-      aie.use_lock(%lock_0_4, Release, 1)
-      aie.use_lock(%lock_0_4_10, Release, 1)
-      cf.br ^bb1
-    }
-    %mem_0_3 = aie.mem(%tile_0_3) {
-      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
-    ^bb1:  // 2 preds: ^bb0, ^bb1
-      aie.use_lock(%lock_0_3_7, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf2 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32}
-      aie.use_lock(%lock_0_3_6, Release, 1)
-      aie.next_bd ^bb1
-    ^bb2:  // pred: ^bb3
-      aie.end
-    ^bb3:  // pred: ^bb0
-      %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb2)
-    ^bb4:  // 2 preds: ^bb3, ^bb4
-      aie.use_lock(%lock_0_3, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf3 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32}
-      aie.use_lock(%lock_0_3_5, Release, 1)
-      aie.next_bd ^bb4
-    }
-    %core_0_3 = aie.core(%tile_0_3) {
-      %0 = ub.poison : i16
-      %c256 = arith.constant 256 : index
-      %c32 = arith.constant 32 : index
-      %c0 = arith.constant 0 : index
-      cf.br ^bb1
-    ^bb1:  // 2 preds: ^bb0, ^bb1
-      aie.use_lock(%lock_0_3_6, AcquireGreaterEqual, 1)
-      aie.use_lock(%lock_0_3_5, AcquireGreaterEqual, 1)
-      scf.for %arg0 = %c0 to %c256 step %c32 {
-        %subview = memref.subview %buf3[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2>
-        %subview_14 = memref.subview %buf2[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2>
-        %1 = vector.transfer_read %subview[%c0], %0 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16>
-        %2 = arith.muli %1, %1 : vector<32xi16>
-        vector.transfer_write %2, %subview_14[%c0] {in_bounds = [true]} : vector<32xi16>, memref<32xi16, strided<[1], offset: ?>, 2>
-      } {loop_annotation = #loop_annotation}
-      aie.use_lock(%lock_0_3, Release, 1)
-      aie.use_lock(%lock_0_3_7, Release, 1)
-      cf.br ^bb1
-    }
-    %mem_0_2 = aie.mem(%tile_0_2) {
-      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
-    ^bb1:  // 2 preds: ^bb0, ^bb1
-      aie.use_lock(%lock_0_2_4, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf0 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32}
-      aie.use_lock(%lock_0_2_3, Release, 1)
-      aie.next_bd ^bb1
-    ^bb2:  // pred: ^bb3
-      aie.end
-    ^bb3:  // pred: ^bb0
-      %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb2)
-    ^bb4:  // 2 preds: ^bb3, ^bb4
-      aie.use_lock(%lock_0_2, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf1 : memref<256xi16, 2>, 0, 256) {task_id = 0 : i32}
-      aie.use_lock(%lock_0_2_2, Release, 1)
-      aie.next_bd ^bb4
-    }
-    %core_0_2 = aie.core(%tile_0_2) {
-      %0 = ub.poison : i16
-      %c256 = arith.constant 256 : index
-      %c32 = arith.constant 32 : index
-      %c0 = arith.constant 0 : index
-      cf.br ^bb1
-    ^bb1:  // 2 preds: ^bb0, ^bb1
-      aie.use_lock(%lock_0_2_3, AcquireGreaterEqual, 1)
-      aie.use_lock(%lock_0_2_2, AcquireGreaterEqual, 1)
-      scf.for %arg0 = %c0 to %c256 step %c32 {
-        %subview = memref.subview %buf1[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2>
-        %subview_14 = memref.subview %buf0[%arg0] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2>
-        %1 = vector.transfer_read %subview[%c0], %0 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16>
-        %2 = arith.muli %1, %1 : vector<32xi16>
-        vector.transfer_write %2, %subview_14[%c0] {in_bounds = [true]} : vector<32xi16>, memref<32xi16, strided<[1], offset: ?>, 2>
-      } {loop_annotation = #loop_annotation}
-      aie.use_lock(%lock_0_2, Release, 1)
-      aie.use_lock(%lock_0_2_4, Release, 1)
-      cf.br ^bb1
-    }
-    aie.flow(%shim_noc_tile_0_0, DMA : 0, %mem_tile_0_1, DMA : 0)
-    aie.flow(%mem_tile_1_1, DMA : 0, %shim_noc_tile_1_0, DMA : 0)
-    aie.flow(%mem_tile_0_1, DMA : 0, %tile_0_2, DMA : 0)
-    aie.flow(%mem_tile_0_1, DMA : 1, %tile_0_3, DMA : 0)
-    aie.flow(%mem_tile_0_1, DMA : 2, %tile_0_4, DMA : 0)
-    aie.flow(%mem_tile_0_1, DMA : 3, %tile_0_5, DMA : 0)
-    aie.flow(%tile_0_2, DMA : 0, %mem_tile_1_1, DMA : 0)
-    aie.flow(%tile_0_3, DMA : 0, %mem_tile_1_1, DMA : 1)
-    aie.flow(%tile_0_4, DMA : 0, %mem_tile_1_1, DMA : 2)
-    aie.flow(%tile_0_5, DMA : 0, %mem_tile_1_1, DMA : 3)
-    %memtile_dma_1_1 = aie.memtile_dma(%mem_tile_1_1) {
-      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
-    ^bb1:  // 2 preds: ^bb0, ^bb1
-      aie.use_lock(%lock_1_1_1, AcquireGreaterEqual, 4)
-      aie.dma_bd(%buf8 : memref<1024xi16, 1>, 0, 1024) {task_id = 0 : i32}
-      aie.use_lock(%lock_1_1, Release, 4)
-      aie.next_bd ^bb1
-    ^bb2:  // pred: ^bb9
-      aie.end
-    ^bb3:  // pred: ^bb0
-      %1 = aie.dma_start(S2MM, 0, ^bb4, ^bb5)
-    ^bb4:  // 2 preds: ^bb3, ^bb4
-      aie.use_lock(%lock_1_1, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf8 : memref<1024xi16, 1>, 0, 256) {task_id = 0 : i32}
-      aie.use_lock(%lock_1_1_1, Release, 1)
-      aie.next_bd ^bb4
-    ^bb5:  // pred: ^bb3
-      %2 = aie.dma_start(S2MM, 1, ^bb6, ^bb7)
-    ^bb6:  // 2 preds: ^bb5, ^bb6
-      aie.use_lock(%lock_1_1, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf8 : memref<1024xi16, 1>, 256, 256) {task_id = 0 : i32}
-      aie.use_lock(%lock_1_1_1, Release, 1)
-      aie.next_bd ^bb6
-    ^bb7:  // pred: ^bb5
-      %3 = aie.dma_start(S2MM, 2, ^bb8, ^bb9)
-    ^bb8:  // 2 preds: ^bb7, ^bb8
-      aie.use_lock(%lock_1_1, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf8 : memref<1024xi16, 1>, 512, 256) {task_id = 0 : i32}
-      aie.use_lock(%lock_1_1_1, Release, 1)
-      aie.next_bd ^bb8
-    ^bb9:  // pred: ^bb7
-      %4 = aie.dma_start(S2MM, 3, ^bb10, ^bb2)
-    ^bb10:  // 2 preds: ^bb9, ^bb10
-      aie.use_lock(%lock_1_1, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf8 : memref<1024xi16, 1>, 768, 256) {task_id = 0 : i32}
-      aie.use_lock(%lock_1_1_1, Release, 1)
-      aie.next_bd ^bb10
-    }
-    %memtile_dma_0_1 = aie.memtile_dma(%mem_tile_0_1) {
-      %0 = aie.dma_start(MM2S, 0, ^bb1, ^bb3)
-    ^bb1:  // 2 preds: ^bb0, ^bb1
-      aie.use_lock(%lock_0_1_0, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 0, 256) {task_id = 0 : i32}
-      aie.use_lock(%lock_0_1, Release, 1)
-      aie.next_bd ^bb1
-    ^bb2:  // pred: ^bb9
-      aie.end
-    ^bb3:  // pred: ^bb0
-      %1 = aie.dma_start(MM2S, 1, ^bb4, ^bb5)
-    ^bb4:  // 2 preds: ^bb3, ^bb4
-      aie.use_lock(%lock_0_1_0, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 256, 256) {task_id = 0 : i32}
-      aie.use_lock(%lock_0_1, Release, 1)
-      aie.next_bd ^bb4
-    ^bb5:  // pred: ^bb3
-      %2 = aie.dma_start(MM2S, 2, ^bb6, ^bb7)
-    ^bb6:  // 2 preds: ^bb5, ^bb6
-      aie.use_lock(%lock_0_1_0, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 512, 256) {task_id = 0 : i32}
-      aie.use_lock(%lock_0_1, Release, 1)
-      aie.next_bd ^bb6
-    ^bb7:  // pred: ^bb5
-      %3 = aie.dma_start(MM2S, 3, ^bb8, ^bb9)
-    ^bb8:  // 2 preds: ^bb7, ^bb8
-      aie.use_lock(%lock_0_1_0, AcquireGreaterEqual, 1)
-      aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 768, 256) {task_id = 0 : i32}
-      aie.use_lock(%lock_0_1, Release, 1)
-      aie.next_bd ^bb8
-    ^bb9:  // pred: ^bb7
-      %4 = aie.dma_start(S2MM, 0, ^bb10, ^bb2)
-    ^bb10:  // 2 preds: ^bb9, ^bb10
-      aie.use_lock(%lock_0_1, AcquireGreaterEqual, 4)
-      aie.dma_bd(%buf9 : memref<1024xi16, 1 : i32>, 0, 1024) {task_id = 0 : i32}
-      aie.use_lock(%lock_0_1_0, Release, 4)
-      aie.next_bd ^bb10
-    }
-    aie.shim_dma_allocation @air_channel_3(%shim_noc_tile_1_0, S2MM, 0)
-    aie.shim_dma_allocation @air_channel_0(%shim_noc_tile_0_0, MM2S, 0)
-    aie.runtime_sequence @square_kernel_0_sequence(%arg0: memref<*xi16>, %arg1: memref<*xi16>, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32) {
-      %0 = aiex.dma_configure_task_for @air_channel_0 {
-        aie.dma_bd(%arg0 : memref<*xi16>, 0, 1024, [<size = 2, stride = 512>, <size = 512, stride = 1>])
-        aie.end
-      }
-      aiex.dma_start_task(%0)
-      %1 = aiex.dma_configure_task_for @air_channel_3 {
-        aie.dma_bd(%arg1 : memref<*xi16>, 0, 1024, [<size = 2, stride = 512>, <size = 512, stride = 1>])
-        aie.end
-      } {issue_token = true}
-      aiex.dma_start_task(%1)
-      aiex.dma_free_task(%0)
-      aiex.dma_await_task(%1)
-    }
-  } {dlti.dl_spec = #dlti.dl_spec<index = 32 : i64>}
-  aie.device(npu2) {
-    aie.runtime_sequence @square_kernel(%arg0: memref<*xi16>, %arg1: memref<*xi16>, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32) {
-      aiex.configure @square_kernel_0 {
-        aiex.run @square_kernel_0_sequence(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7) : (memref<*xi16>, memref<*xi16>, i32, i32, i32, i32, i32, i32)
-      }
-    }
-  }
-}
diff --git a/examples/elementwise_arith/air_project/placed.asm_air_output.mlir b/examples/elementwise_arith/air_project/placed.asm_air_output.mlir
deleted file mode 100644
index aa82d2e..0000000
--- a/examples/elementwise_arith/air_project/placed.asm_air_output.mlir
+++ /dev/null
@@ -1,86 +0,0 @@
-module {
-  air.channel @channel_0 []
-  air.channel @channel_1 [4, 1]
-  air.channel @channel_2 [4, 1]
-  air.channel @channel_3 []
-  func.func @square_kernel(%arg0: memref<*xi16> {tt.divisibility = 16 : i32}, %arg1: memref<*xi16> {tt.divisibility = 16 : i32}, %arg2: i32, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32) {
-    %c1 = arith.constant 1 : index
-    %0 = air.launch async (%arg8, %arg9, %arg10) in (%arg11=%c1, %arg12=%c1, %arg13=%c1) args(%arg14=%arg0, %arg15=%arg1) : memref<*xi16>, memref<*xi16> attributes {id = 1 : i32} {
-      %c1024 = arith.constant 1024 : index
-      %c1_0 = arith.constant 1 : index
-      %1 = arith.muli %arg8, %c1024 : index
-      %2 = air.channel.put async  @channel_0[] (%arg14[%1] [%c1024] [%c1_0]) {id = 1 : i32} : (memref<*xi16>)
-      %3 = air.channel.get async  @channel_3[] (%arg15[%1] [%c1024] [%c1_0]) {id = 2 : i32} : (memref<*xi16>)
-      %4 = air.segment @square_kernel_0 async  attributes {id = 2 : i32, x_loc = 0 : i64, x_size = 8 : i64, y_loc = 2 : i64, y_size = 6 : i64} {
-        %c4 = arith.constant 4 : index
-        %c768 = arith.constant 768 : index
-        %c3 = arith.constant 3 : index
-        %c512 = arith.constant 512 : index
-        %c2 = arith.constant 2 : index
-        %c256 = arith.constant 256 : index
-        %c0 = arith.constant 0 : index
-        %c1_1 = arith.constant 1 : index
-        %async_token, %results = air.execute -> (memref<1024xi16, 1 : i32>) {
-          %alloc = memref.alloc() : memref<1024xi16, 1 : i32>
-          air.execute_terminator %alloc : memref<1024xi16, 1 : i32>
-        }
-        %5 = air.channel.get async [%async_token]  @channel_0[] (%results[] [] []) {id = 3 : i32} : (memref<1024xi16, 1 : i32>)
-        %async_token_2, %results_3 = air.execute -> (memref<1024xi16, 1>) {
-          %alloc = memref.alloc() : memref<1024xi16, 1>
-          air.execute_terminator %alloc : memref<1024xi16, 1>
-        }
-        %6 = air.channel.put async [%5]  @channel_1[%c0, %c0] (%results[%c0] [%c256] [%c1_1]) {id = 4 : i32} : (memref<1024xi16, 1 : i32>)
-        %7 = air.channel.put async [%5]  @channel_1[%c1_1, %c0] (%results[%c256] [%c256] [%c1_1]) {id = 5 : i32} : (memref<1024xi16, 1 : i32>)
-        %8 = air.channel.put async [%5]  @channel_1[%c2, %c0] (%results[%c512] [%c256] [%c1_1]) {id = 6 : i32} : (memref<1024xi16, 1 : i32>)
-        %9 = air.channel.put async [%5]  @channel_1[%c3, %c0] (%results[%c768] [%c256] [%c1_1]) {id = 7 : i32} : (memref<1024xi16, 1 : i32>)
-        %10 = air.channel.get async [%async_token_2]  @channel_2[%c0, %c0] (%results_3[%c0] [%c256] [%c1_1]) {id = 8 : i32} : (memref<1024xi16, 1>)
-        %11 = air.channel.get async [%async_token_2]  @channel_2[%c1_1, %c0] (%results_3[%c256] [%c256] [%c1_1]) {id = 9 : i32} : (memref<1024xi16, 1>)
-        %12 = air.channel.get async [%async_token_2]  @channel_2[%c2, %c0] (%results_3[%c512] [%c256] [%c1_1]) {id = 10 : i32} : (memref<1024xi16, 1>)
-        %13 = air.channel.get async [%async_token_2]  @channel_2[%c3, %c0] (%results_3[%c768] [%c256] [%c1_1]) {id = 11 : i32} : (memref<1024xi16, 1>)
-        %14 = air.herd @herd_0 async [%5, %async_token_2]  tile (%arg16, %arg17) in (%arg18=%c1_1, %arg19=%c4) attributes {id = 3 : i32, x_loc = 0 : i64, y_loc = 2 : i64} {
-          %c32 = arith.constant 32 : index
-          %c256_5 = arith.constant 256 : index
-          %c0_6 = arith.constant 0 : index
-          %16 = ub.poison : i16
-          %async_token_7, %results_8 = air.execute -> (memref<256xi16, 2>) {
-            %alloc = memref.alloc() : memref<256xi16, 2>
-            air.execute_terminator %alloc : memref<256xi16, 2>
-          }
-          %17 = air.channel.get async [%async_token_7]  @channel_1[%arg17, %c0_6] (%results_8[] [] []) {id = 12 : i32} : (memref<256xi16, 2>)
-          %async_token_9, %results_10 = air.execute -> (memref<256xi16, 2>) {
-            %alloc = memref.alloc() : memref<256xi16, 2>
-            air.execute_terminator %alloc : memref<256xi16, 2>
-          }
-          %18 = air.wait_all async [%17, %async_token_9] 
-          %19 = scf.for %arg20 = %c0_6 to %c256_5 step %c32 iter_args(%arg21 = %18) -> (!air.async.token) {
-            %subview = memref.subview %results_8[%arg20] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2>
-            %subview_13 = memref.subview %results_10[%arg20] [32] [1] : memref<256xi16, 2> to memref<32xi16, strided<[1], offset: ?>, 2>
-            %async_token_14, %results_15 = air.execute [%arg21] -> (vector<32xi16>) {
-              %23 = vector.transfer_read %subview[%c0_6], %16 {in_bounds = [true]} : memref<32xi16, strided<[1], offset: ?>, 2>, vector<32xi16>
-              air.execute_terminator %23 : vector<32xi16>
-            }
-            %21 = arith.muli %results_15, %results_15 : vector<32xi16>
-            %async_token_16 = air.execute [%arg21] {
-              vector.transfer_write %21, %subview_13[%c0_6] {in_bounds = [true]} : vector<32xi16>, memref<32xi16, strided<[1], offset: ?>, 2>
-            }
-            %22 = air.wait_all async [%async_token_14, %async_token_16] 
-            scf.yield %22 : !air.async.token
-          }
-          %20 = air.channel.put async [%async_token_9]  @channel_2[%arg17, %c0_6] (%results_10[] [] []) {id = 13 : i32} : (memref<256xi16, 2>)
-          %async_token_11 = air.execute [%17] {
-            memref.dealloc %results_8 : memref<256xi16, 2>
-          }
-          %async_token_12 = air.execute [%20] {
-            memref.dealloc %results_10 : memref<256xi16, 2>
-          }
-        }
-        %15 = air.channel.put async [%14]  @channel_3[] (%results_3[] [] []) {id = 14 : i32} : (memref<1024xi16, 1>)
-        %async_token_4 = air.execute [%15] {
-          memref.dealloc %results_3 : memref<1024xi16, 1>
-        }
-        air.wait_all [%6, %7, %8, %9, %10, %11, %12, %13, %async_token_4]  {air.segment_end}
-      }
-    }
-    return
-  }
-}
diff --git a/examples/elementwise_arith/air_project/square_kernel_0.pdi b/examples/elementwise_arith/air_project/square_kernel_0.pdi
deleted file mode 100644
index 1a6b4e2869f47c37579486ca1fbb299d49a2e6c4..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 6272
zcmeHLO=w(I6h3d}C7Eh$-^f^A+Z11tBB5mQ(uvSQz&FkmqDGt=g@TKwLM;WA*p1y8
zG1MX=0XNf%kc}HxgF+W|GtzF9BD3tqFs-iTrMT##I-YZX-u>RQSyWerUYMEh`|i2t
zo_o%jzxyi@wdzyVOCP@T=FK0zy-ehj$J!x&$m={mWPG{yb4?@t{8Fpkj)I6{6b+Vj
z|7Bb_{Nokb57(bQcjnZ&_fDTZ;~Y*nm`7jD96NY(gy<)J$SaPa8E>#SN>su<(C+`>
z<9`hQ^4USN^&0O$bc18~^ZV|w%A1wEGEp?0olSVKH7REL!8|N*D)0*Ms;5sNzDK<G
zP1sZ9d$h?FrNOQ-qY*vF8N2gxIihryZW9G`r03CFUc>>>)quKvj{)5tUOVaC+BHS=
z$=J1%bnBIbJy)(?_{Qr6b80QAF&nRskN(|x^^0n!{Y&-t#@OjgbDd}OoP7@WCKaMZ
zZ?LcP(FX@x$!(6Z72V;)^ZxAd-SgwJzmA9b`uW|!`;GAJ&o9!Xnb%HbzFU&@j`R9!
znlZA&?JY{tKCG<lCF)$HT|{Ap_O27n*p<dUk@tYgkN96@wIq>rzP`Svw3>cb?tXf&
zbN2q&Y{^f`<~)>G`%v<0MtJOoD*}a=uaAW^V`gt~M(z!DU)Vm7WzrY64=lHRV7cuB
z%PFcC`f9GwS9wlbd0qjY@24Y=`-vXKe$w@|{Slh_T5juaxvjtD6g3O|)F|{*p3_#I
zSAggH>BYzW^hoxTuCMKn(A3v*TYt-K{Vk_xxzJB13jLJlw3X)-;Q4-f{c%4%lKrIX
zYx^TK^|jpA-*Q`j;ZlG5TMB<W$!{y+GRhvKbTY#M;0SOGI04)SyaIR?@EYI^z?*=(
zfO~-ZfCqq6gZ269x%Y^C$LG&C2j_D*N*#^?CxF|4R{*a9UIV-VcoT3Ja1U@F@BnaX
zus&b2a6Zjl{TgsShojWt7;pl(4R{6cD&RH18-O<fcLDbR_W=(8rv~fuEf>zGxjWwp
zIG@8&>TnD=0o(?>0(ceh8sH7Un}EB3dw~0Z2Y^$+0ZquSWc^-aY|q7d33S!6@_vlD
zw>@JYLC*)E6X-GbinDh;9e_^ITQTR}*}EPIKqshG%x7`-F6RO0WX@Za&l597JpN-N
zqt9zRl>*Jp^WX)4^g;HKvQKihLVi2t@0R@3_%l$3^iE)~jn8K;SoIamRatcLG#)Y@
zu}@^n+;H*mJP#d@#vXHt)#vYj!8#r{myd^Q_E7l|`-Hvbhl_{jdFXgFcJb(5(eb#s
zd_0p68IRZ}YEOJ}@$ftk9goHybDbaE>pC7cr{j4-ts~mci4<tLk<CkTzp;+?k=V-y
zv_EfXSYw!cjP_JE_TFb|RdG?#KC%8whl{3)7wseKzkFp!|B{ROQI3O$<RUg$7N|Ti
z7p(GBm>Xx#|E1Mjr7%}xE>2bM3v(09_nEm6=InjRh-dT7c~p^G&Wm6q!$vWm+J1+8
z7VWuS+P?JQdi~43Z0vW)XVISPrLHN6cBt3=WZmRnoyz22VVnPj(fwp|e3Y>aJJ(`v
zYm*e_mYCZdFBax}=C(Fem|IB8#8+Z#5?5`T@52nPuzsAVCpu|e<GW>g-g|bB(&>P_
zQ`FiS??ARTX5IQB&=b}<er4~2KFd1qqVx*rM_K2Hls*o6kd{k)Wux?DyDUf3Ygvbw
zMenIN@0Lq8=0kKKv**|xW_`Eua};!+^=vQDE37|d?B_urXMK;+7eM#ZX}RN6afaK|
zS$-1MvmAoH$hxbGvv0A^-*(DR1olg;XSoIaB<oo`p!;bZ{M55}=2>^~IDLV2zMfP2
z9@=*L66<bXr`K3_`#RlEn{cis?As*1m{a~Iww=Dj_Rhc4Ypgr}PWRJg@V}h-ud(j@
zI6X|KOHZ18XSb)beu!Al>JEC1^{0*fBIpyWXZ-_unAX969sJk9zth9C3HD88uRe=Y
pj((hv!{1NB{nRVPOgYBf|1Y+ZbL!7;a|h|TzC&_c-vJ26@eg<U>b?K~

diff --git a/examples/elementwise_arith/air_project/square_kernel_0_aie_cdo_elfs.bin b/examples/elementwise_arith/air_project/square_kernel_0_aie_cdo_elfs.bin
deleted file mode 100644
index 29b57b909b220d0abba36bf749886d055504d85a..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 2528
zcmeH_ze~eF6vw}pR+B=}cCehF5GzQB4z^Bisa6n0{IwKZ)Wt=l*2T@iq7LF52f@X?
zI4E=~*jYL{6)XM$jt*kIYwpOy<R2*XK}z1|?%loSeWCysi|c>^oQP-Xxz<fGz+_Re
zBEp`ANPD4KqCF+I_a4o72tx?72um{W<RNyXZb&1_a1Rvm9y1jKW+s4-m57b6EIL2}
z`3A!dvqf`wN}$$a;su><y~N&PY2e(fm+-dg07KT(=>uyda=lvgB0F1+5C6JrH%WKn
zHThXE_pWm8n75{XdCIYXW%fSqo*hpXosMX<XnzcgTL5w5VI?l__xcfewK78TR)Ozd
zUtmhDdxy&Pnj!1`T-4XSOuXsv@0R5Mes1li0QUkhfLlmafsC$H`yD{cPW4kS3u?$t
z+`E@iqx|(4sa<}%2R~-kh&mDVf8gl-Tz+2_e7@-q0sQvG<jAPiruZV?UEdS!ge?8<
z1ZCO@%Cr-d!JZ58E*Iimkf9x97Gb_``GIffSKquDQ)jrEF=g5X%CrlV!CnmUtq|f{
zkf9x97Gb_`3j^Q!);Dj))ETa3Oqq6pGVKCou(v~e+YIq7$j}Zli!k4}t$}ZS>zg-Y
Q>I_#ircAp)nRY?4KU-3TT>t<8

diff --git a/examples/elementwise_arith/air_project/square_kernel_0_aie_cdo_enable.bin b/examples/elementwise_arith/air_project/square_kernel_0_aie_cdo_enable.bin
deleted file mode 100644
index 7cc1818bce4d6ce1226fc5fda519967a4842b99e..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 104
zcmZQ!U|?`|@n>LQVqg#jvKQ~%@L!OTfq|Wo1;_&e1!V>x7mN{N##qFhv4{o3#Q>))
B1+xGE

diff --git a/examples/elementwise_arith/air_project/square_kernel_0_aie_cdo_init.bin b/examples/elementwise_arith/air_project/square_kernel_0_aie_cdo_init.bin
deleted file mode 100644
index ace360fc11f90c98660ab2cdf87c15ccc1146121..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 4300
zcmb7`y>1gh6ov2pBp3*=K`u;UiAAPBC_;-Iq!f`D%Rx$EiJ&xe<Pq`+MxvrhO_fJ*
zL+?lM6GWDHfpO|c;mq#NdhW@l$V!Rz-E(H<&Y4-usfdh^zKX;mA5xLue`mMbk;tP6
z;GTTor1FnzPVPjB`w+2j$`8*OaZX$i8{!k<GvagNDe(pICGm`SPP`yq65E>BnG9QU
zs%kD6pNrU!l#}M1^n%zBpAerBpA%1sFNiORXT)>j1@V&D*1XPX+>%pO(>aYOC(XIl
zTo4=L6XG-CbK)uS1@R^EjCfAGAYKyNn%6m<w&YaRbWYzWC(XIlTo4=L6XG-CbK)uS
z1@R^EjCfAGAYKw%;!OJDzNwn`3moPHJtp0Koi+Xf?{O4xPVad}x{zMre(Ul2ea=W1
zdOr%ByB@Fabw;|-y(y4~9<R?kBVFqJwxcoz7lmv@o$^x_7#o#wR$TvIeoi;r{CVj&
z%Io0YkMy}ekM32~kH6Ph&0)_eYL2}}0?#&tb$_D=_UHV@wWJ>1A3S*c+Sygb5o&%9
zDQb?rM*@%T4`=_N{G8ulX8q{?;KAe9&aNuf{Z)6+dnEAa{@y&WKj$~)`r$v_A3S*c
z+Sygb5o+uFs&~wLB=A)IJt^l9*@>&$=eaJ|ALeHCEA;2i?-r6BZLDc@Ykr1)Nos%e
zyBx)X*l+y&@q4zG%>LZZpZvLZexJ$yCVIi+Gbw6TxuNFRd!)s)g<3E0bXq)J)ZAiz
zo~*^wN9|SM=>?vR$e#ytE_db`1A@ogjyUhie^1`*$9Y!%xcxlu_><ayPu}duc}jJ^
z?m5pJ^D8)?a%Vm>oKMN&`5dAaW`EG)IYKQwL(}4!pcZC-+~P@53(pXD;ch=Rkqwcl
z-$Qp*{x)y81O1E0-0M>5Yran8Q}p^pqsy+~q?_^YqFuLU!v*w^g>(ZQdvE+H>3h(z
zr$+CPegGXiX!I@8Gn>R&pqo|VB5M5DZ%q4j`n=iSbrOgBeJi_aJocj*`txA^1JYCI
zsvpuj(4Ph4_etM^{vyy1Nl)!SHUqu48mRmYbah^&A4AvoLytd#j$Z;zemRYwKv(BU
z`giE6f6`Msr1^)c|9$AXf2|)ve;oApcBS<RblsoUyU=xiT2Jkm@)=WqV|f&;-(P8c
zg7JENt#_g8^|hYbQ(FI2t>1;N=hu4A4&tXle|xKey1zMebw5e(LVp>IKPJ5oUEP1u
pdv-|c4{7}&t*`Z-9n<)6Io^K<i^cA@5D9+J`&w$FxIhW}lK+|*O)3BY

diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_2.elf b/examples/elementwise_arith/air_project/square_kernel_0_core_0_2.elf
deleted file mode 100755
index ddd5bbc63d792e8ccbdd7d5efb1a64507a675d14..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 1600
zcma)6PiWg_6#vQbVh2j+ZY+4LgRdQoY-ElUn|1D`YH-U~9q0}lfihHCQW6nc&LlZY
zcWK8$wnOQq7xz%;rO>0<p@*CT?RBHaTn0_p?(MCI((Lyv$u?t${ov`n-~0D{`kwUN
zywtp`X&N%*P#m!uopWyiQ`9vm<S>T<@{sIXlnetpM3{;a<?Fv=H;YZ`=}D5>c?}PM
z2A<CpM$wN?14my?DZH4+p|o7w*Kkc!fWqSv2G2AaI6U54(GF(L0zVh_R&a2|<PM6D
zw;yWzC1Y5NA?Ed+6Vq?{%Xd!oul{lBuZsoySEK)463d?DS+fXysXc$cfAjkf8|ES3
zrlF3wVXn`P?eK)GpX`TxCp<a9pIt1sZtvnu8c&)^e2|m+F=zc=!f0g}uPvT`=a-Au
zXMz4MW`J@LvpXC~R+;9T;7o?+BmFn&&B;#s_n*9-+kElc)Zmv_{k7MH`CQ(dO5X=&
zmS0XrHf&k0KOGMN<$F&GWz5MyiKje~SI0op4>r8;ylOTZ%BWZLiViq!J6Ka&UKsh^
zAoZ%b^*G*)&Mz#i`SE(MRdKqVg%!W;2iptn_EzUJ+xJwf>bQE{tyVqTt?Kp9YhSpY
zbIz@~RijmH)f}Vlx%Jv={gY%5lTU7Eggjmzze{#?<k9c}?FSS5Bif}=k%sa4%jvy5
zmT%phh%b>V5&GVNhAXs@PmZUA>0~W)%)gO1p(v97oe=&l0?FT^HrY$YBk`Z86-6L?
zms$xSO5}H`l@LPsLuw_25I&$*LI~me)QTby{yVi2LI^()BcJ?%vY$VNn`4ej>}|)8
zF?GCvMs%YS+b!z#C=L^Aq-|%ziu@nFh~Go=+vSFDw>>9b?z+9Uhe~s4Nw<ttFH&30
zOM$Dy31g$;ICQM`0>^53YkpwB>4u)ATZ?G*Rt<URu&gMy!x&bynON4A-R^l-V_B%P
z0jp#Cf#nB&46$H2aW{;zD)C!(7}_^7PZIV57sH}7IOOhRWkSTyeN^SKI2@WOhxPxU
zF-;^n+2o#-yD>v4HRQ_>x1Mr62mFxQqOv}c&(H{uNUk#|cdhi`pU?U7ZlK6LouSCU
ixr|U}TR2O5GVcQOj{YO>)3oMR8au-UWzDQlrvCwN-uX%Z

diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_2.ld.script b/examples/elementwise_arith/air_project/square_kernel_0_core_0_2.ld.script
deleted file mode 100644
index 13a60c2..0000000
--- a/examples/elementwise_arith/air_project/square_kernel_0_core_0_2.ld.script
+++ /dev/null
@@ -1,66 +0,0 @@
-
-MEMORY
-{
-   program (RX) : ORIGIN = 0, LENGTH = 0x0020000
-   data (!RX) : ORIGIN = 0x74400, LENGTH = 0xBC00
-}
-ENTRY(__start)
-SECTIONS
-{
-  . = 0x0;
-  .text : {
-     /* the __start symbol has to come at address zero. */
-     *crt0.o(.text*)
-     _ctors_start = .;
-     _init_array_start = .;
-     KEEP(SORT(*.init_array))
-     _ctors_end = .;
-     _init_array_end = .;
-     _dtors_start = .;
-     _dtors_end = .;
-     *(.text*)
-  } > program
-  .data : {
-     *(.data*)
-     *(.rodata*)
-  } > data
-  .comment : {
-     *(.comment*)
-  }
-  .symtab : {
-     *(.symtab)
-  }
-  .shstrtab : {
-     *(.shstrtab)
-  }
-  .strtab : {
-     *(.strtab)
-  }
-  .stack_sizes : {
-     *(.stack_sizes)
-  }
-
-. = 0x70000;
-_sp_start_value_DM_stack = .;
-. += 0x400; /* stack */
-/* No tile with memory exists to the south. */
-. = 0x40000;
-. += 0x10000;
-/* No tile with memory exists to the west. */
-. = 0x50000;
-. += 0x10000;
-. = 0x60400;
-buf3 = .;
-. += 0x400;
-. = 0x64000;
-buf2 = .;
-. += 0x400;
-. = 0x70400;
-buf1 = .;
-. += 0x400;
-. = 0x74000;
-buf0 = .;
-. += 0x400;
-  .bss : { *(.bss*) } > data
-}
-PROVIDE(main = core_0_2);
diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_2.ll b/examples/elementwise_arith/air_project/square_kernel_0_core_0_2.ll
deleted file mode 100644
index d193819..0000000
--- a/examples/elementwise_arith/air_project/square_kernel_0_core_0_2.ll
+++ /dev/null
@@ -1,84 +0,0 @@
-; ModuleID = 'LLVMDialectModule'
-source_filename = "LLVMDialectModule"
-target triple = "aie2p"
-
-@buf0 = external global [256 x float]
-@buf1 = external global [256 x float]
-@buf2 = external global [256 x float]
-@buf3 = external global [256 x float]
-@buf4 = external global [256 x float]
-@buf5 = external global [256 x float]
-@buf6 = external global [256 x float]
-@buf7 = external global [256 x float]
-@buf8 = external global [1024 x float]
-@buf9 = external global [1024 x float]
-
-declare void @debug_i32(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.event(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.put.ms(i32, i32)
-
-; Unknown intrinsic
-declare { i32, i32 } @llvm.aie2p.get.ss()
-
-; Unknown intrinsic
-declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32)
-
-; Unknown intrinsic
-declare <16 x i32> @llvm.aie2p.scd.read.vec(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.acquire(i32, i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.release(i32, i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.set.ctrl.reg(i32, i32)
-
-define void @core_0_2() {
-  call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1)
-  call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0)
-  br label %1
-
-1:                                                ; preds = %13, %0
-  call void @llvm.aie2p.acquire(i32 49, i32 -1)
-  call void @llvm.aie2p.acquire(i32 50, i32 -1)
-  br label %2
-
-2:                                                ; preds = %5, %1
-  %3 = phi i32 [ %12, %5 ], [ 0, %1 ]
-  %4 = icmp slt i32 %3, 256
-  br i1 %4, label %5, label %13
-
-5:                                                ; preds = %2
-  %6 = getelementptr float, ptr @buf1, i32 %3
-  %7 = load <16 x float>, ptr %6, align 4
-  %8 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %7)
-  %9 = shufflevector <16 x bfloat> %8, <16 x bfloat> %8, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-  %10 = call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %9, <32 x bfloat> %9, i32 60)
-  %11 = getelementptr float, ptr @buf0, i32 %3
-  store <16 x float> %10, ptr %11, align 4
-  %12 = add i32 %3, 16
-  br label %2, !llvm.loop !1
-
-13:                                               ; preds = %2
-  call void @llvm.aie2p.release(i32 51, i32 1)
-  call void @llvm.aie2p.release(i32 48, i32 1)
-  br label %1
-}
-
-; Unknown intrinsic
-declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>)
-
-; Unknown intrinsic
-declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32)
-
-!llvm.module.flags = !{!0}
-
-!0 = !{i32 2, !"Debug Info Version", i32 3}
-!1 = distinct !{!1, !2}
-!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_2.o b/examples/elementwise_arith/air_project/square_kernel_0_core_0_2.o
deleted file mode 100644
index 57437bb20b770876576b8753e9fb67450fce9b78..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 932
zcmZ`$O-mb56g|(cj1VLWO;czgx~if>M%)Neq9kphlj6oAbrC0VT1r#Xjw9G^5@?Vv
zLf7*Lx_9LwtNw!0rOOEZ0Q(2D=f0V7hJY9DyZ5|v-hKDo*Qwgil~RaJ!AK#XKL(7G
zT+@Q5$RN?m)+LZbiOHTS0!5tXa7M)Z@^-MRE~XcN@0r0aE?$*^N#p$Fm-?Nvinv~;
zOl-N-%TC<{wV$)W_Mh3S&5ZNI3Lb=9ycY6h19+*1kAlvZClw6Y1lK8jKP$GptlZC<
z?1#AL&7|<(%|fGpius5Siwb`>Ci(>P{2)Yk-2utnLXGMR#(o|MPBBfTfqXAKfA9J*
z#Z2Je?OS3QUuff~G3iD~CjXxKbqZIayvPa6k>9!F<RgH_2o|`=If)6slF=!Ah9&t}
z+Bf>rgieuTUBy4wX8P`l5A(3?yXK46J1a-N)BFhAep9P_a9VEDueA@3TQ1Ba-}9XY
z<JV6*dTz_nhs|c&bL*x0DolH8OJd{rofQE~K#8DkR}T-8DMYv58y3k#t!vF=_WL=d
z<`c9@6px;dvffP=nTOaj`lwFk8(XYf=Z1`_91%lmHkeC&=|{LMT}GTbj{P)9aTfD<
R)J(Om%P8_my3~`q{{x==X=?xg

diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_2.opt.ll b/examples/elementwise_arith/air_project/square_kernel_0_core_0_2.opt.ll
deleted file mode 100644
index f007116..0000000
--- a/examples/elementwise_arith/air_project/square_kernel_0_core_0_2.opt.ll
+++ /dev/null
@@ -1,65 +0,0 @@
-; ModuleID = 'air_project/square_kernel_0_core_0_2.peanohack.ll'
-source_filename = "LLVMDialectModule"
-target datalayout = "e-m:e-p:20:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-f32:32:32-i64:32-f64:32-a:0:32-n32"
-target triple = "aie2p"
-
-@buf0 = external local_unnamed_addr global [256 x float]
-@buf1 = external local_unnamed_addr global [256 x float]
-
-; Function Attrs: nounwind
-declare void @llvm.aie2p.acquire(i32, i32) #0
-
-; Function Attrs: nounwind
-declare void @llvm.aie2p.release(i32, i32) #0
-
-; Function Attrs: nounwind memory(inaccessiblemem: write)
-declare void @llvm.aie2p.set.ctrl.reg(i32, i32) #1
-
-; Function Attrs: noreturn nounwind
-define void @core_0_2() local_unnamed_addr #2 {
-  tail call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1)
-  tail call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0)
-  br label %1
-
-1:                                                ; preds = %13, %0
-  tail call void @llvm.aie2p.acquire(i32 49, i32 -1)
-  tail call void @llvm.aie2p.acquire(i32 50, i32 -1)
-  br label %2
-
-2:                                                ; preds = %1, %2
-  %3 = phi i32 [ 0, %1 ], [ %11, %2 ]
-  %4 = trunc nuw i32 %3 to i20
-  %5 = getelementptr float, ptr @buf1, i20 %4
-  %6 = load <16 x float>, ptr %5, align 64
-  %7 = tail call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %6)
-  %8 = shufflevector <16 x bfloat> %7, <16 x bfloat> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-  %9 = tail call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %8, <32 x bfloat> %8, i32 60)
-  %10 = getelementptr float, ptr @buf0, i20 %4
-  store <16 x float> %9, ptr %10, align 64
-  %11 = add nuw nsw i32 %3, 16
-  %12 = icmp ult i32 %3, 240
-  br i1 %12, label %2, label %13, !llvm.loop !1
-
-13:                                               ; preds = %2
-  tail call void @llvm.aie2p.release(i32 51, i32 1)
-  tail call void @llvm.aie2p.release(i32 48, i32 1)
-  br label %1
-}
-
-; Function Attrs: nofree nounwind memory(inaccessiblemem: read)
-declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>) #3
-
-; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read)
-declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32) #4
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind memory(inaccessiblemem: write) }
-attributes #2 = { noreturn nounwind }
-attributes #3 = { nofree nounwind memory(inaccessiblemem: read) }
-attributes #4 = { mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) }
-
-!llvm.module.flags = !{!0}
-
-!0 = !{i32 2, !"Debug Info Version", i32 3}
-!1 = distinct !{!1, !2}
-!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_2.peanohack.ll b/examples/elementwise_arith/air_project/square_kernel_0_core_0_2.peanohack.ll
deleted file mode 100644
index 055e011..0000000
--- a/examples/elementwise_arith/air_project/square_kernel_0_core_0_2.peanohack.ll
+++ /dev/null
@@ -1,84 +0,0 @@
-; ModuleID = 'LLVMDialectModule'
-source_filename = "LLVMDialectModule"
-target triple = "aie2p"
-
-@buf0 = external global [256 x float]
-@buf1 = external global [256 x float]
-@buf2 = external global [256 x float]
-@buf3 = external global [256 x float]
-@buf4 = external global [256 x float]
-@buf5 = external global [256 x float]
-@buf6 = external global [256 x float]
-@buf7 = external global [256 x float]
-@buf8 = external global [1024 x float]
-@buf9 = external global [1024 x float]
-
-declare void @debug_i32(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.event(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.put.ms(i32, i32)
-
-; Unknown intrinsic
-declare { i32, i32 } @llvm.aie2p.get.ss()
-
-; Unknown intrinsic
-declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32)
-
-; Unknown intrinsic
-declare <16 x i32> @llvm.aie2p.scd.read.vec(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.acquire(i32, i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.release(i32, i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.set.ctrl.reg(i32, i32)
-
-define void @core_0_2() {
-  call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1)
-  call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0)
-  br label %1
-
-1:                                                ; preds = %13, %0
-  call void @llvm.aie2p.acquire(i32 49, i32 -1)
-  call void @llvm.aie2p.acquire(i32 50, i32 -1)
-  br label %2
-
-2:                                                ; preds = %5, %1
-  %3 = phi i32 [ %12, %5 ], [ 0, %1 ]
-  %4 = icmp slt i32 %3, 256
-  br i1 %4, label %5, label %13
-
-5:                                                ; preds = %2
-  %6 = getelementptr float, ptr @buf1, i32 %3
-  %7 = load <16 x float>, ptr %6
-  %8 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %7)
-  %9 = shufflevector <16 x bfloat> %8, <16 x bfloat> %8, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-  %10 = call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %9, <32 x bfloat> %9, i32 60)
-  %11 = getelementptr float, ptr @buf0, i32 %3
-  store <16 x float> %10, ptr %11
-  %12 = add i32 %3, 16
-  br label %2, !llvm.loop !1
-
-13:                                               ; preds = %2
-  call void @llvm.aie2p.release(i32 51, i32 1)
-  call void @llvm.aie2p.release(i32 48, i32 1)
-  br label %1
-}
-
-; Unknown intrinsic
-declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>)
-
-; Unknown intrinsic
-declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32)
-
-!llvm.module.flags = !{!0}
-
-!0 = !{i32 2, !"Debug Info Version", i32 3}
-!1 = distinct !{!1, !2}
-!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_3.elf b/examples/elementwise_arith/air_project/square_kernel_0_core_0_3.elf
deleted file mode 100755
index a8d3607d6115f3e937af1d24e81c3abb59403ece..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 1640
zcma)6J!~UI6#mvRu_K+p2O=^8q~VZIK*8CK4TrDVi+l(XM+yZf5<<&*cayAQufy&-
z<Y+F4C{Q388dD%$a}uRRaYd>~cP;43G)R{SL_=}cU4aPS+aKGa;*(~+_r3RK=IzYv
z+`7?zSJO1)D4;x64LaAK0%ph!C={@O5{i)Q1GXF^I;1fb6}Aulh}}Fk&9l=Yvx^#T
z11&t9D~+SCy#$=UKcnzy5oa>x@`;9zH3cZ#tzh&(qk*%F;|=X}?h5c#>39RDt0pTb
z-`&5Xom7luED|j0hnHsm8LoYIdAR=T<=?ND>~D<W^Jy&atMg_Vcu#xy!tmB-FSpDy
zeoafAvtX{z8N1;nSwF3Z`Y?QYN<X+>?R<NLSF(6Isl-PG89&h+f1D~>IVNk1=fC;Y
z@~88_@Cb82wT$^g@DQfJd=osAWPfD-T?PwM$?(T}&lGka{X8@J`pIzX@6uwSXwGEc
z!!cR?)%3`c8LQ3vlS4rDhkK<e7UV#Q`x@}%0%-feju&22?RHBU&6-}*0jFyRTWZe>
zBflSHUbV0t$Gg$BrKK%D-X3&nPQSOb;dlLDf2rHu>%D3Fp6b*cS8uxYx@Ws}z4><I
z9oKWNx(&B(bn2akV>CUt+1PBpp4Kqkvz#%Cczp33)%w_@;YHfFr}V3|E8{^LCi^eD
z_xM8Yx;YhJp;pr9dlnj2X``5KPX)8-TINK5C~-ouN&Y7y__qioe~)Z(E;$~Fe@Rws
z0?|igC4#6>e@|8-2+?=QN(3Q#L{=gQ(I;fZCJ_A#S&1M-pNdiBJt3r?-^4H{`doA|
z{X-@GXA+-i)M9Txh8$zh3ur_)d$HXiZ%1*MT4T1I9V_y`@FL!q_D5?i-|l)&yw-OI
zT@SVP%8G6o884Hq_Km>R;e@eKa~wK02Z3XCye&U4;PgY!(ycl=gUzH{hGj*u9mcSt
z-PE%7?C!v`T5BSm9aufv4=g|MV~7RIiTh!cj}pIShoOBl_oQJjaFGnkf{AvgN2Wsj
zEXI&dm+md;8p-jRvTgqd8k3}@JDk3!@`mQ9WQN=zaqF4Z3&7`@EtJnA`3#Nm6U708
z@+Qk1a?+B+|2=unNeLDhxS9)cUcwdH^6uxbnD<vM$a&rynSGGOR=J?8na6Yf7yJDG
AD*ylh

diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_3.ld.script b/examples/elementwise_arith/air_project/square_kernel_0_core_0_3.ld.script
deleted file mode 100644
index befdff2..0000000
--- a/examples/elementwise_arith/air_project/square_kernel_0_core_0_3.ld.script
+++ /dev/null
@@ -1,69 +0,0 @@
-
-MEMORY
-{
-   program (RX) : ORIGIN = 0, LENGTH = 0x0020000
-   data (!RX) : ORIGIN = 0x74400, LENGTH = 0xBC00
-}
-ENTRY(__start)
-SECTIONS
-{
-  . = 0x0;
-  .text : {
-     /* the __start symbol has to come at address zero. */
-     *crt0.o(.text*)
-     _ctors_start = .;
-     _init_array_start = .;
-     KEEP(SORT(*.init_array))
-     _ctors_end = .;
-     _init_array_end = .;
-     _dtors_start = .;
-     _dtors_end = .;
-     *(.text*)
-  } > program
-  .data : {
-     *(.data*)
-     *(.rodata*)
-  } > data
-  .comment : {
-     *(.comment*)
-  }
-  .symtab : {
-     *(.symtab)
-  }
-  .shstrtab : {
-     *(.shstrtab)
-  }
-  .strtab : {
-     *(.strtab)
-  }
-  .stack_sizes : {
-     *(.stack_sizes)
-  }
-
-. = 0x70000;
-_sp_start_value_DM_stack = .;
-. += 0x400; /* stack */
-. = 0x40400;
-buf1 = .;
-. += 0x400;
-. = 0x44000;
-buf0 = .;
-. += 0x400;
-/* No tile with memory exists to the west. */
-. = 0x50000;
-. += 0x10000;
-. = 0x60400;
-buf5 = .;
-. += 0x400;
-. = 0x64000;
-buf4 = .;
-. += 0x400;
-. = 0x70400;
-buf3 = .;
-. += 0x400;
-. = 0x74000;
-buf2 = .;
-. += 0x400;
-  .bss : { *(.bss*) } > data
-}
-PROVIDE(main = core_0_3);
diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_3.ll b/examples/elementwise_arith/air_project/square_kernel_0_core_0_3.ll
deleted file mode 100644
index 9d2e115..0000000
--- a/examples/elementwise_arith/air_project/square_kernel_0_core_0_3.ll
+++ /dev/null
@@ -1,84 +0,0 @@
-; ModuleID = 'LLVMDialectModule'
-source_filename = "LLVMDialectModule"
-target triple = "aie2p"
-
-@buf0 = external global [256 x float]
-@buf1 = external global [256 x float]
-@buf2 = external global [256 x float]
-@buf3 = external global [256 x float]
-@buf4 = external global [256 x float]
-@buf5 = external global [256 x float]
-@buf6 = external global [256 x float]
-@buf7 = external global [256 x float]
-@buf8 = external global [1024 x float]
-@buf9 = external global [1024 x float]
-
-declare void @debug_i32(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.event(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.put.ms(i32, i32)
-
-; Unknown intrinsic
-declare { i32, i32 } @llvm.aie2p.get.ss()
-
-; Unknown intrinsic
-declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32)
-
-; Unknown intrinsic
-declare <16 x i32> @llvm.aie2p.scd.read.vec(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.acquire(i32, i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.release(i32, i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.set.ctrl.reg(i32, i32)
-
-define void @core_0_3() {
-  call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1)
-  call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0)
-  br label %1
-
-1:                                                ; preds = %13, %0
-  call void @llvm.aie2p.acquire(i32 49, i32 -1)
-  call void @llvm.aie2p.acquire(i32 50, i32 -1)
-  br label %2
-
-2:                                                ; preds = %5, %1
-  %3 = phi i32 [ %12, %5 ], [ 0, %1 ]
-  %4 = icmp slt i32 %3, 256
-  br i1 %4, label %5, label %13
-
-5:                                                ; preds = %2
-  %6 = getelementptr float, ptr @buf3, i32 %3
-  %7 = load <16 x float>, ptr %6, align 4
-  %8 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %7)
-  %9 = shufflevector <16 x bfloat> %8, <16 x bfloat> %8, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-  %10 = call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %9, <32 x bfloat> %9, i32 60)
-  %11 = getelementptr float, ptr @buf2, i32 %3
-  store <16 x float> %10, ptr %11, align 4
-  %12 = add i32 %3, 16
-  br label %2, !llvm.loop !1
-
-13:                                               ; preds = %2
-  call void @llvm.aie2p.release(i32 51, i32 1)
-  call void @llvm.aie2p.release(i32 48, i32 1)
-  br label %1
-}
-
-; Unknown intrinsic
-declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>)
-
-; Unknown intrinsic
-declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32)
-
-!llvm.module.flags = !{!0}
-
-!0 = !{i32 2, !"Debug Info Version", i32 3}
-!1 = distinct !{!1, !2}
-!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_3.o b/examples/elementwise_arith/air_project/square_kernel_0_core_0_3.o
deleted file mode 100644
index 6b3d34570ca51b7ee784f20a1a0a4907b4e35d5c..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 932
zcmZ`$O-mb56g|(cj1VLWO(V3Bc2z}(q;^vfiB{W6C#48}fQxYwr$tPSjw5I{3JubQ
zbUlBddsi;9>MxKkT}JQ+*gsHv?wc8B2zcSXd(S)P-FM%8#|ze5r4(XQFj5FO>j8R6
zzS4ph$RN?m)-{kriOISu0!3Wpa8AVh`mVjEt_Gh2-!ttsTz#kj{l>-NFZDZDF5-5A
zGO>wLD?9KnsQw%dK3xvq%x0V)<=|1s#cLs7F@X1~Ga4N4J)K8~O>iC1_p@Td%gX(%
z$$p4i-cSnvnJv`LPVp?_!=l2U_lQ2hJl_t{eRn`|w@}^sg0YbYf>R7qX&~PU&p)_6
zPBDG>duJ2N_(JQ&DwC54$>hH<zfIxAC@*pXBjjJ-bMg_uQWq9D#yN=zKbg@fe1>uP
zSlT!Gc%M#@V_n5R*Jk?ep$~JX>AU8;kIR$$zEj_ZWvx`_x15Gs_p8m#gN6%p-}ii{
z#(4BqV(sAb6im-;IJ$yaZ+fm>vZsW23DcUJE7@@@6Wi9p&So-&==KN07@4Sbt$D(J
zcT;LUL7PPJ==rGV-DHt@h`phA>tw#M#kv`8$e1b-(V=FVxzv|_gv-)p#Hr)hNP`qt
TW*(24sn(q^ioBmL_2lmVgu7{C

diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_3.opt.ll b/examples/elementwise_arith/air_project/square_kernel_0_core_0_3.opt.ll
deleted file mode 100644
index f2c89be..0000000
--- a/examples/elementwise_arith/air_project/square_kernel_0_core_0_3.opt.ll
+++ /dev/null
@@ -1,65 +0,0 @@
-; ModuleID = 'air_project/square_kernel_0_core_0_3.peanohack.ll'
-source_filename = "LLVMDialectModule"
-target datalayout = "e-m:e-p:20:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-f32:32:32-i64:32-f64:32-a:0:32-n32"
-target triple = "aie2p"
-
-@buf2 = external local_unnamed_addr global [256 x float]
-@buf3 = external local_unnamed_addr global [256 x float]
-
-; Function Attrs: nounwind
-declare void @llvm.aie2p.acquire(i32, i32) #0
-
-; Function Attrs: nounwind
-declare void @llvm.aie2p.release(i32, i32) #0
-
-; Function Attrs: nounwind memory(inaccessiblemem: write)
-declare void @llvm.aie2p.set.ctrl.reg(i32, i32) #1
-
-; Function Attrs: noreturn nounwind
-define void @core_0_3() local_unnamed_addr #2 {
-  tail call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1)
-  tail call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0)
-  br label %1
-
-1:                                                ; preds = %13, %0
-  tail call void @llvm.aie2p.acquire(i32 49, i32 -1)
-  tail call void @llvm.aie2p.acquire(i32 50, i32 -1)
-  br label %2
-
-2:                                                ; preds = %1, %2
-  %3 = phi i32 [ 0, %1 ], [ %11, %2 ]
-  %4 = trunc nuw i32 %3 to i20
-  %5 = getelementptr float, ptr @buf3, i20 %4
-  %6 = load <16 x float>, ptr %5, align 64
-  %7 = tail call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %6)
-  %8 = shufflevector <16 x bfloat> %7, <16 x bfloat> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-  %9 = tail call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %8, <32 x bfloat> %8, i32 60)
-  %10 = getelementptr float, ptr @buf2, i20 %4
-  store <16 x float> %9, ptr %10, align 64
-  %11 = add nuw nsw i32 %3, 16
-  %12 = icmp ult i32 %3, 240
-  br i1 %12, label %2, label %13, !llvm.loop !1
-
-13:                                               ; preds = %2
-  tail call void @llvm.aie2p.release(i32 51, i32 1)
-  tail call void @llvm.aie2p.release(i32 48, i32 1)
-  br label %1
-}
-
-; Function Attrs: nofree nounwind memory(inaccessiblemem: read)
-declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>) #3
-
-; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read)
-declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32) #4
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind memory(inaccessiblemem: write) }
-attributes #2 = { noreturn nounwind }
-attributes #3 = { nofree nounwind memory(inaccessiblemem: read) }
-attributes #4 = { mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) }
-
-!llvm.module.flags = !{!0}
-
-!0 = !{i32 2, !"Debug Info Version", i32 3}
-!1 = distinct !{!1, !2}
-!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_3.peanohack.ll b/examples/elementwise_arith/air_project/square_kernel_0_core_0_3.peanohack.ll
deleted file mode 100644
index ed78c15..0000000
--- a/examples/elementwise_arith/air_project/square_kernel_0_core_0_3.peanohack.ll
+++ /dev/null
@@ -1,84 +0,0 @@
-; ModuleID = 'LLVMDialectModule'
-source_filename = "LLVMDialectModule"
-target triple = "aie2p"
-
-@buf0 = external global [256 x float]
-@buf1 = external global [256 x float]
-@buf2 = external global [256 x float]
-@buf3 = external global [256 x float]
-@buf4 = external global [256 x float]
-@buf5 = external global [256 x float]
-@buf6 = external global [256 x float]
-@buf7 = external global [256 x float]
-@buf8 = external global [1024 x float]
-@buf9 = external global [1024 x float]
-
-declare void @debug_i32(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.event(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.put.ms(i32, i32)
-
-; Unknown intrinsic
-declare { i32, i32 } @llvm.aie2p.get.ss()
-
-; Unknown intrinsic
-declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32)
-
-; Unknown intrinsic
-declare <16 x i32> @llvm.aie2p.scd.read.vec(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.acquire(i32, i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.release(i32, i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.set.ctrl.reg(i32, i32)
-
-define void @core_0_3() {
-  call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1)
-  call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0)
-  br label %1
-
-1:                                                ; preds = %13, %0
-  call void @llvm.aie2p.acquire(i32 49, i32 -1)
-  call void @llvm.aie2p.acquire(i32 50, i32 -1)
-  br label %2
-
-2:                                                ; preds = %5, %1
-  %3 = phi i32 [ %12, %5 ], [ 0, %1 ]
-  %4 = icmp slt i32 %3, 256
-  br i1 %4, label %5, label %13
-
-5:                                                ; preds = %2
-  %6 = getelementptr float, ptr @buf3, i32 %3
-  %7 = load <16 x float>, ptr %6
-  %8 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %7)
-  %9 = shufflevector <16 x bfloat> %8, <16 x bfloat> %8, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-  %10 = call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %9, <32 x bfloat> %9, i32 60)
-  %11 = getelementptr float, ptr @buf2, i32 %3
-  store <16 x float> %10, ptr %11
-  %12 = add i32 %3, 16
-  br label %2, !llvm.loop !1
-
-13:                                               ; preds = %2
-  call void @llvm.aie2p.release(i32 51, i32 1)
-  call void @llvm.aie2p.release(i32 48, i32 1)
-  br label %1
-}
-
-; Unknown intrinsic
-declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>)
-
-; Unknown intrinsic
-declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32)
-
-!llvm.module.flags = !{!0}
-
-!0 = !{i32 2, !"Debug Info Version", i32 3}
-!1 = distinct !{!1, !2}
-!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_4.elf b/examples/elementwise_arith/air_project/square_kernel_0_core_0_4.elf
deleted file mode 100755
index b06bbc2edc2ab1b34e0e4e5e00fa2e23d127e11e..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 1640
zcma)6O>7%Q6#mvRu_IOBf{2U?(oiH6P;l0<Y3iP~k&6&<q;LR5s?f6D-6X5n>#(~{
zQ!Wi63LFp@F6MyLOC^pLhaPf@)Lub7=7Q8jATB+$w;mATd;4QsocN@f?|tvRnRz=i
zJ9lohKF~A`8FDC&RGrSX7l0{p9SS+jp@2Ljd!H@CkPc}~MTza>KO-lLP4e`lNbS6a
zyFe3<X9}a}+phy>A5AGdoyVz6xp=JMrltUe`y~t?X*6(pezdNg%v=V(E*!1nWW`_w
z#ru2rwBu4W9*Y?B`oV?ie+H}HUmUFccJYtv1?$`D;MF9S_2pTk2z;nLer<5)i#M9a
zDZi$v&R8(l=Zu};f~=p^LwyjuIH4b1FSoxt#G7e6nN;G#oQxl9jz3BitqkL}#q;0%
za`E$7U~q^Tpj^c40eA=#V7>vKNxVNY{|<vWsbuiegO_qUPk)&je)D{=`FCMHmp7);
z@8OuV{&I3;@r>oh!|@@Y{NsZ{8FO-=#6t~ueh#!ef7=bNsaC70s*Q?X(E+<-`I~Cj
z4MMNyr(QL;6-7JYwS|REFWTz2D|WBDu<ms{e{Z4F+3mh(d9G^LY)5Z6wVG==HNA1A
ze${d9C8zGxs_k04ZdV(w)2MGW-breh>{-qTc|1FRg=%f&(Qt|O-3k2`?b2wFhVlN(
z?mat~yKYRxm#LLB`d)^H723!r+f%}HvX(K{pGcfgY?A+(2>vYs$=@X#oJ)>J;$M*!
zn?Up-S&1M@)IX4w2txEdvJydv9+H&^Li90Nu?a-~N>(BW(I;Z$c~1zb=XWuTu|5->
zPySGe{}so_8kNZ1iy+6?b$uG)?QUeX$y;F*B-V&6d)o}XFWr#$rS<7*)3Z9R9j*49
ze#b?nwY;pG)sz>>X6uIU=&*yRTCr_9Hu}D8w%tw7ufpyHuBn@KwEG+HOIL?!hLIIS
zFvFe1GIy;`-!+@7BJFLMUCZ-L&-WsT1=EgtL70sazi9=5bvyGUVb^yM4@!fvb|yzA
zLj0_wCf%ZROVVA5kJpfG`#;c_BrVzD<UN%)G(#yh<OYdbPqm%{Zl$(RHjm^pG{Vmm
z`wYsPEOW?7OAi0{_&q1Zm}6im6XdLf%e3X)&tNj|NhZiy-n*&2pT<_Wpsbn2GyWG1
C;r}=Q

diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_4.ld.script b/examples/elementwise_arith/air_project/square_kernel_0_core_0_4.ld.script
deleted file mode 100644
index 5970233..0000000
--- a/examples/elementwise_arith/air_project/square_kernel_0_core_0_4.ld.script
+++ /dev/null
@@ -1,69 +0,0 @@
-
-MEMORY
-{
-   program (RX) : ORIGIN = 0, LENGTH = 0x0020000
-   data (!RX) : ORIGIN = 0x74400, LENGTH = 0xBC00
-}
-ENTRY(__start)
-SECTIONS
-{
-  . = 0x0;
-  .text : {
-     /* the __start symbol has to come at address zero. */
-     *crt0.o(.text*)
-     _ctors_start = .;
-     _init_array_start = .;
-     KEEP(SORT(*.init_array))
-     _ctors_end = .;
-     _init_array_end = .;
-     _dtors_start = .;
-     _dtors_end = .;
-     *(.text*)
-  } > program
-  .data : {
-     *(.data*)
-     *(.rodata*)
-  } > data
-  .comment : {
-     *(.comment*)
-  }
-  .symtab : {
-     *(.symtab)
-  }
-  .shstrtab : {
-     *(.shstrtab)
-  }
-  .strtab : {
-     *(.strtab)
-  }
-  .stack_sizes : {
-     *(.stack_sizes)
-  }
-
-. = 0x70000;
-_sp_start_value_DM_stack = .;
-. += 0x400; /* stack */
-. = 0x40400;
-buf3 = .;
-. += 0x400;
-. = 0x44000;
-buf2 = .;
-. += 0x400;
-/* No tile with memory exists to the west. */
-. = 0x50000;
-. += 0x10000;
-. = 0x60400;
-buf7 = .;
-. += 0x400;
-. = 0x64000;
-buf6 = .;
-. += 0x400;
-. = 0x70400;
-buf5 = .;
-. += 0x400;
-. = 0x74000;
-buf4 = .;
-. += 0x400;
-  .bss : { *(.bss*) } > data
-}
-PROVIDE(main = core_0_4);
diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_4.ll b/examples/elementwise_arith/air_project/square_kernel_0_core_0_4.ll
deleted file mode 100644
index cfa104c..0000000
--- a/examples/elementwise_arith/air_project/square_kernel_0_core_0_4.ll
+++ /dev/null
@@ -1,84 +0,0 @@
-; ModuleID = 'LLVMDialectModule'
-source_filename = "LLVMDialectModule"
-target triple = "aie2p"
-
-@buf0 = external global [256 x float]
-@buf1 = external global [256 x float]
-@buf2 = external global [256 x float]
-@buf3 = external global [256 x float]
-@buf4 = external global [256 x float]
-@buf5 = external global [256 x float]
-@buf6 = external global [256 x float]
-@buf7 = external global [256 x float]
-@buf8 = external global [1024 x float]
-@buf9 = external global [1024 x float]
-
-declare void @debug_i32(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.event(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.put.ms(i32, i32)
-
-; Unknown intrinsic
-declare { i32, i32 } @llvm.aie2p.get.ss()
-
-; Unknown intrinsic
-declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32)
-
-; Unknown intrinsic
-declare <16 x i32> @llvm.aie2p.scd.read.vec(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.acquire(i32, i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.release(i32, i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.set.ctrl.reg(i32, i32)
-
-define void @core_0_4() {
-  call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1)
-  call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0)
-  br label %1
-
-1:                                                ; preds = %13, %0
-  call void @llvm.aie2p.acquire(i32 49, i32 -1)
-  call void @llvm.aie2p.acquire(i32 50, i32 -1)
-  br label %2
-
-2:                                                ; preds = %5, %1
-  %3 = phi i32 [ %12, %5 ], [ 0, %1 ]
-  %4 = icmp slt i32 %3, 256
-  br i1 %4, label %5, label %13
-
-5:                                                ; preds = %2
-  %6 = getelementptr float, ptr @buf5, i32 %3
-  %7 = load <16 x float>, ptr %6, align 4
-  %8 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %7)
-  %9 = shufflevector <16 x bfloat> %8, <16 x bfloat> %8, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-  %10 = call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %9, <32 x bfloat> %9, i32 60)
-  %11 = getelementptr float, ptr @buf4, i32 %3
-  store <16 x float> %10, ptr %11, align 4
-  %12 = add i32 %3, 16
-  br label %2, !llvm.loop !1
-
-13:                                               ; preds = %2
-  call void @llvm.aie2p.release(i32 51, i32 1)
-  call void @llvm.aie2p.release(i32 48, i32 1)
-  br label %1
-}
-
-; Unknown intrinsic
-declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>)
-
-; Unknown intrinsic
-declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32)
-
-!llvm.module.flags = !{!0}
-
-!0 = !{i32 2, !"Debug Info Version", i32 3}
-!1 = distinct !{!1, !2}
-!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_4.o b/examples/elementwise_arith/air_project/square_kernel_0_core_0_4.o
deleted file mode 100644
index 6afdc2e5495148f08a97bed4dbec3859aee95823..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 932
zcmZ`$O-mb56g|(cj1WpJ7*nv2c2z}(MC?KkiC?YIiF8vHx)>*MTEx_};|S_Tp+Q`T
z>-huSyK<3Le?jTeWrY3!{R7%_-^@5ezzg@?d)_(kzWeSwUbfyTr4XBfkwU<E570|;
zS_|GFgG4J^H$V<0CZAOiDB>!IOCshs5A6+gJ@^{<nQ3p}`a=ciH?9tTso%MB5w{bR
ziA|JR*@63@dNCAy{4?}#KI5E~gW-^i*FwHx04u8VA~^o>atR$a!F532&x#E%EBCV|
z`yp<5&r<mBe4%!JidPXI78U-oNAwBi`F4mNy91KDh3eK9jLkd{oMMnl1Nl~X{>gPD
z#q{Cs!6ugRh1QF8CMOY+$$w>jo5HJ6UgQLxlb?Cy<RgH&E-Y}2a}pDNGNV)Y4CC^#
zv?Ka>pH7iuUBy4wX8P`d4|BKayXO1VPm_DTQ{RDQtyPz{orYWYtIe(bh6{7g_k5?u
zc=%0XZU4(#S*BrnZo|=K%zD#v?UKzsYhj^ePerlJXj{v>Tgeon+n)?$WTMu!W|aMY
zPpSC?Z4$+!=cA%`lSSqswnFdL$$Vprb&K4PF;ya>L(MF6sW1Hqm!->yQ^&EH1}U!0
TJRUVutvg{9c|Tq1$=&|}aJy+@

diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_4.opt.ll b/examples/elementwise_arith/air_project/square_kernel_0_core_0_4.opt.ll
deleted file mode 100644
index a653490..0000000
--- a/examples/elementwise_arith/air_project/square_kernel_0_core_0_4.opt.ll
+++ /dev/null
@@ -1,65 +0,0 @@
-; ModuleID = 'air_project/square_kernel_0_core_0_4.peanohack.ll'
-source_filename = "LLVMDialectModule"
-target datalayout = "e-m:e-p:20:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-f32:32:32-i64:32-f64:32-a:0:32-n32"
-target triple = "aie2p"
-
-@buf4 = external local_unnamed_addr global [256 x float]
-@buf5 = external local_unnamed_addr global [256 x float]
-
-; Function Attrs: nounwind
-declare void @llvm.aie2p.acquire(i32, i32) #0
-
-; Function Attrs: nounwind
-declare void @llvm.aie2p.release(i32, i32) #0
-
-; Function Attrs: nounwind memory(inaccessiblemem: write)
-declare void @llvm.aie2p.set.ctrl.reg(i32, i32) #1
-
-; Function Attrs: noreturn nounwind
-define void @core_0_4() local_unnamed_addr #2 {
-  tail call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1)
-  tail call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0)
-  br label %1
-
-1:                                                ; preds = %13, %0
-  tail call void @llvm.aie2p.acquire(i32 49, i32 -1)
-  tail call void @llvm.aie2p.acquire(i32 50, i32 -1)
-  br label %2
-
-2:                                                ; preds = %1, %2
-  %3 = phi i32 [ 0, %1 ], [ %11, %2 ]
-  %4 = trunc nuw i32 %3 to i20
-  %5 = getelementptr float, ptr @buf5, i20 %4
-  %6 = load <16 x float>, ptr %5, align 64
-  %7 = tail call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %6)
-  %8 = shufflevector <16 x bfloat> %7, <16 x bfloat> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-  %9 = tail call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %8, <32 x bfloat> %8, i32 60)
-  %10 = getelementptr float, ptr @buf4, i20 %4
-  store <16 x float> %9, ptr %10, align 64
-  %11 = add nuw nsw i32 %3, 16
-  %12 = icmp ult i32 %3, 240
-  br i1 %12, label %2, label %13, !llvm.loop !1
-
-13:                                               ; preds = %2
-  tail call void @llvm.aie2p.release(i32 51, i32 1)
-  tail call void @llvm.aie2p.release(i32 48, i32 1)
-  br label %1
-}
-
-; Function Attrs: nofree nounwind memory(inaccessiblemem: read)
-declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>) #3
-
-; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read)
-declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32) #4
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind memory(inaccessiblemem: write) }
-attributes #2 = { noreturn nounwind }
-attributes #3 = { nofree nounwind memory(inaccessiblemem: read) }
-attributes #4 = { mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) }
-
-!llvm.module.flags = !{!0}
-
-!0 = !{i32 2, !"Debug Info Version", i32 3}
-!1 = distinct !{!1, !2}
-!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_4.peanohack.ll b/examples/elementwise_arith/air_project/square_kernel_0_core_0_4.peanohack.ll
deleted file mode 100644
index 520a891..0000000
--- a/examples/elementwise_arith/air_project/square_kernel_0_core_0_4.peanohack.ll
+++ /dev/null
@@ -1,84 +0,0 @@
-; ModuleID = 'LLVMDialectModule'
-source_filename = "LLVMDialectModule"
-target triple = "aie2p"
-
-@buf0 = external global [256 x float]
-@buf1 = external global [256 x float]
-@buf2 = external global [256 x float]
-@buf3 = external global [256 x float]
-@buf4 = external global [256 x float]
-@buf5 = external global [256 x float]
-@buf6 = external global [256 x float]
-@buf7 = external global [256 x float]
-@buf8 = external global [1024 x float]
-@buf9 = external global [1024 x float]
-
-declare void @debug_i32(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.event(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.put.ms(i32, i32)
-
-; Unknown intrinsic
-declare { i32, i32 } @llvm.aie2p.get.ss()
-
-; Unknown intrinsic
-declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32)
-
-; Unknown intrinsic
-declare <16 x i32> @llvm.aie2p.scd.read.vec(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.acquire(i32, i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.release(i32, i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.set.ctrl.reg(i32, i32)
-
-define void @core_0_4() {
-  call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1)
-  call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0)
-  br label %1
-
-1:                                                ; preds = %13, %0
-  call void @llvm.aie2p.acquire(i32 49, i32 -1)
-  call void @llvm.aie2p.acquire(i32 50, i32 -1)
-  br label %2
-
-2:                                                ; preds = %5, %1
-  %3 = phi i32 [ %12, %5 ], [ 0, %1 ]
-  %4 = icmp slt i32 %3, 256
-  br i1 %4, label %5, label %13
-
-5:                                                ; preds = %2
-  %6 = getelementptr float, ptr @buf5, i32 %3
-  %7 = load <16 x float>, ptr %6
-  %8 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %7)
-  %9 = shufflevector <16 x bfloat> %8, <16 x bfloat> %8, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-  %10 = call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %9, <32 x bfloat> %9, i32 60)
-  %11 = getelementptr float, ptr @buf4, i32 %3
-  store <16 x float> %10, ptr %11
-  %12 = add i32 %3, 16
-  br label %2, !llvm.loop !1
-
-13:                                               ; preds = %2
-  call void @llvm.aie2p.release(i32 51, i32 1)
-  call void @llvm.aie2p.release(i32 48, i32 1)
-  br label %1
-}
-
-; Unknown intrinsic
-declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>)
-
-; Unknown intrinsic
-declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32)
-
-!llvm.module.flags = !{!0}
-
-!0 = !{i32 2, !"Debug Info Version", i32 3}
-!1 = distinct !{!1, !2}
-!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_5.elf b/examples/elementwise_arith/air_project/square_kernel_0_core_0_5.elf
deleted file mode 100755
index 9b231c6aa9ecc51f66997deeb1a18cef3b26ae8b..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 1600
zcma)6L2MgE6n*QM*pVu5sfdgKX<8%{P;l0@X>c!WBNrj+NaX;EgwS@qyGd5D*I~U5
zDVGKj1rCS{7jr=Br4mPrLk~GcYOkOkb3tk%xVN_+5aIpZ9oy=OC(XY9{{J)ccmD3&
zx!(HFFbrtqP#mi|oeM7jGvqoHa#%nCc}VsihsKBw8B9fq<CDK*PsgTtc3Nb1-oRa;
ziN|w=arCX%fzyv?6rL>NM3%gKWZ-i{0Sfm^7(Fs*;N<LZ%{ZRB1bkaKT*L7Vixm{_
z@7^<xO4Vd85-ghg=Vt#MuKsv_c=M0*e_bm$-&cpPq%r+0&s#;{BjfR_!#iKT(X>wZ
zHcfTPg1J9e>_q2e|Fj<J{piIh{peb`{lft+X7O}UiH~wJf1)}6FjcfPCVPwL`TTP6
z);usgz#LF6Vt$`3(KEAr3+zd9KC=Ecy*a65_{)Qra@$XSn;CugY`F1HVKJAtX0rEz
zrSr>a%aSF_jfayap#1ZLLKzFvP~ss^<k=a}3c@Wvx~f{Orm8k7X2k^DjuURE9Y5*^
zy)g5th0QqL?q6M6+6dy!LA&Dix=U+8CkS_!I-Q;FyH4P%cFpz7hF7ckj#o1qSL*M3
zzPs$zy;`+hYuDXs!}l8X^~T$24bxASGe#ay&n{5i9D5ABM*HrRev@`-JjuZ1{N?nX
zp2@dvO~qHJl??h`hJhQjkx!4OgxPd2Yob4uIH5Qs{|gcPTLhB7L$<h<v`6CKkQIkO
z^Z{9kAWGCfk(CHS^gXf?L5Loal?X!g5m|8vME_1!A_&pPV&wUq5K_;dVptQ6O6>2(
zkT!MwkVgM@H+I_O&3+uE)|ef4%kBqX`+a^7t<P4Qfz$Ebc(vyZIzB3`l@-&jW_+1!
zx2}ht2{($X71yO>eGs~K+usPnD%@V=+ot^<+Jp6Fd0c^Q_hToDVfVLF%ieK11K)10
zigdSNcby=#gD{987Hl`}MSVR>{I(NC&TZ{U!+z)?nUn<+?M=^2h4@)VQc*o=nk9$z
z|6njphQ90eXv^KGQOXSYGQ@3W+ROo8WwwxBNAejO<5!AX49Z<AYe)wrhyQVMPm>Z9
k8CceWtV_5=Tkdm>514nV1zG34li7P&>>@XmJ#}33e}uC6SpWb4

diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_5.ld.script b/examples/elementwise_arith/air_project/square_kernel_0_core_0_5.ld.script
deleted file mode 100644
index 818260c..0000000
--- a/examples/elementwise_arith/air_project/square_kernel_0_core_0_5.ld.script
+++ /dev/null
@@ -1,66 +0,0 @@
-
-MEMORY
-{
-   program (RX) : ORIGIN = 0, LENGTH = 0x0020000
-   data (!RX) : ORIGIN = 0x74400, LENGTH = 0xBC00
-}
-ENTRY(__start)
-SECTIONS
-{
-  . = 0x0;
-  .text : {
-     /* the __start symbol has to come at address zero. */
-     *crt0.o(.text*)
-     _ctors_start = .;
-     _init_array_start = .;
-     KEEP(SORT(*.init_array))
-     _ctors_end = .;
-     _init_array_end = .;
-     _dtors_start = .;
-     _dtors_end = .;
-     *(.text*)
-  } > program
-  .data : {
-     *(.data*)
-     *(.rodata*)
-  } > data
-  .comment : {
-     *(.comment*)
-  }
-  .symtab : {
-     *(.symtab)
-  }
-  .shstrtab : {
-     *(.shstrtab)
-  }
-  .strtab : {
-     *(.strtab)
-  }
-  .stack_sizes : {
-     *(.stack_sizes)
-  }
-
-. = 0x70000;
-_sp_start_value_DM_stack = .;
-. += 0x400; /* stack */
-. = 0x40400;
-buf5 = .;
-. += 0x400;
-. = 0x44000;
-buf4 = .;
-. += 0x400;
-/* No tile with memory exists to the west. */
-. = 0x50000;
-. += 0x10000;
-/* No tile with memory exists to the north. */
-. = 0x60000;
-. += 0x10000;
-. = 0x70400;
-buf7 = .;
-. += 0x400;
-. = 0x74000;
-buf6 = .;
-. += 0x400;
-  .bss : { *(.bss*) } > data
-}
-PROVIDE(main = core_0_5);
diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_5.ll b/examples/elementwise_arith/air_project/square_kernel_0_core_0_5.ll
deleted file mode 100644
index 3e15d3e..0000000
--- a/examples/elementwise_arith/air_project/square_kernel_0_core_0_5.ll
+++ /dev/null
@@ -1,84 +0,0 @@
-; ModuleID = 'LLVMDialectModule'
-source_filename = "LLVMDialectModule"
-target triple = "aie2p"
-
-@buf0 = external global [256 x float]
-@buf1 = external global [256 x float]
-@buf2 = external global [256 x float]
-@buf3 = external global [256 x float]
-@buf4 = external global [256 x float]
-@buf5 = external global [256 x float]
-@buf6 = external global [256 x float]
-@buf7 = external global [256 x float]
-@buf8 = external global [1024 x float]
-@buf9 = external global [1024 x float]
-
-declare void @debug_i32(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.event(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.put.ms(i32, i32)
-
-; Unknown intrinsic
-declare { i32, i32 } @llvm.aie2p.get.ss()
-
-; Unknown intrinsic
-declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32)
-
-; Unknown intrinsic
-declare <16 x i32> @llvm.aie2p.scd.read.vec(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.acquire(i32, i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.release(i32, i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.set.ctrl.reg(i32, i32)
-
-define void @core_0_5() {
-  call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1)
-  call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0)
-  br label %1
-
-1:                                                ; preds = %13, %0
-  call void @llvm.aie2p.acquire(i32 49, i32 -1)
-  call void @llvm.aie2p.acquire(i32 50, i32 -1)
-  br label %2
-
-2:                                                ; preds = %5, %1
-  %3 = phi i32 [ %12, %5 ], [ 0, %1 ]
-  %4 = icmp slt i32 %3, 256
-  br i1 %4, label %5, label %13
-
-5:                                                ; preds = %2
-  %6 = getelementptr float, ptr @buf7, i32 %3
-  %7 = load <16 x float>, ptr %6, align 4
-  %8 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %7)
-  %9 = shufflevector <16 x bfloat> %8, <16 x bfloat> %8, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-  %10 = call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %9, <32 x bfloat> %9, i32 60)
-  %11 = getelementptr float, ptr @buf6, i32 %3
-  store <16 x float> %10, ptr %11, align 4
-  %12 = add i32 %3, 16
-  br label %2, !llvm.loop !1
-
-13:                                               ; preds = %2
-  call void @llvm.aie2p.release(i32 51, i32 1)
-  call void @llvm.aie2p.release(i32 48, i32 1)
-  br label %1
-}
-
-; Unknown intrinsic
-declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>)
-
-; Unknown intrinsic
-declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32)
-
-!llvm.module.flags = !{!0}
-
-!0 = !{i32 2, !"Debug Info Version", i32 3}
-!1 = distinct !{!1, !2}
-!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_5.o b/examples/elementwise_arith/air_project/square_kernel_0_core_0_5.o
deleted file mode 100644
index cb309eb89ecceee56448345389d9d7a76543e35e..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 932
zcmZ`$O-mb56g|(cj1Wo`OjBqfx>C_0DQ!h4C0cC-C(=z@!NoX9rlmABI*wqwQD~4Z
z#P$3E_pV%I)n8Ctx{S~tVE;h%+&44M5b(l%_nvpoyYIgH4wkJqN-4yqV5AUm+z0fN
ze5M7H$RN?m))kOLiOGg40!5tXa7M)Z>ZY@<E{7+8pP9}&E>|nSpmDx`s($CnMO@EN
zCN@)QXNUd=)svCn-Jg-a3mNBEIT#DMcrD~B2C$;KkAj1pCrjwE39du>epYOHS-GDz
z*$;8sdyv8x3x(S85ym4vEGqn2pXd|J^PLdgbq6GO3)QPH7@K(@IKnWM2J-Fj{JrbL
z6f=N-H#V`1FSLGqWO5iGnfzzw*D1Uf<wZ_ll>Gc1Cm#X4>cIl1I43dTXEQp5&oC_?
zOZ!2e9?&UrtgHCv+DzZw_hD|eeAj%t_I`HPcj{kZSs$uPU!0~}_p7bOUekrS>wCUa
zWBmS2Vr}o!oGdS3dT!IvWz2fZbM2D-9HzCnShAl+vCL>&%Ug|P3eoNNhAA>p>ss@e
z{cfk!e1bNK;?eU_(Ywha^AKC1_v&Q6vBkRA+>kL<BBD#pOXgBv`VlTmml3CqV>1m>
VT$y=1YNlFu$SCqby3~`qZvl4{X=eZc

diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_5.opt.ll b/examples/elementwise_arith/air_project/square_kernel_0_core_0_5.opt.ll
deleted file mode 100644
index bccc4ff..0000000
--- a/examples/elementwise_arith/air_project/square_kernel_0_core_0_5.opt.ll
+++ /dev/null
@@ -1,65 +0,0 @@
-; ModuleID = 'air_project/square_kernel_0_core_0_5.peanohack.ll'
-source_filename = "LLVMDialectModule"
-target datalayout = "e-m:e-p:20:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-f32:32:32-i64:32-f64:32-a:0:32-n32"
-target triple = "aie2p"
-
-@buf6 = external local_unnamed_addr global [256 x float]
-@buf7 = external local_unnamed_addr global [256 x float]
-
-; Function Attrs: nounwind
-declare void @llvm.aie2p.acquire(i32, i32) #0
-
-; Function Attrs: nounwind
-declare void @llvm.aie2p.release(i32, i32) #0
-
-; Function Attrs: nounwind memory(inaccessiblemem: write)
-declare void @llvm.aie2p.set.ctrl.reg(i32, i32) #1
-
-; Function Attrs: noreturn nounwind
-define void @core_0_5() local_unnamed_addr #2 {
-  tail call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1)
-  tail call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0)
-  br label %1
-
-1:                                                ; preds = %13, %0
-  tail call void @llvm.aie2p.acquire(i32 49, i32 -1)
-  tail call void @llvm.aie2p.acquire(i32 50, i32 -1)
-  br label %2
-
-2:                                                ; preds = %1, %2
-  %3 = phi i32 [ 0, %1 ], [ %11, %2 ]
-  %4 = trunc nuw i32 %3 to i20
-  %5 = getelementptr float, ptr @buf7, i20 %4
-  %6 = load <16 x float>, ptr %5, align 64
-  %7 = tail call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %6)
-  %8 = shufflevector <16 x bfloat> %7, <16 x bfloat> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-  %9 = tail call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %8, <32 x bfloat> %8, i32 60)
-  %10 = getelementptr float, ptr @buf6, i20 %4
-  store <16 x float> %9, ptr %10, align 64
-  %11 = add nuw nsw i32 %3, 16
-  %12 = icmp ult i32 %3, 240
-  br i1 %12, label %2, label %13, !llvm.loop !1
-
-13:                                               ; preds = %2
-  tail call void @llvm.aie2p.release(i32 51, i32 1)
-  tail call void @llvm.aie2p.release(i32 48, i32 1)
-  br label %1
-}
-
-; Function Attrs: nofree nounwind memory(inaccessiblemem: read)
-declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>) #3
-
-; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read)
-declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32) #4
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind memory(inaccessiblemem: write) }
-attributes #2 = { noreturn nounwind }
-attributes #3 = { nofree nounwind memory(inaccessiblemem: read) }
-attributes #4 = { mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) }
-
-!llvm.module.flags = !{!0}
-
-!0 = !{i32 2, !"Debug Info Version", i32 3}
-!1 = distinct !{!1, !2}
-!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/square_kernel_0_core_0_5.peanohack.ll b/examples/elementwise_arith/air_project/square_kernel_0_core_0_5.peanohack.ll
deleted file mode 100644
index d8f77fa..0000000
--- a/examples/elementwise_arith/air_project/square_kernel_0_core_0_5.peanohack.ll
+++ /dev/null
@@ -1,84 +0,0 @@
-; ModuleID = 'LLVMDialectModule'
-source_filename = "LLVMDialectModule"
-target triple = "aie2p"
-
-@buf0 = external global [256 x float]
-@buf1 = external global [256 x float]
-@buf2 = external global [256 x float]
-@buf3 = external global [256 x float]
-@buf4 = external global [256 x float]
-@buf5 = external global [256 x float]
-@buf6 = external global [256 x float]
-@buf7 = external global [256 x float]
-@buf8 = external global [1024 x float]
-@buf9 = external global [1024 x float]
-
-declare void @debug_i32(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.event(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.put.ms(i32, i32)
-
-; Unknown intrinsic
-declare { i32, i32 } @llvm.aie2p.get.ss()
-
-; Unknown intrinsic
-declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32)
-
-; Unknown intrinsic
-declare <16 x i32> @llvm.aie2p.scd.read.vec(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.acquire(i32, i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.release(i32, i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.set.ctrl.reg(i32, i32)
-
-define void @core_0_5() {
-  call void @llvm.aie2p.set.ctrl.reg(i32 9, i32 1)
-  call void @llvm.aie2p.set.ctrl.reg(i32 1, i32 0)
-  br label %1
-
-1:                                                ; preds = %13, %0
-  call void @llvm.aie2p.acquire(i32 49, i32 -1)
-  call void @llvm.aie2p.acquire(i32 50, i32 -1)
-  br label %2
-
-2:                                                ; preds = %5, %1
-  %3 = phi i32 [ %12, %5 ], [ 0, %1 ]
-  %4 = icmp slt i32 %3, 256
-  br i1 %4, label %5, label %13
-
-5:                                                ; preds = %2
-  %6 = getelementptr float, ptr @buf7, i32 %3
-  %7 = load <16 x float>, ptr %6
-  %8 = call <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float> %7)
-  %9 = shufflevector <16 x bfloat> %8, <16 x bfloat> %8, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-  %10 = call <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat> %9, <32 x bfloat> %9, i32 60)
-  %11 = getelementptr float, ptr @buf6, i32 %3
-  store <16 x float> %10, ptr %11
-  %12 = add i32 %3, 16
-  br label %2, !llvm.loop !1
-
-13:                                               ; preds = %2
-  call void @llvm.aie2p.release(i32 51, i32 1)
-  call void @llvm.aie2p.release(i32 48, i32 1)
-  br label %1
-}
-
-; Unknown intrinsic
-declare <16 x bfloat> @llvm.aie2p.v16accfloat.to.v16bf16(<16 x float>)
-
-; Unknown intrinsic
-declare <16 x float> @llvm.aie2p.I512.I512.ACC512.bf.mul.conf(<32 x bfloat>, <32 x bfloat>, i32)
-
-!llvm.module.flags = !{!0}
-
-!0 = !{i32 2, !"Debug Info Version", i32 3}
-!1 = distinct !{!1, !2}
-!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/square_kernel_0_design.bif b/examples/elementwise_arith/air_project/square_kernel_0_design.bif
deleted file mode 100644
index 6e94022..0000000
--- a/examples/elementwise_arith/air_project/square_kernel_0_design.bif
+++ /dev/null
@@ -1,10 +0,0 @@
-all:
-{
-  id_code = 0x14ca8093
-  extended_id_code = 0x01
-  image
-  {
-    name=aie_image, id=0x1c000000
-    { type=cdo file=air_project/square_kernel_0_aie_cdo_elfs.bin file=air_project/square_kernel_0_aie_cdo_init.bin file=air_project/square_kernel_0_aie_cdo_enable.bin }
-  }
-}
diff --git a/examples/elementwise_arith/air_project/square_kernel_0_square_kernel_0_sequence.bin b/examples/elementwise_arith/air_project/square_kernel_0_square_kernel_0_sequence.bin
deleted file mode 100644
index 97e175b81722f66b4d8fb9099cc8f7fbe11452c6..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 2288
zcmcJP!41MN3`Lz%A#QNszyXOB7@?zu%{)TIB#gjM>*D?kas#o1VyE%<&jTu{-yf35
zAR@1W2+}#mB=?e?toB8bc2;%|B-1(DSe73B-{SoA=NBCORAcIXoI82@=$VP#V&<7<
z??-IhM6Sy|)_!>=l8dK|&wPETrL^|wYz7?iP^8=03@%Wz5F74Mv_<AJ8`EgUCc~A7
zB8_<Tj*?|ALu|N5p%9tNY)qpWo8rntkw!dvN6D1S5F73(Rb(!+F^y(y{GSGU(=8(&
z%h^@%8z97{ZZ5Mijb>~Hx|ia@+u5*2@O@J?m)Y3dW^8hm$JK8JDQm>oa8L9*VK%1G
UjE(+*4Qs^Ma8Df@)A$FQHy}r3Z2$lO

diff --git a/examples/elementwise_arith/air_project/sub_kernel_0.pdi b/examples/elementwise_arith/air_project/sub_kernel_0.pdi
deleted file mode 100644
index cad10284470b3236f231f480e9d2396b90ba8f55..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 7792
zcmeHM&ubiY6n`^2Nw&IeH#L^ANpac)2_=V3H$n=5&UT}$5^-xP1UxhqYAq;)UfN@n
zfJH<CBBT{D7ePENJ$Q(hmG&ee`wtjWe;mBc_S8c)e%|-TeBVs6wb5JY7dHEOpZC7+
z`+m)4cJ>z{nynrmyngkaH-Gy6+Z#jydEmzI;`b@Mczw3~<B&%B?<=#-W>k)-B1waJ
z-+x)xPW*6_&y)KvT)uez^7|JqU33md9L%CG#!nq>3=qAH7r)E3ZPq!E{Y1VzL)!fx
z-1&3+uK*6}v#;X>q8ehjdaJcPh{JrZKopIpxv^Z{JjQ1F1v$*gl#`1>9`tku?7Ppq
zV!}R7fk)q<qqMcp72pL_?AKQd5hW9Jm#9o9+aA5;MMPWM-<7G=@d()SH_BAG3x6LD
zY%F;1Hez@#Ze07?i_4SPsaIG$b6BH^a<ISu@3q%H9b9YvJotNU;KKFEwP%UeKDS9W
zP6vn9wwI31#_V-A`W1!8`SAGOl@gz?<AJ<#<$#boweamv`n}^uua9GW%jf+<J``nS
z_(Pm3Rz8~z^Ll-L+pC)M5O2Tt6j3<6j=WZg#(7sbO(sj&$9zqwdcpI?64XAQvG(Qd
zgZ@%-tI+!B{@SGn0~3A_7fgQGWBi9)Z?nQvcU)5_ym5OVq;aFY;}PB8)^lR#g17OU
z*txK>oeL}5xv(-tRoeo!zq~oAXDs!Xl_{q$3OPGpC!fsMV=-U4|8`zN({C%=d9bpb
z2P;!l&&^jYH(yzqHnVb3$l3Wi^JKmri}}+1w(}C2ep}hjgO%+(Sec^v+<cwQ%~w{Y
z&8%D$a(2Gncrss)#eC_0+j$91zpZTN!OC_XSmt@K_hRzii|@@?=2h^3l4B_@3mget
z5jYmODe$7eO9C$oyejazz%7B>0(S)75;!qf=U2^KGki~w&2LiV=WvubToE`HxGC_W
zz)J!z3%n}uy1*@g+X8n4-V!)5Sm#&I<)>xWznaL;;V5yqB5*8lQ{Y8`mjqrGcvawa
zfm;H%1?~vEC2(S}&Tl@KpO#&IXGMMvM~TA~fn$N20xt@@B=EAps{*eJ+!DAga7W-R
zffIqtG~)AjC3UR<+q}T@-L~ajjx-Voo}sd7V&*-AV{t811Rn{$EO-)p6SL*~WUm3s
zM`9Ms!kz@*L=UZfEP7iJd?fg?;7RaJ)XMqE_SEu`tf#Oi!w>T3m|6DwVxL#kOb0$a
zGJ>_jH^S3Fzuaf-BV`{uLxX-h9~{`tKGyzKpX8=Lz6uU=&xi1t&5LKfY_QEMfXrB3
z2RU5{GH1m){#@t9wmb8y?J+;LkJ(*j_0CWEe@K4NW9j@f_8@ca0q%zw>-@61lD#)p
z`8nOr{3iF9AKS+`HM@G}r~E%8Kj^V^ej2;{^tGh(%j!z@yQRv{>2~IKY>)Y|eXL@V
ze|qPq{68c==&^Ku8hem+|MX{$&M&Jg*=IB=Kc{o~;bDVXM*{UhM^-kpd5a&^Rt9lk
zA3vPHiU{_QiH`7}9jmq-e<1CtVC=n*)vo-8ynSr_`zP|IO3vFy)_>t<Pybx2seZ(P
z#kHy!%o|i4Av0EW%;`#y@$5wDiaFg7WcwLV&gn)VKWFxZLbpr(dA_iE)PK8{ucx-}
zQO`JMkA7);fA@avurC<<9`(%IqhIQ=6j6_UJ&0FK{W(cne}!%RXZnwjt?@K4FRPn{
zY<m;ubaRkxjT<>#0NM72gl?Dm`+Kavwtw{cYx_s9zqWt$`s0I~EB3GTSO0{~{3~p0
zjQabc{uv$Wubxr!KGZ*>L;ZbG|BMdx_n9!)R7N+G6sV7Andf4ti|5e>7s0<o)MlPE
zukpPCz34rEQ1LX(SBly@?hU7VE8wjk5_}9Ec`Exr@Dt#1isFlcp9YVJ6kihjR8sIo
z%$$$j3tq&`d|SnNui%TAeca$E`^Xzk_af#LBjyw%X0ExiN6aaYn7O9<T)`t|U&Nf+
zmy<$($GXZs?iP4T^j`2HX6D-_=D@~0M#HH+VsjY$LE~pe@B#P(hMyLE5&V9`PYJ#R
zK8;QAK{Cp}i>Nrm?kGYt@q<tMF8Bs`*B56$3qGy6urGqgBX#9}LGW|n)0zuDNUFk5
zHH~Kqyo<;2GvN0b|6^Up7s2C433cu{!8gD^WB572&w+RGI6pyB7x~sjJaxiDedT|o
z>-Zw<U7U_@fOqjYeh$1l*YOMB)4mJ;K{7AmnNQ;>f_LXSeh9qt@AyhG>WjXoGdvC6
z`58_58StX-DL(~X^gZR9;GZ`2ndmw{3*hksjjCHi@DoW@#8XZEjDdG`c0A@m#N+rT
zcz3SjBk<|`cO5^G)P<jVdTt2b`Efk@;-}YL*YP9CXizeFRl1|;^&5g0`KJ63c<0C2
zN8nxGC0`Z(tHMv!UDsX5BYxq(uI$zSh6K}qKnaYIeJKZt7gV4|tuoKdu#4wN?=inK
QIa&WM;AH(Amu0*D0W-OP2mk;8

diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_aie_cdo_elfs.bin b/examples/elementwise_arith/air_project/sub_kernel_0_aie_cdo_elfs.bin
deleted file mode 100644
index cbef72b4fafd5ab9b209a73b1144c433a2ac7d6e..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 2592
zcmeH_K}#D!6vzLwS+lFyb|WZrP}rtWJmk<Q1$*ko<PZe0wHKkM_R=1D2s!nvLVFNz
zh3%oxOQFZmgF;S%o+Y0k#NY?W(SsP@WM<sg?0$lffrZTPy?JjY@8<%j)^-2~IH2$4
zX=_*zfG1pv&<dY{wxeZP(0!`XN1tUfgCT?Yj1`2gJR_Q>LlLgPU!aEGtY;YTd>QC(
z<w6WfxB>#Koe8`bA<!G11ZZ^x=(lj%0bX2D$$hWACqCCBV8(BM{Uf5FOegh{{^pYY
zRSMik`MY@gxDda&EL>N-58uo2A`t(u53av<7vf=Kr535GwXmOu>Q5EFH0M=+eIME%
zHXpNnry@Td<weoNbvqSrIqE#yFNFW?sUF>PbXV0|C632+|1+^}`q6#!Vh)fSzv*3v
zz^XbWH(*{#`4K>~H2LiYg&ox+{&oJuX$=15THkKsPq$vlaig4hADuDzQvKg%$yXQ2
zP#}MvdlIWAdLe+DVd9N-L>(P>gfr_1XVwwU5U$&?Jk$%i<&lHR#iU1?AtTRcEb(t`
z>fglpr#;{fNi%cKtQ(wJH#kFB&G4_1;a{4en`S;^iGQ0@|0c#iJ#+4mG&ASSy1|)s
qgENHhGW>g+;a{4en`S;^iGRCO|0c#iJ#+4mG&ASSy1|)sL$P~=DwMGR

diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_aie_cdo_enable.bin b/examples/elementwise_arith/air_project/sub_kernel_0_aie_cdo_enable.bin
deleted file mode 100644
index 7cc1818bce4d6ce1226fc5fda519967a4842b99e..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 104
zcmZQ!U|?`|@n>LQVqg#jvKQ~%@L!OTfq|Wo1;_&e1!V>x7mN{N##qFhv4{o3#Q>))
B1+xGE

diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_aie_cdo_init.bin b/examples/elementwise_arith/air_project/sub_kernel_0_aie_cdo_init.bin
deleted file mode 100644
index d4549ba181167c6e6d7475f23335e34fea2c3376..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 6032
zcmb7{&rak<5XQ@%hINw3u1Uy>EJ*QcI7AT#M?)kINHH2BaX`vM<iIUF0*^qXJ?)8e
zF5wYI$_vaRaLt7vMY;Fvc_s37+iiD!;}I|tf4=&CUGA#tw!nzU<oIWiROFvEk-z_%
zJ!~W*FA@WSPcd@He{sy=EE%{TBKEES5p+(vAYGEKNKZ*$kiH~+Mf#fb4e1%_Iq6%{
z_oQv3yPk}D`c%hUV?tlVeouYUx*%SXu1HTwUy!~eeMS13^bP45={f0J()Xloqq{y$
zdiqqybf1o>Pg)mN>ymUudP@3&^d;#l($}PKNY6;mN#BybCv6+u_35mqPjyW9=@;sg
z)`iu&Bwdl7lD;5)N&1TPHR&7DGtzU?x1{e$Thh7gr0z}C-Y@95Kj1=K_e*vRasT2A
z`KGw%NtC~$_drR!Af6K!;#1rYonOBXO5z3Gubj9LpW<BU{LA+y<^`S0oYo8RDdrKc
zuV_9c@q&0xTpGTS_%ZrfttFZNDYrPwi8P`Px$pNsn)N<^;r&%-Y<~OqH}kU3U)A~j
z{NqLc{b^o@^RRhW-=}a-^G3%#Fo-!FJksMCBIYU#o-Fco^&UO5Ugxjkt^40s*X#2?
zQZGFCgnj0BwN4NVJu!$m9X!(G86u|Z?ecW>;_N?bzRq9a$@;UpUZ4Mwdf~w*)RN!T
zTGw0O+2D~L&k!+PZ<nX5_w6(5b^gllFaEZ=UZ4Mwdf~w*)RN!TIzg<PZ~YDo9_jH6
z5!3Z{dDfCo{WC<~82>Vspqp^>qxyLCZUYYv{=)g)M&yk)HtOi1`aSl`pz$Z~{VHC>
z{wm~8zhkS{>@Pz8!5=I0ho0@PV=wrGo|TPu3xmfX=5+8#k7tP3R^-WgJe!Ecz0Z3*
zJBWQ9<=Kinq2IpGzOohbc{t*JyYq{${(e{F$2@C)+CR@F|Df@&=r{W@PpQw>ig|vl
z-i6*{KG=BId(6k{cs_@S#l1i1@f;%-&#>z8oFW$Y{-no)v!Lz|&O*DNXg+CuAD%p)
z+W++V)c&W>r}jU6KIt>&<BoKDH!vTsX+9~<XUT*4q%@x;59X87e3m?zPwIlvU0w1V
z+JWF-=&NwXHW&VX<5p+jKZ(o(mzsC_!$5vWzIoMh*>-QrcKx@>wrfeNOYm3=@d_NZ
zw|+x>4;)3bJR^PpjtW{nB))G4Db?$G()v+vO7%KEZ|nOoNU7cwf84d}i)35nL%m9)
zUZqj5n{~S$^(u~f-3(ifdKE{#Db=fZZU-ih)>n&xd&hVAz^Pux=TW^Tu6I|q)q2#U
zHTbJ2{~>V${xaeR#53^EBfd|32(IcOZtO_bqxD;h5z6W6t-#g!Bz^*}&xc-r46f!)
z>of4z(euuUpMtA-6E}8D`NwMi``~*2njeCH5<P!+p?L=WRm6WKegghw#7~Kzg6sY3
z{KihGe-qmOL|#PscNUswSg-5T`~+O@U-MINy+6&*!PWVs=No%Q`#)3r&%pKmG~Wc*
z&)2-Pqm<64x~~V|I{!%VLvT8uitmHd`BZ!g{#n%jy@k$y4*o^NPl)f?G3|e>^6!G{
z`D%{yOZ(S+3a<C3c>%8O|3dRUJE8m&wZAQJonLdDk5s*%7Mkzakr_t)DHkL4{@Vhl
z{wcl*uJh~l1-L$+%*XWnG36iY_v1oy)KAZ!`1S4I2o|Ro1qz~3EajN7i78>jEaZQL
C^4EL-

diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.elf b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.elf
deleted file mode 100755
index fbf768003c7ae9be77deae967dc2a77b06391f68..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 1656
zcma)6PiPcp6o0eX?xv;HHDX!N!iR!HiJO`1+GM?S5;hX*2GWBP>R~$nk__3MU1ny}
zSPw2mYC-9(ussyK2;wOe6v-)&crBs#LK0fMct~#^O4Hw)Kf9xN@CV<#_j~Vqe`enI
z&7Il$RY{U4L57C=tODaV9}o>eR!C5WMk!BO;@w@W1Y0oh!d5C^ef4)>CBC%GrAe%3
zC3;L$qrDIFeed}*L<e6E2|663eNH)n>^CJrw0HEZKpVRfL6<U{1(Nqs_xt?jqO>xn
zqJ!bh^+%FgP!JiRI6U)dq>UA_pXC1Q&Hr?wxA6Okzozs0k4o<pQSayQg|KPvi|k16
zsQp<@<*YSvfQC^Y85`cmTtDgu{H}K_#doKR#)F)6Hugv5*uRnC_!J)zZ*b&8pJA_D
zn>W7gkLqtp<+vXD%}E~-6{jBJz3vi50u-kxeoO3E2{Q`67cy_Q;m&Z4-mlxoGws9Q
zhBofK?k)eDAIoIbq4@m}6X!2RBjlu1tnLhkLd9RU^F<ov0RuY{(d#3ky5p|e-W5@=
z*Mw3n$t9V{Z0PQ?ShGFfX}PggjIIPh+rKhCzU%}m9iwEnn&XR3!*SQg8;!N*CEc-w
zQ8q2PYL&~jZk6Tg=anhTHYcr$RaT6$Q8AUOZB;8v)eBJ%=_lIhBTN4rodjFxTO>LS
z{b7oqgI?$dNj!%>IG)e`Ldrh_F0f?SMsQ%RL8ol=c?y(^YN-Q$o&DVb|C%`<usGge
zg4@L4_}h?NkH^LSA0P!5gZX_(!2~FP_aFrmfcX<h!31Fb6jCq&n7@D&OaSJ8Knf-R
z^FLY1MmLoEIbg01_)F&90jE-6uLs0qY}zgq|5h{54agNg@FJ}*b#qnoo$qWP=cRsQ
zzUJr++YIJgR;OW8sXj9!Yf3C9Ahr6eYsqAKfl@L}7?wJ&sTuaN<0@pfJX@2s2{JlM
z;X`3fCQb7L-3v(b+mWWN>5Y!9)#jO+tE4q`$JHFy35Ye)%%J7@Nfg^P-ShNYi6wH|
zu0>%`><qb;j*JM~(Z^+eRQOR2$EosXdml*H#4Gy3(f@Ue4Kd<t7oHe4<C=v~GeqCU
zI*TL^?}w|8=OA4;`5(=v4u1&D9d{gy|L+N=;FwGZWYWS}=;?he?0fZ&eHY`HZtS~>
Lf?PAnlgR%8Kh^|F

diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.ld.script b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.ld.script
deleted file mode 100644
index fc4f0cf..0000000
--- a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.ld.script
+++ /dev/null
@@ -1,72 +0,0 @@
-
-MEMORY
-{
-   program (RX) : ORIGIN = 0, LENGTH = 0x0020000
-   data (!RX) : ORIGIN = 0x78400, LENGTH = 0x7C00
-}
-ENTRY(__start)
-SECTIONS
-{
-  . = 0x0;
-  .text : {
-     /* the __start symbol has to come at address zero. */
-     *crt0.o(.text*)
-     _ctors_start = .;
-     _init_array_start = .;
-     KEEP(SORT(*.init_array))
-     _ctors_end = .;
-     _init_array_end = .;
-     _dtors_start = .;
-     _dtors_end = .;
-     *(.text*)
-  } > program
-  .data : {
-     *(.data*)
-     *(.rodata*)
-  } > data
-  .comment : {
-     *(.comment*)
-  }
-  .symtab : {
-     *(.symtab)
-  }
-  .shstrtab : {
-     *(.shstrtab)
-  }
-  .strtab : {
-     *(.strtab)
-  }
-  .stack_sizes : {
-     *(.stack_sizes)
-  }
-
-. = 0x70000;
-_sp_start_value_DM_stack = .;
-. += 0x400; /* stack */
-/* No tile with memory exists to the south. */
-. = 0x40000;
-. += 0x10000;
-/* No tile with memory exists to the west. */
-. = 0x50000;
-. += 0x10000;
-. = 0x60400;
-buf5 = .;
-. += 0x400;
-. = 0x64000;
-buf4 = .;
-. += 0x400;
-. = 0x68000;
-buf3 = .;
-. += 0x400;
-. = 0x70400;
-buf2 = .;
-. += 0x400;
-. = 0x74000;
-buf1 = .;
-. += 0x400;
-. = 0x78000;
-buf0 = .;
-. += 0x400;
-  .bss : { *(.bss*) } > data
-}
-PROVIDE(main = core_0_2);
diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.ll b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.ll
deleted file mode 100644
index 906e39c..0000000
--- a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.ll
+++ /dev/null
@@ -1,95 +0,0 @@
-; ModuleID = 'LLVMDialectModule'
-source_filename = "LLVMDialectModule"
-target triple = "aie2p"
-
-@buf0 = external global [256 x float]
-@buf1 = external global [256 x float]
-@buf2 = external global [256 x float]
-@buf3 = external global [256 x float]
-@buf4 = external global [256 x float]
-@buf5 = external global [256 x float]
-@buf6 = external global [256 x float]
-@buf7 = external global [256 x float]
-@buf8 = external global [256 x float]
-@buf9 = external global [256 x float]
-@buf10 = external global [256 x float]
-@buf11 = external global [256 x float]
-@buf12 = external global [1024 x float]
-@buf13 = external global [1024 x float]
-@buf14 = external global [1024 x float]
-
-declare void @debug_i32(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.event(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.put.ms(i32, i32)
-
-; Unknown intrinsic
-declare { i32, i32 } @llvm.aie2p.get.ss()
-
-; Unknown intrinsic
-declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32)
-
-; Unknown intrinsic
-declare <16 x i32> @llvm.aie2p.scd.read.vec(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.acquire(i32, i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.release(i32, i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.set.ctrl.reg(i32, i32)
-
-define void @core_0_2() {
-  br label %1
-
-1:                                                ; preds = %22, %0
-  call void @llvm.aie2p.acquire(i32 49, i32 -1)
-  call void @llvm.aie2p.acquire(i32 50, i32 -1)
-  call void @llvm.aie2p.acquire(i32 52, i32 -1)
-  br label %2
-
-2:                                                ; preds = %5, %1
-  %3 = phi i32 [ %21, %5 ], [ 0, %1 ]
-  %4 = icmp slt i32 %3, 256
-  br i1 %4, label %5, label %22
-
-5:                                                ; preds = %2
-  %6 = getelementptr float, ptr @buf2, i32 %3
-  %7 = load <16 x float>, ptr %6, align 4
-  %8 = getelementptr float, ptr @buf1, i32 %3
-  %9 = load <16 x float>, ptr %8, align 4
-  %10 = bitcast <16 x float> %7 to <8 x i64>
-  %11 = bitcast <16 x float> %9 to <8 x i64>
-  %12 = shufflevector <8 x i64> %10, <8 x i64> %10, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-  %13 = shufflevector <8 x i64> %11, <8 x i64> %11, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-  %14 = bitcast <32 x i64> %12 to <64 x float>
-  %15 = bitcast <32 x i64> %13 to <64 x float>
-  %16 = call <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float> %14, <64 x float> %15, i32 60)
-  %17 = bitcast <64 x float> %16 to <32 x i64>
-  %18 = shufflevector <32 x i64> %17, <32 x i64> %17, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %19 = bitcast <8 x i64> %18 to <16 x float>
-  %20 = getelementptr float, ptr @buf0, i32 %3
-  store <16 x float> %19, ptr %20, align 4
-  %21 = add i32 %3, 16
-  br label %2, !llvm.loop !1
-
-22:                                               ; preds = %2
-  call void @llvm.aie2p.release(i32 51, i32 1)
-  call void @llvm.aie2p.release(i32 53, i32 1)
-  call void @llvm.aie2p.release(i32 48, i32 1)
-  br label %1
-}
-
-; Unknown intrinsic
-declare <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float>, <64 x float>, i32)
-
-!llvm.module.flags = !{!0}
-
-!0 = !{i32 2, !"Debug Info Version", i32 3}
-!1 = distinct !{!1, !2}
-!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.o b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.o
deleted file mode 100644
index c01f0bbb18a2b8aef10a542b3c15d504b366c5fe..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 984
zcmaJ<O=}ZT6g@9ZlOR>nfSQGt;;NrR#&%IRn#NWmOhg2w)J2#i(<+@dWjcZCMv*RD
zh`14E>8AUxTx8W>ApV33{($xm7|(t4l1#Jk!kv50J9qBA@6NnmcOEIF&^Lu#f&r%~
zV2t9jPMAR!!(2J~4dhWGc%`O+v)+$9PRNK=s!K((KW4id>UFgYjOV(C$EuvS&lh+u
zR?Jmy#AqR(zMA<P)xJzd&wft+S<Ski?C3fWee1WZK1dg$-u~@1oUtsQ8Qq^1{a`}+
zvnuByKMJlT_^;Js<8wycF?!ri^plk2CwO7{5@$Wmsb`hmk~pfz{Z7@A>Bo69>I(0*
z@}2iOGR2E4Ft;=ZgK%?zfHW?8b#hr}Y-4y%@WC+o5<c;Hf<G}>7G0ozZ=lJC0X9eY
z_6Qd`%RP!o{$f@`++vu|#W-Y`lb?Y0jy{*x5L4`__<v}#Lhmqywbu?k>+#bUiwB|G
z+=1h~tgXFqeXkkT+O3Z7!8!<o&}}fj-IdtrY~6(wc)qJESj~3e)k}4rIhBe;CfJ79
zLP{icXML|VT$35$e_@uQxq?osa_&yz1zts)L~*kcl=bQGAv+R#NFUA9Kf6}rT?)KU
smvxot+Ss>BAoXP)(I)7!_Wlfn*iVv#W)ri-z$EW0nWT@?rJl_A4}x%RasU7T

diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.opt.ll b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.opt.ll
deleted file mode 100644
index 1f9925e..0000000
--- a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.opt.ll
+++ /dev/null
@@ -1,64 +0,0 @@
-; ModuleID = 'air_project/sub_kernel_0_core_0_2.peanohack.ll'
-source_filename = "LLVMDialectModule"
-target datalayout = "e-m:e-p:20:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-f32:32:32-i64:32-f64:32-a:0:32-n32"
-target triple = "aie2p"
-
-@buf0 = external local_unnamed_addr global [256 x float]
-@buf1 = external local_unnamed_addr global [256 x float]
-@buf2 = external local_unnamed_addr global [256 x float]
-
-; Function Attrs: nounwind
-declare void @llvm.aie2p.acquire(i32, i32) #0
-
-; Function Attrs: nounwind
-declare void @llvm.aie2p.release(i32, i32) #0
-
-; Function Attrs: noreturn nounwind
-define void @core_0_2() local_unnamed_addr #1 {
-  br label %1
-
-1:                                                ; preds = %19, %0
-  tail call void @llvm.aie2p.acquire(i32 49, i32 -1)
-  tail call void @llvm.aie2p.acquire(i32 50, i32 -1)
-  tail call void @llvm.aie2p.acquire(i32 52, i32 -1)
-  br label %2
-
-2:                                                ; preds = %1, %2
-  %3 = phi i32 [ 0, %1 ], [ %17, %2 ]
-  %4 = trunc nuw i32 %3 to i20
-  %5 = getelementptr float, ptr @buf2, i20 %4
-  %6 = load <8 x i64>, ptr %5, align 64
-  %7 = getelementptr float, ptr @buf1, i20 %4
-  %8 = load <8 x i64>, ptr %7, align 64
-  %9 = shufflevector <8 x i64> %6, <8 x i64> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-  %10 = shufflevector <8 x i64> %8, <8 x i64> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-  %11 = bitcast <32 x i64> %9 to <64 x float>
-  %12 = bitcast <32 x i64> %10 to <64 x float>
-  %13 = tail call <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float> %11, <64 x float> %12, i32 60)
-  %14 = bitcast <64 x float> %13 to <32 x i64>
-  %15 = shufflevector <32 x i64> %14, <32 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %16 = getelementptr float, ptr @buf0, i20 %4
-  store <8 x i64> %15, ptr %16, align 64
-  %17 = add nuw nsw i32 %3, 16
-  %18 = icmp ult i32 %3, 240
-  br i1 %18, label %2, label %19, !llvm.loop !1
-
-19:                                               ; preds = %2
-  tail call void @llvm.aie2p.release(i32 51, i32 1)
-  tail call void @llvm.aie2p.release(i32 53, i32 1)
-  tail call void @llvm.aie2p.release(i32 48, i32 1)
-  br label %1
-}
-
-; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read)
-declare <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float>, <64 x float>, i32) #2
-
-attributes #0 = { nounwind }
-attributes #1 = { noreturn nounwind }
-attributes #2 = { mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) }
-
-!llvm.module.flags = !{!0}
-
-!0 = !{i32 2, !"Debug Info Version", i32 3}
-!1 = distinct !{!1, !2}
-!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.peanohack.ll b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.peanohack.ll
deleted file mode 100644
index d91a003..0000000
--- a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_2.peanohack.ll
+++ /dev/null
@@ -1,95 +0,0 @@
-; ModuleID = 'LLVMDialectModule'
-source_filename = "LLVMDialectModule"
-target triple = "aie2p"
-
-@buf0 = external global [256 x float]
-@buf1 = external global [256 x float]
-@buf2 = external global [256 x float]
-@buf3 = external global [256 x float]
-@buf4 = external global [256 x float]
-@buf5 = external global [256 x float]
-@buf6 = external global [256 x float]
-@buf7 = external global [256 x float]
-@buf8 = external global [256 x float]
-@buf9 = external global [256 x float]
-@buf10 = external global [256 x float]
-@buf11 = external global [256 x float]
-@buf12 = external global [1024 x float]
-@buf13 = external global [1024 x float]
-@buf14 = external global [1024 x float]
-
-declare void @debug_i32(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.event(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.put.ms(i32, i32)
-
-; Unknown intrinsic
-declare { i32, i32 } @llvm.aie2p.get.ss()
-
-; Unknown intrinsic
-declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32)
-
-; Unknown intrinsic
-declare <16 x i32> @llvm.aie2p.scd.read.vec(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.acquire(i32, i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.release(i32, i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.set.ctrl.reg(i32, i32)
-
-define void @core_0_2() {
-  br label %1
-
-1:                                                ; preds = %22, %0
-  call void @llvm.aie2p.acquire(i32 49, i32 -1)
-  call void @llvm.aie2p.acquire(i32 50, i32 -1)
-  call void @llvm.aie2p.acquire(i32 52, i32 -1)
-  br label %2
-
-2:                                                ; preds = %5, %1
-  %3 = phi i32 [ %21, %5 ], [ 0, %1 ]
-  %4 = icmp slt i32 %3, 256
-  br i1 %4, label %5, label %22
-
-5:                                                ; preds = %2
-  %6 = getelementptr float, ptr @buf2, i32 %3
-  %7 = load <16 x float>, ptr %6
-  %8 = getelementptr float, ptr @buf1, i32 %3
-  %9 = load <16 x float>, ptr %8
-  %10 = bitcast <16 x float> %7 to <8 x i64>
-  %11 = bitcast <16 x float> %9 to <8 x i64>
-  %12 = shufflevector <8 x i64> %10, <8 x i64> %10, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-  %13 = shufflevector <8 x i64> %11, <8 x i64> %11, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-  %14 = bitcast <32 x i64> %12 to <64 x float>
-  %15 = bitcast <32 x i64> %13 to <64 x float>
-  %16 = call <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float> %14, <64 x float> %15, i32 60)
-  %17 = bitcast <64 x float> %16 to <32 x i64>
-  %18 = shufflevector <32 x i64> %17, <32 x i64> %17, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %19 = bitcast <8 x i64> %18 to <16 x float>
-  %20 = getelementptr float, ptr @buf0, i32 %3
-  store <16 x float> %19, ptr %20
-  %21 = add i32 %3, 16
-  br label %2, !llvm.loop !1
-
-22:                                               ; preds = %2
-  call void @llvm.aie2p.release(i32 51, i32 1)
-  call void @llvm.aie2p.release(i32 53, i32 1)
-  call void @llvm.aie2p.release(i32 48, i32 1)
-  br label %1
-}
-
-; Unknown intrinsic
-declare <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float>, <64 x float>, i32)
-
-!llvm.module.flags = !{!0}
-
-!0 = !{i32 2, !"Debug Info Version", i32 3}
-!1 = distinct !{!1, !2}
-!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.elf b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.elf
deleted file mode 100755
index e4e226b260243a140d2d00b735618366285dcca0..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 1720
zcma)7O^6$17=9-klU4jlS1bV)J}Ss6Yj$Rnb~n8^T~aBUh4rA6_ApFlW;eqolQ5aA
zTRj-0sGxWgq9E=?5KoJs$etEjuTs1V?V@<;p}j~!aebcoNunno_`dgj-k<M%=li~y
zdG})T6;0D5Lr&%=tOn!$Ln3pKHBpk2MJY;N7(K$sa0CMfLZyW9`frh&`DSHdmZW-K
zldnY@a{J-p#QXd+BFC@JDY>^KcevyVR==t#k=rNVm*n7<CeXFqVM+8msQX^=@Uqsv
zWTJ!l!@aLGvt%GML3w`tbfPVl@{bk%9&dbdYP|W=sb4P??GKIdQzGL};|D^}rI+&y
z<CFfg4U?-j)G-=HeXQ6CA7%ZdAMoqp<1_r$g>viTf_5hLC*|lr$T5G0FQ|WUtcO11
zURj%g|Me&JkF;u95B(OjM?}hN`*^PFf+GRSYmz=C;!VLB#ruVo5BqTESY!OfjVE&b
zd*99-eDGkr{a0}*mpA9q=R;1qemOZpE-K~v&FP^?`Li3vvMlm|ft#AhgA<XaAMAMH
zdDUz-lu@tf6<x&X*ul2i^}>PQ3sS3EY)4UlaDI7t+mG7AR>kRcmoNJrKiFIDbauNh
z*}kV*RmauqZnf&!ZdI?JtF5`7v+CB|s?n;pYK~F&+<I-R{zB5j?2L9M$jkkcC&4x+
z7EPXpzCXjCgI<~>X*`Dsn9sYvIpbdkR~R~M3%D?^L6>~8JtZk5wah7hgZ|N!Um*t+
z2J>4ac$*03zXQp7JTCg*hg28@`A3k71So-zAr%Qg{tcue0m#3DR3rfTkC2K4ApaRs
zkpSesP|4%o0PN?OiaF)KlaHqSE_ptAqZxmn+??`1$VXEymB`zRgvZ|X0w{xP-N<f1
zwg*v|XcK8WJJ!H|+Z*71HQ(H5_;$y0qK%$A?08aXuCMEsk;)ZFt9dbSb#cPTs5lM`
zTf@MyTHdxF7~=Fo&(f``w1!(}In_8-#j*yG9Y$ge`iW-k+MS_iH8x0{9kIH$A6S0i
zM?y_3C+dZREQ@x_4nzA|W=Y&$;7Xj7I%Dq6W+sGo^s&Op;G}a}<Fv}@Tznf$?&AM~
zhA56?$FcR$k7!7dzP9m=qZzMh3^OP4R;p8E>o6X!34Q=MfRo>N-df%S#^8Gq{~sV?
nwBcCI2xQj68R-0vWcU(se`W+Si+eHEN2%{~6lBfJpUHm!Ajl1>

diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.ld.script b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.ld.script
deleted file mode 100644
index 6120a88..0000000
--- a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.ld.script
+++ /dev/null
@@ -1,78 +0,0 @@
-
-MEMORY
-{
-   program (RX) : ORIGIN = 0, LENGTH = 0x0020000
-   data (!RX) : ORIGIN = 0x78400, LENGTH = 0x7C00
-}
-ENTRY(__start)
-SECTIONS
-{
-  . = 0x0;
-  .text : {
-     /* the __start symbol has to come at address zero. */
-     *crt0.o(.text*)
-     _ctors_start = .;
-     _init_array_start = .;
-     KEEP(SORT(*.init_array))
-     _ctors_end = .;
-     _init_array_end = .;
-     _dtors_start = .;
-     _dtors_end = .;
-     *(.text*)
-  } > program
-  .data : {
-     *(.data*)
-     *(.rodata*)
-  } > data
-  .comment : {
-     *(.comment*)
-  }
-  .symtab : {
-     *(.symtab)
-  }
-  .shstrtab : {
-     *(.shstrtab)
-  }
-  .strtab : {
-     *(.strtab)
-  }
-  .stack_sizes : {
-     *(.stack_sizes)
-  }
-
-. = 0x70000;
-_sp_start_value_DM_stack = .;
-. += 0x400; /* stack */
-. = 0x40400;
-buf2 = .;
-. += 0x400;
-. = 0x44000;
-buf1 = .;
-. += 0x400;
-. = 0x48000;
-buf0 = .;
-. += 0x400;
-/* No tile with memory exists to the west. */
-. = 0x50000;
-. += 0x10000;
-. = 0x60400;
-buf8 = .;
-. += 0x400;
-. = 0x64000;
-buf7 = .;
-. += 0x400;
-. = 0x68000;
-buf6 = .;
-. += 0x400;
-. = 0x70400;
-buf5 = .;
-. += 0x400;
-. = 0x74000;
-buf4 = .;
-. += 0x400;
-. = 0x78000;
-buf3 = .;
-. += 0x400;
-  .bss : { *(.bss*) } > data
-}
-PROVIDE(main = core_0_3);
diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.ll b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.ll
deleted file mode 100644
index ba863ab..0000000
--- a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.ll
+++ /dev/null
@@ -1,95 +0,0 @@
-; ModuleID = 'LLVMDialectModule'
-source_filename = "LLVMDialectModule"
-target triple = "aie2p"
-
-@buf0 = external global [256 x float]
-@buf1 = external global [256 x float]
-@buf2 = external global [256 x float]
-@buf3 = external global [256 x float]
-@buf4 = external global [256 x float]
-@buf5 = external global [256 x float]
-@buf6 = external global [256 x float]
-@buf7 = external global [256 x float]
-@buf8 = external global [256 x float]
-@buf9 = external global [256 x float]
-@buf10 = external global [256 x float]
-@buf11 = external global [256 x float]
-@buf12 = external global [1024 x float]
-@buf13 = external global [1024 x float]
-@buf14 = external global [1024 x float]
-
-declare void @debug_i32(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.event(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.put.ms(i32, i32)
-
-; Unknown intrinsic
-declare { i32, i32 } @llvm.aie2p.get.ss()
-
-; Unknown intrinsic
-declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32)
-
-; Unknown intrinsic
-declare <16 x i32> @llvm.aie2p.scd.read.vec(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.acquire(i32, i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.release(i32, i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.set.ctrl.reg(i32, i32)
-
-define void @core_0_3() {
-  br label %1
-
-1:                                                ; preds = %22, %0
-  call void @llvm.aie2p.acquire(i32 49, i32 -1)
-  call void @llvm.aie2p.acquire(i32 50, i32 -1)
-  call void @llvm.aie2p.acquire(i32 52, i32 -1)
-  br label %2
-
-2:                                                ; preds = %5, %1
-  %3 = phi i32 [ %21, %5 ], [ 0, %1 ]
-  %4 = icmp slt i32 %3, 256
-  br i1 %4, label %5, label %22
-
-5:                                                ; preds = %2
-  %6 = getelementptr float, ptr @buf5, i32 %3
-  %7 = load <16 x float>, ptr %6, align 4
-  %8 = getelementptr float, ptr @buf4, i32 %3
-  %9 = load <16 x float>, ptr %8, align 4
-  %10 = bitcast <16 x float> %7 to <8 x i64>
-  %11 = bitcast <16 x float> %9 to <8 x i64>
-  %12 = shufflevector <8 x i64> %10, <8 x i64> %10, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-  %13 = shufflevector <8 x i64> %11, <8 x i64> %11, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-  %14 = bitcast <32 x i64> %12 to <64 x float>
-  %15 = bitcast <32 x i64> %13 to <64 x float>
-  %16 = call <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float> %14, <64 x float> %15, i32 60)
-  %17 = bitcast <64 x float> %16 to <32 x i64>
-  %18 = shufflevector <32 x i64> %17, <32 x i64> %17, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %19 = bitcast <8 x i64> %18 to <16 x float>
-  %20 = getelementptr float, ptr @buf3, i32 %3
-  store <16 x float> %19, ptr %20, align 4
-  %21 = add i32 %3, 16
-  br label %2, !llvm.loop !1
-
-22:                                               ; preds = %2
-  call void @llvm.aie2p.release(i32 51, i32 1)
-  call void @llvm.aie2p.release(i32 53, i32 1)
-  call void @llvm.aie2p.release(i32 48, i32 1)
-  br label %1
-}
-
-; Unknown intrinsic
-declare <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float>, <64 x float>, i32)
-
-!llvm.module.flags = !{!0}
-
-!0 = !{i32 2, !"Debug Info Version", i32 3}
-!1 = distinct !{!1, !2}
-!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.o b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.o
deleted file mode 100644
index d5d447390e239adbd0677385ea8f16d9e12c9d10..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 984
zcmaJ<&ubG=5T2K&NsuaOK%0X~@zkG7Vk@*4O=GJ$Y($Wjf<o9NTUuP3vfV)2iy}RE
z5b+}Hv6tR^@{m*i0`X6%;2%)`0n_h$`;u&P=)k=BzM0vXd9&~1n*Bm4B|TA+i7}D$
zgvbbji#lOS(lW@U>yJoQ3IK1_q{v10XI9R@fK{S{2(v$@JDX~|QW6=>bPi5cDO<dp
zgP)Sml<!4oE}Oia`X5%mjfby)jsIOqJ70?7U6JrduVv*~G8cCD9<0g*%HmAv{;25r
zW9*Nrm<RmWzZ2uXSMv3*DfP(2qjrp+C72)MxyfrB^)RQNmA#ob9FO{)s|C{!^QP1-
z+-vCs_c}Jk^GjlGX&iks(MKXl`PXfNbDfcl$UA^fhVf5$z?U)p#$YZwhxl?o#vdc{
zbcjD6;-u5qBZc|%X$|E*%xosY0n7{^0ptielhjZiqNl?9&|(GdK_J#{D{!rsuh!@H
z1E=v`Y<r`+y5o3mBdE5TZO;{JKkx&m4tw~4w%*=)!ef!gf>^%mIl7kBX!&lfP=n7d
zmkYJW#ui~~_S$Z9uqHFYO=cQFa|NAN!QAcG2e^tBTB=!zl(ctna7UCXbU05xb{&m&
uNmzk$T_va%^resWIS*rFP_Df<1EB21Ni0?bMv1;K?>iXNN1?388Lt3R{cdRh

diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.opt.ll b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.opt.ll
deleted file mode 100644
index ddb3226..0000000
--- a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.opt.ll
+++ /dev/null
@@ -1,64 +0,0 @@
-; ModuleID = 'air_project/sub_kernel_0_core_0_3.peanohack.ll'
-source_filename = "LLVMDialectModule"
-target datalayout = "e-m:e-p:20:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-f32:32:32-i64:32-f64:32-a:0:32-n32"
-target triple = "aie2p"
-
-@buf3 = external local_unnamed_addr global [256 x float]
-@buf4 = external local_unnamed_addr global [256 x float]
-@buf5 = external local_unnamed_addr global [256 x float]
-
-; Function Attrs: nounwind
-declare void @llvm.aie2p.acquire(i32, i32) #0
-
-; Function Attrs: nounwind
-declare void @llvm.aie2p.release(i32, i32) #0
-
-; Function Attrs: noreturn nounwind
-define void @core_0_3() local_unnamed_addr #1 {
-  br label %1
-
-1:                                                ; preds = %19, %0
-  tail call void @llvm.aie2p.acquire(i32 49, i32 -1)
-  tail call void @llvm.aie2p.acquire(i32 50, i32 -1)
-  tail call void @llvm.aie2p.acquire(i32 52, i32 -1)
-  br label %2
-
-2:                                                ; preds = %1, %2
-  %3 = phi i32 [ 0, %1 ], [ %17, %2 ]
-  %4 = trunc nuw i32 %3 to i20
-  %5 = getelementptr float, ptr @buf5, i20 %4
-  %6 = load <8 x i64>, ptr %5, align 64
-  %7 = getelementptr float, ptr @buf4, i20 %4
-  %8 = load <8 x i64>, ptr %7, align 64
-  %9 = shufflevector <8 x i64> %6, <8 x i64> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-  %10 = shufflevector <8 x i64> %8, <8 x i64> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-  %11 = bitcast <32 x i64> %9 to <64 x float>
-  %12 = bitcast <32 x i64> %10 to <64 x float>
-  %13 = tail call <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float> %11, <64 x float> %12, i32 60)
-  %14 = bitcast <64 x float> %13 to <32 x i64>
-  %15 = shufflevector <32 x i64> %14, <32 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %16 = getelementptr float, ptr @buf3, i20 %4
-  store <8 x i64> %15, ptr %16, align 64
-  %17 = add nuw nsw i32 %3, 16
-  %18 = icmp ult i32 %3, 240
-  br i1 %18, label %2, label %19, !llvm.loop !1
-
-19:                                               ; preds = %2
-  tail call void @llvm.aie2p.release(i32 51, i32 1)
-  tail call void @llvm.aie2p.release(i32 53, i32 1)
-  tail call void @llvm.aie2p.release(i32 48, i32 1)
-  br label %1
-}
-
-; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read)
-declare <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float>, <64 x float>, i32) #2
-
-attributes #0 = { nounwind }
-attributes #1 = { noreturn nounwind }
-attributes #2 = { mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) }
-
-!llvm.module.flags = !{!0}
-
-!0 = !{i32 2, !"Debug Info Version", i32 3}
-!1 = distinct !{!1, !2}
-!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.peanohack.ll b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.peanohack.ll
deleted file mode 100644
index 8b8d6a6..0000000
--- a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_3.peanohack.ll
+++ /dev/null
@@ -1,95 +0,0 @@
-; ModuleID = 'LLVMDialectModule'
-source_filename = "LLVMDialectModule"
-target triple = "aie2p"
-
-@buf0 = external global [256 x float]
-@buf1 = external global [256 x float]
-@buf2 = external global [256 x float]
-@buf3 = external global [256 x float]
-@buf4 = external global [256 x float]
-@buf5 = external global [256 x float]
-@buf6 = external global [256 x float]
-@buf7 = external global [256 x float]
-@buf8 = external global [256 x float]
-@buf9 = external global [256 x float]
-@buf10 = external global [256 x float]
-@buf11 = external global [256 x float]
-@buf12 = external global [1024 x float]
-@buf13 = external global [1024 x float]
-@buf14 = external global [1024 x float]
-
-declare void @debug_i32(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.event(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.put.ms(i32, i32)
-
-; Unknown intrinsic
-declare { i32, i32 } @llvm.aie2p.get.ss()
-
-; Unknown intrinsic
-declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32)
-
-; Unknown intrinsic
-declare <16 x i32> @llvm.aie2p.scd.read.vec(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.acquire(i32, i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.release(i32, i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.set.ctrl.reg(i32, i32)
-
-define void @core_0_3() {
-  br label %1
-
-1:                                                ; preds = %22, %0
-  call void @llvm.aie2p.acquire(i32 49, i32 -1)
-  call void @llvm.aie2p.acquire(i32 50, i32 -1)
-  call void @llvm.aie2p.acquire(i32 52, i32 -1)
-  br label %2
-
-2:                                                ; preds = %5, %1
-  %3 = phi i32 [ %21, %5 ], [ 0, %1 ]
-  %4 = icmp slt i32 %3, 256
-  br i1 %4, label %5, label %22
-
-5:                                                ; preds = %2
-  %6 = getelementptr float, ptr @buf5, i32 %3
-  %7 = load <16 x float>, ptr %6
-  %8 = getelementptr float, ptr @buf4, i32 %3
-  %9 = load <16 x float>, ptr %8
-  %10 = bitcast <16 x float> %7 to <8 x i64>
-  %11 = bitcast <16 x float> %9 to <8 x i64>
-  %12 = shufflevector <8 x i64> %10, <8 x i64> %10, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-  %13 = shufflevector <8 x i64> %11, <8 x i64> %11, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-  %14 = bitcast <32 x i64> %12 to <64 x float>
-  %15 = bitcast <32 x i64> %13 to <64 x float>
-  %16 = call <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float> %14, <64 x float> %15, i32 60)
-  %17 = bitcast <64 x float> %16 to <32 x i64>
-  %18 = shufflevector <32 x i64> %17, <32 x i64> %17, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %19 = bitcast <8 x i64> %18 to <16 x float>
-  %20 = getelementptr float, ptr @buf3, i32 %3
-  store <16 x float> %19, ptr %20
-  %21 = add i32 %3, 16
-  br label %2, !llvm.loop !1
-
-22:                                               ; preds = %2
-  call void @llvm.aie2p.release(i32 51, i32 1)
-  call void @llvm.aie2p.release(i32 53, i32 1)
-  call void @llvm.aie2p.release(i32 48, i32 1)
-  br label %1
-}
-
-; Unknown intrinsic
-declare <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float>, <64 x float>, i32)
-
-!llvm.module.flags = !{!0}
-
-!0 = !{i32 2, !"Debug Info Version", i32 3}
-!1 = distinct !{!1, !2}
-!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.elf b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.elf
deleted file mode 100755
index cfc7551935cf7c8534f551f4b481c91b57b4768e..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 1724
zcma)7O=uKn7=C9n-Hp<pO+;Bx;Uk5TO5Du2tI2xl7}gTn4WtK4(8Dl)$qw0_U1nxu
z)PswNEhyfEEwt1_3#F$}Xel`bwAUiu3mOzJ9#T&^sPTE{XLt4F1K;<)&-?Se?|k1k
zGgr^nzLX@1Qe<ef&&n|V@&VBZWSInIXp(Xy6GwM3Qf$G%1EG}1_~vP7r@mpC9VUq`
zOY|F2l^*TO^}U}RA=>_GM9|I@J?14(VD$@<AbPa-SDrQ=NCaKVZ01RMjJnryn@duC
zUPA|?o9n+yT3$tFjKb*Lp;(*B%O7Q5_ZDuC^%nmfdvYdce6RKn67}vx4}_rkFXZvw
zUi)ZO<5jC-8x5mAR&4kma{ag;@az7^L;S&+f_W<|O(g!f9Q!vioIk|J#akTfq0gvS
zuFZk(`s4arQYop2ezVekqQdN5JlA!?kpP8RN}dw&8sUth{ldzdZMZXBqxaLjPcrSD
zKSnmb+wHBq%uQuvZ6tX<<Rt4C;v?imsZhB;I20=Ua4%P&Nj_lUzC^UUM^tmYRmVRq
zYPG6RD@CQK5LpewTM=uHAGj?qv5Luh7`B7c)6*+%SnrrctJR!davQF<KHX@nHIEyv
zBg~RzD;2v`atynqR8Ey=ZO59i%XUdMOJ><pD~??$FIPT~dl;V4P9HM;zjpv^v2T&+
zF!Z}a{8Q-pev-uJ&<E%9-CrE?&w&dJ1-5Zqn6IIe9B)savT-eKz|XP2JK*P;0|JBd
zO(uAo7@U6vlI!ttvHu37z+f=H2`QKWdGH>jU;;4z9a1m>nEweWm;lWGffP&t<_{qS
z6M*@DtjM@G0Qa-aiZ<ZSn0E*KIdeIFqdEQsb8WzPnRf@AilMU}5+8fh@t_2knxSDr
z)`QTGwZ1g0RXuPoI|1%j?c0T_Ycw1yTxi*yhC{{L+?=ATi97+R*UoyjLY5z@MazO=
zx#L;7>8!Y(N><BvbVV<d*;$_DX@;j$r0YRw_#x>*JJ$3yqtS8n>H<@1mGq|Jdb;bm
zA+bif6}J2!&0@Q5_`Y!|wZv}6vnfhSoDsK&Gh@Pb^l_4>sJ9Ypsv6A-&z0ytXuOZ_
z2?=pL;$6q~L*HUUg5=eWZXKKPsz%T<L>CjCMY;~h!_~*%ARBP<cb;!9-v*AsHzWE-
oKt!m+F_RL=w1o-i{4YuIGvZ#Q1Tu{~mgwEY_ah2&&D5XDH`iMZ9{>OV

diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.ld.script b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.ld.script
deleted file mode 100644
index ddda3c2..0000000
--- a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.ld.script
+++ /dev/null
@@ -1,78 +0,0 @@
-
-MEMORY
-{
-   program (RX) : ORIGIN = 0, LENGTH = 0x0020000
-   data (!RX) : ORIGIN = 0x78400, LENGTH = 0x7C00
-}
-ENTRY(__start)
-SECTIONS
-{
-  . = 0x0;
-  .text : {
-     /* the __start symbol has to come at address zero. */
-     *crt0.o(.text*)
-     _ctors_start = .;
-     _init_array_start = .;
-     KEEP(SORT(*.init_array))
-     _ctors_end = .;
-     _init_array_end = .;
-     _dtors_start = .;
-     _dtors_end = .;
-     *(.text*)
-  } > program
-  .data : {
-     *(.data*)
-     *(.rodata*)
-  } > data
-  .comment : {
-     *(.comment*)
-  }
-  .symtab : {
-     *(.symtab)
-  }
-  .shstrtab : {
-     *(.shstrtab)
-  }
-  .strtab : {
-     *(.strtab)
-  }
-  .stack_sizes : {
-     *(.stack_sizes)
-  }
-
-. = 0x70000;
-_sp_start_value_DM_stack = .;
-. += 0x400; /* stack */
-. = 0x40400;
-buf5 = .;
-. += 0x400;
-. = 0x44000;
-buf4 = .;
-. += 0x400;
-. = 0x48000;
-buf3 = .;
-. += 0x400;
-/* No tile with memory exists to the west. */
-. = 0x50000;
-. += 0x10000;
-. = 0x60400;
-buf11 = .;
-. += 0x400;
-. = 0x64000;
-buf10 = .;
-. += 0x400;
-. = 0x68000;
-buf9 = .;
-. += 0x400;
-. = 0x70400;
-buf8 = .;
-. += 0x400;
-. = 0x74000;
-buf7 = .;
-. += 0x400;
-. = 0x78000;
-buf6 = .;
-. += 0x400;
-  .bss : { *(.bss*) } > data
-}
-PROVIDE(main = core_0_4);
diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.ll b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.ll
deleted file mode 100644
index 54f47e7..0000000
--- a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.ll
+++ /dev/null
@@ -1,95 +0,0 @@
-; ModuleID = 'LLVMDialectModule'
-source_filename = "LLVMDialectModule"
-target triple = "aie2p"
-
-@buf0 = external global [256 x float]
-@buf1 = external global [256 x float]
-@buf2 = external global [256 x float]
-@buf3 = external global [256 x float]
-@buf4 = external global [256 x float]
-@buf5 = external global [256 x float]
-@buf6 = external global [256 x float]
-@buf7 = external global [256 x float]
-@buf8 = external global [256 x float]
-@buf9 = external global [256 x float]
-@buf10 = external global [256 x float]
-@buf11 = external global [256 x float]
-@buf12 = external global [1024 x float]
-@buf13 = external global [1024 x float]
-@buf14 = external global [1024 x float]
-
-declare void @debug_i32(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.event(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.put.ms(i32, i32)
-
-; Unknown intrinsic
-declare { i32, i32 } @llvm.aie2p.get.ss()
-
-; Unknown intrinsic
-declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32)
-
-; Unknown intrinsic
-declare <16 x i32> @llvm.aie2p.scd.read.vec(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.acquire(i32, i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.release(i32, i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.set.ctrl.reg(i32, i32)
-
-define void @core_0_4() {
-  br label %1
-
-1:                                                ; preds = %22, %0
-  call void @llvm.aie2p.acquire(i32 49, i32 -1)
-  call void @llvm.aie2p.acquire(i32 50, i32 -1)
-  call void @llvm.aie2p.acquire(i32 52, i32 -1)
-  br label %2
-
-2:                                                ; preds = %5, %1
-  %3 = phi i32 [ %21, %5 ], [ 0, %1 ]
-  %4 = icmp slt i32 %3, 256
-  br i1 %4, label %5, label %22
-
-5:                                                ; preds = %2
-  %6 = getelementptr float, ptr @buf8, i32 %3
-  %7 = load <16 x float>, ptr %6, align 4
-  %8 = getelementptr float, ptr @buf7, i32 %3
-  %9 = load <16 x float>, ptr %8, align 4
-  %10 = bitcast <16 x float> %7 to <8 x i64>
-  %11 = bitcast <16 x float> %9 to <8 x i64>
-  %12 = shufflevector <8 x i64> %10, <8 x i64> %10, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-  %13 = shufflevector <8 x i64> %11, <8 x i64> %11, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-  %14 = bitcast <32 x i64> %12 to <64 x float>
-  %15 = bitcast <32 x i64> %13 to <64 x float>
-  %16 = call <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float> %14, <64 x float> %15, i32 60)
-  %17 = bitcast <64 x float> %16 to <32 x i64>
-  %18 = shufflevector <32 x i64> %17, <32 x i64> %17, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %19 = bitcast <8 x i64> %18 to <16 x float>
-  %20 = getelementptr float, ptr @buf6, i32 %3
-  store <16 x float> %19, ptr %20, align 4
-  %21 = add i32 %3, 16
-  br label %2, !llvm.loop !1
-
-22:                                               ; preds = %2
-  call void @llvm.aie2p.release(i32 51, i32 1)
-  call void @llvm.aie2p.release(i32 53, i32 1)
-  call void @llvm.aie2p.release(i32 48, i32 1)
-  br label %1
-}
-
-; Unknown intrinsic
-declare <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float>, <64 x float>, i32)
-
-!llvm.module.flags = !{!0}
-
-!0 = !{i32 2, !"Debug Info Version", i32 3}
-!1 = distinct !{!1, !2}
-!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.o b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.o
deleted file mode 100644
index 01ddce691877b0d149c944b2a2bff263cbb20063..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 984
zcmaJ<O=}ZT6g@9ZlOR>nfSQF$an;Wuu|?yirm?MAOr%hhf<l-iQ<RQPnNFa(sYn+t
zMBE6obmP7&7g_Ze6n{bme?a>OjOV_2Nv2u!!hQFgckkS<d53HEx>5>#QOLv?aFPJV
z=v>kP(@0|&D_4JjED8j#)D&>m`<2Bp8L>)q>5=G9>F$<#Q!NAIneM?)Rn8XA=Xo!Z
z&s1(jXg-^~p86NozD|Z4r;~qI($42%coPV}_e)lvBy(YJ@9rwjn3iWs*Jno0pOE^@
z%6`a?{2MX;dnMoal2Z4LKPo5w<AlV=cy8(vXFm3+dzIP}IO>n;ov1}qkNu|9HO^J~
z!MTo1@_Y$qN^>zVw+9GF;-c3gmvu%mhSvli4U<3N5ueBSropo49Q}_6KKU`g@(6!A
z!iCOok0KJkkk$})Xy!5z4rpfO5l}u*XOkM@9&0MT4=q;U9t5y<+ktC6d;Vf!KX95m
zu<e(%)who4HiKHb)$v?d`+*-g4chnb#5OwHB{?3+@er2pdXCOzHQT;hFVvS{*_BG6
z{=nEGZQWkmZ4K9CTDZ#0&}l}{VO93siG9FPw1^coD?wS`4iDLpNR2ugryslaM!OVP
trOLX>R4uG4camTF5#Iz=*52=d5PNYF^A(AiVjvRtjZDJFsgh56{10FHZgT(t

diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.opt.ll b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.opt.ll
deleted file mode 100644
index de0f954..0000000
--- a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.opt.ll
+++ /dev/null
@@ -1,64 +0,0 @@
-; ModuleID = 'air_project/sub_kernel_0_core_0_4.peanohack.ll'
-source_filename = "LLVMDialectModule"
-target datalayout = "e-m:e-p:20:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-f32:32:32-i64:32-f64:32-a:0:32-n32"
-target triple = "aie2p"
-
-@buf6 = external local_unnamed_addr global [256 x float]
-@buf7 = external local_unnamed_addr global [256 x float]
-@buf8 = external local_unnamed_addr global [256 x float]
-
-; Function Attrs: nounwind
-declare void @llvm.aie2p.acquire(i32, i32) #0
-
-; Function Attrs: nounwind
-declare void @llvm.aie2p.release(i32, i32) #0
-
-; Function Attrs: noreturn nounwind
-define void @core_0_4() local_unnamed_addr #1 {
-  br label %1
-
-1:                                                ; preds = %19, %0
-  tail call void @llvm.aie2p.acquire(i32 49, i32 -1)
-  tail call void @llvm.aie2p.acquire(i32 50, i32 -1)
-  tail call void @llvm.aie2p.acquire(i32 52, i32 -1)
-  br label %2
-
-2:                                                ; preds = %1, %2
-  %3 = phi i32 [ 0, %1 ], [ %17, %2 ]
-  %4 = trunc nuw i32 %3 to i20
-  %5 = getelementptr float, ptr @buf8, i20 %4
-  %6 = load <8 x i64>, ptr %5, align 64
-  %7 = getelementptr float, ptr @buf7, i20 %4
-  %8 = load <8 x i64>, ptr %7, align 64
-  %9 = shufflevector <8 x i64> %6, <8 x i64> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-  %10 = shufflevector <8 x i64> %8, <8 x i64> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-  %11 = bitcast <32 x i64> %9 to <64 x float>
-  %12 = bitcast <32 x i64> %10 to <64 x float>
-  %13 = tail call <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float> %11, <64 x float> %12, i32 60)
-  %14 = bitcast <64 x float> %13 to <32 x i64>
-  %15 = shufflevector <32 x i64> %14, <32 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %16 = getelementptr float, ptr @buf6, i20 %4
-  store <8 x i64> %15, ptr %16, align 64
-  %17 = add nuw nsw i32 %3, 16
-  %18 = icmp ult i32 %3, 240
-  br i1 %18, label %2, label %19, !llvm.loop !1
-
-19:                                               ; preds = %2
-  tail call void @llvm.aie2p.release(i32 51, i32 1)
-  tail call void @llvm.aie2p.release(i32 53, i32 1)
-  tail call void @llvm.aie2p.release(i32 48, i32 1)
-  br label %1
-}
-
-; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read)
-declare <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float>, <64 x float>, i32) #2
-
-attributes #0 = { nounwind }
-attributes #1 = { noreturn nounwind }
-attributes #2 = { mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) }
-
-!llvm.module.flags = !{!0}
-
-!0 = !{i32 2, !"Debug Info Version", i32 3}
-!1 = distinct !{!1, !2}
-!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.peanohack.ll b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.peanohack.ll
deleted file mode 100644
index 56c3882..0000000
--- a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_4.peanohack.ll
+++ /dev/null
@@ -1,95 +0,0 @@
-; ModuleID = 'LLVMDialectModule'
-source_filename = "LLVMDialectModule"
-target triple = "aie2p"
-
-@buf0 = external global [256 x float]
-@buf1 = external global [256 x float]
-@buf2 = external global [256 x float]
-@buf3 = external global [256 x float]
-@buf4 = external global [256 x float]
-@buf5 = external global [256 x float]
-@buf6 = external global [256 x float]
-@buf7 = external global [256 x float]
-@buf8 = external global [256 x float]
-@buf9 = external global [256 x float]
-@buf10 = external global [256 x float]
-@buf11 = external global [256 x float]
-@buf12 = external global [1024 x float]
-@buf13 = external global [1024 x float]
-@buf14 = external global [1024 x float]
-
-declare void @debug_i32(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.event(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.put.ms(i32, i32)
-
-; Unknown intrinsic
-declare { i32, i32 } @llvm.aie2p.get.ss()
-
-; Unknown intrinsic
-declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32)
-
-; Unknown intrinsic
-declare <16 x i32> @llvm.aie2p.scd.read.vec(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.acquire(i32, i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.release(i32, i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.set.ctrl.reg(i32, i32)
-
-define void @core_0_4() {
-  br label %1
-
-1:                                                ; preds = %22, %0
-  call void @llvm.aie2p.acquire(i32 49, i32 -1)
-  call void @llvm.aie2p.acquire(i32 50, i32 -1)
-  call void @llvm.aie2p.acquire(i32 52, i32 -1)
-  br label %2
-
-2:                                                ; preds = %5, %1
-  %3 = phi i32 [ %21, %5 ], [ 0, %1 ]
-  %4 = icmp slt i32 %3, 256
-  br i1 %4, label %5, label %22
-
-5:                                                ; preds = %2
-  %6 = getelementptr float, ptr @buf8, i32 %3
-  %7 = load <16 x float>, ptr %6
-  %8 = getelementptr float, ptr @buf7, i32 %3
-  %9 = load <16 x float>, ptr %8
-  %10 = bitcast <16 x float> %7 to <8 x i64>
-  %11 = bitcast <16 x float> %9 to <8 x i64>
-  %12 = shufflevector <8 x i64> %10, <8 x i64> %10, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-  %13 = shufflevector <8 x i64> %11, <8 x i64> %11, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-  %14 = bitcast <32 x i64> %12 to <64 x float>
-  %15 = bitcast <32 x i64> %13 to <64 x float>
-  %16 = call <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float> %14, <64 x float> %15, i32 60)
-  %17 = bitcast <64 x float> %16 to <32 x i64>
-  %18 = shufflevector <32 x i64> %17, <32 x i64> %17, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %19 = bitcast <8 x i64> %18 to <16 x float>
-  %20 = getelementptr float, ptr @buf6, i32 %3
-  store <16 x float> %19, ptr %20
-  %21 = add i32 %3, 16
-  br label %2, !llvm.loop !1
-
-22:                                               ; preds = %2
-  call void @llvm.aie2p.release(i32 51, i32 1)
-  call void @llvm.aie2p.release(i32 53, i32 1)
-  call void @llvm.aie2p.release(i32 48, i32 1)
-  br label %1
-}
-
-; Unknown intrinsic
-declare <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float>, <64 x float>, i32)
-
-!llvm.module.flags = !{!0}
-
-!0 = !{i32 2, !"Debug Info Version", i32 3}
-!1 = distinct !{!1, !2}
-!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.elf b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.elf
deleted file mode 100755
index 4588246a4ddfdd6db12c4f239462db7892609dba..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 1660
zcma)7O=uHQ5S~qAQpKNGQ35JF6r@T`cTH>)FRmquXbSbB1(jv9yKR>yo3Oj7tsn*w
z^q}5^C<tBz4<3pKm7WR;-o%4<f6#*$(VGXcezQMG^yCNc&3rTSX69vfH&>5WPDqkO
zF%mS`Wd#_`bwmS@1rn5?VM<eyIJ%7$V-*Ho2&D|xnJ>N>`}$?7pG106qQ^vK`mjFT
z_3qn6^y$=qpzkB}ky9Q*_Vbb;`mpvaL(6X^f-WUiG9-UQ-D~NUX=(1HiVg->79UA!
zM#0Gt*};h&p*E69Zc6>`Oy1ernR>JJ^RcvkQ|W9c>f8-3grJj0lS7@g=I*k}S<B)R
z8b*C&thpPxe%KHAb$4?ge|s!jy_J&oME<ZG`<D}ZzK;)yzc})t&!AVX&4K^=!}_aI
zKB|X)Q_=>aZ1ExPb)7I0AX}v9mWWpgGYWn$WL{~)o!}as2QRlIn%|!eEZ_Lqnf;X>
zNhH;Q=zcg8<<EvA<fN1>z3L5xviDx5voy>D23|=-Ki7yVwli<JM?|Gk7D_26=VT(I
zraQA@!E!yj;Y3z3Jm>pO@5t!rtnJUWsyU-kADy;qwzD`|t1Z+I>b51SdBc=TW<GD}
zW?n9h6^f>1jGF~BuT=BZf}xZwvs9QV?GJnC-_cGNN&2z24Q#4wk!UCMhkbl6^i21V
z#B=C^&-1fC)#sl87g#cELwI0LL#Jf;^JFMRXPB!!evWy&$1gAk1QwqUUyz>v3hZ2u
z$Ho4ekOGUr{5GUu0%XANLkcDU^Cysk3Bde0q+kLte+?;^0L<S(3MK&a_pBuG+yH!^
z&#b6D{*`&V$0_Gqi$3uf>y`t>TdMnd6>`q=-B9aF-I&)r`?BTXd8wS8EZcg`GW^Mg
z*{WHTt4vJDni9!FkXq%qW6ET>zLGNx7-m|Ip;fI}+fm49xRxeshpF0{QIuepc`uQs
zdA{!Yq<PIy(-!nv%hJk|OpSTc>bmV{wqyIm8fk{#aJ~2x+cn*F^`+Pnx-G|~;85fY
zxY>U)By2|?Mc&4F8w&=i@(=fakPyZz{!X@`ud*RR^y=Xo9LC{Q3!o;5E=4+vI1k6e
z)x`^tHk|yO<{Ugtj=?)0eB&Vk6yX?;31r;D9_ajE5Tk*(Z!v+4;|@f6JMvvbL9QA5
GWBCW<3<TZ)

diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.ld.script b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.ld.script
deleted file mode 100644
index 51c13db..0000000
--- a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.ld.script
+++ /dev/null
@@ -1,72 +0,0 @@
-
-MEMORY
-{
-   program (RX) : ORIGIN = 0, LENGTH = 0x0020000
-   data (!RX) : ORIGIN = 0x78400, LENGTH = 0x7C00
-}
-ENTRY(__start)
-SECTIONS
-{
-  . = 0x0;
-  .text : {
-     /* the __start symbol has to come at address zero. */
-     *crt0.o(.text*)
-     _ctors_start = .;
-     _init_array_start = .;
-     KEEP(SORT(*.init_array))
-     _ctors_end = .;
-     _init_array_end = .;
-     _dtors_start = .;
-     _dtors_end = .;
-     *(.text*)
-  } > program
-  .data : {
-     *(.data*)
-     *(.rodata*)
-  } > data
-  .comment : {
-     *(.comment*)
-  }
-  .symtab : {
-     *(.symtab)
-  }
-  .shstrtab : {
-     *(.shstrtab)
-  }
-  .strtab : {
-     *(.strtab)
-  }
-  .stack_sizes : {
-     *(.stack_sizes)
-  }
-
-. = 0x70000;
-_sp_start_value_DM_stack = .;
-. += 0x400; /* stack */
-. = 0x40400;
-buf8 = .;
-. += 0x400;
-. = 0x44000;
-buf7 = .;
-. += 0x400;
-. = 0x48000;
-buf6 = .;
-. += 0x400;
-/* No tile with memory exists to the west. */
-. = 0x50000;
-. += 0x10000;
-/* No tile with memory exists to the north. */
-. = 0x60000;
-. += 0x10000;
-. = 0x70400;
-buf11 = .;
-. += 0x400;
-. = 0x74000;
-buf10 = .;
-. += 0x400;
-. = 0x78000;
-buf9 = .;
-. += 0x400;
-  .bss : { *(.bss*) } > data
-}
-PROVIDE(main = core_0_5);
diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.ll b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.ll
deleted file mode 100644
index 8972a4d..0000000
--- a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.ll
+++ /dev/null
@@ -1,95 +0,0 @@
-; ModuleID = 'LLVMDialectModule'
-source_filename = "LLVMDialectModule"
-target triple = "aie2p"
-
-@buf0 = external global [256 x float]
-@buf1 = external global [256 x float]
-@buf2 = external global [256 x float]
-@buf3 = external global [256 x float]
-@buf4 = external global [256 x float]
-@buf5 = external global [256 x float]
-@buf6 = external global [256 x float]
-@buf7 = external global [256 x float]
-@buf8 = external global [256 x float]
-@buf9 = external global [256 x float]
-@buf10 = external global [256 x float]
-@buf11 = external global [256 x float]
-@buf12 = external global [1024 x float]
-@buf13 = external global [1024 x float]
-@buf14 = external global [1024 x float]
-
-declare void @debug_i32(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.event(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.put.ms(i32, i32)
-
-; Unknown intrinsic
-declare { i32, i32 } @llvm.aie2p.get.ss()
-
-; Unknown intrinsic
-declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32)
-
-; Unknown intrinsic
-declare <16 x i32> @llvm.aie2p.scd.read.vec(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.acquire(i32, i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.release(i32, i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.set.ctrl.reg(i32, i32)
-
-define void @core_0_5() {
-  br label %1
-
-1:                                                ; preds = %22, %0
-  call void @llvm.aie2p.acquire(i32 49, i32 -1)
-  call void @llvm.aie2p.acquire(i32 50, i32 -1)
-  call void @llvm.aie2p.acquire(i32 52, i32 -1)
-  br label %2
-
-2:                                                ; preds = %5, %1
-  %3 = phi i32 [ %21, %5 ], [ 0, %1 ]
-  %4 = icmp slt i32 %3, 256
-  br i1 %4, label %5, label %22
-
-5:                                                ; preds = %2
-  %6 = getelementptr float, ptr @buf11, i32 %3
-  %7 = load <16 x float>, ptr %6, align 4
-  %8 = getelementptr float, ptr @buf10, i32 %3
-  %9 = load <16 x float>, ptr %8, align 4
-  %10 = bitcast <16 x float> %7 to <8 x i64>
-  %11 = bitcast <16 x float> %9 to <8 x i64>
-  %12 = shufflevector <8 x i64> %10, <8 x i64> %10, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-  %13 = shufflevector <8 x i64> %11, <8 x i64> %11, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-  %14 = bitcast <32 x i64> %12 to <64 x float>
-  %15 = bitcast <32 x i64> %13 to <64 x float>
-  %16 = call <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float> %14, <64 x float> %15, i32 60)
-  %17 = bitcast <64 x float> %16 to <32 x i64>
-  %18 = shufflevector <32 x i64> %17, <32 x i64> %17, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %19 = bitcast <8 x i64> %18 to <16 x float>
-  %20 = getelementptr float, ptr @buf9, i32 %3
-  store <16 x float> %19, ptr %20, align 4
-  %21 = add i32 %3, 16
-  br label %2, !llvm.loop !1
-
-22:                                               ; preds = %2
-  call void @llvm.aie2p.release(i32 51, i32 1)
-  call void @llvm.aie2p.release(i32 53, i32 1)
-  call void @llvm.aie2p.release(i32 48, i32 1)
-  br label %1
-}
-
-; Unknown intrinsic
-declare <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float>, <64 x float>, i32)
-
-!llvm.module.flags = !{!0}
-
-!0 = !{i32 2, !"Debug Info Version", i32 3}
-!1 = distinct !{!1, !2}
-!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.o b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.o
deleted file mode 100644
index ca78f75bdee0299a5c24ffa00da8f22d235605ba..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 984
zcmZ`%O-~b16g{t$wlPRb6SWI9#1$Y+I({Uu(NaN5GBw5oVsOz+%NUH6;xI&XqcJR8
z5aPyUmTcIwa?usPfbbI_;RnDUu$=qmwH=yxlRNjEcTeuU@6P)^@4Qq>p<@b}7z2(H
zzyOm^v|tQr^t5tx24qnnc&A2z)Arvi4#|jBqQ#7;f2Lb+)mpg-3}#x}2dbF0FD7{|
zmd})KM`$veyqWqJR`y21*T*C0vuXF29o_=MKb@B4=gC~y-nci1Q<mj7rTepD!yl6V
ztjc-Fcl_ZPKbg(fc2nws$w%!Zf0z(`jORwLan|FUdRFNz!v1{J??^p1{Wxz*-Qc~Z
z_j#`!Q#?Nnb4zp34R^W-NaC`+LN4o!YyfWvei$ZyVvYDB#up8iMdz5Wb#wA#fX{t=
zrjH99=N`pGKb6)HcNrdLA{;PG$RnVAqfaC?#C`Tud>`7Zz}pUBtv3VDT6ncIwHdhe
zk8qsj%G?LH;njmmb9JlX!P*S`z^yTU{Uour^?nAH?=@Uq!KycXuUe=+h2@k=h3X>{
zZ4ufwS=E_eUoB|bTa+2%|6!ada|JCbbM7xOIj^EkqPSTJiu$zoksXP>p!e(aW7pYe
wmja*BWnD$OHufzONPU?{azk`kduIkhY{W?{XA`qT*F^W5Oyq-fsV6gD0S*Oj=>Px#

diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.opt.ll b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.opt.ll
deleted file mode 100644
index d08aa8f..0000000
--- a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.opt.ll
+++ /dev/null
@@ -1,64 +0,0 @@
-; ModuleID = 'air_project/sub_kernel_0_core_0_5.peanohack.ll'
-source_filename = "LLVMDialectModule"
-target datalayout = "e-m:e-p:20:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-f32:32:32-i64:32-f64:32-a:0:32-n32"
-target triple = "aie2p"
-
-@buf9 = external local_unnamed_addr global [256 x float]
-@buf10 = external local_unnamed_addr global [256 x float]
-@buf11 = external local_unnamed_addr global [256 x float]
-
-; Function Attrs: nounwind
-declare void @llvm.aie2p.acquire(i32, i32) #0
-
-; Function Attrs: nounwind
-declare void @llvm.aie2p.release(i32, i32) #0
-
-; Function Attrs: noreturn nounwind
-define void @core_0_5() local_unnamed_addr #1 {
-  br label %1
-
-1:                                                ; preds = %19, %0
-  tail call void @llvm.aie2p.acquire(i32 49, i32 -1)
-  tail call void @llvm.aie2p.acquire(i32 50, i32 -1)
-  tail call void @llvm.aie2p.acquire(i32 52, i32 -1)
-  br label %2
-
-2:                                                ; preds = %1, %2
-  %3 = phi i32 [ 0, %1 ], [ %17, %2 ]
-  %4 = trunc nuw i32 %3 to i20
-  %5 = getelementptr float, ptr @buf11, i20 %4
-  %6 = load <8 x i64>, ptr %5, align 64
-  %7 = getelementptr float, ptr @buf10, i20 %4
-  %8 = load <8 x i64>, ptr %7, align 64
-  %9 = shufflevector <8 x i64> %6, <8 x i64> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-  %10 = shufflevector <8 x i64> %8, <8 x i64> poison, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-  %11 = bitcast <32 x i64> %9 to <64 x float>
-  %12 = bitcast <32 x i64> %10 to <64 x float>
-  %13 = tail call <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float> %11, <64 x float> %12, i32 60)
-  %14 = bitcast <64 x float> %13 to <32 x i64>
-  %15 = shufflevector <32 x i64> %14, <32 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %16 = getelementptr float, ptr @buf9, i20 %4
-  store <8 x i64> %15, ptr %16, align 64
-  %17 = add nuw nsw i32 %3, 16
-  %18 = icmp ult i32 %3, 240
-  br i1 %18, label %2, label %19, !llvm.loop !1
-
-19:                                               ; preds = %2
-  tail call void @llvm.aie2p.release(i32 51, i32 1)
-  tail call void @llvm.aie2p.release(i32 53, i32 1)
-  tail call void @llvm.aie2p.release(i32 48, i32 1)
-  br label %1
-}
-
-; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read)
-declare <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float>, <64 x float>, i32) #2
-
-attributes #0 = { nounwind }
-attributes #1 = { noreturn nounwind }
-attributes #2 = { mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read) }
-
-!llvm.module.flags = !{!0}
-
-!0 = !{i32 2, !"Debug Info Version", i32 3}
-!1 = distinct !{!1, !2}
-!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.peanohack.ll b/examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.peanohack.ll
deleted file mode 100644
index 69f695d..0000000
--- a/examples/elementwise_arith/air_project/sub_kernel_0_core_0_5.peanohack.ll
+++ /dev/null
@@ -1,95 +0,0 @@
-; ModuleID = 'LLVMDialectModule'
-source_filename = "LLVMDialectModule"
-target triple = "aie2p"
-
-@buf0 = external global [256 x float]
-@buf1 = external global [256 x float]
-@buf2 = external global [256 x float]
-@buf3 = external global [256 x float]
-@buf4 = external global [256 x float]
-@buf5 = external global [256 x float]
-@buf6 = external global [256 x float]
-@buf7 = external global [256 x float]
-@buf8 = external global [256 x float]
-@buf9 = external global [256 x float]
-@buf10 = external global [256 x float]
-@buf11 = external global [256 x float]
-@buf12 = external global [1024 x float]
-@buf13 = external global [1024 x float]
-@buf14 = external global [1024 x float]
-
-declare void @debug_i32(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.event(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.put.ms(i32, i32)
-
-; Unknown intrinsic
-declare { i32, i32 } @llvm.aie2p.get.ss()
-
-; Unknown intrinsic
-declare void @llvm.aie2p.mcd.write.vec(<16 x i32>, i32)
-
-; Unknown intrinsic
-declare <16 x i32> @llvm.aie2p.scd.read.vec(i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.acquire(i32, i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.release(i32, i32)
-
-; Unknown intrinsic
-declare void @llvm.aie2p.set.ctrl.reg(i32, i32)
-
-define void @core_0_5() {
-  br label %1
-
-1:                                                ; preds = %22, %0
-  call void @llvm.aie2p.acquire(i32 49, i32 -1)
-  call void @llvm.aie2p.acquire(i32 50, i32 -1)
-  call void @llvm.aie2p.acquire(i32 52, i32 -1)
-  br label %2
-
-2:                                                ; preds = %5, %1
-  %3 = phi i32 [ %21, %5 ], [ 0, %1 ]
-  %4 = icmp slt i32 %3, 256
-  br i1 %4, label %5, label %22
-
-5:                                                ; preds = %2
-  %6 = getelementptr float, ptr @buf11, i32 %3
-  %7 = load <16 x float>, ptr %6
-  %8 = getelementptr float, ptr @buf10, i32 %3
-  %9 = load <16 x float>, ptr %8
-  %10 = bitcast <16 x float> %7 to <8 x i64>
-  %11 = bitcast <16 x float> %9 to <8 x i64>
-  %12 = shufflevector <8 x i64> %10, <8 x i64> %10, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-  %13 = shufflevector <8 x i64> %11, <8 x i64> %11, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-  %14 = bitcast <32 x i64> %12 to <64 x float>
-  %15 = bitcast <32 x i64> %13 to <64 x float>
-  %16 = call <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float> %14, <64 x float> %15, i32 60)
-  %17 = bitcast <64 x float> %16 to <32 x i64>
-  %18 = shufflevector <32 x i64> %17, <32 x i64> %17, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %19 = bitcast <8 x i64> %18 to <16 x float>
-  %20 = getelementptr float, ptr @buf9, i32 %3
-  store <16 x float> %19, ptr %20
-  %21 = add i32 %3, 16
-  br label %2, !llvm.loop !1
-
-22:                                               ; preds = %2
-  call void @llvm.aie2p.release(i32 51, i32 1)
-  call void @llvm.aie2p.release(i32 53, i32 1)
-  call void @llvm.aie2p.release(i32 48, i32 1)
-  br label %1
-}
-
-; Unknown intrinsic
-declare <64 x float> @llvm.aie2p.ACC2048.accfloat.sub.conf(<64 x float>, <64 x float>, i32)
-
-!llvm.module.flags = !{!0}
-
-!0 = !{i32 2, !"Debug Info Version", i32 3}
-!1 = distinct !{!1, !2}
-!2 = !{!"llvm.loop.mustprogress"}
diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_design.bif b/examples/elementwise_arith/air_project/sub_kernel_0_design.bif
deleted file mode 100644
index bbeec41..0000000
--- a/examples/elementwise_arith/air_project/sub_kernel_0_design.bif
+++ /dev/null
@@ -1,10 +0,0 @@
-all:
-{
-  id_code = 0x14ca8093
-  extended_id_code = 0x01
-  image
-  {
-    name=aie_image, id=0x1c000000
-    { type=cdo file=air_project/sub_kernel_0_aie_cdo_elfs.bin file=air_project/sub_kernel_0_aie_cdo_init.bin file=air_project/sub_kernel_0_aie_cdo_enable.bin }
-  }
-}
diff --git a/examples/elementwise_arith/air_project/sub_kernel_0_sub_kernel_0_sequence.bin b/examples/elementwise_arith/air_project/sub_kernel_0_sub_kernel_0_sequence.bin
deleted file mode 100644
index f2eb383b7eb903e8ad8809b7d44fb0a4d660bfb6..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 3248
zcmcJQ-Aw~A5QNu<g?K=M2OfApq6JF0GAPG%L<ue-N=1yF_0OCH8j;via_gU++3a_@
zvk>neg4w5;eLtEZyo^(@j}krmd|WcZ&8OQJf_lyu+9e|Yukmso`>#0IXQ!}T>04{-
zdWRz5+iJIiZ@khs7OO7qzIV<49O}M3Vd35Et*zr4+n!k}H6?s5ZHHP2b8iNd9f!uq
z2%ELeWFG=0l9+H#Mw;SV&vYiNQGA=m?@TyH^<^fmv5ZNwd1Or5@6ws1SwdvOIT;Fy
zFEe2cjS1(dzRbimmN6++jgb*5y%`xL$~=|+C!AAeiZ3%^4UGxssJ_g^HI^|MPc*|&
zYb<A{(Vr%DO=9vtUuMD@8WYaRC|J1{n2BpFW3p2})f#K>`xk4J-t?`*yLP<eIY;$n
zCaj?`;T+YMnYhL>CTa8!jZ|yI(P^-T#)NY+)Rg;`nXrb&gmYA1X5t#lnB*HUVGWH5
W=cv75Caj?`;T+YMnYhL+nEU|x-i@&U

diff --git a/examples/elementwise_arith/air_project/tt.mlir b/examples/elementwise_arith/air_project/tt.mlir
deleted file mode 100644
index cfdc62d..0000000
--- a/examples/elementwise_arith/air_project/tt.mlir
+++ /dev/null
@@ -1,35 +0,0 @@
-#loc = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":86:1)
-#loc10 = loc("X"(#loc))
-#loc11 = loc("OUT"(#loc))
-module {
-  tt.func public @square_kernel(%X: !tt.ptr<i16> {tt.divisibility = 16 : i32} loc("X"(#loc)), %OUT: !tt.ptr<i16> {tt.divisibility = 16 : i32} loc("OUT"(#loc))) attributes {noinline = false} {
-    %c1024_i32 = arith.constant 1024 : i32 loc(#loc1)
-    %pid = tt.get_program_id x : i32 loc(#loc12)
-    %offsets = arith.muli %pid, %c1024_i32 : i32 loc(#loc13)
-    %offsets_0 = tt.make_range {end = 1024 : i32, start = 0 : i32} : tensor<1024xi32> loc(#loc14)
-    %offsets_1 = tt.splat %offsets : i32 -> tensor<1024xi32> loc(#loc13)
-    %offsets_2 = arith.addi %offsets_1, %offsets_0 : tensor<1024xi32> loc(#loc13)
-    %x = tt.splat %X : !tt.ptr<i16> -> tensor<1024x!tt.ptr<i16>> loc(#loc15)
-    %x_3 = tt.addptr %x, %offsets_2 : tensor<1024x!tt.ptr<i16>>, tensor<1024xi32> loc(#loc15)
-    %x_4 = tt.load %x_3 : tensor<1024x!tt.ptr<i16>> loc(#loc16)
-    %0 = tt.splat %OUT : !tt.ptr<i16> -> tensor<1024x!tt.ptr<i16>> loc(#loc7)
-    %1 = tt.addptr %0, %offsets_2 : tensor<1024x!tt.ptr<i16>>, tensor<1024xi32> loc(#loc7)
-    %2 = arith.muli %x_4, %x_4 : tensor<1024xi16> loc(#loc8)
-    tt.store %1, %2 : tensor<1024x!tt.ptr<i16>> loc(#loc9)
-    tt.return loc(#loc)
-  } loc(#loc)
-} loc(#loc)
-#loc1 = loc(unknown)
-#loc2 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":87:11)
-#loc3 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":88:15)
-#loc4 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":88:34)
-#loc5 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":89:17)
-#loc6 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":89:9)
-#loc7 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":90:14)
-#loc8 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":90:32)
-#loc9 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":90:5)
-#loc12 = loc("pid"(#loc2))
-#loc13 = loc("offsets"(#loc3))
-#loc14 = loc("offsets"(#loc4))
-#loc15 = loc("x"(#loc5))
-#loc16 = loc("x"(#loc6))
diff --git a/examples/elementwise_arith/tt.shared.mlir b/examples/elementwise_arith/tt.shared.mlir
deleted file mode 100644
index dc6929b..0000000
--- a/examples/elementwise_arith/tt.shared.mlir
+++ /dev/null
@@ -1 +0,0 @@
-b'#loc = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":86:1)\n#loc5 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":89:9)\n#map = affine_map<(d0) -> (d0)>\n#loc8 = loc("X"(#loc))\n#loc9 = loc("OUT"(#loc))\n#loc12 = loc("x"(#loc5))\nmodule {\n  func.func @square_kernel(%arg0: memref<*xf32> {tt.divisibility = 16 : i32} loc("X"(#loc)), %arg1: memref<*xf32> {tt.divisibility = 16 : i32} loc("OUT"(#loc)), %arg2: i32 loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":86:1), %arg3: i32 loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":86:1), %arg4: i32 loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":86:1), %arg5: i32 loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":86:1), %arg6: i32 loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":86:1), %arg7: i32 loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":86:1)) {\n    %c1024_i32 = arith.constant 1024 : i32 loc(#loc1)\n    %0 = arith.muli %arg5, %c1024_i32 : i32 loc(#loc10)\n    %1 = arith.index_cast %0 : i32 to index loc(#loc3)\n    %reinterpret_cast = memref.reinterpret_cast %arg0 to offset: [%1], sizes: [1024], strides: [1] : memref<*xf32> to memref<1024xf32, strided<[1], offset: ?>> loc(#loc11)\n    %alloc = memref.alloc() : memref<1024xf32> loc(#loc12)\n    memref.copy %reinterpret_cast, %alloc : memref<1024xf32, strided<[1], offset: ?>> to memref<1024xf32> loc(#loc12)\n    %2 = bufferization.to_tensor %alloc restrict writable : memref<1024xf32> to tensor<1024xf32> loc(#loc12)\n    %reinterpret_cast_0 = memref.reinterpret_cast %arg1 to offset: [%1], sizes: [1024], strides: [1] : memref<*xf32> to memref<1024xf32, strided<[1], offset: ?>> loc(#loc3)\n    %3 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins(%2, %2 : tensor<1024xf32>, tensor<1024xf32>) outs(%2 : tensor<1024xf32>) {\n    ^bb0(%in: f32 loc("x"(#loc5)), %in_1: f32 loc("x"(#loc5)), %out: f32 loc("x"(#loc5))):\n      %4 = arith.mulf %in, %in_1 : f32 loc(#loc6)\n      linalg.yield %4 : f32 loc(#loc6)\n    } -> tensor<1024xf32> loc(#loc6)\n    bufferization.materialize_in_destination %3 in writable %reinterpret_cast_0 : (tensor<1024xf32>, memref<1024xf32, strided<[1], offset: ?>>) -> () loc(#loc7)\n    return loc(#loc)\n  } loc(#loc)\n} loc(#loc)\n#loc1 = loc(unknown)\n#loc2 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":88:15)\n#loc3 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":90:14)\n#loc4 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":89:17)\n#loc6 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":90:32)\n#loc7 = loc("/home/strixminipc/Triton-XDNA/examples/elementwise_arith/elementwise_arith.py":90:5)\n#loc10 = loc("offsets"(#loc2))\n#loc11 = loc("x"(#loc4))\n\n'
\ No newline at end of file

From 1998bd8a8a73703746c3627caf534adbf198bfd1 Mon Sep 17 00:00:00 2001
From: erwei-xilinx <erwei.wang@amd.com>
Date: Thu, 9 Apr 2026 22:42:05 -0700
Subject: [PATCH 9/9] Add NPU1 (AIE2) support to elementwise_arith example
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Create transform_binary_aie2.mlir and transform_unary_aie2.mlir for
NPU1 targets — content is identical to the AIE2P variants since
@DTYPE@ and @VECTOR_SIZE@ placeholders handle the differences.

Update elementwise_arith.py to auto-detect the NPU version via
detect_npu_version() and select the correct transform script suffix
(aie2 vs aie2p) instead of hardcoding aie2p.

Update generate_readme.py get_device_support() to use glob patterns
so it detects both transform_aie2.mlir and transform_*_aie2.mlir
naming conventions used by multi-op examples.

Tested on NPU1 (Phoenix/AIE2): all 7 test cases pass
(sub bf16/f32, mul bf16/f32, div f32, square bf16/f32).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../elementwise_arith/elementwise_arith.py    |  8 +++-
 .../transform_binary_aie2.mlir                | 40 +++++++++++++++++++
 .../transform_unary_aie2.mlir                 | 40 +++++++++++++++++++
 examples/generate_readme.py                   |  7 +++-
 4 files changed, 91 insertions(+), 4 deletions(-)
 create mode 100644 examples/elementwise_arith/transform_binary_aie2.mlir
 create mode 100644 examples/elementwise_arith/transform_unary_aie2.mlir

diff --git a/examples/elementwise_arith/elementwise_arith.py b/examples/elementwise_arith/elementwise_arith.py
index 04d4844..cd4c678 100644
--- a/examples/elementwise_arith/elementwise_arith.py
+++ b/examples/elementwise_arith/elementwise_arith.py
@@ -174,14 +174,18 @@ def bench_op(op, N, provider, cfg):
     if cfg["bf16_emulation"]:
         os.environ["AMD_TRITON_NPU_BF16_EMULATION"] = "1"
 
-    # Select the right transform script based on op arity.
+    # Select the right transform script based on op arity and NPU version.
     # If AIR_TRANSFORM_TILING_SCRIPT is already set, respect it.
     if not os.environ.get("AIR_TRANSFORM_TILING_SCRIPT"):
+        from triton.backends.amd_triton_npu.driver import detect_npu_version
+
         is_unary = args.op == "square"
         script_dir = os.path.dirname(os.path.abspath(__file__))
         arity = "unary" if is_unary else "binary"
+        npu = detect_npu_version()
+        suffix = "aie2" if npu == "npu1" else "aie2p"
         os.environ["AIR_TRANSFORM_TILING_SCRIPT"] = os.path.join(
-            script_dir, f"transform_{arity}_aie2p.mlir"
+            script_dir, f"transform_{arity}_{suffix}.mlir"
         )
 
     benchmark.select_npu_backend()
diff --git a/examples/elementwise_arith/transform_binary_aie2.mlir b/examples/elementwise_arith/transform_binary_aie2.mlir
new file mode 100644
index 0000000..ccec81d
--- /dev/null
+++ b/examples/elementwise_arith/transform_binary_aie2.mlir
@@ -0,0 +1,40 @@
+// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: MIT
+
+////////////////////////////////////////////////////////////////////////////////
+// Transform Script for Binary Elementwise Ops (AIE2): sub, mul, div
+// Dtype-generic: uses @DTYPE@ and @VECTOR_SIZE@ placeholders.
+// Uses shared library sequences from transform_library.mlir (auto-injected).
+////////////////////////////////////////////////////////////////////////////////
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(
+      %arg1: !transform.any_op {transform.readonly}) {
+
+    transform.include @canonicalize_with_fold_dims failures(propagate)
+        (%arg1) : (!transform.any_op) -> ()
+    transform.include @fuse_elementwise_and_canonicalize failures(propagate)
+        (%arg1) : (!transform.any_op) -> ()
+    transform.include @flatten_tile_forall failures(propagate)
+        (%arg1) : (!transform.any_op) -> ()
+    transform.include @canonicalize_with_cse failures(propagate)
+        (%arg1) : (!transform.any_op) -> ()
+    transform.include @pad_and_promote_binary_@DTYPE@ failures(propagate)
+        (%arg1) : (!transform.any_op) -> ()
+    transform.include @canonicalize_with_cse failures(propagate)
+        (%arg1) : (!transform.any_op) -> ()
+    transform.include @one_shot_bufferize failures(propagate)
+        (%arg1) : (!transform.any_op) -> ()
+    transform.include @post_bufferize_cleanup failures(propagate)
+        (%arg1) : (!transform.any_op) -> ()
+
+    transform.include @vectorize_generics_at_@VECTOR_SIZE@ failures(propagate)
+        (%arg1) : (!transform.any_op) -> ()
+    %vh = transform.include @air_herd_mapping_and_vectorize
+        failures(propagate) (%arg1) : (!transform.any_op) -> !transform.any_op
+    transform.include @cast_bf16_only_ops failures(propagate)
+        (%vh) : (!transform.any_op) -> ()
+
+    transform.yield
+  }
+}
diff --git a/examples/elementwise_arith/transform_unary_aie2.mlir b/examples/elementwise_arith/transform_unary_aie2.mlir
new file mode 100644
index 0000000..2e09a8b
--- /dev/null
+++ b/examples/elementwise_arith/transform_unary_aie2.mlir
@@ -0,0 +1,40 @@
+// Copyright (C) 2026, Advanced Micro Devices, Inc. All rights reserved.
+// SPDX-License-Identifier: MIT
+
+////////////////////////////////////////////////////////////////////////////////
+// Transform Script for Unary Elementwise Ops (AIE2): square
+// Dtype-generic: uses @DTYPE@ and @VECTOR_SIZE@ placeholders.
+// Uses shared library sequences from transform_library.mlir (auto-injected).
+////////////////////////////////////////////////////////////////////////////////
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(
+      %arg1: !transform.any_op {transform.readonly}) {
+
+    transform.include @canonicalize_with_fold_dims failures(propagate)
+        (%arg1) : (!transform.any_op) -> ()
+    transform.include @fuse_elementwise_and_canonicalize failures(propagate)
+        (%arg1) : (!transform.any_op) -> ()
+    transform.include @flatten_tile_forall failures(propagate)
+        (%arg1) : (!transform.any_op) -> ()
+    transform.include @canonicalize_with_cse failures(propagate)
+        (%arg1) : (!transform.any_op) -> ()
+    transform.include @pad_and_promote_unary_@DTYPE@ failures(propagate)
+        (%arg1) : (!transform.any_op) -> ()
+    transform.include @canonicalize_with_cse failures(propagate)
+        (%arg1) : (!transform.any_op) -> ()
+    transform.include @one_shot_bufferize failures(propagate)
+        (%arg1) : (!transform.any_op) -> ()
+    transform.include @post_bufferize_cleanup failures(propagate)
+        (%arg1) : (!transform.any_op) -> ()
+
+    transform.include @vectorize_generics_at_@VECTOR_SIZE@ failures(propagate)
+        (%arg1) : (!transform.any_op) -> ()
+    %vh = transform.include @air_herd_mapping_and_vectorize
+        failures(propagate) (%arg1) : (!transform.any_op) -> !transform.any_op
+    transform.include @cast_bf16_only_ops failures(propagate)
+        (%vh) : (!transform.any_op) -> ()
+
+    transform.yield
+  }
+}
diff --git a/examples/generate_readme.py b/examples/generate_readme.py
index bc75808..b64b9f5 100644
--- a/examples/generate_readme.py
+++ b/examples/generate_readme.py
@@ -161,10 +161,13 @@
 def get_device_support(example_dir):
     """Check which device targets have transform files.
 
+    Checks for both exact names (transform_aie2.mlir) and prefixed
+    variants (transform_*_aie2.mlir) used by multi-op examples.
+
     Returns (has_aie2, has_aie2p) as booleans.
     """
-    has_aie2 = (example_dir / "transform_aie2.mlir").exists()
-    has_aie2p = (example_dir / "transform_aie2p.mlir").exists()
+    has_aie2 = bool(list(example_dir.glob("transform*_aie2.mlir")))
+    has_aie2p = bool(list(example_dir.glob("transform*_aie2p.mlir")))
     return has_aie2, has_aie2p