diff --git a/.gitignore b/.gitignore
index bdd09a81..838703d3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -160,3 +160,8 @@ jupyter_execute
 .nvcode/
 
 llvm.sh*
+
+# nsys profiles
+*cuda_gpu_kern_sum.json
+*.sqlite
+*.nsys-rep
diff --git a/numbast/src/numbast/__init__.py b/numbast/src/numbast/__init__.py
index 6a1d7648..79ef61c8 100644
--- a/numbast/src/numbast/__init__.py
+++ b/numbast/src/numbast/__init__.py
@@ -3,12 +3,6 @@
 
 import numba
 
-# Use pynvjitlink by default. This can avoid version mismatch between system driver and
-# installed CTK version.
-from pynvjitlink.patch import patch_numba_linker
-
-patch_numba_linker()
-
 from numbast import numba_patch
 
 from numbast.struct import bind_cxx_struct, bind_cxx_structs
diff --git a/numbast/src/numbast/numba_patch.py b/numbast/src/numbast/numba_patch.py
index 41923e38..6652df4b 100644
--- a/numbast/src/numbast/numba_patch.py
+++ b/numbast/src/numbast/numba_patch.py
@@ -36,7 +36,7 @@
 # added.
 
 
-def nvrtc_compile(src, name, cc):
+def nvrtc_compile(src, name, cc, ltoir=False):
     """
     Compile a CUDA C/C++ source to PTX for a given compute capability.
 
@@ -67,6 +67,8 @@ def nvrtc_compile(src, name, cc):
     numba_include = f"-I{numba_cuda_path}"
     options = [arch, *extra_include_paths, include, numba_include, "-rdc", "true"]
     options += extra_options
+    if ltoir:
+        options.append("-dlto")
 
     # Compile the program
     compile_error = nvrtc.compile_program(program, options)
@@ -84,8 +86,11 @@ def nvrtc_compile(src, name, cc):
         msg = f"NVRTC log messages whilst compiling {name}:\n\n{log}"
         warnings.warn(msg)
 
-    ptx = nvrtc.get_ptx(program)
-    return ptx, log
+    if ltoir:
+        return nvrtc.get_lto(program), log
+    else:
+        ptx = nvrtc.get_ptx(program)
+        return ptx, log
 
 
 # Monkey-patch the existing implementation
diff --git a/numbast_extensions/benchmarks/analyze.py b/numbast_extensions/benchmarks/analyze.py
new file mode 100644
index 00000000..896b29d4
--- /dev/null
+++ b/numbast_extensions/benchmarks/analyze.py
@@ -0,0 +1,63 @@
+import click
+import json
+import pandas as pd
+
+
+@click.command()
+@click.argument(
+    "gold_name", type=click.Path(exists=True, dir_okay=False, file_okay=True)
+)
+@click.argument(
+    "py_lto_off_name", type=click.Path(exists=True, dir_okay=False, file_okay=True)
+)
+@click.argument(
+    "py_lto_on_name", type=click.Path(exists=True, dir_okay=False, file_okay=True)
+)
+def compare_gpu_kern(gold_name, py_lto_off_name, py_lto_on_name):
+    """Read profile results from gold run result and Numba kernel, compare them.
+
+    GOLD_NAME: JSON profile result of the gold kernel.
+    NUMBA_NAME: JSON profile result of the Numba kernel.
+    """
+    with open(gold_name, "r") as goldf:
+        gold_kerns = json.load(goldf)
+    with open(py_lto_off_name, "r") as pyf:
+        lto_off_kerns = json.load(pyf)
+    with open(py_lto_on_name, "r") as pyf:
+        lto_on_kerns = json.load(pyf)
+
+    gold_kern = gold_kerns[0]
+    lto_off_kern = lto_off_kerns[0]
+    lto_on_kern = lto_on_kerns[0]
+
+    columns = [
+        "GOLD: " + gold_kern["Name"],
+        "NUMBA LTO OFF: " + lto_off_kern["Name"],
+        "NUMBA LTO ON: " + lto_on_kern["Name"],
+    ]
+    index = [k for k in gold_kern.keys() if k != "Name"]
+    values = [
+        (gold_kern[k], lto_off_kern[k], lto_on_kern[k])
+        for k in gold_kern.keys()
+        if k != "Name"
+    ]
+
+    df = pd.DataFrame(data=values, index=index, columns=columns)
+
+    print(df)
+
+    print("Perf Ratio (NUMBA LTO OFF / GOLD, %): ")
+    diff = df.iloc[:, 1] / df.iloc[:, 0] * 100
+    diff.index = diff.index.str.strip("%ns)").str.strip("( ")
+    print(diff[["Avg", "Med", "Min", "Max", "StdDev"]])
+
+    print("---------")
+
+    print("Perf Ratio (NUMBA LTO ON / GOLD, %): ")
+    diff = df.iloc[:, 2] / df.iloc[:, 0] * 100
+    diff.index = diff.index.str.strip("%ns)").str.strip("( ")
+    print(diff[["Avg", "Med", "Min", "Max", "StdDev"]])
+
+
+if __name__ == "__main__":
+    compare_gpu_kern()
diff --git a/numbast_extensions/benchmarks/run_benchmark.sh b/numbast_extensions/benchmarks/run_benchmark.sh
new file mode 100755
index 00000000..e5c33dfd
--- /dev/null
+++ b/numbast_extensions/benchmarks/run_benchmark.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+
+NUMBAST_BENCH_KERN_REPETITION=1000
+
+BENCH_NAME=test_arithmetic_bf16
+
+PY_NAME=${BENCH_NAME}.py
+PY_PTX=${BENCH_NAME}_py.ptx
+
+GOLD_NAME=${BENCH_NAME}_gold
+GOLD_SRC_NAME=${GOLD_NAME}.cu
+GOLD_PTX=${GOLD_NAME}.ptx
+
+COMPUTE_CAP=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader|head -n 1)
+SMCC=sm_${COMPUTE_CAP//./}
+
+# Cleanup
+rm -rf *.json *.nsys-rep *.sqlite $GOLD_NAME
+
+# Compile gold
+nvcc --gpu-architecture=$SMCC $GOLD_SRC_NAME -o $GOLD_NAME
+
+# Prof gold
+nsys profile --trace cuda --force-overwrite true -o gold.nsys-rep $GOLD_NAME
+
+# Prof py LTO OFF
+nsys profile --trace cuda --force-overwrite true -o py_lto_off.nsys-rep --env-var NUMBA_CUDA_ENABLE_PYNVJITLINK=1 python $PY_NAME --lto False
+
+# Prof py LTO ON
+nsys profile --trace cuda --force-overwrite true -o py_lto_on.nsys-rep --env-var NUMBA_CUDA_ENABLE_PYNVJITLINK=1 python $PY_NAME --lto True
+
+# Create gold nsys stat report
+nsys stats --report cuda_gpu_kern_sum --format json --output . gold.nsys-rep
+
+# Analyze py LTO OFF nsys stat report
+nsys stats --report cuda_gpu_kern_sum --format json --output . py_lto_off.nsys-rep
+
+# Analyze py LTO ON nsys stat report
+nsys stats --report cuda_gpu_kern_sum --format json --output . py_lto_on.nsys-rep
+
+echo "Benchmark completes!"
+echo "The below compares the performance between gold and Numba."
+echo ""
+
+# Compare stat report
+python analyze.py gold_cuda_gpu_kern_sum.json py_lto_off_cuda_gpu_kern_sum.json py_lto_on_cuda_gpu_kern_sum.json
diff --git a/numbast_extensions/benchmarks/test_arithmetic_bf16.py b/numbast_extensions/benchmarks/test_arithmetic_bf16.py
new file mode 100755
index 00000000..55b1ecaf
--- /dev/null
+++ b/numbast_extensions/benchmarks/test_arithmetic_bf16.py
@@ -0,0 +1,49 @@
+import click
+import os
+import warnings
+
+import numba.cuda as cuda
+import numpy as np
+from numba import float32
+
+from numbast_extensions.bf16 import (
+    nv_bfloat16,
+    get_shims,
+)
+
+
+repetition_char = os.getenv("NUMBAST_BENCH_KERN_REPETITION", None)
+if repetition_char is None:
+    warnings.warn(
+        "Unable to retrieve NUMBAST_BENCH_KERN_REPETITION environment variable in `py`."
+        "Assume repetition 1000."
+    )
+    repetition = 1000
+else:
+    repetition = int(repetition_char)
+
+
+@click.command()
+@click.option("--lto", type=click.BOOL, required=True)
+def _run(lto):
+    @cuda.jit(link=get_shims(), lto=lto)
+    def kernel(arith):
+        # Binary Arithmetic Operators
+        a = nv_bfloat16(1.0)
+        b = nv_bfloat16(2.0)
+
+        arith[0] = float32(a + b)
+        arith[1] = float32(a - b)
+        arith[2] = float32(a * b)
+        arith[3] = float32(a / b)
+
+    arith = np.zeros(4, dtype=np.float32)
+
+    for _ in range(repetition):
+        kernel[1, 1](arith)
+
+    assert all(arith == [3.0, -1.0, 2.0, 0.5])
+
+
+if __name__ == "__main__":
+    _run()
diff --git a/numbast_extensions/benchmarks/test_arithmetic_bf16_gold.cu b/numbast_extensions/benchmarks/test_arithmetic_bf16_gold.cu
new file mode 100644
index 00000000..e3868435
--- /dev/null
+++ b/numbast_extensions/benchmarks/test_arithmetic_bf16_gold.cu
@@ -0,0 +1,48 @@
+#include <cstdlib>
+#include <iostream>
+#include <string>
+
+#include <cuda_bf16.h>
+
+__global__ void simple_kernel(float *arith) {
+  // Binary Arithmetic Operators
+  nv_bfloat16 a = nv_bfloat16(1.0f);
+  nv_bfloat16 b = nv_bfloat16(2.0f);
+  arith[0] = float(a + b);
+  arith[1] = float(a - b);
+  arith[2] = float(a * b);
+  arith[3] = float(a / b);
+}
+
+int main(void) {
+  char *repetition_char = std::getenv("NUMBAST_BENCH_KERN_REPETITION");
+  if (repetition_char == nullptr)
+    std::cout << "Unable to retrieve NUMBAST_BENCH_KERN_REPETITION environment "
+                 "variable in `gold`. Assume repetition 1000."
+              << std::endl;
+  int repetition =
+      repetition_char ? std::stoi(std::string(repetition_char)) : 1000;
+
+  int N = 4;
+  float *arith, *arith_d;
+  arith = (float *)malloc(N * sizeof(float));
+
+  cudaMalloc(&arith_d, N * sizeof(float));
+
+  for (int i = 0; i < N; i++) {
+    arith[i] = 0.0f;
+  }
+  cudaMemcpy(arith_d, arith, N * sizeof(float), cudaMemcpyHostToDevice);
+
+  for (int i = 0; i < repetition; i++)
+    simple_kernel<<<1, 1>>>(arith_d);
+
+  cudaDeviceSynchronize();
+
+  cudaMemcpy(arith, arith_d, N * sizeof(float), cudaMemcpyDeviceToHost);
+
+  cudaFree(arith_d);
+  free(arith);
+
+  return 0;
+}
diff --git a/numbast/benchmarks/test_arithmetic.py b/numbast_extensions/benchmarks/test_arithmetic_fp16.py
similarity index 100%
rename from numbast/benchmarks/test_arithmetic.py
rename to numbast_extensions/benchmarks/test_arithmetic_fp16.py