diff --git a/.gitignore b/.gitignore index bdd09a81..838703d3 100644 --- a/.gitignore +++ b/.gitignore @@ -160,3 +160,8 @@ jupyter_execute .nvcode/ llvm.sh* + +# nsys profiles +*cuda_gpu_kern_sum.json +*.sqlite +*.nsys-rep diff --git a/numbast/src/numbast/__init__.py b/numbast/src/numbast/__init__.py index 6a1d7648..79ef61c8 100644 --- a/numbast/src/numbast/__init__.py +++ b/numbast/src/numbast/__init__.py @@ -3,12 +3,6 @@ import numba -# Use pynvjitlink by default. This can avoid version mismatch between system driver and -# installed CTK version. -from pynvjitlink.patch import patch_numba_linker - -patch_numba_linker() - from numbast import numba_patch from numbast.struct import bind_cxx_struct, bind_cxx_structs diff --git a/numbast/src/numbast/numba_patch.py b/numbast/src/numbast/numba_patch.py index 41923e38..6652df4b 100644 --- a/numbast/src/numbast/numba_patch.py +++ b/numbast/src/numbast/numba_patch.py @@ -36,7 +36,7 @@ # added. -def nvrtc_compile(src, name, cc): +def nvrtc_compile(src, name, cc, ltoir=False): """ Compile a CUDA C/C++ source to PTX for a given compute capability. @@ -67,6 +67,8 @@ def nvrtc_compile(src, name, cc): numba_include = f"-I{numba_cuda_path}" options = [arch, *extra_include_paths, include, numba_include, "-rdc", "true"] options += extra_options + if ltoir: + options.append("-dlto") # Compile the program compile_error = nvrtc.compile_program(program, options) @@ -84,8 +86,11 @@ def nvrtc_compile(src, name, cc): msg = f"NVRTC log messages whilst compiling {name}:\n\n{log}" warnings.warn(msg) - ptx = nvrtc.get_ptx(program) - return ptx, log + if ltoir: + return nvrtc.get_lto(program), log + else: + ptx = nvrtc.get_ptx(program) + return ptx, log # Monkey-patch the existing implementation diff --git a/numbast_extensions/benchmarks/analyze.py b/numbast_extensions/benchmarks/analyze.py new file mode 100644 index 00000000..896b29d4 --- /dev/null +++ b/numbast_extensions/benchmarks/analyze.py @@ -0,0 +1,63 @@ +import click +import json +import pandas as pd + + +@click.command() +@click.argument( + "gold_name", type=click.Path(exists=True, dir_okay=False, file_okay=True) +) +@click.argument( + "py_lto_off_name", type=click.Path(exists=True, dir_okay=False, file_okay=True) +) +@click.argument( + "py_lto_on_name", type=click.Path(exists=True, dir_okay=False, file_okay=True) +) +def compare_gpu_kern(gold_name, py_lto_off_name, py_lto_on_name): + """Read profile results from gold run result and Numba kernel, compare them. + + GOLD_NAME: JSON profile result of the gold kernel. + NUMBA_NAME: JSON profile result of the Numba kernel. + """ + with open(gold_name, "r") as goldf: + gold_kerns = json.load(goldf) + with open(py_lto_off_name, "r") as pyf: + lto_off_kerns = json.load(pyf) + with open(py_lto_on_name, "r") as pyf: + lto_on_kerns = json.load(pyf) + + gold_kern = gold_kerns[0] + lto_off_kern = lto_off_kerns[0] + lto_on_kern = lto_on_kerns[0] + + columns = [ + "GOLD: " + gold_kern["Name"], + "NUMBA LTO OFF: " + lto_off_kern["Name"], + "NUMBA LTO ON: " + lto_on_kern["Name"], + ] + index = [k for k in gold_kern.keys() if k != "Name"] + values = [ + (gold_kern[k], lto_off_kern[k], lto_on_kern[k]) + for k in gold_kern.keys() + if k != "Name" + ] + + df = pd.DataFrame(data=values, index=index, columns=columns) + + print(df) + + print("Perf Ratio (NUMBA LTO OFF / GOLD, %): ") + diff = df.iloc[:, 1] / df.iloc[:, 0] * 100 + diff.index = diff.index.str.strip("%ns)").str.strip("( ") + print(diff[["Avg", "Med", "Min", "Max", "StdDev"]]) + + print("---------") + + print("Perf Ratio (NUMBA LTO ON / GOLD, %): ") + diff = df.iloc[:, 2] / df.iloc[:, 0] * 100 + diff.index = diff.index.str.strip("%ns)").str.strip("( ") + print(diff[["Avg", "Med", "Min", "Max", "StdDev"]]) + + +if __name__ == "__main__": + compare_gpu_kern() diff --git a/numbast_extensions/benchmarks/run_benchmark.sh b/numbast_extensions/benchmarks/run_benchmark.sh new file mode 100755 index 00000000..e5c33dfd --- /dev/null +++ b/numbast_extensions/benchmarks/run_benchmark.sh @@ -0,0 +1,46 @@ +#!/bin/bash + +NUMBAST_BENCH_KERN_REPETITION=1000 + +BENCH_NAME=test_arithmetic_bf16 + +PY_NAME=${BENCH_NAME}.py +PY_PTX=${BENCH_NAME}_py.ptx + +GOLD_NAME=${BENCH_NAME}_gold +GOLD_SRC_NAME=${GOLD_NAME}.cu +GOLD_PTX=${GOLD_NAME}.ptx + +COMPUTE_CAP=$(nvidia-smi --query-gpu=compute_cap --format=csv,noheader|head -n 1) +SMCC=sm_${COMPUTE_CAP//./} + +# Cleanup +rm -rf *.json *.nsys-rep *.sqlite $GOLD_NAME + +# Compile gold +nvcc --gpu-architecture=$SMCC $GOLD_SRC_NAME -o $GOLD_NAME + +# Prof gold +nsys profile --trace cuda --force-overwrite true -o gold.nsys-rep $GOLD_NAME + +# Prof py LTO OFF +nsys profile --trace cuda --force-overwrite true -o py_lto_off.nsys-rep --env-var NUMBA_CUDA_ENABLE_PYNVJITLINK=1 python $PY_NAME --lto False + +# Prof py LTO ON +nsys profile --trace cuda --force-overwrite true -o py_lto_on.nsys-rep --env-var NUMBA_CUDA_ENABLE_PYNVJITLINK=1 python $PY_NAME --lto True + +# Create gold nsys stat report +nsys stats --report cuda_gpu_kern_sum --format json --output . gold.nsys-rep + +# Analyze py LTO OFF nsys stat report +nsys stats --report cuda_gpu_kern_sum --format json --output . py_lto_off.nsys-rep + +# Analyze py LTO ON nsys stat report +nsys stats --report cuda_gpu_kern_sum --format json --output . py_lto_on.nsys-rep + +echo "Benchmark completes!" +echo "The below compares the performance between gold and Numba." +echo "" + +# Compare stat report +python analyze.py gold_cuda_gpu_kern_sum.json py_lto_off_cuda_gpu_kern_sum.json py_lto_on_cuda_gpu_kern_sum.json diff --git a/numbast_extensions/benchmarks/test_arithmetic_bf16.py b/numbast_extensions/benchmarks/test_arithmetic_bf16.py new file mode 100755 index 00000000..55b1ecaf --- /dev/null +++ b/numbast_extensions/benchmarks/test_arithmetic_bf16.py @@ -0,0 +1,49 @@ +import click +import os +import warnings + +import numba.cuda as cuda +import numpy as np +from numba import float32 + +from numbast_extensions.bf16 import ( + nv_bfloat16, + get_shims, +) + + +repetition_char = os.getenv("NUMBAST_BENCH_KERN_REPETITION", None) +if repetition_char is None: + warnings.warn( + "Unable to retrieve NUMBAST_BENCH_KERN_REPETITION environment variable in `py`." + "Assume repetition 1000." + ) + repetition = 1000 +else: + repetition = int(repetition_char) + + +@click.command() +@click.option("--lto", type=click.BOOL, required=True) +def _run(lto): + @cuda.jit(link=get_shims(), lto=lto) + def kernel(arith): + # Binary Arithmetic Operators + a = nv_bfloat16(1.0) + b = nv_bfloat16(2.0) + + arith[0] = float32(a + b) + arith[1] = float32(a - b) + arith[2] = float32(a * b) + arith[3] = float32(a / b) + + arith = np.zeros(4, dtype=np.float32) + + for _ in range(repetition): + kernel[1, 1](arith) + + assert all(arith == [3.0, -1.0, 2.0, 0.5]) + + +if __name__ == "__main__": + _run() diff --git a/numbast_extensions/benchmarks/test_arithmetic_bf16_gold.cu b/numbast_extensions/benchmarks/test_arithmetic_bf16_gold.cu new file mode 100644 index 00000000..e3868435 --- /dev/null +++ b/numbast_extensions/benchmarks/test_arithmetic_bf16_gold.cu @@ -0,0 +1,48 @@ +#include +#include +#include + +#include + +__global__ void simple_kernel(float *arith) { + // Binary Arithmetic Operators + nv_bfloat16 a = nv_bfloat16(1.0f); + nv_bfloat16 b = nv_bfloat16(2.0f); + arith[0] = float(a + b); + arith[1] = float(a - b); + arith[2] = float(a * b); + arith[3] = float(a / b); +} + +int main(void) { + char *repetition_char = std::getenv("NUMBAST_BENCH_KERN_REPETITION"); + if (repetition_char == nullptr) + std::cout << "Unable to retrieve NUMBAST_BENCH_KERN_REPETITION environment " + "variable in `gold`. Assume repetition 1000." + << std::endl; + int repetition = + repetition_char ? std::stoi(std::string(repetition_char)) : 1000; + + int N = 4; + float *arith, *arith_d; + arith = (float *)malloc(N * sizeof(float)); + + cudaMalloc(&arith_d, N * sizeof(float)); + + for (int i = 0; i < N; i++) { + arith[i] = 0.0f; + } + cudaMemcpy(arith_d, arith, N * sizeof(float), cudaMemcpyHostToDevice); + + for (int i = 0; i < repetition; i++) + simple_kernel<<<1, 1>>>(arith_d); + + cudaDeviceSynchronize(); + + cudaMemcpy(arith, arith_d, N * sizeof(float), cudaMemcpyDeviceToHost); + + cudaFree(arith_d); + free(arith); + + return 0; +} diff --git a/numbast/benchmarks/test_arithmetic.py b/numbast_extensions/benchmarks/test_arithmetic_fp16.py similarity index 100% rename from numbast/benchmarks/test_arithmetic.py rename to numbast_extensions/benchmarks/test_arithmetic_fp16.py