Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
45 commits
Select commit Hold shift + click to select a range
3332892
Merge pull request #2 from Saransh-cpp/main
FrancescoDerme Dec 16, 2025
322f82e
run.py
FrancescoDerme Dec 16, 2025
3077d74
run.py
FrancescoDerme Dec 16, 2025
9b01c53
merge
FrancescoDerme Dec 16, 2025
d2ae502
Stop tracking external code
FrancescoDerme Dec 16, 2025
15289a6
Working on run.py
FrancescoDerme Dec 16, 2025
4fdbc16
New tests
FrancescoDerme Dec 16, 2025
5b7c40c
removed requirements
FrancescoDerme Dec 16, 2025
2f5ff5f
style: pre-commit fixes
pre-commit-ci[bot] Dec 16, 2025
7327a74
transpose working on GPU!!
Saransh-cpp Dec 16, 2025
5aa89e9
Merge branch 'main' of https://github.com/Saransh-cpp/SOFIE-ALPAKA in…
Saransh-cpp Dec 16, 2025
3793b0c
fix: get concat tests to work on GPU
Saransh-cpp Dec 16, 2025
ea83974
fix: get topk tests to work on CUDA
Saransh-cpp Dec 16, 2025
a99d99c
fix: get where tests to work on CUDA
Saransh-cpp Dec 16, 2025
e55f940
Merge pull request #5 from Saransh-cpp/fix/where-cuda
Saransh-cpp Dec 16, 2025
a166fcf
Merge pull request #6 from Saransh-cpp/fix/topk-cuda
Saransh-cpp Dec 16, 2025
fab75f2
fix CPU tests
Saransh-cpp Dec 16, 2025
cef206b
Merge pull request #7 from Saransh-cpp/fix/concat-cuda
Saransh-cpp Dec 16, 2025
ad54955
fix concat gpu tests
Saransh-cpp Dec 16, 2025
197681d
have removed large notebook
Saransh-cpp Dec 16, 2025
e1b2bb4
fix dangling code
Saransh-cpp Dec 16, 2025
d1fb51a
update readme a bit"
Saransh-cpp Dec 16, 2025
03f3aa1
add pre-commit badge
Saransh-cpp Dec 16, 2025
8d386c5
oops, update cuda version in cmake
Saransh-cpp Dec 16, 2025
a3efbab
Fixing tests
FrancescoDerme Dec 17, 2025
a9b1a7d
merge
FrancescoDerme Dec 17, 2025
d1fb9ca
style: pre-commit fixes
pre-commit-ci[bot] Dec 17, 2025
76e81c6
Merge branch 'version2' of https://github.com/Saransh-cpp/SOFIE-ALPAK…
Saransh-cpp Dec 17, 2025
ed0d640
pytorch benchmarking
PietroFumagalli Dec 17, 2025
b9d33b2
Added warmup runs and trivial kernel
FrancescoDerme Dec 17, 2025
0cc67fc
style: pre-commit fixes
pre-commit-ci[bot] Dec 17, 2025
48ffb0a
Merge branch 'version2' of github.com:Saransh-cpp/SOFIE-ALPAKA into v…
FrancescoDerme Dec 17, 2025
b7aecd9
fix transpose and where kernels for GPU
Saransh-cpp Dec 17, 2025
d8553ea
PyTorch!
FrancescoDerme Dec 17, 2025
5597b6b
style: pre-commit fixes
pre-commit-ci[bot] Dec 17, 2025
44618e4
fix other kernels
Saransh-cpp Dec 17, 2025
88fa53e
PyTorch!
FrancescoDerme Dec 17, 2025
83ab5e9
Merge branch 'version2' of github.com:Saransh-cpp/SOFIE-ALPAKA into v…
FrancescoDerme Dec 17, 2025
fda7a01
run.py updates
FrancescoDerme Dec 17, 2025
5b80551
Merge branch 'version2' of https://github.com/Saransh-cpp/SOFIE-ALPAK…
Saransh-cpp Dec 17, 2025
779a2ce
run.py
FrancescoDerme Dec 17, 2025
86e312f
run.py
FrancescoDerme Dec 17, 2025
14c1975
add GPU to run.py
Saransh-cpp Dec 17, 2025
4c1bff4
Merge branch 'version2' of https://github.com/Saransh-cpp/SOFIE-ALPAK…
Saransh-cpp Dec 17, 2025
f5c9c66
fix trivial kernelel for GPU
Saransh-cpp Dec 17, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitattributes
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# External code
models/ linguist-vendored
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@
.bin/
.DS_Store
build/
.vscode/
.ipynb_checkpoints/
1 change: 0 additions & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ repos:
rev: v6.0.0
hooks:
- id: check-added-large-files
args: ['--maxkb=3000']
- id: check-case-conflict
- id: check-merge-conflict
- id: check-symlinks
Expand Down
70 changes: 29 additions & 41 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,44 @@ set(CMAKE_CUDA_STANDARD 20)
set(CMAKE_CUDA_STANDARD_REQUIRED ON)

# --- User-configurable options ---
set(CUDA_BASE "/usr/local/cuda-13.1" CACHE PATH "CUDA base path")
set(TBB_BASE "/usr" CACHE PATH "TBB base path")
set(CUDA_BASE "/usr/local/cuda-12.5" CACHE PATH "CUDA base path")
set(ALPAKA_BASE "external/alpaka" CACHE PATH "Alpaka base path")

# --- Compiler flags ---
set(CXXFLAGS -O2 -g -DALPAKA_HAS_STD_ATOMIC_REF)
set(CXX_HOST_FLAGS -fPIC -pthread)
set(CUDA_ARCH "sm_75")
set(CXX_CUDA_FLAGS -arch=${CUDA_ARCH} -Wno-deprecated-gpu-targets --extended-lambda --expt-relaxed-constexpr)
set(CMAKE_CUDA_COMPILER "/usr/local/cuda-13.1/bin/nvcc" CACHE PATH "Cuda compiler path")
set(CMAKE_CUDA_COMPILER "/usr/local/cuda-12.5/bin/nvcc" CACHE PATH "Cuda compiler path")

# --- Executables ---
add_executable(test_trivial tests/test_trivial.cpp)
set_source_files_properties(tests/test_trivial.cpp PROPERTIES LANGUAGE CUDA)
enable_language(CUDA)

set_target_properties(test_trivial PROPERTIES
CUDA_SEPARABLE_COMPILATION ON
)

target_compile_features(test_trivial PUBLIC cxx_std_20)

target_compile_options(test_trivial PRIVATE
${CXXFLAGS}
${CXX_CUDA_FLAGS}
${CXX_HOST_FLAGS}
)

target_compile_definitions(test_trivial PRIVATE
ALPAKA_ACC_GPU_CUDA_ENABLED
)

target_include_directories(test_trivial PRIVATE
${ALPAKA_BASE}/include
${CUDA_BASE}/include
)

target_link_directories(test_trivial PRIVATE ${CUDA_BASE}/lib64)

add_executable(test_transpose tests/test_transpose.cpp)
set_source_files_properties(tests/test_transpose.cpp PROPERTIES LANGUAGE CUDA)
enable_language(CUDA)
Expand Down Expand Up @@ -46,14 +72,6 @@ target_include_directories(test_transpose PRIVATE

target_link_directories(test_transpose PRIVATE ${CUDA_BASE}/lib64)

target_link_libraries(test_transpose
PRIVATE
cublas
cublasLt
cudart
nvidia-ml
)

add_executable(test_concat tests/test_concat.cpp)
set_source_files_properties(tests/test_concat.cpp PROPERTIES LANGUAGE CUDA)
enable_language(CUDA)
Expand Down Expand Up @@ -81,14 +99,6 @@ target_include_directories(test_concat PRIVATE

target_link_directories(test_concat PRIVATE ${CUDA_BASE}/lib64)

target_link_libraries(test_concat
PRIVATE
cublas
cublasLt
cudart
nvidia-ml
)

add_executable(test_where tests/test_where.cpp)
set_source_files_properties(tests/test_where.cpp PROPERTIES LANGUAGE CUDA)
enable_language(CUDA)
Expand Down Expand Up @@ -116,14 +126,6 @@ target_include_directories(test_where PRIVATE

target_link_directories(test_where PRIVATE ${CUDA_BASE}/lib64)

target_link_libraries(test_where
PRIVATE
cublas
cublasLt
cudart
nvidia-ml
)

add_executable(test_topk tests/test_topk.cpp)
set_source_files_properties(tests/test_topk.cpp PROPERTIES LANGUAGE CUDA)
enable_language(CUDA)
Expand All @@ -150,17 +152,3 @@ target_include_directories(test_topk PRIVATE
)

target_link_directories(test_topk PRIVATE ${CUDA_BASE}/lib64)

target_link_libraries(test_topk
PRIVATE
cublas
cublasLt
cudart
nvidia-ml
)

# Optional clean
add_custom_target(clean-all
COMMAND ${CMAKE_COMMAND} -E rm -f test_transpose *.d *.o *.so
COMMENT "Cleaning all generated files"
)
37 changes: 31 additions & 6 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,13 +1,40 @@
CXX := g++
CXXFLAGS ?= -std=c++17 -O2 -Wall
CXXFLAGS ?= -std=c++17 -O3 -Wall
LDFLAGS ?=

# Path setup: Mac + Linux compatible
KERNEL_DIR ?= kernels
TEST_DIR ?= tests
ALPAKA_DIR ?= $(CURDIR)/external/alpaka/include
CPLUS_INCLUDE_PATH ?= /opt/homebrew/include
LIBRARY_PATH ?= /opt/homebrew/lib
BIN_DIR ?= bin
ALPAKA_ACCELERATOR_FLAG ?= ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED

LDFLAGS += -L$(LIBRARY_PATH)

# Accelerator selection (CPU options)
# Debugging (slow, checks everything)
# ALPAKA_ACCELERATOR_FLAG ?= ALPAKA_ACC_CPU_B_SEQ_T_THREADS_ENABLED

# Performance (fast, single core)
# ALPAKA_ACCELERATOR_FLAG ?= ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED

# High performance (fast, multi-core TBB)
ALPAKA_ACCELERATOR_FLAG ?= ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED

# Conditional flags
# Auto-add -ltbb if TBB is selected
ifneq (,$(findstring TBB,$(ALPAKA_ACCELERATOR_FLAG)))
LDFLAGS += -ltbb
endif

# Auto-add -fopenmp if OMP is selected
ifneq (,$(findstring OMP,$(ALPAKA_ACCELERATOR_FLAG)))
CXXFLAGS += -fopenmp
LDFLAGS += -fopenmp
endif

# Build rules
KERNEL_HEADERS := $(wildcard $(KERNEL_DIR)/*.hpp)
KERNEL_NAMES := $(patsubst $(KERNEL_DIR)/%.hpp,%,$(KERNEL_HEADERS))
EXECUTABLES := $(patsubst %,$(BIN_DIR)/test_%.out,$(KERNEL_NAMES))
Expand All @@ -26,14 +53,12 @@ test: $(EXECUTABLES)

$(BIN_DIR)/test_%.out: $(TEST_DIR)/test_%.cpp $(KERNEL_DIR)/%.hpp | $(BIN_DIR)
@echo "Building test for kernel: $*"
$(CXX) $(CXXFLAGS) -I$(ALPAKA_DIR) -I$(CPLUS_INCLUDE_PATH) -D$(ALPAKA_ACCELERATOR_FLAG) $< -o $@
$(CXX) $(CXXFLAGS) -I$(ALPAKA_DIR) -I$(CPLUS_INCLUDE_PATH) -D$(ALPAKA_ACCELERATOR_FLAG) $< -o $@ $(LDFLAGS)

$(BIN_DIR):
mkdir -p $(BIN_DIR)

clean:
rm -rf $(BIN_DIR)

test:

.PHONY = all test clean
.PHONY: all test clean
27 changes: 15 additions & 12 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# SOFIE-ALPAKA

[![Build and Test on CPU](https://github.com/Saransh-cpp/SOFIE-ALPAKA/actions/workflows/build_and_test.yml/badge.svg?branch=main)](https://github.com/Saransh-cpp/SOFIE-ALPAKA/actions/workflows/build_and_test.yml)
[![pre-commit.ci status](https://results.pre-commit.ci/badge/github/Saransh-cpp/SOFIE-ALPAKA/main.svg)](https://results.pre-commit.ci/latest/github/Saransh-cpp/SOFIE-ALPAKA/main)

Kernels for heterogeneous architectures written in [Alpaka](https://alpaka.readthedocs.io/en/stable/) (An Abstraction Library for Parallel Kernel Acceleration) for [SOFIE](https://github.com/ML4EP/SOFIE) (System for Optimised Fast Inference code Emit).

Expand All @@ -27,15 +28,15 @@ git clone https://github.com/Saransh-cpp/SOFIE-ALPAKA --recursive
To build all kernels and tests in `bin/`:

```
make all
make all -j10
```

### Running tests on a threaded CPU

To run all kernel tests (and build if not built before):

```
make test
make test -j10
```

### Building kernels and tests on an NVIDIA GPU
Expand All @@ -44,24 +45,26 @@ To build all the kernels and tests in `build/`

```
cmake -S. -Bbuild
cmake --build build
cmake --build build -j10
```

where the following flags can be configured by the user:
- `CUDA_BASE` (default: "/usr/local/cuda-13.1"): CUDA base path
- `TBB_BASE` (default: "/usr"): TBB base path
- `ALPAKA_BASE` (default: "external/alpaka"): Alpaka base path
- `CUDA_ARCH` (default: "sm_75"): CUDA architecture
- `CMAKE_CUDA_COMPILER` (default: "/usr/local/cuda-13.1/bin/nvcc"): Cuda compiler path
- `CMAKE_CUDA_COMPILER` (default: "/usr/local/cuda-12.5/bin/nvcc"): Cuda compiler path

To run the tests, simply execute `test_*` executables produced in `build/`.

### Running integration tests on an NVIDIA GPU

To run SOFIE integration tests:
1. Port a kernel to [SOFIE](https://github.com/ML4EP/SOFIE) on a stand-alone branch (against the `gpu/alpaka` branch) (see https://github.com/ML4EP/SOFIE/pull/7 and https://github.com/ML4EP/SOFIE/pull/8 for reference).
2. Make sure there is a corrresponding `onnx` model in `SOFIE/src/SOFIE_core/test/input_models/`.
3. Make sure there is a reference output in `SOFIE/src/SOFIE_core/test/input_models/references`.
4. Follow instructions in SOFIE's README to build and run tests with CUDA (remember to set `-DCUDA_ARCH` as per your GPU's architecture).

```
cd tests/sofie_integration
cmake -S. -Bbuild
cmake --build build
```
The relevant header and DAT files will be generated in `SOFIE/build/src/SOFIE_core/test/`.

#### Kernels already ported to SOFIE

with the same configurable flags listed in the section above.
`Transpose` and `Concat` kernels have already been ported to SOFIE (pull requests not merged yet). This repository has an updated implementation for both of these kernels, and two other kernels, which much be ported in the future.
56 changes: 42 additions & 14 deletions kernels/concat.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,39 +12,67 @@ struct ConcatKernel {
std::array<alpaka::Vec<Dim, Idx>, N> input_strides_vec,
alpaka::Vec<Dim, Idx> output_strides, alpaka::Vec<Dim, Idx> output_shape,
std::array<Idx, N> axis_sizes, std::size_t concat_axis) const {
using DimAcc = alpaka::Dim<TAcc>;
static_assert(DimAcc::value == Dim::value, "Accelerator and data dims must match");

constexpr std::size_t D = Dim::value;
auto elements = alpaka::uniformElementsND(acc, output_shape);

for (auto const& idx : elements) {
Idx concat_coord = idx[concat_axis];
// Get global thread index and total threads
auto const threadIdx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
auto const threadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);

// Convert to linear thread index
Idx global_thread_idx = 0;
Idx stride = 1;
for (std::size_t d = 0; d < D; ++d) {
global_thread_idx += threadIdx[d] * stride;
stride *= threadExtent[d];
}

// Total number of output elements
Idx total_elements = 1;
for (std::size_t d = 0; d < D; ++d) {
total_elements *= output_shape[d];
}

// Grid-stride loop
for (Idx elem_idx = global_thread_idx; elem_idx < total_elements; elem_idx += threadExtent.prod()) {
// Convert linear index to multi-dimensional output index
Idx remaining = elem_idx;
alpaka::Vec<Dim, Idx> out_idx;
for (int d = D - 1; d >= 0; --d) {
out_idx[d] = remaining % output_shape[d];
remaining /= output_shape[d];
}

// Determine which input tensor this element comes from
Idx concat_coord = out_idx[concat_axis];
std::size_t chosen = 0;
Idx offset = 0;

// Find which input matrix this pixel belongs to
// Find the input tensor that contains this coordinate
for (std::size_t k = 0; k < N; ++k) {
Idx const sz = axis_sizes[k];
if (concat_coord < offset + sz) {
chosen = k;
break;
}

offset += sz;
}

// Compute input and output indexes
Idx input_idx = 0;
// Compute output linear index
Idx output_idx = 0;
for (std::size_t d = 0; d < D; ++d) {
Idx const out_coord = idx[d];
output_idx += out_coord * output_strides[d];
output_idx += out_idx[d] * output_strides[d];
}

Idx const in_coord = out_coord - offset * (d == concat_axis);
input_idx += in_coord * input_strides_vec[chosen][d];
// Compute input linear index (adjust for concat axis offset)
alpaka::Vec<Dim, Idx> in_idx = out_idx;
in_idx[concat_axis] = concat_coord - offset;

Idx input_idx = 0;
for (std::size_t d = 0; d < D; ++d) {
input_idx += in_idx[d] * input_strides_vec[chosen][d];
}

// Copy the element
output[output_idx] = input_ptrs[chosen][input_idx];
}
}
Expand Down
Loading