Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
name: CUDA CMake CI

on:
push:
branches: [ main ]
pull_request:
branches: [ main ]

jobs:
build:
runs-on: ubuntu-latest
container:
image: leonardeyer/cuda-builder:latest
steps:
- name: Checkout source
uses: actions/checkout@v4

- name: Configure with CMake
run: cmake -B build -S .

- name: Build
run: cmake --build build --parallel

- uses: actions/upload-artifact@v4
with:
name: mps
path: build/mps
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,5 @@
*.ptx
*.cubin
*.fatbin
.*
*__pycache__*
75 changes: 3 additions & 72 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -35,18 +35,7 @@ CPMAddPackage("gh:gabime/spdlog@1.8.2")
include_directories(include)

# mps
add_executable(mps
src/mps.cu

include/tensor.h
include/decomposition.h
include/permutation.h
include/Operators.hpp
include/mps.h
include/tensordot.h
include/cutensor_utils.h
include/reduction.h
)
add_executable(mps src/mps.cu)

target_link_libraries(mps PRIVATE
cuTENSOR::cuTENSOR
Expand All @@ -70,8 +59,7 @@ target_compile_options(cuda_compile_options INTERFACE

target_link_libraries(mps PRIVATE cuda_compile_options)

set_target_properties(mps PROPERTIES
CUDA_SEPARABLE_COMPILATION ON)
set_target_properties(mps PROPERTIES CUDA_SEPARABLE_COMPILATION ON)


# test_mps
Expand All @@ -86,61 +74,4 @@ target_link_libraries(test_mps PRIVATE

target_compile_options(test_mps PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda>)

set_target_properties(test_mps PROPERTIES
CUDA_SEPARABLE_COMPILATION ON)

# Add the test executable
add_executable(test_tensordot tests/test_tensordot.cu)

# Link against CUTENSOR and CUDA libraries.
target_link_libraries(test_tensordot
cuTENSOR::cuTENSOR
CUDA::cudart
CUDA::cublas
spdlog::spdlog
)

set_target_properties(test_tensordot PROPERTIES
CUDA_SEPARABLE_COMPILATION ON)


add_executable(test_einsum tests/test_einsum.cu)

# Link against CUTENSOR and CUDA libraries.
target_link_libraries(test_einsum
cuTENSOR::cuTENSOR
CUDA::cudart
CUDA::cublas
spdlog::spdlog
)

set_target_properties(test_einsum PROPERTIES
CUDA_SEPARABLE_COMPILATION ON)

add_executable(test_permutation tests/test_permutation.cu)

# Link against CUTENSOR and CUDA libraries.
target_link_libraries(test_permutation
cuTENSOR::cuTENSOR
CUDA::cudart
CUDA::cublas
spdlog::spdlog
)

set_target_properties(test_permutation PROPERTIES
CUDA_SEPARABLE_COMPILATION ON)


add_executable(test_truncation tests/test_reduction.cu)

# Link against CUTENSOR and CUDA libraries.
target_link_libraries(test_truncation
cuTENSOR::cuTENSOR
CUDA::cudart
CUDA::cublas
spdlog::spdlog
cuda_compile_options
)

set_target_properties(test_truncation PROPERTIES
CUDA_SEPARABLE_COMPILATION ON)
set_target_properties(test_mps PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
20 changes: 20 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Use the NVIDIA CUDA 12.8.0 development image with Ubuntu 24.04
FROM nvidia/cuda:12.8.0-devel-ubuntu24.04

# Set non-interactive mode for apt
ENV DEBIAN_FRONTEND=noninteractive

# Install system dependencies
RUN apt-get update && apt-get install -y \
build-essential \
cmake \
git \
nvtop \
python3 \
python3-pip \
python3-dev \
libcutensor2 \
&& rm -rf /var/lib/apt/lists/*

# Install CuPy for CUDA 12.x
RUN python3 -m pip install cupy-cuda12x --break-system-packages
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ The main computational task involves decomposing a tensor of shape $(D_1, d, d,

Since exact SVD decompositions do not parallelize well on GPUs ([ref](https://arxiv.org/pdf/2212.09782)) I opted for the existing one-sided Jacobi based implementation `cusolverDn<t>gesvdj` from [cuSolver](https://docs.nvidia.com/cuda/cusolver/index.html#cusolverdn-t-gesvdj).

The implementation focused mainly on avoiding copying data to the host when possible and preallocating all the required memory / workspaces up front to aid getting the data ready for the decomposition. This involved using scractch tensors and custom out-of-place kernels to allow directly writing to the destination tensor memory. The truncation and normalization procedure was fused into a single kernel making use of cooperative groups to synchronize across blocks.
The implementation focuses mainly on avoiding copying data to the host when possible and pre-allocating all the required memory / workspaces up front to aid getting the data ready for the decomposition. This involved using scractch tensors and custom out-of-place kernels to allow directly writing to the destination tensor memory.

---
##### cuTensorNet
Expand Down
Loading