LeonardEyer · LeonardEyer · Feb 14, 2025 · Feb 14, 2025 · Feb 14, 2025 · Feb 14, 2025
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -0,0 +1,27 @@
+name: CUDA CMake CI
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    container:
+      image: leonardeyer/cuda-builder:latest
+    steps:
+      - name: Checkout source
+        uses: actions/checkout@v4
+
+      - name: Configure with CMake
+        run: cmake -B build -S .
+
+      - name: Build
+        run: cmake --build build --parallel
+
+      - uses: actions/upload-artifact@v4
+        with:
+          name: mps
+          path: build/mps
diff --git a/.gitignore b/.gitignore
@@ -4,3 +4,5 @@
 *.ptx
 *.cubin
 *.fatbin
+.*
+*__pycache__*
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -35,18 +35,7 @@ CPMAddPackage("gh:gabime/spdlog@1.8.2")
 include_directories(include)
 
 # mps
-add_executable(mps
-        src/mps.cu
-
-        include/tensor.h
-        include/decomposition.h
-        include/permutation.h
-        include/Operators.hpp
-        include/mps.h
-        include/tensordot.h
-        include/cutensor_utils.h
-        include/reduction.h
-)
+add_executable(mps src/mps.cu)
 
 target_link_libraries(mps PRIVATE
         cuTENSOR::cuTENSOR
@@ -70,8 +59,7 @@ target_compile_options(cuda_compile_options INTERFACE
 
 target_link_libraries(mps PRIVATE cuda_compile_options)
 
-set_target_properties(mps PROPERTIES
-        CUDA_SEPARABLE_COMPILATION ON)
+set_target_properties(mps PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
 
 
 # test_mps
@@ -86,61 +74,4 @@ target_link_libraries(test_mps PRIVATE
 
 target_compile_options(test_mps PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:--extended-lambda>)
 
-set_target_properties(test_mps PROPERTIES
-        CUDA_SEPARABLE_COMPILATION ON)
-
-# Add the test executable
-add_executable(test_tensordot tests/test_tensordot.cu)
-
-# Link against CUTENSOR and CUDA libraries.
-target_link_libraries(test_tensordot
-        cuTENSOR::cuTENSOR
-        CUDA::cudart
-        CUDA::cublas
-        spdlog::spdlog
-)
-
-set_target_properties(test_tensordot PROPERTIES
-        CUDA_SEPARABLE_COMPILATION ON)
-
-
-add_executable(test_einsum tests/test_einsum.cu)
-
-# Link against CUTENSOR and CUDA libraries.
-target_link_libraries(test_einsum
-        cuTENSOR::cuTENSOR
-        CUDA::cudart
-        CUDA::cublas
-        spdlog::spdlog
-)
-
-set_target_properties(test_einsum PROPERTIES
-        CUDA_SEPARABLE_COMPILATION ON)
-
-add_executable(test_permutation tests/test_permutation.cu)
-
-# Link against CUTENSOR and CUDA libraries.
-target_link_libraries(test_permutation
-        cuTENSOR::cuTENSOR
-        CUDA::cudart
-        CUDA::cublas
-        spdlog::spdlog
-)
-
-set_target_properties(test_permutation PROPERTIES
-        CUDA_SEPARABLE_COMPILATION ON)
-
-
-add_executable(test_truncation tests/test_reduction.cu)
-
-# Link against CUTENSOR and CUDA libraries.
-target_link_libraries(test_truncation
-        cuTENSOR::cuTENSOR
-        CUDA::cudart
-        CUDA::cublas
-        spdlog::spdlog
-        cuda_compile_options
-)
-
-set_target_properties(test_truncation PROPERTIES
-        CUDA_SEPARABLE_COMPILATION ON)
+set_target_properties(test_mps PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,20 @@
+# Use the NVIDIA CUDA 12.8.0 development image with Ubuntu 24.04
+FROM nvidia/cuda:12.8.0-devel-ubuntu24.04
+
+# Set non-interactive mode for apt
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    cmake \
+    git \
+    nvtop \
+    python3 \
+    python3-pip \
+    python3-dev \
+    libcutensor2 \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install CuPy for CUDA 12.x
+RUN python3 -m pip install cupy-cuda12x --break-system-packages
diff --git a/README.md b/README.md
@@ -14,7 +14,7 @@ The main computational task involves decomposing a tensor of shape $(D_1, d, d,
 
 Since exact SVD decompositions do not parallelize well on GPUs ([ref](https://arxiv.org/pdf/2212.09782)) I opted for the existing one-sided Jacobi based implementation `cusolverDn<t>gesvdj` from [cuSolver](https://docs.nvidia.com/cuda/cusolver/index.html#cusolverdn-t-gesvdj).
 
-The implementation focused mainly on avoiding copying data to the host when possible and preallocating all the required memory / workspaces up front to aid getting the data ready for the decomposition. This involved using scractch tensors and custom out-of-place kernels to allow directly writing to the destination tensor memory. The truncation and normalization procedure was fused into a single kernel making use of cooperative groups to synchronize across blocks.
+The implementation focuses mainly on avoiding copying data to the host when possible and pre-allocating all the required memory / workspaces up front to aid getting the data ready for the decomposition. This involved using scractch tensors and custom out-of-place kernels to allow directly writing to the destination tensor memory. 
 
 ---
 ##### cuTensorNet
-Original file line number
+Diff line change
@@ Expand Up / @@ -4,3 +4,5 @@ @@
     *.ptx
     *.cubin
     *.fatbin
+    .*
+    *__pycache__*