Cytnx-dev · jysh1214 · Dec 1, 2025 · Mar 30, 2026 · Mar 30, 2026 · Mar 30, 2026
diff --git a/CytnxBKNDCMakeLists.cmake b/CytnxBKNDCMakeLists.cmake
@@ -126,6 +126,33 @@ if(USE_CUDA)
     #      -gencode=arch=compute_75,code=compute_75 ")
     target_compile_definitions(cytnx PUBLIC UNI_GPU)
     target_include_directories(cytnx PRIVATE ${CUDAToolkit_INCLUDE_DIRS})
+    # CUDA 12+/13 may place Thrust/CUB headers under include/cccl.
+    set(_cytnx_cccl_candidates)
+    if(DEFINED CUDAToolkit_TARGET_DIR AND NOT "${CUDAToolkit_TARGET_DIR}" STREQUAL "")
+      list(APPEND _cytnx_cccl_candidates "${CUDAToolkit_TARGET_DIR}/include/cccl")
+    endif()
+    foreach(_cuda_inc IN LISTS CUDAToolkit_INCLUDE_DIRS)
+      list(APPEND _cytnx_cccl_candidates
+        "${_cuda_inc}/cccl"
+        "${_cuda_inc}/../include/cccl"
+        "${_cuda_inc}/../../include/cccl"
+        "${_cuda_inc}/../../../include/cccl")
+    endforeach()
+    list(REMOVE_DUPLICATES _cytnx_cccl_candidates)
+
+    set(_cytnx_cccl_dir "")
+    foreach(_cccl_candidate IN LISTS _cytnx_cccl_candidates)
+      get_filename_component(_cccl_candidate_abs "${_cccl_candidate}" ABSOLUTE)
+      if(EXISTS "${_cccl_candidate_abs}")
+        set(_cytnx_cccl_dir "${_cccl_candidate_abs}")
+        break()
+      endif()
+    endforeach()
+    if(NOT "${_cytnx_cccl_dir}" STREQUAL "")
+      target_include_directories(cytnx PRIVATE "${_cytnx_cccl_dir}")
+      message(STATUS "Detected CCCL headers at: ${_cytnx_cccl_dir}")
+    endif()
+
     target_link_libraries(cytnx PUBLIC CUDA::toolkit)
     target_link_libraries(cytnx PUBLIC CUDA::cudart CUDA::cublas CUDA::cusparse CUDA::curand CUDA::cusolver)
     target_link_libraries(cytnx PUBLIC -lcudadevrt)

diff --git a/src/backend/linalg_internal_cpu/Arithmetic_internal.cpp b/src/backend/linalg_internal_cpu/Arithmetic_internal.cpp
diff --git a/src/backend/linalg_internal_cpu/Arithmetic_internal.hpp b/src/backend/linalg_internal_cpu/Arithmetic_internal.hpp
diff --git a/src/backend/linalg_internal_cpu/CMakeLists.txt b/src/backend/linalg_internal_cpu/CMakeLists.txt
@@ -3,8 +3,6 @@ target_sources_local(cytnx
     Norm_internal.hpp
     Add_internal.hpp
     iAdd_internal.hpp
-    Arithmetic_internal.hpp
-    iArithmetic_internal.hpp
     Conj_inplace_internal.hpp
     Cpr_internal.hpp
     Diag_internal.hpp
@@ -46,8 +44,6 @@ target_sources_local(cytnx
     memcpyTruncation.hpp
 
     iAdd_internal.cpp
-    Arithmetic_internal.cpp
-    iArithmetic_internal.cpp
     Conj_inplace_internal.cpp
     Diag_internal.cpp
     iDiv_internal.cpp

diff --git a/src/backend/linalg_internal_cpu/iArithmetic_internal.cpp b/src/backend/linalg_internal_cpu/iArithmetic_internal.cpp
diff --git a/src/backend/linalg_internal_cpu/iArithmetic_internal.hpp b/src/backend/linalg_internal_cpu/iArithmetic_internal.hpp
diff --git a/src/backend/linalg_internal_gpu/CMakeLists.txt b/src/backend/linalg_internal_gpu/CMakeLists.txt
@@ -36,6 +36,9 @@ target_sources_local(cytnx
 
     cuAbs_internal.cu
     cuAdd_internal.cu
+    cuSub_dispatch.cu
+    cuMul_dispatch.cu
+    cuDiv_dispatch.cu
     cuGer_internal.cu
     cuArithmetic_internal.cu
     cuConj_inplace_internal.cu

diff --git a/src/backend/linalg_internal_gpu/cuAdd_internal.cu b/src/backend/linalg_internal_gpu/cuAdd_internal.cu
@@ -5,6 +5,280 @@ namespace cytnx {
 
   namespace linalg_internal {
 
+    namespace {
+
+      template <typename T>
+      __device__ inline cuDoubleComplex CuToComplexDouble(const T &v) {
+        return make_cuDoubleComplex(static_cast<cytnx_double>(v), 0.0);
+      }
+
+      __device__ inline cuDoubleComplex CuToComplexDouble(const cuDoubleComplex &v) { return v; }
+
+      __device__ inline cuDoubleComplex CuToComplexDouble(const cuComplex &v) {
+        return cuComplexFloatToDouble(v);
+      }
+
+      template <typename T>
+      __device__ inline cuComplex CuToComplexFloat(const T &v) {
+        return make_cuFloatComplex(static_cast<cytnx_float>(v), 0.0f);
+      }
+
+      __device__ inline cuComplex CuToComplexFloat(const cuComplex &v) { return v; }
+
+      __device__ inline cuComplex CuToComplexFloat(const cuDoubleComplex &v) {
+        return make_cuFloatComplex(static_cast<cytnx_float>(cuCreal(v)),
+                                   static_cast<cytnx_float>(cuCimag(v)));
+      }
+
+      template <typename TO, typename TL, typename TR>
+      __device__ inline TO CuAddDispatchOp(const TL &lhs, const TR &rhs) {
+        if constexpr (std::is_same_v<TO, cuDoubleComplex>) {
+          return cuCadd(CuToComplexDouble(lhs), CuToComplexDouble(rhs));
+        } else if constexpr (std::is_same_v<TO, cuComplex>) {
+          return cuCaddf(CuToComplexFloat(lhs), CuToComplexFloat(rhs));
+        } else {
+          return static_cast<TO>(lhs) + static_cast<TO>(rhs);
+        }
+      }
+
+      template <typename TO, typename TL, typename TR>
+      __global__ void cuAdd_dispatch_constconst_kernel(TO *out, const TL lhs, const cytnx_uint64 n,
+                                                       const TR rhs) {
+        const cytnx_uint64 idx = blockIdx.x * blockDim.x + threadIdx.x;
+        if (idx < n) out[idx] = CuAddDispatchOp<TO>(lhs, rhs);
+      }
+
+      template <typename TO, typename TL, typename TR>
+      __global__ void cuAdd_dispatch_lconst_kernel(TO *out, const TL lhs, const cytnx_uint64 n,
+                                                   const TR *rhs) {
+        const cytnx_uint64 idx = blockIdx.x * blockDim.x + threadIdx.x;
+        if (idx < n) out[idx] = CuAddDispatchOp<TO>(lhs, rhs[idx]);
+      }
+
+      template <typename TO, typename TL, typename TR>
+      __global__ void cuAdd_dispatch_rconst_kernel(TO *out, const TL *lhs, const cytnx_uint64 n,
+                                                   const TR rhs) {
+        const cytnx_uint64 idx = blockIdx.x * blockDim.x + threadIdx.x;
+        if (idx < n) out[idx] = CuAddDispatchOp<TO>(lhs[idx], rhs);
+      }
+
+      template <typename TO, typename TL, typename TR>
+      __global__ void cuAdd_dispatch_tn_kernel(TO *out, const TL *lhs, const cytnx_uint64 n,
+                                               const TR *rhs) {
+        const cytnx_uint64 idx = blockIdx.x * blockDim.x + threadIdx.x;
+        if (idx < n) out[idx] = CuAddDispatchOp<TO>(lhs[idx], rhs[idx]);
+      }
+
+      template <typename TO, typename TL, typename TR>
+      __global__ void cuAdd_dispatch_tn_kernel_nonconti(
+        TO *out, const TL *lhs, const cytnx_uint64 n, const TR *rhs, const cytnx_uint64 *accu_shape,
+        const cytnx_uint64 *old_accu_shapeL, const cytnx_uint64 *old_accu_shapeR,
+        const cytnx_uint64 *invmapper_L, const cytnx_uint64 *invmapper_R,
+        const cytnx_uint64 shapesize) {
+        extern __shared__ cytnx_uint64 tmpv[];
+
+        const cytnx_uint64 idx = blockIdx.x * blockDim.x + threadIdx.x;
+        if (idx < n) {
+          cytnx_uint64 tmp = idx;
+          const cytnx_uint64 offset = threadIdx.x * shapesize;
+          cytnx_uint64 Lidx = 0, Ridx = 0;
+
+          for (cytnx_uint64 j = 0; j < shapesize; j++) {
+            tmpv[offset + j] = tmp / accu_shape[j];
+            tmp = tmp % accu_shape[j];
+          }
+          for (cytnx_uint64 j = 0; j < shapesize; j++) {
+            Lidx += tmpv[offset + invmapper_L[j]] * old_accu_shapeL[j];
+            Ridx += tmpv[offset + invmapper_R[j]] * old_accu_shapeR[j];
+          }
+          out[idx] = CuAddDispatchOp<TO>(lhs[Lidx], rhs[Ridx]);
+        }
+      }
+
+      template <typename TL, typename TR>
+      void cuAdd_dispatch_typed(boost::intrusive_ptr<Storage_base> &out,
+                                boost::intrusive_ptr<Storage_base> &Lin,
+                                boost::intrusive_ptr<Storage_base> &Rin,
+                                const unsigned long long &len,
+                                const std::vector<cytnx_uint64> &shape,
+                                const std::vector<cytnx_uint64> &invmapper_L,
+                                const std::vector<cytnx_uint64> &invmapper_R) {
+        using TO = Type_class::type_promote_gpu_t<TL, TR>;
+        cytnx_error_msg(out->dtype() != Type_class::cy_typeid_gpu_v<TO>,
+                        "[cuAdd_dispatch] output dtype mismatch. got=%d expected=%d%s",
+                        out->dtype(), Type_class::cy_typeid_gpu_v<TO>, "\n");
+
+        TO *_out = reinterpret_cast<TO *>(out->data());
+        const TL *_Lin = reinterpret_cast<const TL *>(Lin->data());
+        const TR *_Rin = reinterpret_cast<const TR *>(Rin->data());
+
+        cytnx_uint32 NBlocks = len / 512;
+        if (len % 512) NBlocks += 1;
+
+        if (Lin->size() == 1 and Rin->size() == 1) {
+          cuAdd_dispatch_constconst_kernel<<<NBlocks, 512>>>(_out, _Lin[0], len, _Rin[0]);
+        } else if (Lin->size() == 1) {
+          cuAdd_dispatch_lconst_kernel<<<NBlocks, 512>>>(_out, _Lin[0], len, _Rin);
+        } else if (Rin->size() == 1) {
+          cuAdd_dispatch_rconst_kernel<<<NBlocks, 512>>>(_out, _Lin, len, _Rin[0]);
+        } else {
+          if (shape.size() == 0) {
+            cuAdd_dispatch_tn_kernel<<<NBlocks, 512>>>(_out, _Lin, len, _Rin);
+          } else {
+            cytnx_uint64 *m_accu_shape = reinterpret_cast<cytnx_uint64 *>(
+              utils_internal::cuCalloc_gpu(shape.size(), sizeof(cytnx_uint64)));
+            cytnx_uint64 *m_old_accu_shapeL = reinterpret_cast<cytnx_uint64 *>(
+              utils_internal::cuCalloc_gpu(shape.size(), sizeof(cytnx_uint64)));
+            cytnx_uint64 *m_old_accu_shapeR = reinterpret_cast<cytnx_uint64 *>(
+              utils_internal::cuCalloc_gpu(shape.size(), sizeof(cytnx_uint64)));
+            cytnx_uint64 *m_invmapper_L = reinterpret_cast<cytnx_uint64 *>(
+              utils_internal::cuMalloc_gpu(invmapper_L.size() * sizeof(cytnx_uint64)));
+            cytnx_uint64 *m_invmapper_R = reinterpret_cast<cytnx_uint64 *>(
+              utils_internal::cuMalloc_gpu(invmapper_R.size() * sizeof(cytnx_uint64)));
+
+            checkCudaErrors(cudaMemcpy(m_invmapper_L, &invmapper_L[0],
+                                       sizeof(cytnx_uint64) * invmapper_L.size(),
+                                       cudaMemcpyHostToDevice));
+            checkCudaErrors(cudaMemcpy(m_invmapper_R, &invmapper_R[0],
+                                       sizeof(cytnx_uint64) * invmapper_R.size(),
+                                       cudaMemcpyHostToDevice));
+
+            cytnx_uint64 tmp1 = 1, tmp2 = 1, tmp3 = 1;
+            for (cytnx_uint64 i = 0; i < shape.size(); i++) {
+              m_accu_shape[shape.size() - 1 - i] = tmp1;
+              tmp1 *= shape[shape.size() - 1 - i];
+
+              m_old_accu_shapeL[shape.size() - 1 - i] = tmp2;
+              tmp2 *= shape[invmapper_L[shape.size() - 1 - i]];
+
+              m_old_accu_shapeR[shape.size() - 1 - i] = tmp3;
+              tmp3 *= shape[invmapper_R[shape.size() - 1 - i]];
+            }
+
+            cuAdd_dispatch_tn_kernel_nonconti<<<NBlocks, 512,
+                                                512 * shape.size() * sizeof(cytnx_uint64)>>>(
+              _out, _Lin, len, _Rin, m_accu_shape, m_old_accu_shapeL, m_old_accu_shapeR,
+              m_invmapper_L, m_invmapper_R, shape.size());
+
+            checkCudaErrors(cudaFree(m_accu_shape));
+            checkCudaErrors(cudaFree(m_old_accu_shapeL));
+            checkCudaErrors(cudaFree(m_old_accu_shapeR));
+            checkCudaErrors(cudaFree(m_invmapper_L));
+            checkCudaErrors(cudaFree(m_invmapper_R));
+          }
+        }
+      }
+
+      template <typename TL>
+      void cuAdd_dispatch_rhs(boost::intrusive_ptr<Storage_base> &out,
+                              boost::intrusive_ptr<Storage_base> &Lin,
+                              boost::intrusive_ptr<Storage_base> &Rin,
+                              const unsigned long long &len, const std::vector<cytnx_uint64> &shape,
+                              const std::vector<cytnx_uint64> &invmapper_L,
+                              const std::vector<cytnx_uint64> &invmapper_R) {
+        switch (Rin->dtype()) {
+          case Type.ComplexDouble:
+            cuAdd_dispatch_typed<TL, cuDoubleComplex>(out, Lin, Rin, len, shape, invmapper_L,
+                                                      invmapper_R);
+            break;
+          case Type.ComplexFloat:
+            cuAdd_dispatch_typed<TL, cuComplex>(out, Lin, Rin, len, shape, invmapper_L,
+                                                invmapper_R);
+            break;
+          case Type.Double:
+            cuAdd_dispatch_typed<TL, cytnx_double>(out, Lin, Rin, len, shape, invmapper_L,
+                                                   invmapper_R);
+            break;
+          case Type.Float:
+            cuAdd_dispatch_typed<TL, cytnx_float>(out, Lin, Rin, len, shape, invmapper_L,
+                                                  invmapper_R);
+            break;
+          case Type.Int64:
+            cuAdd_dispatch_typed<TL, cytnx_int64>(out, Lin, Rin, len, shape, invmapper_L,
+                                                  invmapper_R);
+            break;
+          case Type.Uint64:
+            cuAdd_dispatch_typed<TL, cytnx_uint64>(out, Lin, Rin, len, shape, invmapper_L,
+                                                   invmapper_R);
+            break;
+          case Type.Int32:
+            cuAdd_dispatch_typed<TL, cytnx_int32>(out, Lin, Rin, len, shape, invmapper_L,
+                                                  invmapper_R);
+            break;
+          case Type.Uint32:
+            cuAdd_dispatch_typed<TL, cytnx_uint32>(out, Lin, Rin, len, shape, invmapper_L,
+                                                   invmapper_R);
+            break;
+          case Type.Int16:
+            cuAdd_dispatch_typed<TL, cytnx_int16>(out, Lin, Rin, len, shape, invmapper_L,
+                                                  invmapper_R);
+            break;
+          case Type.Uint16:
+            cuAdd_dispatch_typed<TL, cytnx_uint16>(out, Lin, Rin, len, shape, invmapper_L,
+                                                   invmapper_R);
+            break;
+          case Type.Bool:
+            cuAdd_dispatch_typed<TL, cytnx_bool>(out, Lin, Rin, len, shape, invmapper_L,
+                                                 invmapper_R);
+            break;
+          default:
+            cytnx_error_msg(true, "[cuAdd_dispatch] unsupported rhs dtype: %d%s", Rin->dtype(),
+                            "\n");
+        }
+      }
+
+    }  // namespace
+
+    void cuAdd_dispatch(boost::intrusive_ptr<Storage_base> &out,
+                        boost::intrusive_ptr<Storage_base> &Lin,
+                        boost::intrusive_ptr<Storage_base> &Rin, const unsigned long long &len,
+                        const std::vector<cytnx_uint64> &shape,
+                        const std::vector<cytnx_uint64> &invmapper_L,
+                        const std::vector<cytnx_uint64> &invmapper_R) {
+      const unsigned int expected_dtype = Type.type_promote(Lin->dtype(), Rin->dtype());
+      cytnx_error_msg(out->dtype() != expected_dtype,
+                      "[cuAdd_dispatch] output dtype mismatch. got=%d expected=%d%s", out->dtype(),
+                      expected_dtype, "\n");
+
+      switch (Lin->dtype()) {
+        case Type.ComplexDouble:
+          cuAdd_dispatch_rhs<cuDoubleComplex>(out, Lin, Rin, len, shape, invmapper_L, invmapper_R);
+          break;
+        case Type.ComplexFloat:
+          cuAdd_dispatch_rhs<cuComplex>(out, Lin, Rin, len, shape, invmapper_L, invmapper_R);
+          break;
+        case Type.Double:
+          cuAdd_dispatch_rhs<cytnx_double>(out, Lin, Rin, len, shape, invmapper_L, invmapper_R);
+          break;
+        case Type.Float:
+          cuAdd_dispatch_rhs<cytnx_float>(out, Lin, Rin, len, shape, invmapper_L, invmapper_R);
+          break;
+        case Type.Int64:
+          cuAdd_dispatch_rhs<cytnx_int64>(out, Lin, Rin, len, shape, invmapper_L, invmapper_R);
+          break;
+        case Type.Uint64:
+          cuAdd_dispatch_rhs<cytnx_uint64>(out, Lin, Rin, len, shape, invmapper_L, invmapper_R);
+          break;
+        case Type.Int32:
+          cuAdd_dispatch_rhs<cytnx_int32>(out, Lin, Rin, len, shape, invmapper_L, invmapper_R);
+          break;
+        case Type.Uint32:
+          cuAdd_dispatch_rhs<cytnx_uint32>(out, Lin, Rin, len, shape, invmapper_L, invmapper_R);
+          break;
+        case Type.Int16:
+          cuAdd_dispatch_rhs<cytnx_int16>(out, Lin, Rin, len, shape, invmapper_L, invmapper_R);
+          break;
+        case Type.Uint16:
+          cuAdd_dispatch_rhs<cytnx_uint16>(out, Lin, Rin, len, shape, invmapper_L, invmapper_R);
+          break;
+        case Type.Bool:
+          cuAdd_dispatch_rhs<cytnx_bool>(out, Lin, Rin, len, shape, invmapper_L, invmapper_R);
+          break;
+        default:
+          cytnx_error_msg(true, "[cuAdd_dispatch] unsupported lhs dtype: %d%s", Lin->dtype(), "\n");
+      }
+    }
+
     //====================================================================
     // generic R+R kernel
 

diff --git a/src/backend/linalg_internal_gpu/cuAdd_internal.hpp b/src/backend/linalg_internal_gpu/cuAdd_internal.hpp
@@ -13,6 +13,13 @@ namespace cytnx {
   namespace linalg_internal {
 
     /// cuAdd
+    void cuAdd_dispatch(boost::intrusive_ptr<Storage_base> &out,
+                        boost::intrusive_ptr<Storage_base> &Lin,
+                        boost::intrusive_ptr<Storage_base> &Rin, const unsigned long long &len,
+                        const std::vector<cytnx_uint64> &shape,
+                        const std::vector<cytnx_uint64> &invmapper_L,
+                        const std::vector<cytnx_uint64> &invmapper_R);
+
     void cuAdd_internal_cdtcd(boost::intrusive_ptr<Storage_base> &out,
                               boost::intrusive_ptr<Storage_base> &Lin,
                               boost::intrusive_ptr<Storage_base> &Rin,