Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions CytnxBKNDCMakeLists.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,33 @@ if(USE_CUDA)
# -gencode=arch=compute_75,code=compute_75 ")
target_compile_definitions(cytnx PUBLIC UNI_GPU)
target_include_directories(cytnx PRIVATE ${CUDAToolkit_INCLUDE_DIRS})
# CUDA 12+/13 may place Thrust/CUB headers under include/cccl.
set(_cytnx_cccl_candidates)
if(DEFINED CUDAToolkit_TARGET_DIR AND NOT "${CUDAToolkit_TARGET_DIR}" STREQUAL "")
list(APPEND _cytnx_cccl_candidates "${CUDAToolkit_TARGET_DIR}/include/cccl")
endif()
foreach(_cuda_inc IN LISTS CUDAToolkit_INCLUDE_DIRS)
list(APPEND _cytnx_cccl_candidates
"${_cuda_inc}/cccl"
"${_cuda_inc}/../include/cccl"
"${_cuda_inc}/../../include/cccl"
"${_cuda_inc}/../../../include/cccl")
endforeach()
list(REMOVE_DUPLICATES _cytnx_cccl_candidates)

set(_cytnx_cccl_dir "")
foreach(_cccl_candidate IN LISTS _cytnx_cccl_candidates)
get_filename_component(_cccl_candidate_abs "${_cccl_candidate}" ABSOLUTE)
if(EXISTS "${_cccl_candidate_abs}")
set(_cytnx_cccl_dir "${_cccl_candidate_abs}")
break()
endif()
endforeach()
if(NOT "${_cytnx_cccl_dir}" STREQUAL "")
target_include_directories(cytnx PRIVATE "${_cytnx_cccl_dir}")
message(STATUS "Detected CCCL headers at: ${_cytnx_cccl_dir}")
endif()

target_link_libraries(cytnx PUBLIC CUDA::toolkit)
target_link_libraries(cytnx PUBLIC CUDA::cudart CUDA::cublas CUDA::cusparse CUDA::curand CUDA::cusolver)
target_link_libraries(cytnx PUBLIC -lcudadevrt)
Expand Down
3,042 changes: 0 additions & 3,042 deletions src/backend/linalg_internal_cpu/Arithmetic_internal.cpp

This file was deleted.

805 changes: 0 additions & 805 deletions src/backend/linalg_internal_cpu/Arithmetic_internal.hpp

This file was deleted.

4 changes: 0 additions & 4 deletions src/backend/linalg_internal_cpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@ target_sources_local(cytnx
Norm_internal.hpp
Add_internal.hpp
iAdd_internal.hpp
Arithmetic_internal.hpp
iArithmetic_internal.hpp
Conj_inplace_internal.hpp
Cpr_internal.hpp
Diag_internal.hpp
Expand Down Expand Up @@ -46,8 +44,6 @@ target_sources_local(cytnx
memcpyTruncation.hpp

iAdd_internal.cpp
Arithmetic_internal.cpp
iArithmetic_internal.cpp
Conj_inplace_internal.cpp
Diag_internal.cpp
iDiv_internal.cpp
Expand Down
2,239 changes: 0 additions & 2,239 deletions src/backend/linalg_internal_cpu/iArithmetic_internal.cpp

This file was deleted.

755 changes: 0 additions & 755 deletions src/backend/linalg_internal_cpu/iArithmetic_internal.hpp

This file was deleted.

3 changes: 3 additions & 0 deletions src/backend/linalg_internal_gpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@ target_sources_local(cytnx

cuAbs_internal.cu
cuAdd_internal.cu
cuSub_dispatch.cu
cuMul_dispatch.cu
cuDiv_dispatch.cu
cuGer_internal.cu
cuArithmetic_internal.cu
cuConj_inplace_internal.cu
Expand Down
274 changes: 274 additions & 0 deletions src/backend/linalg_internal_gpu/cuAdd_internal.cu
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,280 @@ namespace cytnx {

namespace linalg_internal {

namespace {

template <typename T>
__device__ inline cuDoubleComplex CuToComplexDouble(const T &v) {
return make_cuDoubleComplex(static_cast<cytnx_double>(v), 0.0);
}

__device__ inline cuDoubleComplex CuToComplexDouble(const cuDoubleComplex &v) { return v; }

__device__ inline cuDoubleComplex CuToComplexDouble(const cuComplex &v) {
return cuComplexFloatToDouble(v);
}

template <typename T>
__device__ inline cuComplex CuToComplexFloat(const T &v) {
return make_cuFloatComplex(static_cast<cytnx_float>(v), 0.0f);
}

__device__ inline cuComplex CuToComplexFloat(const cuComplex &v) { return v; }

__device__ inline cuComplex CuToComplexFloat(const cuDoubleComplex &v) {
return make_cuFloatComplex(static_cast<cytnx_float>(cuCreal(v)),
static_cast<cytnx_float>(cuCimag(v)));
}

template <typename TO, typename TL, typename TR>
__device__ inline TO CuAddDispatchOp(const TL &lhs, const TR &rhs) {
if constexpr (std::is_same_v<TO, cuDoubleComplex>) {
return cuCadd(CuToComplexDouble(lhs), CuToComplexDouble(rhs));
} else if constexpr (std::is_same_v<TO, cuComplex>) {
return cuCaddf(CuToComplexFloat(lhs), CuToComplexFloat(rhs));
} else {
return static_cast<TO>(lhs) + static_cast<TO>(rhs);
}
}

template <typename TO, typename TL, typename TR>
__global__ void cuAdd_dispatch_constconst_kernel(TO *out, const TL lhs, const cytnx_uint64 n,
const TR rhs) {
const cytnx_uint64 idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) out[idx] = CuAddDispatchOp<TO>(lhs, rhs);
}

template <typename TO, typename TL, typename TR>
__global__ void cuAdd_dispatch_lconst_kernel(TO *out, const TL lhs, const cytnx_uint64 n,
const TR *rhs) {
const cytnx_uint64 idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) out[idx] = CuAddDispatchOp<TO>(lhs, rhs[idx]);
}

template <typename TO, typename TL, typename TR>
__global__ void cuAdd_dispatch_rconst_kernel(TO *out, const TL *lhs, const cytnx_uint64 n,
const TR rhs) {
const cytnx_uint64 idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) out[idx] = CuAddDispatchOp<TO>(lhs[idx], rhs);
}

template <typename TO, typename TL, typename TR>
__global__ void cuAdd_dispatch_tn_kernel(TO *out, const TL *lhs, const cytnx_uint64 n,
const TR *rhs) {
const cytnx_uint64 idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) out[idx] = CuAddDispatchOp<TO>(lhs[idx], rhs[idx]);
}

template <typename TO, typename TL, typename TR>
__global__ void cuAdd_dispatch_tn_kernel_nonconti(
TO *out, const TL *lhs, const cytnx_uint64 n, const TR *rhs, const cytnx_uint64 *accu_shape,
const cytnx_uint64 *old_accu_shapeL, const cytnx_uint64 *old_accu_shapeR,
const cytnx_uint64 *invmapper_L, const cytnx_uint64 *invmapper_R,
const cytnx_uint64 shapesize) {
extern __shared__ cytnx_uint64 tmpv[];

const cytnx_uint64 idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) {
cytnx_uint64 tmp = idx;
const cytnx_uint64 offset = threadIdx.x * shapesize;
cytnx_uint64 Lidx = 0, Ridx = 0;

for (cytnx_uint64 j = 0; j < shapesize; j++) {
tmpv[offset + j] = tmp / accu_shape[j];
tmp = tmp % accu_shape[j];
}
for (cytnx_uint64 j = 0; j < shapesize; j++) {
Lidx += tmpv[offset + invmapper_L[j]] * old_accu_shapeL[j];
Ridx += tmpv[offset + invmapper_R[j]] * old_accu_shapeR[j];
}
out[idx] = CuAddDispatchOp<TO>(lhs[Lidx], rhs[Ridx]);
}
}

template <typename TL, typename TR>
void cuAdd_dispatch_typed(boost::intrusive_ptr<Storage_base> &out,
boost::intrusive_ptr<Storage_base> &Lin,
boost::intrusive_ptr<Storage_base> &Rin,
const unsigned long long &len,
const std::vector<cytnx_uint64> &shape,
const std::vector<cytnx_uint64> &invmapper_L,
const std::vector<cytnx_uint64> &invmapper_R) {
using TO = Type_class::type_promote_gpu_t<TL, TR>;
cytnx_error_msg(out->dtype() != Type_class::cy_typeid_gpu_v<TO>,
"[cuAdd_dispatch] output dtype mismatch. got=%d expected=%d%s",
out->dtype(), Type_class::cy_typeid_gpu_v<TO>, "\n");

TO *_out = reinterpret_cast<TO *>(out->data());
const TL *_Lin = reinterpret_cast<const TL *>(Lin->data());
const TR *_Rin = reinterpret_cast<const TR *>(Rin->data());

cytnx_uint32 NBlocks = len / 512;
if (len % 512) NBlocks += 1;

if (Lin->size() == 1 and Rin->size() == 1) {
cuAdd_dispatch_constconst_kernel<<<NBlocks, 512>>>(_out, _Lin[0], len, _Rin[0]);
} else if (Lin->size() == 1) {
cuAdd_dispatch_lconst_kernel<<<NBlocks, 512>>>(_out, _Lin[0], len, _Rin);
} else if (Rin->size() == 1) {
cuAdd_dispatch_rconst_kernel<<<NBlocks, 512>>>(_out, _Lin, len, _Rin[0]);
} else {
if (shape.size() == 0) {
cuAdd_dispatch_tn_kernel<<<NBlocks, 512>>>(_out, _Lin, len, _Rin);
} else {
cytnx_uint64 *m_accu_shape = reinterpret_cast<cytnx_uint64 *>(
utils_internal::cuCalloc_gpu(shape.size(), sizeof(cytnx_uint64)));
cytnx_uint64 *m_old_accu_shapeL = reinterpret_cast<cytnx_uint64 *>(
utils_internal::cuCalloc_gpu(shape.size(), sizeof(cytnx_uint64)));
cytnx_uint64 *m_old_accu_shapeR = reinterpret_cast<cytnx_uint64 *>(
utils_internal::cuCalloc_gpu(shape.size(), sizeof(cytnx_uint64)));
cytnx_uint64 *m_invmapper_L = reinterpret_cast<cytnx_uint64 *>(
utils_internal::cuMalloc_gpu(invmapper_L.size() * sizeof(cytnx_uint64)));
cytnx_uint64 *m_invmapper_R = reinterpret_cast<cytnx_uint64 *>(
utils_internal::cuMalloc_gpu(invmapper_R.size() * sizeof(cytnx_uint64)));

checkCudaErrors(cudaMemcpy(m_invmapper_L, &invmapper_L[0],
sizeof(cytnx_uint64) * invmapper_L.size(),
cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(m_invmapper_R, &invmapper_R[0],
sizeof(cytnx_uint64) * invmapper_R.size(),
cudaMemcpyHostToDevice));

cytnx_uint64 tmp1 = 1, tmp2 = 1, tmp3 = 1;
for (cytnx_uint64 i = 0; i < shape.size(); i++) {
m_accu_shape[shape.size() - 1 - i] = tmp1;
tmp1 *= shape[shape.size() - 1 - i];

m_old_accu_shapeL[shape.size() - 1 - i] = tmp2;
tmp2 *= shape[invmapper_L[shape.size() - 1 - i]];

m_old_accu_shapeR[shape.size() - 1 - i] = tmp3;
tmp3 *= shape[invmapper_R[shape.size() - 1 - i]];
}

cuAdd_dispatch_tn_kernel_nonconti<<<NBlocks, 512,
512 * shape.size() * sizeof(cytnx_uint64)>>>(
_out, _Lin, len, _Rin, m_accu_shape, m_old_accu_shapeL, m_old_accu_shapeR,
m_invmapper_L, m_invmapper_R, shape.size());

checkCudaErrors(cudaFree(m_accu_shape));
checkCudaErrors(cudaFree(m_old_accu_shapeL));
checkCudaErrors(cudaFree(m_old_accu_shapeR));
checkCudaErrors(cudaFree(m_invmapper_L));
checkCudaErrors(cudaFree(m_invmapper_R));
}
}
}

template <typename TL>
void cuAdd_dispatch_rhs(boost::intrusive_ptr<Storage_base> &out,
boost::intrusive_ptr<Storage_base> &Lin,
boost::intrusive_ptr<Storage_base> &Rin,
const unsigned long long &len, const std::vector<cytnx_uint64> &shape,
const std::vector<cytnx_uint64> &invmapper_L,
const std::vector<cytnx_uint64> &invmapper_R) {
switch (Rin->dtype()) {
case Type.ComplexDouble:
cuAdd_dispatch_typed<TL, cuDoubleComplex>(out, Lin, Rin, len, shape, invmapper_L,
invmapper_R);
break;
case Type.ComplexFloat:
cuAdd_dispatch_typed<TL, cuComplex>(out, Lin, Rin, len, shape, invmapper_L,
invmapper_R);
break;
case Type.Double:
cuAdd_dispatch_typed<TL, cytnx_double>(out, Lin, Rin, len, shape, invmapper_L,
invmapper_R);
break;
case Type.Float:
cuAdd_dispatch_typed<TL, cytnx_float>(out, Lin, Rin, len, shape, invmapper_L,
invmapper_R);
break;
case Type.Int64:
cuAdd_dispatch_typed<TL, cytnx_int64>(out, Lin, Rin, len, shape, invmapper_L,
invmapper_R);
break;
case Type.Uint64:
cuAdd_dispatch_typed<TL, cytnx_uint64>(out, Lin, Rin, len, shape, invmapper_L,
invmapper_R);
break;
case Type.Int32:
cuAdd_dispatch_typed<TL, cytnx_int32>(out, Lin, Rin, len, shape, invmapper_L,
invmapper_R);
break;
case Type.Uint32:
cuAdd_dispatch_typed<TL, cytnx_uint32>(out, Lin, Rin, len, shape, invmapper_L,
invmapper_R);
break;
case Type.Int16:
cuAdd_dispatch_typed<TL, cytnx_int16>(out, Lin, Rin, len, shape, invmapper_L,
invmapper_R);
break;
case Type.Uint16:
cuAdd_dispatch_typed<TL, cytnx_uint16>(out, Lin, Rin, len, shape, invmapper_L,
invmapper_R);
break;
case Type.Bool:
cuAdd_dispatch_typed<TL, cytnx_bool>(out, Lin, Rin, len, shape, invmapper_L,
invmapper_R);
break;
default:
cytnx_error_msg(true, "[cuAdd_dispatch] unsupported rhs dtype: %d%s", Rin->dtype(),
"\n");
}
}

} // namespace

void cuAdd_dispatch(boost::intrusive_ptr<Storage_base> &out,
boost::intrusive_ptr<Storage_base> &Lin,
boost::intrusive_ptr<Storage_base> &Rin, const unsigned long long &len,
const std::vector<cytnx_uint64> &shape,
const std::vector<cytnx_uint64> &invmapper_L,
const std::vector<cytnx_uint64> &invmapper_R) {
const unsigned int expected_dtype = Type.type_promote(Lin->dtype(), Rin->dtype());
cytnx_error_msg(out->dtype() != expected_dtype,
"[cuAdd_dispatch] output dtype mismatch. got=%d expected=%d%s", out->dtype(),
expected_dtype, "\n");

switch (Lin->dtype()) {
case Type.ComplexDouble:
cuAdd_dispatch_rhs<cuDoubleComplex>(out, Lin, Rin, len, shape, invmapper_L, invmapper_R);
break;
case Type.ComplexFloat:
cuAdd_dispatch_rhs<cuComplex>(out, Lin, Rin, len, shape, invmapper_L, invmapper_R);
break;
case Type.Double:
cuAdd_dispatch_rhs<cytnx_double>(out, Lin, Rin, len, shape, invmapper_L, invmapper_R);
break;
case Type.Float:
cuAdd_dispatch_rhs<cytnx_float>(out, Lin, Rin, len, shape, invmapper_L, invmapper_R);
break;
case Type.Int64:
cuAdd_dispatch_rhs<cytnx_int64>(out, Lin, Rin, len, shape, invmapper_L, invmapper_R);
break;
case Type.Uint64:
cuAdd_dispatch_rhs<cytnx_uint64>(out, Lin, Rin, len, shape, invmapper_L, invmapper_R);
break;
case Type.Int32:
cuAdd_dispatch_rhs<cytnx_int32>(out, Lin, Rin, len, shape, invmapper_L, invmapper_R);
break;
case Type.Uint32:
cuAdd_dispatch_rhs<cytnx_uint32>(out, Lin, Rin, len, shape, invmapper_L, invmapper_R);
break;
case Type.Int16:
cuAdd_dispatch_rhs<cytnx_int16>(out, Lin, Rin, len, shape, invmapper_L, invmapper_R);
break;
case Type.Uint16:
cuAdd_dispatch_rhs<cytnx_uint16>(out, Lin, Rin, len, shape, invmapper_L, invmapper_R);
break;
case Type.Bool:
cuAdd_dispatch_rhs<cytnx_bool>(out, Lin, Rin, len, shape, invmapper_L, invmapper_R);
break;
default:
cytnx_error_msg(true, "[cuAdd_dispatch] unsupported lhs dtype: %d%s", Lin->dtype(), "\n");
}
}

//====================================================================
// generic R+R kernel

Expand Down
7 changes: 7 additions & 0 deletions src/backend/linalg_internal_gpu/cuAdd_internal.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,13 @@ namespace cytnx {
namespace linalg_internal {

/// cuAdd
void cuAdd_dispatch(boost::intrusive_ptr<Storage_base> &out,
boost::intrusive_ptr<Storage_base> &Lin,
boost::intrusive_ptr<Storage_base> &Rin, const unsigned long long &len,
const std::vector<cytnx_uint64> &shape,
const std::vector<cytnx_uint64> &invmapper_L,
const std::vector<cytnx_uint64> &invmapper_R);

void cuAdd_internal_cdtcd(boost::intrusive_ptr<Storage_base> &out,
boost::intrusive_ptr<Storage_base> &Lin,
boost::intrusive_ptr<Storage_base> &Rin,
Expand Down
Loading
Loading