Wrapper for Blocksparse CuTensor code#3057
Conversation
… to make it a union type of CuTensorBS and AbstractArray?
|
Your PR requires formatting changes to meet the project's style guidelines. Click here to view the suggested changes.diff --git a/lib/cutensor/src/blocksparse/interfaces.jl b/lib/cutensor/src/blocksparse/interfaces.jl
index c6eef0e5b..0a479ddf8 100644
--- a/lib/cutensor/src/blocksparse/interfaces.jl
+++ b/lib/cutensor/src/blocksparse/interfaces.jl
@@ -1,4 +1,4 @@
-## For now call contract in ITensor and rely on UnallocatedArrays to make
+## For now call contract in ITensor and rely on UnallocatedArrays to make
## C in a dry-run of the contraction.
# function Base.:(*)(A::CuTensorBS, B::CuTensorBs)
# tC = promote_type(eltype(A), eltype(B))
@@ -18,11 +18,13 @@
using LinearAlgebra
function LinearAlgebra.mul!(C::CuTensorBS, A::CuTensorBS, B::CuTensorBS, α::Number, β::Number)
- contract!(α,
- A, A.inds, CUTENSOR_OP_IDENTITY,
- B, B.inds, CUTENSOR_OP_IDENTITY,
- β,
- C, C.inds, CUTENSOR_OP_IDENTITY,
- CUTENSOR_OP_IDENTITY; jit=CUTENSOR_JIT_MODE_DEFAULT)
- return C
-end
\ No newline at end of file
+ contract!(
+ α,
+ A, A.inds, CUTENSOR_OP_IDENTITY,
+ B, B.inds, CUTENSOR_OP_IDENTITY,
+ β,
+ C, C.inds, CUTENSOR_OP_IDENTITY,
+ CUTENSOR_OP_IDENTITY; jit = CUTENSOR_JIT_MODE_DEFAULT
+ )
+ return C
+end
diff --git a/lib/cutensor/src/blocksparse/operations.jl b/lib/cutensor/src/blocksparse/operations.jl
index 19542e5de..0f98c92ef 100644
--- a/lib/cutensor/src/blocksparse/operations.jl
+++ b/lib/cutensor/src/blocksparse/operations.jl
@@ -9,23 +9,26 @@ function contract!(
@nospecialize(beta::Number),
@nospecialize(C), Cinds::ModeType, opC::cutensorOperator_t,
opOut::cutensorOperator_t;
- jit::cutensorJitMode_t=JIT_MODE_NONE,
- workspace::cutensorWorksizePreference_t=WORKSPACE_DEFAULT,
- algo::cutensorAlgo_t=ALGO_DEFAULT,
- compute_type::Union{DataType, cutensorComputeDescriptorEnum, Nothing}=nothing,
- plan::Union{CuTensorPlan, Nothing}=nothing)
+ jit::cutensorJitMode_t = JIT_MODE_NONE,
+ workspace::cutensorWorksizePreference_t = WORKSPACE_DEFAULT,
+ algo::cutensorAlgo_t = ALGO_DEFAULT,
+ compute_type::Union{DataType, cutensorComputeDescriptorEnum, Nothing} = nothing,
+ plan::Union{CuTensorPlan, Nothing} = nothing
+ )
actual_plan = if plan === nothing
- plan_contraction(A, Ainds, opA, B, Binds, opB, C, Cinds, opC, opOut;
- jit, workspace, algo, compute_type)
+ plan_contraction(
+ A, Ainds, opA, B, Binds, opB, C, Cinds, opC, opOut;
+ jit, workspace, algo, compute_type
+ )
else
plan
end
contractBS!(actual_plan, alpha, nonzero_blocks(A), nonzero_blocks(B), beta, nonzero_blocks(C))
-
+
if plan === nothing
- CUDA.unsafe_free!(actual_plan)
+ CUDA.unsafe_free!(actual_plan)
end
return C
@@ -33,12 +36,14 @@ end
## This function assumes A, B, and C are Arrays of pointers to CuArrays.
## Please overwrite the `nonzero_blocks` function for your datatype to access this function from contract!
-function contractBS!(plan::CuTensorPlan,
- @nospecialize(alpha::Number),
- @nospecialize(A::AbstractArray),
- @nospecialize(B::AbstractArray),
- @nospecialize(beta::Number),
- @nospecialize(C::AbstractArray))
+function contractBS!(
+ plan::CuTensorPlan,
+ @nospecialize(alpha::Number),
+ @nospecialize(A::AbstractArray),
+ @nospecialize(B::AbstractArray),
+ @nospecialize(beta::Number),
+ @nospecialize(C::AbstractArray)
+ )
scalar_type = plan.scalar_type
# Extract GPU pointers from each CuArray block
@@ -46,11 +51,13 @@ function contractBS!(plan::CuTensorPlan,
A_ptrs = CuPtr{Cvoid}[pointer(block) for block in A]
B_ptrs = CuPtr{Cvoid}[pointer(block) for block in B]
C_ptrs = CuPtr{Cvoid}[pointer(block) for block in C]
-
- cutensorBlockSparseContract(handle(), plan,
- Ref{scalar_type}(alpha), A_ptrs, B_ptrs,
- Ref{scalar_type}(beta), C_ptrs, C_ptrs,
- plan.workspace, sizeof(plan.workspace), stream())
+
+ cutensorBlockSparseContract(
+ handle(), plan,
+ Ref{scalar_type}(alpha), A_ptrs, B_ptrs,
+ Ref{scalar_type}(beta), C_ptrs, C_ptrs,
+ plan.workspace, sizeof(plan.workspace), stream()
+ )
synchronize(stream())
return C
end
@@ -60,21 +67,22 @@ function plan_contraction(
@nospecialize(B), Binds::ModeType, opB::cutensorOperator_t,
@nospecialize(C), Cinds::ModeType, opC::cutensorOperator_t,
opOut::cutensorOperator_t;
- jit::cutensorJitMode_t=JIT_MODE_NONE,
- workspace::cutensorWorksizePreference_t=WORKSPACE_DEFAULT,
- algo::cutensorAlgo_t=ALGO_DEFAULT,
- compute_type::Union{DataType, cutensorComputeDescriptorEnum, Nothing}=nothing)
+ jit::cutensorJitMode_t = JIT_MODE_NONE,
+ workspace::cutensorWorksizePreference_t = WORKSPACE_DEFAULT,
+ algo::cutensorAlgo_t = ALGO_DEFAULT,
+ compute_type::Union{DataType, cutensorComputeDescriptorEnum, Nothing} = nothing
+ )
!is_unary(opA) && throw(ArgumentError("opA must be a unary op!"))
!is_unary(opB) && throw(ArgumentError("opB must be a unary op!"))
!is_unary(opC) && throw(ArgumentError("opC must be a unary op!"))
!is_unary(opOut) && throw(ArgumentError("opOut must be a unary op!"))
-
+
descA = CuTensorBSDescriptor(A)
descB = CuTensorBSDescriptor(B)
descC = CuTensorBSDescriptor(C)
# for now, D must be identical to C (and thus, descD must be identical to descC)
-
+
modeA = collect(Cint, Ainds)
modeB = collect(Cint, Binds)
modeC = collect(Cint, Cinds)
@@ -87,17 +95,19 @@ function plan_contraction(
desc = Ref{cutensorOperationDescriptor_t}()
- cutensorCreateBlockSparseContraction(handle(),
- desc,
- descA, modeA, opA,
- descB, modeB, opB,
- descC, modeC, opC,
- descC, modeC, actual_compute_type)
+ cutensorCreateBlockSparseContraction(
+ handle(),
+ desc,
+ descA, modeA, opA,
+ descB, modeB, opB,
+ descC, modeC, opC,
+ descC, modeC, actual_compute_type
+ )
plan_pref = Ref{cutensorPlanPreference_t}()
cutensorCreatePlanPreference(handle(), plan_pref, algo, jit)
- plan = CuTensorPlan(desc[], plan_pref[]; workspacePref=workspace)
+ plan = CuTensorPlan(desc[], plan_pref[]; workspacePref = workspace)
# cutensorDestroyOperationDescriptor(desc[])
cutensorDestroyPlanPreference(plan_pref[])
return plan
diff --git a/lib/cutensor/src/blocksparse/types.jl b/lib/cutensor/src/blocksparse/types.jl
index 292dc4d00..41cbebdbd 100644
--- a/lib/cutensor/src/blocksparse/types.jl
+++ b/lib/cutensor/src/blocksparse/types.jl
@@ -12,20 +12,26 @@ mutable struct CuTensorBS{T, N}
## This expects a Vector{Tuple(Int)} right now
nonzero_block_coords
- function CuTensorBS{T, N}(nonzero_data::Vector{<:CuArray},
- blocks_per_mode::Vector{Int}, block_extents, nonzero_block_coords, inds::Vector) where {T<:Number, N}
+ function CuTensorBS{T, N}(
+ nonzero_data::Vector{<:CuArray},
+ blocks_per_mode::Vector{Int}, block_extents, nonzero_block_coords, inds::Vector
+ ) where {T <: Number, N}
CuArrayT = eltype(nonzero_data)
@assert eltype(CuArrayT) == T
# @assert ndims(CuArrayT) == N
@assert length(block_extents) == N
- new(nonzero_data, inds, blocks_per_mode, block_extents, nonzero_block_coords)
+ return new(nonzero_data, inds, blocks_per_mode, block_extents, nonzero_block_coords)
end
end
-function CuTensorBS(nonzero_data::Vector{<:CuArray{T}},
- blocks_per_mode, block_extents, nonzero_block_coords, inds::Vector) where {T<:Number}
- CuTensorBS{T,length(block_extents)}(nonzero_data,
- blocks_per_mode, block_extents, nonzero_block_coords, inds)
+function CuTensorBS(
+ nonzero_data::Vector{<:CuArray{T}},
+ blocks_per_mode, block_extents, nonzero_block_coords, inds::Vector
+ ) where {T <: Number}
+ return CuTensorBS{T, length(block_extents)}(
+ nonzero_data,
+ blocks_per_mode, block_extents, nonzero_block_coords, inds
+ )
end
# array interface
function Base.size(T::CuTensorBS)
@@ -39,8 +45,8 @@ Base.strides(T::CuTensorBS) = vcat([[st...] for st in strides.(T.nonzero_data)].
Base.eltype(T::CuTensorBS) = eltype(eltype(T.nonzero_data))
function block_extents(T::CuTensorBS)
- extents = Vector{Int64}()
-
+ extents = Vector{Int64}()
+
for ex in T.block_extents
extents = vcat(extents, ex...)
end
@@ -66,18 +72,21 @@ mutable struct CuTensorBSDescriptor
handle::cutensorBlockSparseTensorDescriptor_t
# inner constructor handles creation and finalizer of the descriptor
function CuTensorBSDescriptor(
- numModes,
- numNonZeroBlocks,
- numSectionsPerMode,
- extent,
- nonZeroCoordinates,
- stride,
- eltype)
+ numModes,
+ numNonZeroBlocks,
+ numSectionsPerMode,
+ extent,
+ nonZeroCoordinates,
+ stride,
+ eltype
+ )
desc = Ref{cuTENSOR.cutensorBlockSparseTensorDescriptor_t}()
- cutensorCreateBlockSparseTensorDescriptor(handle(), desc,
- numModes, numNonZeroBlocks, numSectionsPerMode, extent, nonZeroCoordinates,
- stride, eltype)
+ cutensorCreateBlockSparseTensorDescriptor(
+ handle(), desc,
+ numModes, numNonZeroBlocks, numSectionsPerMode, extent, nonZeroCoordinates,
+ stride, eltype
+ )
obj = new(desc[])
finalizer(unsafe_destroy!, obj)
@@ -86,12 +95,13 @@ mutable struct CuTensorBSDescriptor
end
function CuTensorBSDescriptor(
- numModes,
- numNonZeroBlocks,
- numSectionsPerMode,
- extent,
- nonZeroCoordinates,
- eltype)
+ numModes,
+ numNonZeroBlocks,
+ numSectionsPerMode,
+ extent,
+ nonZeroCoordinates,
+ eltype
+ )
return CuTensorBSDescriptor(numModes, numNonZeroBlocks, numSectionsPerMode, extent, nonZeroCoordinates, C_NULL, eltype)
end
@@ -101,7 +111,7 @@ Base.show(io::IO, desc::CuTensorBSDescriptor) = @printf(io, "CuTensorBSDescripto
Base.unsafe_convert(::Type{cutensorBlockSparseTensorDescriptor_t}, obj::CuTensorBSDescriptor) = obj.handle
function unsafe_destroy!(obj::CuTensorBSDescriptor)
- cutensorDestroyBlockSparseTensorDescriptor(obj)
+ return cutensorDestroyBlockSparseTensorDescriptor(obj)
end
## Descriptor function for CuTensorBS type. Please overwrite for custom objects
@@ -110,11 +120,13 @@ function CuTensorBSDescriptor(A::CuTensorBS)
numNonZeroBlocks = Int64(length(A.nonzero_block_coords))
numSectionsPerMode = collect(Int32, A.blocks_per_mode)
extent = block_extents(A)
- nonZeroCoordinates = Int32.(vcat([[x...] for x in A.nonzero_block_coords]...) .- 1)
+ nonZeroCoordinates = Int32.(vcat([[x...] for x in A.nonzero_block_coords]...) .- 1)
st = strides(A)
- dataType = eltype(A)#convert(cuTENSOR.cutensorDataType_t, eltype(A))
+ dataType = eltype(A) #convert(cuTENSOR.cutensorDataType_t, eltype(A))
## Right now assume stride is NULL. I am not sure if stride works, need to discuss with cuTENSOR team.
- CuTensorBSDescriptor(numModes, numNonZeroBlocks,
- numSectionsPerMode, extent, nonZeroCoordinates, dataType)
+ return CuTensorBSDescriptor(
+ numModes, numNonZeroBlocks,
+ numSectionsPerMode, extent, nonZeroCoordinates, dataType
+ )
end
diff --git a/lib/cutensor/src/libcutensor.jl b/lib/cutensor/src/libcutensor.jl
index a73ab654a..c83a6728f 100644
--- a/lib/cutensor/src/libcutensor.jl
+++ b/lib/cutensor/src/libcutensor.jl
@@ -545,12 +545,12 @@ end
@gcsafe_ccall libcutensor.cutensorBlockSparseContract(handle::cutensorHandle_t,
plan::cutensorPlan_t,
alpha::Ptr{Cvoid},
- A::Ptr{Cvoid},
- B::Ptr{Cvoid},
+ A::Ptr{Cvoid},
+ B::Ptr{Cvoid},
beta::Ptr{Cvoid},
- C::Ptr{Cvoid},
- D::Ptr{Cvoid},
- workspace::CuPtr{Cvoid},
+ C::Ptr{Cvoid},
+ D::Ptr{Cvoid},
+ workspace::CuPtr{Cvoid},
workspaceSize::UInt64,
stream::cudaStream_t)::cutensorStatus_t
end
diff --git a/lib/cutensor/test/contractions.jl b/lib/cutensor/test/contractions.jl
index 636600a74..baf56949a 100644
--- a/lib/cutensor/test/contractions.jl
+++ b/lib/cutensor/test/contractions.jl
@@ -188,62 +188,73 @@ end
end
end
-eltypes_compact = [
- (Float32, Float32, Float32, Float32),
- (ComplexF32, ComplexF32, ComplexF32, Float32),
- (Float64, Float64, Float64, Float64),
- (ComplexF64, ComplexF64, ComplexF64, Float64)
-]
-@testset "Blocksparse Contraction" begin
- ## There are many unsupported types because this is a new functionality
- ## So I will test with Float32 and ComplexF32 only
- @testset for (eltyA, eltyB, eltyC, eltyCompute) in eltypes_compact
- ## i = [20,20,25]
- ## k = [10,10,15]
- ## l = [30,30,35]
- ## A = Tensor(k,i,l)
- ## Nonzero blocks are
- ## [1,1,1], [1,1,3], [1,3,1], [1,3,3], [3,1,1], [3,1,3], [3,3,1], [3,3,3]
- A = Vector{CuArray{eltyA, 3}}()
- for k in [10,15]
- for i in [20,25]
- for l in [30,35]
- push!(A, CuArray(ones(eltyA, k,i,l)))
+ eltypes_compact = [
+ (Float32, Float32, Float32, Float32),
+ (ComplexF32, ComplexF32, ComplexF32, Float32),
+ (Float64, Float64, Float64, Float64),
+ (ComplexF64, ComplexF64, ComplexF64, Float64),
+ ]
+ @testset "Blocksparse Contraction" begin
+ ## There are many unsupported types because this is a new functionality
+ ## So I will test with Float32 and ComplexF32 only
+ @testset for (eltyA, eltyB, eltyC, eltyCompute) in eltypes_compact
+ ## i = [20,20,25]
+ ## k = [10,10,15]
+ ## l = [30,30,35]
+ ## A = Tensor(k,i,l)
+ ## Nonzero blocks are
+ ## [1,1,1], [1,1,3], [1,3,1], [1,3,3], [3,1,1], [3,1,3], [3,3,1], [3,3,3]
+ A = Vector{CuArray{eltyA, 3}}()
+ for k in [10, 15]
+ for i in [20, 25]
+ for l in [30, 35]
+ push!(A, CuArray(ones(eltyA, k, i, l)))
+ end
end
end
- end
- ## B = Tensor(k,l)
- ## Nonzero blocks are
- ## [1,1], [2,3]
- B = Array{CuArray{eltyB, 2}}(
- [CuArray(randn(eltyB, 10, 30)),
- CuArray(randn(eltyB, 10, 35))])
-
- ## C = Tensor(i)
- ## Nonzero blocks are
- ## [1,], [3,]
- C = Vector{CuArray{eltyC, 1}}(
- [CuArray(zeros(eltyC, 20)),
- CuArray(zeros(eltyC, 25))]
- )
-
- cuTenA = cuTENSOR.CuTensorBS(A, [3,3,3],
- [(10,10,15), (20,20,25), (30,30,35)],
- [(1,1,1), (1,1,3), (1,3,1), (1,3,3), (3,1,1), (3,1,3), (3,3,1), (3,3,3)],
- [1,3,2])
- cuTenB = cuTENSOR.CuTensorBS(B, [3,3],
- [(10,10,15), (30,30,35)],
- [(1,1),(2,3)], [1,2], )
- cuTenC = cuTENSOR.CuTensorBS(C, [3],
- [(20,20,25)],[(1,),(3,)], [3])
-
- mul!(cuTenC, cuTenA, cuTenB, 1, 0)
- ## C[1] = A[1,1,1] * B[1,1]
- @test C[1] ≈ reshape(permutedims(A[1], (2,1,3)), (20, 10 * 30)) * reshape(B[1], (10 * 30))
- ## C[3] = A[1,3,1] * B[1,1]
- @test C[2] ≈ reshape(permutedims(A[3], (2,1,3)), (25, 10 * 30)) * reshape(B[1], (10 * 30))
+ ## B = Tensor(k,l)
+ ## Nonzero blocks are
+ ## [1,1], [2,3]
+ B = Array{CuArray{eltyB, 2}}(
+ [
+ CuArray(randn(eltyB, 10, 30)),
+ CuArray(randn(eltyB, 10, 35)),
+ ]
+ )
+
+ ## C = Tensor(i)
+ ## Nonzero blocks are
+ ## [1,], [3,]
+ C = Vector{CuArray{eltyC, 1}}(
+ [
+ CuArray(zeros(eltyC, 20)),
+ CuArray(zeros(eltyC, 25)),
+ ]
+ )
+
+ cuTenA = cuTENSOR.CuTensorBS(
+ A, [3, 3, 3],
+ [(10, 10, 15), (20, 20, 25), (30, 30, 35)],
+ [(1, 1, 1), (1, 1, 3), (1, 3, 1), (1, 3, 3), (3, 1, 1), (3, 1, 3), (3, 3, 1), (3, 3, 3)],
+ [1, 3, 2]
+ )
+ cuTenB = cuTENSOR.CuTensorBS(
+ B, [3, 3],
+ [(10, 10, 15), (30, 30, 35)],
+ [(1, 1), (2, 3)], [1, 2],
+ )
+ cuTenC = cuTENSOR.CuTensorBS(
+ C, [3],
+ [(20, 20, 25)], [(1,), (3,)], [3]
+ )
+
+ mul!(cuTenC, cuTenA, cuTenB, 1, 0)
+ ## C[1] = A[1,1,1] * B[1,1]
+ @test C[1] ≈ reshape(permutedims(A[1], (2, 1, 3)), (20, 10 * 30)) * reshape(B[1], (10 * 30))
+ ## C[3] = A[1,3,1] * B[1,1]
+ @test C[2] ≈ reshape(permutedims(A[3], (2, 1, 3)), (25, 10 * 30)) * reshape(B[1], (10 * 30))
+ end
end
-end
end |
|
There were some issues in the Clang.jl's conversion of the cuTENSOR.h file into Julia wrapper functions. Specifically I had a runtime issue when trying to convert arrays of cuarray into |
…mp5VT/CUDA.jl into kmp5/feature/wrap_blocksparse_cutensor
Codecov Report❌ Patch coverage is Additional details and impacted files@@ Coverage Diff @@
## master #3057 +/- ##
===========================================
+ Coverage 12.50% 88.88% +76.38%
===========================================
Files 145 151 +6
Lines 12819 13123 +304
===========================================
+ Hits 1603 11665 +10062
+ Misses 11216 1458 -9758 ☔ View full report in Codecov by Sentry. 🚀 New features to boost your workflow:
|
There was a problem hiding this comment.
CUDA.jl Benchmarks
Details
| Benchmark suite | Current: cda4a4e | Previous: f7b7929 | Ratio |
|---|---|---|---|
latency/precompile |
45048875701 ns |
45018748344 ns |
1.00 |
latency/ttfp |
12756533815 ns |
12770284486 ns |
1.00 |
latency/import |
3544150826 ns |
3541917719 ns |
1.00 |
integration/volumerhs |
9433450 ns |
9450947.5 ns |
1.00 |
integration/byval/slices=1 |
146355 ns |
146127 ns |
1.00 |
integration/byval/slices=3 |
423305 ns |
423159 ns |
1.00 |
integration/byval/reference |
143942 ns |
143932 ns |
1.00 |
integration/byval/slices=2 |
284712 ns |
284759.5 ns |
1.00 |
integration/cudadevrt |
102483 ns |
102551 ns |
1.00 |
kernel/indexing |
13304 ns |
13204 ns |
1.01 |
kernel/indexing_checked |
14034 ns |
13977 ns |
1.00 |
kernel/occupancy |
702.8074534161491 ns |
664.05625 ns |
1.06 |
kernel/launch |
2102.2 ns |
2163.9444444444443 ns |
0.97 |
kernel/rand |
14550 ns |
18131 ns |
0.80 |
array/reverse/1d |
18552 ns |
18471 ns |
1.00 |
array/reverse/2dL_inplace |
65960 ns |
65988 ns |
1.00 |
array/reverse/1dL |
69079 ns |
69022 ns |
1.00 |
array/reverse/2d |
20828 ns |
20733 ns |
1.00 |
array/reverse/1d_inplace |
8546.666666666666 ns |
8573 ns |
1.00 |
array/reverse/2d_inplace |
10124 ns |
10232 ns |
0.99 |
array/reverse/2dL |
72879.5 ns |
72825 ns |
1.00 |
array/reverse/1dL_inplace |
65879 ns |
65937 ns |
1.00 |
array/copy |
18782 ns |
18988 ns |
0.99 |
array/iteration/findall/int |
150513 ns |
150059 ns |
1.00 |
array/iteration/findall/bool |
133007 ns |
132365.5 ns |
1.00 |
array/iteration/findfirst/int |
84657 ns |
83639 ns |
1.01 |
array/iteration/findfirst/bool |
82233 ns |
81468 ns |
1.01 |
array/iteration/scalar |
69788 ns |
66443.5 ns |
1.05 |
array/iteration/logical |
202573.5 ns |
200236 ns |
1.01 |
array/iteration/findmin/1d |
89225 ns |
86614.5 ns |
1.03 |
array/iteration/findmin/2d |
117528.5 ns |
117241 ns |
1.00 |
array/reductions/reduce/Int64/1d |
43691 ns |
42766 ns |
1.02 |
array/reductions/reduce/Int64/dims=1 |
52975 ns |
52907 ns |
1.00 |
array/reductions/reduce/Int64/dims=2 |
60037 ns |
60231 ns |
1.00 |
array/reductions/reduce/Int64/dims=1L |
87823 ns |
87828 ns |
1.00 |
array/reductions/reduce/Int64/dims=2L |
85292 ns |
84956.5 ns |
1.00 |
array/reductions/reduce/Float32/1d |
35655.5 ns |
34964 ns |
1.02 |
array/reductions/reduce/Float32/dims=1 |
49676.5 ns |
40442.5 ns |
1.23 |
array/reductions/reduce/Float32/dims=2 |
57337 ns |
57125 ns |
1.00 |
array/reductions/reduce/Float32/dims=1L |
52144 ns |
52000 ns |
1.00 |
array/reductions/reduce/Float32/dims=2L |
70377 ns |
69982.5 ns |
1.01 |
array/reductions/mapreduce/Int64/1d |
43606 ns |
42509 ns |
1.03 |
array/reductions/mapreduce/Int64/dims=1 |
42928.5 ns |
42334 ns |
1.01 |
array/reductions/mapreduce/Int64/dims=2 |
59796 ns |
59835 ns |
1.00 |
array/reductions/mapreduce/Int64/dims=1L |
87781 ns |
87864 ns |
1.00 |
array/reductions/mapreduce/Int64/dims=2L |
85436 ns |
85164 ns |
1.00 |
array/reductions/mapreduce/Float32/1d |
35514 ns |
34719 ns |
1.02 |
array/reductions/mapreduce/Float32/dims=1 |
40267 ns |
45273 ns |
0.89 |
array/reductions/mapreduce/Float32/dims=2 |
57138 ns |
56959 ns |
1.00 |
array/reductions/mapreduce/Float32/dims=1L |
51919 ns |
52179 ns |
1.00 |
array/reductions/mapreduce/Float32/dims=2L |
69693 ns |
69729 ns |
1.00 |
array/broadcast |
20712 ns |
20464 ns |
1.01 |
array/copyto!/gpu_to_gpu |
11493 ns |
11261 ns |
1.02 |
array/copyto!/cpu_to_gpu |
216025 ns |
216266 ns |
1.00 |
array/copyto!/gpu_to_cpu |
283876 ns |
282685.5 ns |
1.00 |
array/accumulate/Int64/1d |
119314 ns |
119363 ns |
1.00 |
array/accumulate/Int64/dims=1 |
80714 ns |
80474 ns |
1.00 |
array/accumulate/Int64/dims=2 |
156644 ns |
157437.5 ns |
0.99 |
array/accumulate/Int64/dims=1L |
1706170 ns |
1706725 ns |
1.00 |
array/accumulate/Int64/dims=2L |
962126 ns |
962008 ns |
1.00 |
array/accumulate/Float32/1d |
101792 ns |
101483 ns |
1.00 |
array/accumulate/Float32/dims=1 |
77153 ns |
77247 ns |
1.00 |
array/accumulate/Float32/dims=2 |
144309 ns |
143932 ns |
1.00 |
array/accumulate/Float32/dims=1L |
1585720 ns |
1593993 ns |
0.99 |
array/accumulate/Float32/dims=2L |
658004 ns |
660832 ns |
1.00 |
array/construct |
1337.5 ns |
1332.6 ns |
1.00 |
array/random/randn/Float32 |
38584 ns |
38567.5 ns |
1.00 |
array/random/randn!/Float32 |
31635 ns |
31716 ns |
1.00 |
array/random/rand!/Int64 |
34319 ns |
34263.5 ns |
1.00 |
array/random/rand!/Float32 |
8566.5 ns |
8628 ns |
0.99 |
array/random/rand/Int64 |
37365 ns |
30788.5 ns |
1.21 |
array/random/rand/Float32 |
13203 ns |
13144 ns |
1.00 |
array/permutedims/4d |
52765 ns |
52096 ns |
1.01 |
array/permutedims/2d |
52682 ns |
52583 ns |
1.00 |
array/permutedims/3d |
53575 ns |
53461 ns |
1.00 |
array/sorting/1d |
2735420 ns |
2734388 ns |
1.00 |
array/sorting/by |
3304538.5 ns |
3327876 ns |
0.99 |
array/sorting/2d |
1068815 ns |
1072450 ns |
1.00 |
cuda/synchronization/stream/auto |
1031.4 ns |
1031.7 ns |
1.00 |
cuda/synchronization/stream/nonblocking |
7637 ns |
7628.4 ns |
1.00 |
cuda/synchronization/stream/blocking |
856.8205128205128 ns |
827.9 ns |
1.03 |
cuda/synchronization/context/auto |
1177.8 ns |
1165.1 ns |
1.01 |
cuda/synchronization/context/nonblocking |
6990.4 ns |
7638.9 ns |
0.92 |
cuda/synchronization/context/blocking |
934.3947368421053 ns |
925.0566037735849 ns |
1.01 |
This comment was automatically generated by workflow using github-action-benchmark.
|
Thanks very much for putting this together, I'm happy to help with the header issues if needed! |
…but the C++ code is still in flux)
|
@kshyatt I removed the extra code, made the functions that linked to the library relatively agnostic (i.e. you are not forced to use CuTensorBS but can buy in if you'd like) and added a unit test. If you could help with the Clang.jl issue, that would be amazing! |
Hi,
This is a wrapper type and functions to access the newly introduced blocksparse cutensor backend. Right now the code is expert level, i.e. users need to write a type that converts their object to CuTensorBS types or can achieve the low-level operations required by cutensor kernels. I am still writing a test but the code is fully operational.
Thanks,
Karl