Skip to content

Wrapper for Blocksparse CuTensor code#3057

Open
kmp5VT wants to merge 12 commits intoJuliaGPU:masterfrom
kmp5VT:kmp5/feature/wrap_blocksparse_cutensor
Open

Wrapper for Blocksparse CuTensor code#3057
kmp5VT wants to merge 12 commits intoJuliaGPU:masterfrom
kmp5VT:kmp5/feature/wrap_blocksparse_cutensor

Conversation

@kmp5VT
Copy link
Contributor

@kmp5VT kmp5VT commented Mar 16, 2026

Hi,

This is a wrapper type and functions to access the newly introduced blocksparse cutensor backend. Right now the code is expert level, i.e. users need to write a type that converts their object to CuTensorBS types or can achieve the low-level operations required by cutensor kernels. I am still writing a test but the code is fully operational.

Thanks,
Karl

@github-actions
Copy link
Contributor

github-actions bot commented Mar 16, 2026

Your PR requires formatting changes to meet the project's style guidelines.
Please consider running Runic (git runic master) to apply these changes.

Click here to view the suggested changes.
diff --git a/lib/cutensor/src/blocksparse/interfaces.jl b/lib/cutensor/src/blocksparse/interfaces.jl
index c6eef0e5b..0a479ddf8 100644
--- a/lib/cutensor/src/blocksparse/interfaces.jl
+++ b/lib/cutensor/src/blocksparse/interfaces.jl
@@ -1,4 +1,4 @@
-## For now call contract in ITensor and rely on UnallocatedArrays to make 
+## For now call contract in ITensor and rely on UnallocatedArrays to make
 ## C in a dry-run of the contraction.
 # function Base.:(*)(A::CuTensorBS, B::CuTensorBs)
 #     tC = promote_type(eltype(A), eltype(B))
@@ -18,11 +18,13 @@
 using LinearAlgebra
 
 function LinearAlgebra.mul!(C::CuTensorBS, A::CuTensorBS, B::CuTensorBS, α::Number, β::Number)
-   contract!(α, 
-            A, A.inds, CUTENSOR_OP_IDENTITY,
-            B, B.inds, CUTENSOR_OP_IDENTITY, 
-            β,
-            C, C.inds, CUTENSOR_OP_IDENTITY, 
-            CUTENSOR_OP_IDENTITY; jit=CUTENSOR_JIT_MODE_DEFAULT)
-   return C
-end
\ No newline at end of file
+    contract!(
+        α,
+        A, A.inds, CUTENSOR_OP_IDENTITY,
+        B, B.inds, CUTENSOR_OP_IDENTITY,
+        β,
+        C, C.inds, CUTENSOR_OP_IDENTITY,
+        CUTENSOR_OP_IDENTITY; jit = CUTENSOR_JIT_MODE_DEFAULT
+    )
+    return C
+end
diff --git a/lib/cutensor/src/blocksparse/operations.jl b/lib/cutensor/src/blocksparse/operations.jl
index 19542e5de..0f98c92ef 100644
--- a/lib/cutensor/src/blocksparse/operations.jl
+++ b/lib/cutensor/src/blocksparse/operations.jl
@@ -9,23 +9,26 @@ function contract!(
         @nospecialize(beta::Number),
         @nospecialize(C), Cinds::ModeType, opC::cutensorOperator_t,
         opOut::cutensorOperator_t;
-        jit::cutensorJitMode_t=JIT_MODE_NONE,
-        workspace::cutensorWorksizePreference_t=WORKSPACE_DEFAULT,
-        algo::cutensorAlgo_t=ALGO_DEFAULT,
-        compute_type::Union{DataType, cutensorComputeDescriptorEnum, Nothing}=nothing,
-        plan::Union{CuTensorPlan, Nothing}=nothing)
+        jit::cutensorJitMode_t = JIT_MODE_NONE,
+        workspace::cutensorWorksizePreference_t = WORKSPACE_DEFAULT,
+        algo::cutensorAlgo_t = ALGO_DEFAULT,
+        compute_type::Union{DataType, cutensorComputeDescriptorEnum, Nothing} = nothing,
+        plan::Union{CuTensorPlan, Nothing} = nothing
+    )
 
     actual_plan = if plan === nothing
-        plan_contraction(A, Ainds, opA, B, Binds, opB, C, Cinds, opC, opOut;
-                         jit, workspace, algo, compute_type)
+        plan_contraction(
+            A, Ainds, opA, B, Binds, opB, C, Cinds, opC, opOut;
+            jit, workspace, algo, compute_type
+        )
     else
         plan
     end
 
     contractBS!(actual_plan, alpha, nonzero_blocks(A), nonzero_blocks(B), beta, nonzero_blocks(C))
-    
+
     if plan === nothing
-    CUDA.unsafe_free!(actual_plan)
+        CUDA.unsafe_free!(actual_plan)
     end
 
     return C
@@ -33,12 +36,14 @@ end
 
 ## This function assumes A, B, and C are Arrays of pointers to CuArrays.
 ## Please overwrite the `nonzero_blocks` function for your datatype to access this function from contract!
-function contractBS!(plan::CuTensorPlan,
-                   @nospecialize(alpha::Number),
-                   @nospecialize(A::AbstractArray),
-                   @nospecialize(B::AbstractArray),
-                   @nospecialize(beta::Number),
-                   @nospecialize(C::AbstractArray))
+function contractBS!(
+        plan::CuTensorPlan,
+        @nospecialize(alpha::Number),
+        @nospecialize(A::AbstractArray),
+        @nospecialize(B::AbstractArray),
+        @nospecialize(beta::Number),
+        @nospecialize(C::AbstractArray)
+    )
     scalar_type = plan.scalar_type
 
     # Extract GPU pointers from each CuArray block
@@ -46,11 +51,13 @@ function contractBS!(plan::CuTensorPlan,
     A_ptrs = CuPtr{Cvoid}[pointer(block) for block in A]
     B_ptrs = CuPtr{Cvoid}[pointer(block) for block in B]
     C_ptrs = CuPtr{Cvoid}[pointer(block) for block in C]
-    
-    cutensorBlockSparseContract(handle(), plan, 
-                                            Ref{scalar_type}(alpha), A_ptrs, B_ptrs, 
-                                            Ref{scalar_type}(beta),  C_ptrs, C_ptrs, 
-                                            plan.workspace, sizeof(plan.workspace), stream())
+
+    cutensorBlockSparseContract(
+        handle(), plan,
+        Ref{scalar_type}(alpha), A_ptrs, B_ptrs,
+        Ref{scalar_type}(beta), C_ptrs, C_ptrs,
+        plan.workspace, sizeof(plan.workspace), stream()
+    )
     synchronize(stream())
     return C
 end
@@ -60,21 +67,22 @@ function plan_contraction(
         @nospecialize(B), Binds::ModeType, opB::cutensorOperator_t,
         @nospecialize(C), Cinds::ModeType, opC::cutensorOperator_t,
         opOut::cutensorOperator_t;
-        jit::cutensorJitMode_t=JIT_MODE_NONE,
-        workspace::cutensorWorksizePreference_t=WORKSPACE_DEFAULT,
-        algo::cutensorAlgo_t=ALGO_DEFAULT,
-        compute_type::Union{DataType, cutensorComputeDescriptorEnum, Nothing}=nothing)
+        jit::cutensorJitMode_t = JIT_MODE_NONE,
+        workspace::cutensorWorksizePreference_t = WORKSPACE_DEFAULT,
+        algo::cutensorAlgo_t = ALGO_DEFAULT,
+        compute_type::Union{DataType, cutensorComputeDescriptorEnum, Nothing} = nothing
+    )
 
     !is_unary(opA)    && throw(ArgumentError("opA must be a unary op!"))
     !is_unary(opB)    && throw(ArgumentError("opB must be a unary op!"))
     !is_unary(opC)    && throw(ArgumentError("opC must be a unary op!"))
     !is_unary(opOut)  && throw(ArgumentError("opOut must be a unary op!"))
-    
+
     descA = CuTensorBSDescriptor(A)
     descB = CuTensorBSDescriptor(B)
     descC = CuTensorBSDescriptor(C)
     # for now, D must be identical to C (and thus, descD must be identical to descC)
-    
+
     modeA = collect(Cint, Ainds)
     modeB = collect(Cint, Binds)
     modeC = collect(Cint, Cinds)
@@ -87,17 +95,19 @@ function plan_contraction(
 
 
     desc = Ref{cutensorOperationDescriptor_t}()
-    cutensorCreateBlockSparseContraction(handle(),
-    desc, 
-    descA, modeA, opA,
-    descB, modeB, opB,
-    descC, modeC, opC,
-    descC, modeC, actual_compute_type)
+    cutensorCreateBlockSparseContraction(
+        handle(),
+        desc,
+        descA, modeA, opA,
+        descB, modeB, opB,
+        descC, modeC, opC,
+        descC, modeC, actual_compute_type
+    )
 
     plan_pref = Ref{cutensorPlanPreference_t}()
     cutensorCreatePlanPreference(handle(), plan_pref, algo, jit)
 
-    plan = CuTensorPlan(desc[], plan_pref[]; workspacePref=workspace)
+    plan = CuTensorPlan(desc[], plan_pref[]; workspacePref = workspace)
     # cutensorDestroyOperationDescriptor(desc[])
     cutensorDestroyPlanPreference(plan_pref[])
     return plan
diff --git a/lib/cutensor/src/blocksparse/types.jl b/lib/cutensor/src/blocksparse/types.jl
index 292dc4d00..41cbebdbd 100644
--- a/lib/cutensor/src/blocksparse/types.jl
+++ b/lib/cutensor/src/blocksparse/types.jl
@@ -12,20 +12,26 @@ mutable struct CuTensorBS{T, N}
     ## This expects a Vector{Tuple(Int)} right now
     nonzero_block_coords
 
-    function CuTensorBS{T, N}(nonzero_data::Vector{<:CuArray}, 
-        blocks_per_mode::Vector{Int}, block_extents, nonzero_block_coords, inds::Vector) where {T<:Number, N}
+    function CuTensorBS{T, N}(
+            nonzero_data::Vector{<:CuArray},
+            blocks_per_mode::Vector{Int}, block_extents, nonzero_block_coords, inds::Vector
+        ) where {T <: Number, N}
         CuArrayT = eltype(nonzero_data)
         @assert eltype(CuArrayT) == T
         # @assert ndims(CuArrayT) == N
         @assert length(block_extents) == N
-        new(nonzero_data, inds, blocks_per_mode, block_extents, nonzero_block_coords)
+        return new(nonzero_data, inds, blocks_per_mode, block_extents, nonzero_block_coords)
     end
 end
 
-function CuTensorBS(nonzero_data::Vector{<:CuArray{T}}, 
-    blocks_per_mode, block_extents, nonzero_block_coords, inds::Vector) where {T<:Number}
-    CuTensorBS{T,length(block_extents)}(nonzero_data, 
-    blocks_per_mode, block_extents, nonzero_block_coords, inds)
+function CuTensorBS(
+        nonzero_data::Vector{<:CuArray{T}},
+        blocks_per_mode, block_extents, nonzero_block_coords, inds::Vector
+    ) where {T <: Number}
+    return CuTensorBS{T, length(block_extents)}(
+        nonzero_data,
+        blocks_per_mode, block_extents, nonzero_block_coords, inds
+    )
 end
 # array interface
 function Base.size(T::CuTensorBS)
@@ -39,8 +45,8 @@ Base.strides(T::CuTensorBS) = vcat([[st...] for st in strides.(T.nonzero_data)].
 Base.eltype(T::CuTensorBS) = eltype(eltype(T.nonzero_data))
 
 function block_extents(T::CuTensorBS)
-    extents = Vector{Int64}() 
-    
+    extents = Vector{Int64}()
+
     for ex in T.block_extents
         extents = vcat(extents, ex...)
     end
@@ -66,18 +72,21 @@ mutable struct CuTensorBSDescriptor
     handle::cutensorBlockSparseTensorDescriptor_t
     # inner constructor handles creation and finalizer of the descriptor
     function CuTensorBSDescriptor(
-        numModes,
-        numNonZeroBlocks,
-        numSectionsPerMode,
-        extent,
-        nonZeroCoordinates,
-        stride,
-        eltype)
+            numModes,
+            numNonZeroBlocks,
+            numSectionsPerMode,
+            extent,
+            nonZeroCoordinates,
+            stride,
+            eltype
+        )
 
         desc = Ref{cuTENSOR.cutensorBlockSparseTensorDescriptor_t}()
-        cutensorCreateBlockSparseTensorDescriptor(handle(), desc, 
-        numModes, numNonZeroBlocks, numSectionsPerMode, extent, nonZeroCoordinates,
-        stride, eltype)
+        cutensorCreateBlockSparseTensorDescriptor(
+            handle(), desc,
+            numModes, numNonZeroBlocks, numSectionsPerMode, extent, nonZeroCoordinates,
+            stride, eltype
+        )
 
         obj = new(desc[])
         finalizer(unsafe_destroy!, obj)
@@ -86,12 +95,13 @@ mutable struct CuTensorBSDescriptor
 end
 
 function CuTensorBSDescriptor(
-    numModes,
-    numNonZeroBlocks,
-    numSectionsPerMode,
-    extent,
-    nonZeroCoordinates,
-    eltype)
+        numModes,
+        numNonZeroBlocks,
+        numSectionsPerMode,
+        extent,
+        nonZeroCoordinates,
+        eltype
+    )
 
     return CuTensorBSDescriptor(numModes, numNonZeroBlocks, numSectionsPerMode, extent, nonZeroCoordinates, C_NULL, eltype)
 end
@@ -101,7 +111,7 @@ Base.show(io::IO, desc::CuTensorBSDescriptor) = @printf(io, "CuTensorBSDescripto
 Base.unsafe_convert(::Type{cutensorBlockSparseTensorDescriptor_t}, obj::CuTensorBSDescriptor) = obj.handle
 
 function unsafe_destroy!(obj::CuTensorBSDescriptor)
-    cutensorDestroyBlockSparseTensorDescriptor(obj)
+    return cutensorDestroyBlockSparseTensorDescriptor(obj)
 end
 
 ## Descriptor function for CuTensorBS type. Please overwrite for custom objects
@@ -110,11 +120,13 @@ function CuTensorBSDescriptor(A::CuTensorBS)
     numNonZeroBlocks = Int64(length(A.nonzero_block_coords))
     numSectionsPerMode = collect(Int32, A.blocks_per_mode)
     extent = block_extents(A)
-    nonZeroCoordinates =  Int32.(vcat([[x...] for x in A.nonzero_block_coords]...) .- 1)
+    nonZeroCoordinates = Int32.(vcat([[x...] for x in A.nonzero_block_coords]...) .- 1)
     st = strides(A)
-    dataType = eltype(A)#convert(cuTENSOR.cutensorDataType_t, eltype(A))
+    dataType = eltype(A) #convert(cuTENSOR.cutensorDataType_t, eltype(A))
 
     ## Right now assume stride is NULL. I am not sure if stride works, need to discuss with cuTENSOR team.
-    CuTensorBSDescriptor(numModes, numNonZeroBlocks, 
-    numSectionsPerMode, extent, nonZeroCoordinates, dataType)
+    return CuTensorBSDescriptor(
+        numModes, numNonZeroBlocks,
+        numSectionsPerMode, extent, nonZeroCoordinates, dataType
+    )
 end
diff --git a/lib/cutensor/src/libcutensor.jl b/lib/cutensor/src/libcutensor.jl
index a73ab654a..c83a6728f 100644
--- a/lib/cutensor/src/libcutensor.jl
+++ b/lib/cutensor/src/libcutensor.jl
@@ -545,12 +545,12 @@ end
     @gcsafe_ccall libcutensor.cutensorBlockSparseContract(handle::cutensorHandle_t,
                                                           plan::cutensorPlan_t,
                                                           alpha::Ptr{Cvoid},
-                                                          A::Ptr{Cvoid},
-                                                          B::Ptr{Cvoid},
+        A::Ptr{Cvoid},
+        B::Ptr{Cvoid},
                                                           beta::Ptr{Cvoid},
-                                                          C::Ptr{Cvoid},
-                                                          D::Ptr{Cvoid},
-                                                          workspace::CuPtr{Cvoid},
+        C::Ptr{Cvoid},
+        D::Ptr{Cvoid},
+        workspace::CuPtr{Cvoid},
                                                           workspaceSize::UInt64,
                                                           stream::cudaStream_t)::cutensorStatus_t
 end
diff --git a/lib/cutensor/test/contractions.jl b/lib/cutensor/test/contractions.jl
index 636600a74..baf56949a 100644
--- a/lib/cutensor/test/contractions.jl
+++ b/lib/cutensor/test/contractions.jl
@@ -188,62 +188,73 @@ end
     end
 end
 
-eltypes_compact = [
-    (Float32, Float32, Float32, Float32),
-    (ComplexF32, ComplexF32, ComplexF32, Float32),
-     (Float64, Float64, Float64, Float64),
-     (ComplexF64, ComplexF64, ComplexF64, Float64)
-]
-@testset "Blocksparse Contraction" begin
-    ## There are many unsupported types because this is a new functionality
-    ## So I will test with Float32 and ComplexF32 only
-    @testset for (eltyA, eltyB, eltyC, eltyCompute) in eltypes_compact
-        ## i = [20,20,25]
-        ## k = [10,10,15]
-        ## l = [30,30,35]
-        ## A = Tensor(k,i,l)
-        ## Nonzero blocks are 
-        ## [1,1,1], [1,1,3], [1,3,1], [1,3,3], [3,1,1], [3,1,3], [3,3,1], [3,3,3]
-        A = Vector{CuArray{eltyA, 3}}()
-        for k in [10,15]
-            for i in [20,25]
-                for l in [30,35]
-                    push!(A, CuArray(ones(eltyA, k,i,l)))
+    eltypes_compact = [
+        (Float32, Float32, Float32, Float32),
+        (ComplexF32, ComplexF32, ComplexF32, Float32),
+        (Float64, Float64, Float64, Float64),
+        (ComplexF64, ComplexF64, ComplexF64, Float64),
+    ]
+    @testset "Blocksparse Contraction" begin
+        ## There are many unsupported types because this is a new functionality
+        ## So I will test with Float32 and ComplexF32 only
+        @testset for (eltyA, eltyB, eltyC, eltyCompute) in eltypes_compact
+            ## i = [20,20,25]
+            ## k = [10,10,15]
+            ## l = [30,30,35]
+            ## A = Tensor(k,i,l)
+            ## Nonzero blocks are
+            ## [1,1,1], [1,1,3], [1,3,1], [1,3,3], [3,1,1], [3,1,3], [3,3,1], [3,3,3]
+            A = Vector{CuArray{eltyA, 3}}()
+            for k in [10, 15]
+                for i in [20, 25]
+                    for l in [30, 35]
+                        push!(A, CuArray(ones(eltyA, k, i, l)))
+                    end
                 end
             end
-        end
 
-        ## B = Tensor(k,l)
-        ## Nonzero blocks are
-        ## [1,1], [2,3]
-        B = Array{CuArray{eltyB, 2}}(
-            [CuArray(randn(eltyB, 10, 30)),
-            CuArray(randn(eltyB, 10, 35))])
-
-        ## C = Tensor(i)
-        ## Nonzero blocks are 
-        ## [1,], [3,]
-        C = Vector{CuArray{eltyC, 1}}(
-            [CuArray(zeros(eltyC, 20)),
-            CuArray(zeros(eltyC, 25))]
-        )
-        
-        cuTenA = cuTENSOR.CuTensorBS(A, [3,3,3], 
-        [(10,10,15), (20,20,25),  (30,30,35)], 
-        [(1,1,1), (1,1,3), (1,3,1), (1,3,3), (3,1,1), (3,1,3), (3,3,1), (3,3,3)],
-        [1,3,2])
-        cuTenB = cuTENSOR.CuTensorBS(B, [3,3],
-        [(10,10,15), (30,30,35)],
-        [(1,1),(2,3)], [1,2], )
-        cuTenC = cuTENSOR.CuTensorBS(C, [3],
-        [(20,20,25)],[(1,),(3,)], [3])
-
-        mul!(cuTenC, cuTenA, cuTenB, 1, 0)
-        ## C[1] = A[1,1,1] * B[1,1]
-        @test C[1] ≈ reshape(permutedims(A[1], (2,1,3)), (20, 10 * 30)) * reshape(B[1], (10 * 30))
-        ## C[3] = A[1,3,1] * B[1,1]
-        @test C[2] ≈ reshape(permutedims(A[3], (2,1,3)), (25, 10 * 30)) * reshape(B[1], (10 * 30))
+            ## B = Tensor(k,l)
+            ## Nonzero blocks are
+            ## [1,1], [2,3]
+            B = Array{CuArray{eltyB, 2}}(
+                [
+                    CuArray(randn(eltyB, 10, 30)),
+                    CuArray(randn(eltyB, 10, 35)),
+                ]
+            )
+
+            ## C = Tensor(i)
+            ## Nonzero blocks are
+            ## [1,], [3,]
+            C = Vector{CuArray{eltyC, 1}}(
+                [
+                    CuArray(zeros(eltyC, 20)),
+                    CuArray(zeros(eltyC, 25)),
+                ]
+            )
+
+            cuTenA = cuTENSOR.CuTensorBS(
+                A, [3, 3, 3],
+                [(10, 10, 15), (20, 20, 25), (30, 30, 35)],
+                [(1, 1, 1), (1, 1, 3), (1, 3, 1), (1, 3, 3), (3, 1, 1), (3, 1, 3), (3, 3, 1), (3, 3, 3)],
+                [1, 3, 2]
+            )
+            cuTenB = cuTENSOR.CuTensorBS(
+                B, [3, 3],
+                [(10, 10, 15), (30, 30, 35)],
+                [(1, 1), (2, 3)], [1, 2],
+            )
+            cuTenC = cuTENSOR.CuTensorBS(
+                C, [3],
+                [(20, 20, 25)], [(1,), (3,)], [3]
+            )
+
+            mul!(cuTenC, cuTenA, cuTenB, 1, 0)
+            ## C[1] = A[1,1,1] * B[1,1]
+            @test C[1] ≈ reshape(permutedims(A[1], (2, 1, 3)), (20, 10 * 30)) * reshape(B[1], (10 * 30))
+            ## C[3] = A[1,3,1] * B[1,1]
+            @test C[2] ≈ reshape(permutedims(A[3], (2, 1, 3)), (25, 10 * 30)) * reshape(B[1], (10 * 30))
+        end
     end
-end
 
 end

@kmp5VT
Copy link
Contributor Author

kmp5VT commented Mar 16, 2026

There were some issues in the Clang.jl's conversion of the cuTENSOR.h file into Julia wrapper functions. Specifically I had a runtime issue when trying to convert arrays of cuarray into ptr{ptr{cvoid}}. I think this is because CUDA.jl does not expect an array of cuarrays and so the julia side unsafe convert failed. This is not yet ready to merge.

…mp5VT/CUDA.jl into kmp5/feature/wrap_blocksparse_cutensor
@codecov
Copy link

codecov bot commented Mar 17, 2026

Codecov Report

❌ Patch coverage is 0% with 88 lines in your changes missing coverage. Please review.
✅ Project coverage is 88.88%. Comparing base (f809025) to head (67013c8).
⚠️ Report is 1 commits behind head on master.

Files with missing lines Patch % Lines
lib/cutensor/src/blocksparse/types.jl 0.00% 48 Missing ⚠️
lib/cutensor/src/blocksparse/operations.jl 0.00% 37 Missing ⚠️
lib/cutensor/src/blocksparse/interfaces.jl 0.00% 3 Missing ⚠️
Additional details and impacted files
@@             Coverage Diff             @@
##           master    #3057       +/-   ##
===========================================
+ Coverage   12.50%   88.88%   +76.38%     
===========================================
  Files         145      151        +6     
  Lines       12819    13123      +304     
===========================================
+ Hits         1603    11665    +10062     
+ Misses      11216     1458     -9758     

☔ View full report in Codecov by Sentry.
📢 Have feedback on the report? Share it here.

🚀 New features to boost your workflow:
  • ❄️ Test Analytics: Detect flaky tests, report on failures, and find test suite problems.

Copy link
Contributor

@github-actions github-actions bot left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

CUDA.jl Benchmarks

Details
Benchmark suite Current: cda4a4e Previous: f7b7929 Ratio
latency/precompile 45048875701 ns 45018748344 ns 1.00
latency/ttfp 12756533815 ns 12770284486 ns 1.00
latency/import 3544150826 ns 3541917719 ns 1.00
integration/volumerhs 9433450 ns 9450947.5 ns 1.00
integration/byval/slices=1 146355 ns 146127 ns 1.00
integration/byval/slices=3 423305 ns 423159 ns 1.00
integration/byval/reference 143942 ns 143932 ns 1.00
integration/byval/slices=2 284712 ns 284759.5 ns 1.00
integration/cudadevrt 102483 ns 102551 ns 1.00
kernel/indexing 13304 ns 13204 ns 1.01
kernel/indexing_checked 14034 ns 13977 ns 1.00
kernel/occupancy 702.8074534161491 ns 664.05625 ns 1.06
kernel/launch 2102.2 ns 2163.9444444444443 ns 0.97
kernel/rand 14550 ns 18131 ns 0.80
array/reverse/1d 18552 ns 18471 ns 1.00
array/reverse/2dL_inplace 65960 ns 65988 ns 1.00
array/reverse/1dL 69079 ns 69022 ns 1.00
array/reverse/2d 20828 ns 20733 ns 1.00
array/reverse/1d_inplace 8546.666666666666 ns 8573 ns 1.00
array/reverse/2d_inplace 10124 ns 10232 ns 0.99
array/reverse/2dL 72879.5 ns 72825 ns 1.00
array/reverse/1dL_inplace 65879 ns 65937 ns 1.00
array/copy 18782 ns 18988 ns 0.99
array/iteration/findall/int 150513 ns 150059 ns 1.00
array/iteration/findall/bool 133007 ns 132365.5 ns 1.00
array/iteration/findfirst/int 84657 ns 83639 ns 1.01
array/iteration/findfirst/bool 82233 ns 81468 ns 1.01
array/iteration/scalar 69788 ns 66443.5 ns 1.05
array/iteration/logical 202573.5 ns 200236 ns 1.01
array/iteration/findmin/1d 89225 ns 86614.5 ns 1.03
array/iteration/findmin/2d 117528.5 ns 117241 ns 1.00
array/reductions/reduce/Int64/1d 43691 ns 42766 ns 1.02
array/reductions/reduce/Int64/dims=1 52975 ns 52907 ns 1.00
array/reductions/reduce/Int64/dims=2 60037 ns 60231 ns 1.00
array/reductions/reduce/Int64/dims=1L 87823 ns 87828 ns 1.00
array/reductions/reduce/Int64/dims=2L 85292 ns 84956.5 ns 1.00
array/reductions/reduce/Float32/1d 35655.5 ns 34964 ns 1.02
array/reductions/reduce/Float32/dims=1 49676.5 ns 40442.5 ns 1.23
array/reductions/reduce/Float32/dims=2 57337 ns 57125 ns 1.00
array/reductions/reduce/Float32/dims=1L 52144 ns 52000 ns 1.00
array/reductions/reduce/Float32/dims=2L 70377 ns 69982.5 ns 1.01
array/reductions/mapreduce/Int64/1d 43606 ns 42509 ns 1.03
array/reductions/mapreduce/Int64/dims=1 42928.5 ns 42334 ns 1.01
array/reductions/mapreduce/Int64/dims=2 59796 ns 59835 ns 1.00
array/reductions/mapreduce/Int64/dims=1L 87781 ns 87864 ns 1.00
array/reductions/mapreduce/Int64/dims=2L 85436 ns 85164 ns 1.00
array/reductions/mapreduce/Float32/1d 35514 ns 34719 ns 1.02
array/reductions/mapreduce/Float32/dims=1 40267 ns 45273 ns 0.89
array/reductions/mapreduce/Float32/dims=2 57138 ns 56959 ns 1.00
array/reductions/mapreduce/Float32/dims=1L 51919 ns 52179 ns 1.00
array/reductions/mapreduce/Float32/dims=2L 69693 ns 69729 ns 1.00
array/broadcast 20712 ns 20464 ns 1.01
array/copyto!/gpu_to_gpu 11493 ns 11261 ns 1.02
array/copyto!/cpu_to_gpu 216025 ns 216266 ns 1.00
array/copyto!/gpu_to_cpu 283876 ns 282685.5 ns 1.00
array/accumulate/Int64/1d 119314 ns 119363 ns 1.00
array/accumulate/Int64/dims=1 80714 ns 80474 ns 1.00
array/accumulate/Int64/dims=2 156644 ns 157437.5 ns 0.99
array/accumulate/Int64/dims=1L 1706170 ns 1706725 ns 1.00
array/accumulate/Int64/dims=2L 962126 ns 962008 ns 1.00
array/accumulate/Float32/1d 101792 ns 101483 ns 1.00
array/accumulate/Float32/dims=1 77153 ns 77247 ns 1.00
array/accumulate/Float32/dims=2 144309 ns 143932 ns 1.00
array/accumulate/Float32/dims=1L 1585720 ns 1593993 ns 0.99
array/accumulate/Float32/dims=2L 658004 ns 660832 ns 1.00
array/construct 1337.5 ns 1332.6 ns 1.00
array/random/randn/Float32 38584 ns 38567.5 ns 1.00
array/random/randn!/Float32 31635 ns 31716 ns 1.00
array/random/rand!/Int64 34319 ns 34263.5 ns 1.00
array/random/rand!/Float32 8566.5 ns 8628 ns 0.99
array/random/rand/Int64 37365 ns 30788.5 ns 1.21
array/random/rand/Float32 13203 ns 13144 ns 1.00
array/permutedims/4d 52765 ns 52096 ns 1.01
array/permutedims/2d 52682 ns 52583 ns 1.00
array/permutedims/3d 53575 ns 53461 ns 1.00
array/sorting/1d 2735420 ns 2734388 ns 1.00
array/sorting/by 3304538.5 ns 3327876 ns 0.99
array/sorting/2d 1068815 ns 1072450 ns 1.00
cuda/synchronization/stream/auto 1031.4 ns 1031.7 ns 1.00
cuda/synchronization/stream/nonblocking 7637 ns 7628.4 ns 1.00
cuda/synchronization/stream/blocking 856.8205128205128 ns 827.9 ns 1.03
cuda/synchronization/context/auto 1177.8 ns 1165.1 ns 1.01
cuda/synchronization/context/nonblocking 6990.4 ns 7638.9 ns 0.92
cuda/synchronization/context/blocking 934.3947368421053 ns 925.0566037735849 ns 1.01

This comment was automatically generated by workflow using github-action-benchmark.

@kshyatt kshyatt self-requested a review March 17, 2026 10:52
@kshyatt
Copy link
Member

kshyatt commented Mar 17, 2026

Thanks very much for putting this together, I'm happy to help with the header issues if needed!

@kmp5VT
Copy link
Contributor Author

kmp5VT commented Mar 19, 2026

@kshyatt I removed the extra code, made the functions that linked to the library relatively agnostic (i.e. you are not forced to use CuTensorBS but can buy in if you'd like) and added a unit test. If you could help with the Clang.jl issue, that would be amazing!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

2 participants