Conversation
…xpensive external and internal batching.
|
Your PR requires formatting changes to meet the project's style guidelines. Click here to view the suggested changes.diff --git a/lib/cufft/fft.jl b/lib/cufft/fft.jl
index 773387f51..f6e390c6c 100644
--- a/lib/cufft/fft.jl
+++ b/lib/cufft/fft.jl
@@ -3,8 +3,8 @@
@reexport using AbstractFFTs
import AbstractFFTs: plan_fft, plan_fft!, plan_bfft, plan_bfft!, plan_ifft,
- plan_rfft, plan_brfft, plan_inv, normalization, fft, bfft, ifft, rfft, irfft,
- Plan, ScaledPlan
+ plan_rfft, plan_brfft, plan_inv, normalization, fft, bfft, ifft, rfft, irfft,
+ Plan, ScaledPlan
using LinearAlgebra
@@ -26,65 +26,69 @@ Base.:(*)(p::ScaledPlan, x::DenseCuArray) = rmul!(p.p * x, p.scale)
# N is the number of dimensions
mutable struct CuFFTPlan{T <: cufftNumber, S <: cufftNumber, K, inplace, N, R, B} <: Plan{S}
- # handle to Cuda low level plan. Note that this plan sometimes has lower dimensions
- # to handle more transform cases such as individual directions
- handle::cufftHandle
- ctx::CuContext
- stream::CuStream
- input_size::NTuple{N, Int} # Julia size of input array
- output_size::NTuple{N, Int} # Julia size of output array
- region::NTuple{R, Int}
- buffer::B # buffer for out-of-place complex-to-real FFT, or `nothing` if not needed
- pinv::ScaledPlan{T} # required by AbstractFFTs API, will be defined by AbstractFFTs if needed
-
- function CuFFTPlan{T, S, K, inplace, N, R, B}(handle::cufftHandle,
- input_size::NTuple{N, Int}, output_size::NTuple{N, Int},
- region::NTuple{R, Int}, buffer::B,
- ) where {T <: cufftNumber, S <: cufftNumber, K, inplace, N, R, B}
- abs(K) == 1 || throw(ArgumentError("FFT direction must be either -1 (forward) or +1 (inverse)"))
- inplace isa Bool || throw(ArgumentError("FFT inplace argument must be a Bool"))
- p = new{T, S, K, inplace, N, R, B}(handle, context(), stream(), input_size, output_size, region, buffer)
- finalizer(unsafe_free!, p)
- p
- end
+ # handle to Cuda low level plan. Note that this plan sometimes has lower dimensions
+ # to handle more transform cases such as individual directions
+ handle::cufftHandle
+ ctx::CuContext
+ stream::CuStream
+ input_size::NTuple{N, Int} # Julia size of input array
+ output_size::NTuple{N, Int} # Julia size of output array
+ region::NTuple{R, Int}
+ buffer::B # buffer for out-of-place complex-to-real FFT, or `nothing` if not needed
+ pinv::ScaledPlan{T} # required by AbstractFFTs API, will be defined by AbstractFFTs if needed
+
+ function CuFFTPlan{T, S, K, inplace, N, R, B}(
+ handle::cufftHandle,
+ input_size::NTuple{N, Int}, output_size::NTuple{N, Int},
+ region::NTuple{R, Int}, buffer::B,
+ ) where {T <: cufftNumber, S <: cufftNumber, K, inplace, N, R, B}
+ abs(K) == 1 || throw(ArgumentError("FFT direction must be either -1 (forward) or +1 (inverse)"))
+ inplace isa Bool || throw(ArgumentError("FFT inplace argument must be a Bool"))
+ p = new{T, S, K, inplace, N, R, B}(handle, context(), stream(), input_size, output_size, region, buffer)
+ finalizer(unsafe_free!, p)
+ return p
+ end
end
-function CuFFTPlan{T, S, K, inplace, N, R, B}(handle::cufftHandle, X::DenseCuArray{S, N},
- sizey::NTuple{N, Int}, region::NTuple{R, Int}, buffer::B,
-) where {T <: cufftNumber, S <: cufftNumber, K, inplace, N, R, B}
- CuFFTPlan{T, S, K, inplace, N, R, B}(handle, size(X), sizey, region, buffer)
+function CuFFTPlan{T, S, K, inplace, N, R, B}(
+ handle::cufftHandle, X::DenseCuArray{S, N},
+ sizey::NTuple{N, Int}, region::NTuple{R, Int}, buffer::B,
+ ) where {T <: cufftNumber, S <: cufftNumber, K, inplace, N, R, B}
+ return CuFFTPlan{T, S, K, inplace, N, R, B}(handle, size(X), sizey, region, buffer)
end
function CUDA.unsafe_free!(plan::CuFFTPlan)
- if plan.handle != C_NULL
- context!(plan.ctx; skip_destroyed = true) do
- cufftReleasePlan(plan.handle)
- end
- plan.handle = C_NULL
- end
- if !isnothing(plan.buffer)
- CUDA.unsafe_free!(plan.buffer)
- end
+ if plan.handle != C_NULL
+ context!(plan.ctx; skip_destroyed = true) do
+ cufftReleasePlan(plan.handle)
+ end
+ plan.handle = C_NULL
+ end
+ return if !isnothing(plan.buffer)
+ CUDA.unsafe_free!(plan.buffer)
+ end
end
function showfftdims(io, sz, T)
- if isempty(sz)
- print(io, "0-dimensional")
- elseif length(sz) == 1
- print(io, sz[1], "-element")
- else
- print(io, join(sz, "×"))
- end
- print(io, " CuArray of ", T)
+ if isempty(sz)
+ print(io, "0-dimensional")
+ elseif length(sz) == 1
+ print(io, sz[1], "-element")
+ else
+ print(io, join(sz, "×"))
+ end
+ return print(io, " CuArray of ", T)
end
function Base.show(io::IO, p::CuFFTPlan{T, S, K, inplace}) where {T, S, K, inplace}
- print(io, "CUFFT ",
- inplace ? "in-place " : "",
- S == T ? "$T " : "$(S)-to-$(T) ",
- K == CUFFT_FORWARD ? "forward " : "backward ",
- "plan for ")
- showfftdims(io, p.input_size, S)
+ print(
+ io, "CUFFT ",
+ inplace ? "in-place " : "",
+ S == T ? "$T " : "$(S)-to-$(T) ",
+ K == CUFFT_FORWARD ? "forward " : "backward ",
+ "plan for "
+ )
+ return showfftdims(io, p.input_size, S)
end
output_type(::CuFFTPlan{T, S}) where {T, S} = T
@@ -101,12 +105,12 @@ Base.size(p::CuFFTPlan) = p.input_size
# the one used in the current task. call this function before every API call that performs
# operations on a stream to ensure the plan is using the correct task-local stream.
@inline function update_stream(plan::CuFFTPlan)
- new_stream = stream()
- if plan.stream != new_stream
- plan.stream = new_stream
- cufftSetStream(plan, new_stream)
- end
- return
+ new_stream = stream()
+ if plan.stream != new_stream
+ plan.stream = new_stream
+ cufftSetStream(plan, new_stream)
+ end
+ return
end
@@ -115,19 +119,19 @@ end
# promote to a complex floating-point type (out-of-place only),
# so implementations only need Complex{Float} methods
for f in (:fft, :bfft, :ifft)
- pf = Symbol("plan_", f)
- @eval begin
- $f(x::DenseCuArray{<:Real}, region = 1:ndims(x)) = $f(complexfloat(x), region)
- $pf(x::DenseCuArray{<:Real}, region) = $pf(complexfloat(x), region)
- $f(x::DenseCuArray{<:Complex{<:Union{Integer, Rational}}}, region = 1:ndims(x)) = $f(complexfloat(x), region)
- $pf(x::DenseCuArray{<:Complex{<:Union{Integer, Rational}}}, region) = $pf(complexfloat(x), region)
- end
+ pf = Symbol("plan_", f)
+ @eval begin
+ $f(x::DenseCuArray{<:Real}, region = 1:ndims(x)) = $f(complexfloat(x), region)
+ $pf(x::DenseCuArray{<:Real}, region) = $pf(complexfloat(x), region)
+ $f(x::DenseCuArray{<:Complex{<:Union{Integer, Rational}}}, region = 1:ndims(x)) = $f(complexfloat(x), region)
+ $pf(x::DenseCuArray{<:Complex{<:Union{Integer, Rational}}}, region) = $pf(complexfloat(x), region)
+ end
end
rfft(x::DenseCuArray{<:Union{Integer, Rational}}, region = 1:ndims(x)) = rfft(realfloat(x), region)
plan_rfft(x::DenseCuArray{<:Real}, region) = plan_rfft(realfloat(x), region)
function irfft(x::DenseCuArray{<:Union{Real, Integer, Rational}}, d::Integer, region = 1:ndims(x))
- irfft(complexfloat(x), d, region)
+ return irfft(complexfloat(x), d, region)
end
"""
@@ -145,27 +149,27 @@ All other dimensions are external batch dimensions.
This size Tuple is only used to determine the best set of consecutive dimension to be used for internal batching.
"""
function get_batch_dims(region, sz)
- internal_batch_dims = ()
- external_batch_dims = ()
- previous_transform_dim = 0
- best_gap_size = 0
- # iterate through the transform dimensions and one extra dim beyond the size to cover the external batch dims
- for t in (region..., length(sz)+1)
- # calculate the product only of consecutively non-transformed sizes
- if (t > previous_transform_dim+1)
- gap_size = prod(sz[(previous_transform_dim+1):(t-1)])
- if (gap_size > best_gap_size)
- best_gap_size = gap_size
- # the previously best dims were not the best. Add them to the external list.
- external_batch_dims = (external_batch_dims..., internal_batch_dims...)
- internal_batch_dims = Tuple((previous_transform_dim+1):(t-1))
- else
- external_batch_dims = Tuple(external_batch_dims..., (previous_transform_dim+1):(t-1))
- end
- end
- previous_transform_dim = t
- end
- return internal_batch_dims, external_batch_dims
+ internal_batch_dims = ()
+ external_batch_dims = ()
+ previous_transform_dim = 0
+ best_gap_size = 0
+ # iterate through the transform dimensions and one extra dim beyond the size to cover the external batch dims
+ for t in (region..., length(sz) + 1)
+ # calculate the product only of consecutively non-transformed sizes
+ if (t > previous_transform_dim + 1)
+ gap_size = prod(sz[(previous_transform_dim + 1):(t - 1)])
+ if (gap_size > best_gap_size)
+ best_gap_size = gap_size
+ # the previously best dims were not the best. Add them to the external list.
+ external_batch_dims = (external_batch_dims..., internal_batch_dims...)
+ internal_batch_dims = Tuple((previous_transform_dim + 1):(t - 1))
+ else
+ external_batch_dims = Tuple(external_batch_dims..., (previous_transform_dim + 1):(t - 1))
+ end
+ end
+ previous_transform_dim = t
+ end
+ return internal_batch_dims, external_batch_dims
end
# retrieves the size to allocate even if the external batch dimensions do no transform
@@ -173,8 +177,8 @@ get_osz(osz, x) = ntuple((d)->(d>length(osz) ? size(x, d) : osz[d]), ndims(x))
# returns a view of the front part of the dimensions of the array up to md dimensions
function front_view(X, md)
- t = ntuple((d)->ifelse(d<=md, Colon(), 1), ndims(X))
- @view X[t...]
+ t = ntuple((d) -> ifelse(d <= md, Colon(), 1), ndims(X))
+ return @view X[t...]
end
ensure_raising(num::Number) = num
@@ -184,8 +188,8 @@ ensure_raising(num::Number) = num
ensure_raising(sequence::NTuple{1, Int}) = sequence
ensure_raising(sequence::NTuple{2, Int}) = (sequence[1] < sequence[2]) ? sequence : sequence[2:-1:1]
ensure_raising(sequence::NTuple{3, Int}) = (sequence[1] < sequence[2]) ?
- ((sequence[2]<sequence[3]) ? sequence : (sequence[1]<sequence[3]) ? sequence[[1,3,2]] : sequence[[3,1,2]]) :
- ((sequence[1]<sequence[3]) ? sequence[[2,1,3]] : (sequence[2]<sequence[3]) ? sequence[[2,3,1]] : sequence[[3,2,1]])
+ ((sequence[2] < sequence[3]) ? sequence : (sequence[1] < sequence[3]) ? sequence[[1, 3, 2]] : sequence[[3, 1, 2]]) :
+ ((sequence[1] < sequence[3]) ? sequence[[2, 1, 3]] : (sequence[2] < sequence[3]) ? sequence[[2, 3, 1]] : sequence[[3, 2, 1]])
function ensure_raising(sequence::NTuple)
throw(ArgumentError("only up to three transform dimensions are allowed in one plan"))
end
@@ -198,128 +202,134 @@ end
# inference of calls like plan_fft(X), which is translated by AbstractFFTs.jl into
# plan_fft(X, 1:ndims(X)).
for f in (:plan_fft!, :plan_bfft!, :plan_fft, :plan_bfft)
- @eval begin
- Base.@constprop :aggressive function $f(X::DenseCuArray{T, N}, region) where {T <: cufftComplexes, N}
+ @eval begin
+ Base.@constprop :aggressive function $f(X::DenseCuArray{T, N}, region) where {T <: cufftComplexes, N}
region = unique(region)
- R = length(region)
- region = NTuple{R, Int}(region)
+ R = length(region)
+ region = NTuple{R, Int}(region)
region = ensure_raising(region)
- $f(X, region)
- end
- end
+ $f(X, region)
+ end
+ end
end
# inplace complex
function plan_fft!(X::DenseCuArray{T, N}, region::NTuple{R, Int}) where {T <: cufftComplexes, N, R}
- K = CUFFT_FORWARD
- inplace = true
+ K = CUFFT_FORWARD
+ inplace = true
region = ensure_raising(Tuple(unique(region)))
- handle = cufftGetPlan(T, T, size(X), region)
+ handle = cufftGetPlan(T, T, size(X), region)
- CuFFTPlan{T, T, K, inplace, N, R, Nothing}(handle, X, size(X), region, nothing)
+ return CuFFTPlan{T, T, K, inplace, N, R, Nothing}(handle, X, size(X), region, nothing)
end
function plan_bfft!(X::DenseCuArray{T, N}, region::NTuple{R, Int}) where {T <: cufftComplexes, N, R}
- K = CUFFT_INVERSE
- inplace = true
+ K = CUFFT_INVERSE
+ inplace = true
region = ensure_raising(Tuple(unique(region)))
- handle = cufftGetPlan(T, T, size(X), region)
+ handle = cufftGetPlan(T, T, size(X), region)
- CuFFTPlan{T, T, K, inplace, N, R, Nothing}(handle, X, size(X), region, nothing)
+ return CuFFTPlan{T, T, K, inplace, N, R, Nothing}(handle, X, size(X), region, nothing)
end
# out-of-place complex
function plan_fft(X::DenseCuArray{T, N}, region::NTuple{R, Int}) where {T <: cufftComplexes, N, R}
- K = CUFFT_FORWARD
- inplace = false
+ K = CUFFT_FORWARD
+ inplace = false
region = ensure_raising(Tuple(unique(region)))
- handle = cufftGetPlan(T, T, size(X), region)
+ handle = cufftGetPlan(T, T, size(X), region)
- CuFFTPlan{T, T, K, inplace, N, R, Nothing}(handle, X, size(X), region, nothing)
+ return CuFFTPlan{T, T, K, inplace, N, R, Nothing}(handle, X, size(X), region, nothing)
end
function plan_bfft(X::DenseCuArray{T, N}, region::NTuple{R, Int}) where {T <: cufftComplexes, N, R}
- K = CUFFT_INVERSE
- inplace = false
+ K = CUFFT_INVERSE
+ inplace = false
region = ensure_raising(Tuple(unique(region)))
- handle = cufftGetPlan(T, T, size(X), region)
+ handle = cufftGetPlan(T, T, size(X), region)
- CuFFTPlan{T, T, K, inplace, N, R, Nothing}(handle, size(X), size(X), region, nothing)
+ return CuFFTPlan{T, T, K, inplace, N, R, Nothing}(handle, size(X), size(X), region, nothing)
end
# out-of-place real-to-complex
Base.@constprop :aggressive function plan_rfft(X::DenseCuArray{T, N}, region) where {T <: cufftReals, N}
- # for rfft we cannot sort the transform dimensions, since the meaning in fftw is that the first dimension in the list is reduced.
- # so we let the plan throw an error
- R = length(region)
- region = NTuple{R, Int}(region)
- plan_rfft(X, region)
+ # for rfft we cannot sort the transform dimensions, since the meaning in fftw is that the first dimension in the list is reduced.
+ # so we let the plan throw an error
+ R = length(region)
+ region = NTuple{R, Int}(region)
+ plan_rfft(X, region)
end
function plan_rfft(X::DenseCuArray{T, N}, region::NTuple{R, Int}) where {T <: cufftReals, N, R}
- K = CUFFT_FORWARD
- inplace = false
- # for rfft we cannot sort the transform dimensions, since the meaning in fftw is that the first dimension in the list is reduced.
- # so we let the plan throw an error
+ K = CUFFT_FORWARD
+ inplace = false
+ # for rfft we cannot sort the transform dimensions, since the meaning in fftw is that the first dimension in the list is reduced.
+ # so we let the plan throw an error
- handle = cufftGetPlan(complex(T), T, size(X), region)
+ handle = cufftGetPlan(complex(T), T, size(X), region)
- xdims = size(X)
- ydims = Base.setindex(xdims, div(xdims[region[1]], 2) + 1, region[1])
+ xdims = size(X)
+ ydims = Base.setindex(xdims, div(xdims[region[1]], 2) + 1, region[1])
- # The buffer is not needed for real-to-complex (`mul!`),
- # but it’s required for complex-to-real (`ldiv!`).
- buffer = CuArray{complex(T)}(undef, ydims...)
- B = typeof(buffer)
+ # The buffer is not needed for real-to-complex (`mul!`),
+ # but it’s required for complex-to-real (`ldiv!`).
+ buffer = CuArray{complex(T)}(undef, ydims...)
+ B = typeof(buffer)
- CuFFTPlan{complex(T), T, K, inplace, N, R, B}(handle, size(X), (ydims...,), region, buffer)
+ return CuFFTPlan{complex(T), T, K, inplace, N, R, B}(handle, size(X), (ydims...,), region, buffer)
end
# out-of-place complex-to-real
Base.@constprop :aggressive function plan_brfft(X::DenseCuArray{T, N}, d::Integer, region) where {T <: cufftComplexes, N}
- # for rfft we cannot sort the transform dimensions, since the meaning in fftw is that the first dimension in the list is reduced.
- # so we let the plan throw an error
- R = length(region)
- region = NTuple{R, Int}(region)
- plan_brfft(X, d, region)
+ # for rfft we cannot sort the transform dimensions, since the meaning in fftw is that the first dimension in the list is reduced.
+ # so we let the plan throw an error
+ R = length(region)
+ region = NTuple{R, Int}(region)
+ plan_brfft(X, d, region)
end
function plan_brfft(X::DenseCuArray{T, N}, d::Integer, region::NTuple{R, Int}) where {T <: cufftComplexes, N, R}
- K = CUFFT_INVERSE
- inplace = false
- # for rfft we cannot sort, since the meaning in fftw is that the first dimension in the list is reduced.
- # so we let the plan throw an error
+ K = CUFFT_INVERSE
+ inplace = false
+ # for rfft we cannot sort, since the meaning in fftw is that the first dimension in the list is reduced.
+ # so we let the plan throw an error
- xdims = size(X)
- ydims = Base.setindex(xdims, d, region[1])
- handle = cufftGetPlan(real(T), T, ydims, region)
+ xdims = size(X)
+ ydims = Base.setindex(xdims, d, region[1])
+ handle = cufftGetPlan(real(T), T, ydims, region)
- buffer = CuArray{T}(undef, size(X))
- B = typeof(buffer)
+ buffer = CuArray{T}(undef, size(X))
+ B = typeof(buffer)
- CuFFTPlan{real(T), T, K, inplace, N, R, B}(handle, size(X), ydims, region, buffer)
+ return CuFFTPlan{real(T), T, K, inplace, N, R, B}(handle, size(X), ydims, region, buffer)
end
# FIXME: plan_inv methods allocate needlessly (to provide type parameters)
# Perhaps use FakeArray types to avoid this.
-function plan_inv(p::CuFFTPlan{T, S, CUFFT_INVERSE, inplace, N, R, B}
-) where {T <: cufftNumber, S <: cufftNumber, inplace, N, R, B}
- handle = cufftGetPlan(S, T, p.output_size, p.region)
- ScaledPlan(CuFFTPlan{S, T, CUFFT_FORWARD, inplace, N, R, B}(handle, p.output_size, p.input_size, p.region, p.buffer),
- normalization(real(T), p.output_size, p.region))
+function plan_inv(
+ p::CuFFTPlan{T, S, CUFFT_INVERSE, inplace, N, R, B}
+ ) where {T <: cufftNumber, S <: cufftNumber, inplace, N, R, B}
+ handle = cufftGetPlan(S, T, p.output_size, p.region)
+ return ScaledPlan(
+ CuFFTPlan{S, T, CUFFT_FORWARD, inplace, N, R, B}(handle, p.output_size, p.input_size, p.region, p.buffer),
+ normalization(real(T), p.output_size, p.region)
+ )
end
-function plan_inv(p::CuFFTPlan{T, S, CUFFT_FORWARD, inplace, N, R, B}
-) where {T <: cufftNumber, S <: cufftNumber, inplace, N, R, B}
- handle = cufftGetPlan(S, T, p.input_size, p.region)
- ScaledPlan(CuFFTPlan{S, T, CUFFT_INVERSE, inplace, N, R, B}(handle, p.output_size, p.input_size, p.region, p.buffer),
- normalization(real(S), p.input_size, p.region))
+function plan_inv(
+ p::CuFFTPlan{T, S, CUFFT_FORWARD, inplace, N, R, B}
+ ) where {T <: cufftNumber, S <: cufftNumber, inplace, N, R, B}
+ handle = cufftGetPlan(S, T, p.input_size, p.region)
+ return ScaledPlan(
+ CuFFTPlan{S, T, CUFFT_INVERSE, inplace, N, R, B}(handle, p.output_size, p.input_size, p.region, p.buffer),
+ normalization(real(S), p.input_size, p.region)
+ )
end
@@ -330,143 +340,152 @@ end
# see # JuliaGPU/CuArrays.jl#345, NVIDIA/cuFFT#2714055.
function assert_applicable(p::CuFFTPlan{T, S}, X::DenseCuArray{S}) where {T, S}
- (size(X) == p.input_size) ||
- throw(ArgumentError("CuFFT plan applied to wrong-size input"))
+ return (size(X) == p.input_size) ||
+ throw(ArgumentError("CuFFT plan applied to wrong-size input"))
end
-function assert_applicable(p::CuFFTPlan{T, S, K, inplace}, X::DenseCuArray{S},
- Y::DenseCuArray{T}) where {T, S, K, inplace}
- assert_applicable(p, X)
- if size(Y) != p.output_size
- throw(ArgumentError("CuFFT plan applied to wrong-size output"))
- elseif inplace != (pointer(X) == pointer(Y))
- throw(ArgumentError(string("CuFFT ",
- inplace ? "in-place" : "out-of-place",
- " plan applied to ",
- inplace ? "out-of-place" : "in-place",
- " data")))
- end
+function assert_applicable(
+ p::CuFFTPlan{T, S, K, inplace}, X::DenseCuArray{S},
+ Y::DenseCuArray{T}
+ ) where {T, S, K, inplace}
+ assert_applicable(p, X)
+ return if size(Y) != p.output_size
+ throw(ArgumentError("CuFFT plan applied to wrong-size output"))
+ elseif inplace != (pointer(X) == pointer(Y))
+ throw(
+ ArgumentError(
+ string(
+ "CuFFT ",
+ inplace ? "in-place" : "out-of-place",
+ " plan applied to ",
+ inplace ? "out-of-place" : "in-place",
+ " data"
+ )
+ )
+ )
+ end
end
function unsafe_execute!(plan::CuFFTPlan{T, S, K, inplace}, x::DenseCuArray{S}, y::DenseCuArray{T}) where {T, S, K, inplace}
- update_stream(plan)
- cufftXtExec(plan, x, y, K)
+ update_stream(plan)
+ return cufftXtExec(plan, x, y, K)
end
# a version of unsafe_execute which applies the plan to each element of external batch dimensions not covered by the plan.
# Note that for plans, with external batch non-transform dimensions views are created for each of such elements.
# Such views each have lower dimensions and are then transformed by the lower dimension low-level Cuda plan.
function unsafe_execute_external_batches!(p::CuFFTPlan{T, S, K, inplace}, x, y) where {T, S, K, inplace}
- # For R2C/C2R transforms, dimension 1 should not be in external batches (alignment requirement)
- internal_batch_dims, external_batch_dims = get_batch_dims(p.region, p.output_size)
- if isempty(external_batch_dims)
- unsafe_execute!(p, x, y)
- else
- # flatten the memory as a view as otherwise the intput is not correctly interpreted as a contiguous Cuda memory
- external_batch_ids = [external_batch_dims...]
- batch_strides_x = (1, cumprod(size(x))...)[external_batch_ids]
- batch_strides_y = (1, cumprod(size(y))...)[external_batch_ids]
- did_skip_x = false
- to_skip_x = 0
- did_skip_y = false
- to_skip_y = 0
- for c in CartesianIndices(size(x)[external_batch_ids])
- batch_start_x = sum(batch_strides_x .* (Tuple(c) .- 1)) + 1
- batch_start_y = sum(batch_strides_y .* (Tuple(c) .- 1)) + 1
- if (eltype(x) <: Real && iseven(batch_start_x))
- did_skip_x = true
- if (to_skip_x == 0)
- to_skip_x = batch_start_x - 1
- end
- continue;
- end
- if (eltype(y) <: Real && iseven(batch_start_y))
- did_skip_y = true
- if (to_skip_y == 0)
- to_skip_y = batch_start_y - 1
- end
- continue;
- end
- vx = @view x[batch_start_x:end]
- vy = @view y[batch_start_y:end]
- unsafe_execute!(p, vx, vy)
- end
- # If there was at least one skip due to real Float32 alignment, we need to cyclicly rotate the whole array in place and run again
- if (did_skip_x || did_skip_y)
- extra_x_index = 0
- extra_y_index = 0
- if (did_skip_x)
- extra_y_index = 1
- circshift!((@view x[:]), -to_skip_x)
- end
- if (did_skip_y)
- extra_x_index = 1
- circshift!((@view y[:]), -to_skip_y)
- end
- for c in CartesianIndices(size(x)[external_batch_ids])
- batch_start_x = sum(batch_strides_x .* (Tuple(c) .- 1)) + 1 + extra_x_index
- batch_start_y = sum(batch_strides_y .* (Tuple(c) .- 1)) + 1 + extra_y_index
- if (eltype(x) <: Real && iseven(batch_start_x))
- continue;
- end
- if (eltype(y) <: Real && iseven(batch_start_y))
- continue;
- end
- vx = @view x[batch_start_x:end]
- vy = @view y[batch_start_y:end]
- unsafe_execute!(p, vx, vy)
- end
- if (did_skip_x)
- circshift!((@view x[:]), 1)
- end
- if (did_skip_y)
- circshift!((@view y[:]), 1)
- end
- end
- end
- return
+ # For R2C/C2R transforms, dimension 1 should not be in external batches (alignment requirement)
+ internal_batch_dims, external_batch_dims = get_batch_dims(p.region, p.output_size)
+ if isempty(external_batch_dims)
+ unsafe_execute!(p, x, y)
+ else
+ # flatten the memory as a view as otherwise the intput is not correctly interpreted as a contiguous Cuda memory
+ external_batch_ids = [external_batch_dims...]
+ batch_strides_x = (1, cumprod(size(x))...)[external_batch_ids]
+ batch_strides_y = (1, cumprod(size(y))...)[external_batch_ids]
+ did_skip_x = false
+ to_skip_x = 0
+ did_skip_y = false
+ to_skip_y = 0
+ for c in CartesianIndices(size(x)[external_batch_ids])
+ batch_start_x = sum(batch_strides_x .* (Tuple(c) .- 1)) + 1
+ batch_start_y = sum(batch_strides_y .* (Tuple(c) .- 1)) + 1
+ if (eltype(x) <: Real && iseven(batch_start_x))
+ did_skip_x = true
+ if (to_skip_x == 0)
+ to_skip_x = batch_start_x - 1
+ end
+ continue
+ end
+ if (eltype(y) <: Real && iseven(batch_start_y))
+ did_skip_y = true
+ if (to_skip_y == 0)
+ to_skip_y = batch_start_y - 1
+ end
+ continue
+ end
+ vx = @view x[batch_start_x:end]
+ vy = @view y[batch_start_y:end]
+ unsafe_execute!(p, vx, vy)
+ end
+ # If there was at least one skip due to real Float32 alignment, we need to cyclicly rotate the whole array in place and run again
+ if (did_skip_x || did_skip_y)
+ extra_x_index = 0
+ extra_y_index = 0
+ if (did_skip_x)
+ extra_y_index = 1
+ circshift!((@view x[:]), -to_skip_x)
+ end
+ if (did_skip_y)
+ extra_x_index = 1
+ circshift!((@view y[:]), -to_skip_y)
+ end
+ for c in CartesianIndices(size(x)[external_batch_ids])
+ batch_start_x = sum(batch_strides_x .* (Tuple(c) .- 1)) + 1 + extra_x_index
+ batch_start_y = sum(batch_strides_y .* (Tuple(c) .- 1)) + 1 + extra_y_index
+ if (eltype(x) <: Real && iseven(batch_start_x))
+ continue
+ end
+ if (eltype(y) <: Real && iseven(batch_start_y))
+ continue
+ end
+ vx = @view x[batch_start_x:end]
+ vy = @view y[batch_start_y:end]
+ unsafe_execute!(p, vx, vy)
+ end
+ if (did_skip_x)
+ circshift!((@view x[:]), 1)
+ end
+ if (did_skip_y)
+ circshift!((@view y[:]), 1)
+ end
+ end
+ end
+ return
end
## high-level integrations
-function LinearAlgebra.mul!(y::DenseCuArray{T}, p::CuFFTPlan{T, S, K, inplace}, x::DenseCuArray{S}
-) where {T, S, K, inplace}
- assert_applicable(p, x, y)
- if !inplace && T<:Real
- # Out-of-place complex-to-real FFT will always overwrite input x.
- # We copy the input x in an auxiliary buffer.
- z = p.buffer
- copyto!(z, x)
- else
- z = x
- end
- unsafe_execute_external_batches!(p, z, y)
- y
+function LinearAlgebra.mul!(
+ y::DenseCuArray{T}, p::CuFFTPlan{T, S, K, inplace}, x::DenseCuArray{S}
+ ) where {T, S, K, inplace}
+ assert_applicable(p, x, y)
+ if !inplace && T <: Real
+ # Out-of-place complex-to-real FFT will always overwrite input x.
+ # We copy the input x in an auxiliary buffer.
+ z = p.buffer
+ copyto!(z, x)
+ else
+ z = x
+ end
+ unsafe_execute_external_batches!(p, z, y)
+ return y
end
function Base.:(*)(p::CuFFTPlan{T, S, K, true}, x::DenseCuArray{S}) where {T, S, K}
- assert_applicable(p, x)
- unsafe_execute_external_batches!(p, x, x)
- x
+ assert_applicable(p, x)
+ unsafe_execute_external_batches!(p, x, x)
+ return x
end
function Base.:(*)(p::CuFFTPlan{T, S, K, false}, x::DenseCuArray{S1, M}) where {T, S, K, S1, M}
- if T<:Real
- # Out-of-place complex-to-real FFT will always overwrite input x.
- # We copy the input x in an auxiliary buffer.
- z = p.buffer
- copyto!(z, x)
- else
- if S1 != S
- # Convert to the expected input type.
- z = copy1(S, x)
- else
- z = x
- end
- end
- assert_applicable(p, z)
- y = CuArray{T, M}(undef, p.output_size)
- unsafe_execute_external_batches!(p, z, y)
- y
+ if T <: Real
+ # Out-of-place complex-to-real FFT will always overwrite input x.
+ # We copy the input x in an auxiliary buffer.
+ z = p.buffer
+ copyto!(z, x)
+ else
+ if S1 != S
+ # Convert to the expected input type.
+ z = copy1(S, x)
+ else
+ z = x
+ end
+ end
+ assert_applicable(p, z)
+ y = CuArray{T, M}(undef, p.output_size)
+ unsafe_execute_external_batches!(p, z, y)
+ return y
end
diff --git a/lib/cufft/wrappers.jl b/lib/cufft/wrappers.jl
index 82eb64d6e..7d0f9669f 100644
--- a/lib/cufft/wrappers.jl
+++ b/lib/cufft/wrappers.jl
@@ -1,14 +1,15 @@
# wrappers of low-level functionality
function cufftGetProperty(property::libraryPropertyType)
- value_ref = Ref{Cint}()
- cufftGetProperty(property, value_ref)
- value_ref[]
+ value_ref = Ref{Cint}()
+ cufftGetProperty(property, value_ref)
+ return value_ref[]
end
version() = VersionNumber(cufftGetProperty(CUDA.MAJOR_VERSION),
- cufftGetProperty(CUDA.MINOR_VERSION),
- cufftGetProperty(CUDA.PATCH_LEVEL))
+ cufftGetProperty(CUDA.MINOR_VERSION),
+ cufftGetProperty(CUDA.PATCH_LEVEL)
+)
"""
cufftMakePlan(output_type::Type{<:cufftNumber}, input_type::Type{<:cufftNumber}, input_size::Dims, region)
@@ -22,103 +23,107 @@ low level interface to the CUDA library CuFFT for the function cufftXtMakePlanMa
* `region`: dimensions of the array to transform
"""
function cufftMakePlan(output_type::Type{<:cufftNumber}, input_type::Type{<:cufftNumber}, input_size::Dims, region)
- if any(diff(collect(region)) .< 1)
- throw(ArgumentError("transformed dims for rfft-type transforms must be an increasing sequence"))
- end
- if any(region .< 1 .|| region .> length(input_size))
- throw(ArgumentError("transformed dims can only refer to valid dimensions"))
- end
- nrank = length(region)
-
- sz = ntuple((d) -> input_size[region[d]], nrank)
- csz = ntuple((d) -> (d==1) ? div(input_size[region[d]], 2) + 1 : input_size[region[d]], nrank)
-
- # all sizes which are not part of the dimensions specified by region are batch dimensions.
- num_internal_batches = prod(input_size) ÷ prod(sz)
- cdims = ntuple((d) -> (d==region[1]) ? div(input_size[d], 2) + 1 : input_size[d], length(input_size))
-
- # make the plan
- worksize_ref = Ref{Csize_t}()
- rsz = length(sz) > 1 ? rsz = reverse(sz) : sz
- if nrank > 3
- throw(ArgumentError("only up to three transform dimensions are allowed in one plan"))
- end
-
- # initialize the plan handle
- handle_ref = Ref{cufftHandle}()
- cufftCreate(handle_ref)
- handle = handle_ref[]
-
- if (region...,) == ((1:nrank)...,)
- # handle simple case, transforming the first nrank dimensions, ... simply! (for robustness)
- # arguments are: plan, rank, transform-sizes, inembed, istride, idist, itype, onembed, ostride, odist, otype, batch
- execution_type = promote_type(input_type, output_type)
- cufftXtMakePlanMany(handle, nrank, Clonglong[rsz...],
- C_NULL, 1, 1, convert(cudaDataType, input_type),
- C_NULL, 1, 1, convert(cudaDataType, output_type),
- num_internal_batches, worksize_ref, convert(cudaDataType, execution_type))
- else
- # reduce the array to the final transform direction.
- # This situation will be picked up in the application of the plan later.
- internal_batch_dims, external_batch_dims = get_batch_dims(region, input_size)
- # plan only for the internal dims and the external dims will be handled later
-
- # the internal batching is over the largest consecutive batch indices.
- # Since they are consecutive they can all be batched by a single batch_stride "i_dist".
- idist = prod((1, input_size...)[1:internal_batch_dims[1]])
-
- cdist = prod((1, cdims...)[1:internal_batch_dims[1]])
- istride = prod((1, input_size...)[1:region[1]])
-
- # inembed are the products of the sizes between the transform directions
- all_strides = cumprod((1, input_size...))
- if (nrank == 1)
- # For 1D transforms, inembed[0] must be >= n[0] (the transform size)
- # Using idist gives the embedded storage size which satisfies this requirement
- inembed = Clonglong[idist,]
- cnembed = Clonglong[cdist,]
- elseif (nrank == 2)
- # inembed[0] must be >= n[0] (the larger transform dim in C-order)
- # Using idist ensures sufficient storage is indicated
- inembed = Clonglong[idist, prod(input_size[region[1]:(region[2]-1)])]
- cnembed = Clonglong[cdist, prod(cdims[region[1]:(region[2]-1)])]
- elseif (nrank == 3)
- inembed = Clonglong[idist, prod(input_size[region[2]:(region[3]-1)]), prod(input_size[region[1]:(region[2]-1)])]
- cnembed = Clonglong[cdist, prod(cdims[region[2]:(region[3]-1)]), prod(cdims[region[1]:(region[2]-1)])]
- end
-
- # internal number of batches are product of the sizes at the internal_batch_dims
- num_internal_batches = prod(input_size[[internal_batch_dims...]])
-
- # in C-style notation:
- # 2D: input[ b * idist + (x * inembed[1] + y) * istride ]
- # 3D: input[ b * idist + ((x * inembed[1] + y) * inembed[2] + z) * istride ]
- # first transform dimension stride, assuming internal batching over outer dimensions
- # batching stride (datatype size dependent)
-
- ostride = istride
- # if input_type is real, the output will be half complex, so the output strides are modified
- if input_type <: Real
- odist = cdist
- onembed = cnembed
- else
- odist = idist
- onembed = inembed
- end
- if output_type <: Real
- idist = cdist
- inembed = cnembed
- end
-
- execution_type = promote_type(input_type, output_type)
-
- res = cufftXtMakePlanMany(handle, nrank, Clonglong[rsz...],
- inembed, istride, idist, convert(cudaDataType, input_type),
- onembed, ostride, odist, convert(cudaDataType, output_type),
- num_internal_batches, worksize_ref, convert(cudaDataType, execution_type))
- end
-
- handle, worksize_ref[]
+ if any(diff(collect(region)) .< 1)
+ throw(ArgumentError("transformed dims for rfft-type transforms must be an increasing sequence"))
+ end
+ if any(region .< 1 .|| region .> length(input_size))
+ throw(ArgumentError("transformed dims can only refer to valid dimensions"))
+ end
+ nrank = length(region)
+
+ sz = ntuple((d) -> input_size[region[d]], nrank)
+ csz = ntuple((d) -> (d == 1) ? div(input_size[region[d]], 2) + 1 : input_size[region[d]], nrank)
+
+ # all sizes which are not part of the dimensions specified by region are batch dimensions.
+ num_internal_batches = prod(input_size) ÷ prod(sz)
+ cdims = ntuple((d) -> (d == region[1]) ? div(input_size[d], 2) + 1 : input_size[d], length(input_size))
+
+ # make the plan
+ worksize_ref = Ref{Csize_t}()
+ rsz = length(sz) > 1 ? rsz = reverse(sz) : sz
+ if nrank > 3
+ throw(ArgumentError("only up to three transform dimensions are allowed in one plan"))
+ end
+
+ # initialize the plan handle
+ handle_ref = Ref{cufftHandle}()
+ cufftCreate(handle_ref)
+ handle = handle_ref[]
+
+ if (region...,) == ((1:nrank)...,)
+ # handle simple case, transforming the first nrank dimensions, ... simply! (for robustness)
+ # arguments are: plan, rank, transform-sizes, inembed, istride, idist, itype, onembed, ostride, odist, otype, batch
+ execution_type = promote_type(input_type, output_type)
+ cufftXtMakePlanMany(
+ handle, nrank, Clonglong[rsz...],
+ C_NULL, 1, 1, convert(cudaDataType, input_type),
+ C_NULL, 1, 1, convert(cudaDataType, output_type),
+ num_internal_batches, worksize_ref, convert(cudaDataType, execution_type)
+ )
+ else
+ # reduce the array to the final transform direction.
+ # This situation will be picked up in the application of the plan later.
+ internal_batch_dims, external_batch_dims = get_batch_dims(region, input_size)
+ # plan only for the internal dims and the external dims will be handled later
+
+ # the internal batching is over the largest consecutive batch indices.
+ # Since they are consecutive they can all be batched by a single batch_stride "i_dist".
+ idist = prod((1, input_size...)[1:internal_batch_dims[1]])
+
+ cdist = prod((1, cdims...)[1:internal_batch_dims[1]])
+ istride = prod((1, input_size...)[1:region[1]])
+
+ # inembed are the products of the sizes between the transform directions
+ all_strides = cumprod((1, input_size...))
+ if (nrank == 1)
+ # For 1D transforms, inembed[0] must be >= n[0] (the transform size)
+ # Using idist gives the embedded storage size which satisfies this requirement
+ inembed = Clonglong[idist]
+ cnembed = Clonglong[cdist]
+ elseif (nrank == 2)
+ # inembed[0] must be >= n[0] (the larger transform dim in C-order)
+ # Using idist ensures sufficient storage is indicated
+ inembed = Clonglong[idist, prod(input_size[region[1]:(region[2] - 1)])]
+ cnembed = Clonglong[cdist, prod(cdims[region[1]:(region[2] - 1)])]
+ elseif (nrank == 3)
+ inembed = Clonglong[idist, prod(input_size[region[2]:(region[3] - 1)]), prod(input_size[region[1]:(region[2] - 1)])]
+ cnembed = Clonglong[cdist, prod(cdims[region[2]:(region[3] - 1)]), prod(cdims[region[1]:(region[2] - 1)])]
+ end
+
+ # internal number of batches are product of the sizes at the internal_batch_dims
+ num_internal_batches = prod(input_size[[internal_batch_dims...]])
+
+ # in C-style notation:
+ # 2D: input[ b * idist + (x * inembed[1] + y) * istride ]
+ # 3D: input[ b * idist + ((x * inembed[1] + y) * inembed[2] + z) * istride ]
+ # first transform dimension stride, assuming internal batching over outer dimensions
+ # batching stride (datatype size dependent)
+
+ ostride = istride
+ # if input_type is real, the output will be half complex, so the output strides are modified
+ if input_type <: Real
+ odist = cdist
+ onembed = cnembed
+ else
+ odist = idist
+ onembed = inembed
+ end
+ if output_type <: Real
+ idist = cdist
+ inembed = cnembed
+ end
+
+ execution_type = promote_type(input_type, output_type)
+
+ res = cufftXtMakePlanMany(
+ handle, nrank, Clonglong[rsz...],
+ inembed, istride, idist, convert(cudaDataType, input_type),
+ onembed, ostride, odist, convert(cudaDataType, output_type),
+ num_internal_batches, worksize_ref, convert(cudaDataType, execution_type)
+ )
+ end
+
+ return handle, worksize_ref[]
end
@@ -126,30 +131,30 @@ end
const cufftHandleCacheKey = Tuple{CuContext, Type, Type, Dims, Any}
function handle_ctor((ctx, args...))
- context!(ctx) do
- # make the plan
- handle, worksize = cufftMakePlan(args...)
-
- # NOTE: we currently do not use the worksize to allocate our own workarea,
- # instead relying on the automatic allocation strategy.
- handle
- end
+ return context!(ctx) do
+ # make the plan
+ handle, worksize = cufftMakePlan(args...)
+
+ # NOTE: we currently do not use the worksize to allocate our own workarea,
+ # instead relying on the automatic allocation strategy.
+ handle
+ end
end
function handle_dtor((ctx, args...), handle)
- context!(ctx; skip_destroyed = true) do
- cufftDestroy(handle)
- end
+ return context!(ctx; skip_destroyed = true) do
+ cufftDestroy(handle)
+ end
end
const idle_handles = HandleCache{cufftHandleCacheKey, cufftHandle}(handle_ctor, handle_dtor)
function cufftGetPlan(args...)
- ctx = context()
- handle = pop!(idle_handles, (ctx, args...))
+ ctx = context()
+ handle = pop!(idle_handles, (ctx, args...))
- cufftSetStream(handle, stream())
+ cufftSetStream(handle, stream())
- return handle
+ return handle
end
function cufftReleasePlan(plan)
- push!(idle_handles, plan)
+ return push!(idle_handles, plan)
end
diff --git a/test/libraries/cufft.jl b/test/libraries/cufft.jl
index fa3cabaf5..37261be9a 100644
--- a/test/libraries/cufft.jl
+++ b/test/libraries/cufft.jl
@@ -157,7 +157,7 @@ function in_place(X::AbstractArray{T,N}) where {T <: Complex,N}
@test isapprox(Z, X, rtol = rtol(T), atol = atol(T))
end
-function batched(X::AbstractArray{T,N}, region) where {T <: Complex,N}
+function batched(X::AbstractArray{T, N}, region) where {T <: Complex, N}
fftw_X = fft(X,region)
d_X = CuArray(X)
p = plan_fft(d_X,region)
@@ -244,51 +244,51 @@ end
@testset "Batch 2D (in 3D)" begin
dims = (N1,N2,N3)
- for region in [(1,2),(2,3),(1,3),(3,1)]
+ for region in [(1, 2), (2, 3), (1, 3), (3, 1)]
X = rand(T, dims)
batched(X,region)
end
- # @test_throws ArgumentError batched(X,(3,1))
+ # @test_throws ArgumentError batched(X,(3,1))
end
@testset "Batch 2D (in 4D)" begin
dims = (N1,N2,N3,N4)
- for region in [(1,2),(1,3),(3,),(2,),(2,3)]
- X = rand(T, dims)
- batched(X,region)
- end
+ for region in [(1, 2), (1, 3), (3,), (2,), (2, 3)]
+ X = rand(T, dims)
+ batched(X, region)
+ end
- for region in [(1,4),(3,4)]
- X = rand(T, dims)
- if (T <: ComplexF16)
- # for odd dimensions covering axes of or between transform axes
- # there is a rather unspecific CUFFTError: "CUFFT_INCOMPLETE_PARAMETER_LIST" thrown. This should be caught elsewhere to give more specific hint
- # on how to avoid this. Maybe as of Cuda 13 this issue is removed?
- @test_throws CUFFTError batched(X, region)
- else
- batched(X,region)
+ for region in [(1, 4), (3, 4)]
+ X = rand(T, dims)
+ if (T <: ComplexF16)
+ # for odd dimensions covering axes of or between transform axes
+ # there is a rather unspecific CUFFTError: "CUFFT_INCOMPLETE_PARAMETER_LIST" thrown. This should be caught elsewhere to give more specific hint
+ # on how to avoid this. Maybe as of Cuda 13 this issue is removed?
+ @test_throws CUFFTError batched(X, region)
+ else
+ batched(X, region)
+ end
end
- end
-end
+ end
-@testset "Batch 3D (in 5D)" begin
- dims = (N1,N2,N3,N4,N5)
- for region in [(1,2,3),(2,3,5)]
+ @testset "Batch 3D (in 5D)" begin
+ dims = (N1, N2, N3, N4, N5)
+ for region in [(1, 2, 3), (2, 3, 5)]
X = rand(T, dims)
batched(X,region)
end
- for region in [(1,2,4),(2,3,4),(3,4,5)]
+ for region in [(1, 2, 4), (2, 3, 4), (3, 4, 5)]
X = rand(T, dims)
- if (T <: ComplexF16)
- # for odd dimensions covering axes of or between transform axes
- # there is a rather unspecific CUFFTError: "CUFFT_INCOMPLETE_PARAMETER_LIST" thrown. This should be caught elsewhere to give more specific hint
- # on how to avoid this. Maybe as of Cuda 13 this issue is removed?
- @test_throws CUFFTError batched(X,region)
- else
- batched(X,region)
- end
+ if (T <: ComplexF16)
+ # for odd dimensions covering axes of or between transform axes
+ # there is a rather unspecific CUFFTError: "CUFFT_INCOMPLETE_PARAMETER_LIST" thrown. This should be caught elsewhere to give more specific hint
+ # on how to avoid this. Maybe as of Cuda 13 this issue is removed?
+ @test_throws CUFFTError batched(X, region)
+ else
+ batched(X, region)
+ end
end
end
@@ -342,12 +342,12 @@ end
@testset "ensure_raising" begin
- @test (1,2,3) == CUDA.CUFFT.ensure_raising((1,2,3))
- @test (1,2,3) == CUDA.CUFFT.ensure_raising((2,1,3))
- @test (1,2,3) == CUDA.CUFFT.ensure_raising((1,3,2))
- @test (1,2,3) == CUDA.CUFFT.ensure_raising((3,1,2))
- @test (1,2,3) == CUDA.CUFFT.ensure_raising((2,3,1))
- @test (10,20,30) == CUDA.CUFFT.ensure_raising((30,20,10))
+ @test (1, 2, 3) == CUDA.CUFFT.ensure_raising((1, 2, 3))
+ @test (1, 2, 3) == CUDA.CUFFT.ensure_raising((2, 1, 3))
+ @test (1, 2, 3) == CUDA.CUFFT.ensure_raising((1, 3, 2))
+ @test (1, 2, 3) == CUDA.CUFFT.ensure_raising((3, 1, 2))
+ @test (1, 2, 3) == CUDA.CUFFT.ensure_raising((2, 3, 1))
+ @test (10, 20, 30) == CUDA.CUFFT.ensure_raising((30, 20, 10))
end
@testset for T in [Float16, Float32, Float64]
@@ -385,30 +385,30 @@ end
X = rand(T, dims)
- # This should only throw an error for rfft type transforms:
+ # This should only throw an error for rfft type transforms:
@test_throws ArgumentError batched(X,(3,1))
end
@testset "Batch 2D (in 4D)" begin
dims = (N1,N2,N3,N4)
- for region in [(1,2),(1,3),(2,3)]
+ for region in [(1, 2), (1, 3), (2, 3)]
X = rand(T, dims)
batched(X,region)
end
- for region in [(2,4),(1,4),(3,4)]
- if (T <: Float16)
- # for odd dimensions covering axes of or between transform axes
- # there is a rather unspecific CUFFTError: "CUFFT_INCOMPLETE_PARAMETER_LIST" thrown. This should be caught elsewhere to give more specific hint
- # on how to avoid this. Maybe as of Cuda 13 this issue is removed?
- X = rand(T, dims);
- @test_throws CUFFTError batched(X,region)
- else
- batched(X,region)
- end
+ for region in [(2, 4), (1, 4), (3, 4)]
+ if (T <: Float16)
+ # for odd dimensions covering axes of or between transform axes
+ # there is a rather unspecific CUFFTError: "CUFFT_INCOMPLETE_PARAMETER_LIST" thrown. This should be caught elsewhere to give more specific hint
+ # on how to avoid this. Maybe as of Cuda 13 this issue is removed?
+ X = rand(T, dims)
+ @test_throws CUFFTError batched(X, region)
+ else
+ batched(X, region)
+ end
end
- X = rand(T, dims)
- @test_throws ArgumentError batched(X,(3,1))
+ X = rand(T, dims)
+ @test_throws ArgumentError batched(X, (3, 1))
end
@testset "3D" begin |
There was a problem hiding this comment.
CUDA.jl Benchmarks
Details
| Benchmark suite | Current: 4f0148b | Previous: f7b7929 | Ratio |
|---|---|---|---|
latency/precompile |
44914330045.5 ns |
45018748344 ns |
1.00 |
latency/ttfp |
12761270340 ns |
12770284486 ns |
1.00 |
latency/import |
3544756834 ns |
3541917719 ns |
1.00 |
integration/volumerhs |
9439312 ns |
9450947.5 ns |
1.00 |
integration/byval/slices=1 |
145893 ns |
146127 ns |
1.00 |
integration/byval/slices=3 |
423210 ns |
423159 ns |
1.00 |
integration/byval/reference |
143988 ns |
143932 ns |
1.00 |
integration/byval/slices=2 |
284786 ns |
284759.5 ns |
1.00 |
integration/cudadevrt |
102555 ns |
102551 ns |
1.00 |
kernel/indexing |
13248 ns |
13204 ns |
1.00 |
kernel/indexing_checked |
14156 ns |
13977 ns |
1.01 |
kernel/occupancy |
697.675 ns |
664.05625 ns |
1.05 |
kernel/launch |
2153.3333333333335 ns |
2163.9444444444443 ns |
1.00 |
kernel/rand |
16713 ns |
18131 ns |
0.92 |
array/reverse/1d |
18460 ns |
18471 ns |
1.00 |
array/reverse/2dL_inplace |
65937 ns |
65988 ns |
1.00 |
array/reverse/1dL |
68991 ns |
69022 ns |
1.00 |
array/reverse/2d |
20716 ns |
20733 ns |
1.00 |
array/reverse/1d_inplace |
10273.666666666666 ns |
8573 ns |
1.20 |
array/reverse/2d_inplace |
10513 ns |
10232 ns |
1.03 |
array/reverse/2dL |
72859 ns |
72825 ns |
1.00 |
array/reverse/1dL_inplace |
65949 ns |
65937 ns |
1.00 |
array/copy |
18699 ns |
18988 ns |
0.98 |
array/iteration/findall/int |
150125.5 ns |
150059 ns |
1.00 |
array/iteration/findall/bool |
132095 ns |
132365.5 ns |
1.00 |
array/iteration/findfirst/int |
83445 ns |
83639 ns |
1.00 |
array/iteration/findfirst/bool |
81374 ns |
81468 ns |
1.00 |
array/iteration/scalar |
68598 ns |
66443.5 ns |
1.03 |
array/iteration/logical |
206181 ns |
200236 ns |
1.03 |
array/iteration/findmin/1d |
88901 ns |
86614.5 ns |
1.03 |
array/iteration/findmin/2d |
117453.5 ns |
117241 ns |
1.00 |
array/reductions/reduce/Int64/1d |
42922 ns |
42766 ns |
1.00 |
array/reductions/reduce/Int64/dims=1 |
44043.5 ns |
52907 ns |
0.83 |
array/reductions/reduce/Int64/dims=2 |
59759 ns |
60231 ns |
0.99 |
array/reductions/reduce/Int64/dims=1L |
87702 ns |
87828 ns |
1.00 |
array/reductions/reduce/Int64/dims=2L |
84702 ns |
84956.5 ns |
1.00 |
array/reductions/reduce/Float32/1d |
34967 ns |
34964 ns |
1.00 |
array/reductions/reduce/Float32/dims=1 |
49282.5 ns |
40442.5 ns |
1.22 |
array/reductions/reduce/Float32/dims=2 |
57050.5 ns |
57125 ns |
1.00 |
array/reductions/reduce/Float32/dims=1L |
52100 ns |
52000 ns |
1.00 |
array/reductions/reduce/Float32/dims=2L |
69613 ns |
69982.5 ns |
0.99 |
array/reductions/mapreduce/Int64/1d |
43257.5 ns |
42509 ns |
1.02 |
array/reductions/mapreduce/Int64/dims=1 |
42737 ns |
42334 ns |
1.01 |
array/reductions/mapreduce/Int64/dims=2 |
59688 ns |
59835 ns |
1.00 |
array/reductions/mapreduce/Int64/dims=1L |
87707 ns |
87864 ns |
1.00 |
array/reductions/mapreduce/Int64/dims=2L |
85018 ns |
85164 ns |
1.00 |
array/reductions/mapreduce/Float32/1d |
34634 ns |
34719 ns |
1.00 |
array/reductions/mapreduce/Float32/dims=1 |
40447.5 ns |
45273 ns |
0.89 |
array/reductions/mapreduce/Float32/dims=2 |
56655 ns |
56959 ns |
0.99 |
array/reductions/mapreduce/Float32/dims=1L |
51783 ns |
52179 ns |
0.99 |
array/reductions/mapreduce/Float32/dims=2L |
69462.5 ns |
69729 ns |
1.00 |
array/broadcast |
20660 ns |
20464 ns |
1.01 |
array/copyto!/gpu_to_gpu |
11207 ns |
11261 ns |
1.00 |
array/copyto!/cpu_to_gpu |
215049 ns |
216266 ns |
0.99 |
array/copyto!/gpu_to_cpu |
284996.5 ns |
282685.5 ns |
1.01 |
array/accumulate/Int64/1d |
118721 ns |
119363 ns |
0.99 |
array/accumulate/Int64/dims=1 |
79981.5 ns |
80474 ns |
0.99 |
array/accumulate/Int64/dims=2 |
157035 ns |
157437.5 ns |
1.00 |
array/accumulate/Int64/dims=1L |
1706343 ns |
1706725 ns |
1.00 |
array/accumulate/Int64/dims=2L |
961502.5 ns |
962008 ns |
1.00 |
array/accumulate/Float32/1d |
101269 ns |
101483 ns |
1.00 |
array/accumulate/Float32/dims=1 |
76864.5 ns |
77247 ns |
1.00 |
array/accumulate/Float32/dims=2 |
143796 ns |
143932 ns |
1.00 |
array/accumulate/Float32/dims=1L |
1594511.5 ns |
1593993 ns |
1.00 |
array/accumulate/Float32/dims=2L |
660393 ns |
660832 ns |
1.00 |
array/construct |
1285 ns |
1332.6 ns |
0.96 |
array/random/randn/Float32 |
39043 ns |
38567.5 ns |
1.01 |
array/random/randn!/Float32 |
31501 ns |
31716 ns |
0.99 |
array/random/rand!/Int64 |
34294 ns |
34263.5 ns |
1.00 |
array/random/rand!/Float32 |
8695.333333333334 ns |
8628 ns |
1.01 |
array/random/rand/Int64 |
37230 ns |
30788.5 ns |
1.21 |
array/random/rand/Float32 |
13347 ns |
13144 ns |
1.02 |
array/permutedims/4d |
52317 ns |
52096 ns |
1.00 |
array/permutedims/2d |
52613.5 ns |
52583 ns |
1.00 |
array/permutedims/3d |
53375.5 ns |
53461 ns |
1.00 |
array/sorting/1d |
2735141 ns |
2734388 ns |
1.00 |
array/sorting/by |
3314264.5 ns |
3327876 ns |
1.00 |
array/sorting/2d |
1068839 ns |
1072450 ns |
1.00 |
cuda/synchronization/stream/auto |
1069.2 ns |
1031.7 ns |
1.04 |
cuda/synchronization/stream/nonblocking |
7549.1 ns |
7628.4 ns |
0.99 |
cuda/synchronization/stream/blocking |
836.8414634146342 ns |
827.9 ns |
1.01 |
cuda/synchronization/context/auto |
1213.8 ns |
1165.1 ns |
1.04 |
cuda/synchronization/context/nonblocking |
7339.299999999999 ns |
7638.9 ns |
0.96 |
cuda/synchronization/context/blocking |
939.8235294117648 ns |
925.0566037735849 ns |
1.02 |
This comment was automatically generated by workflow using github-action-benchmark.
|
@maleadt, is there a way to make it pass the buildkite/cud-dot-jl ? |
|
Failures seem related. Sorting NTuples from Base is only available as of Julia 1.12 |
Thanks! Great catch. So I will fix this and push again. |
That seems to have worked but other packages (sparse) seem to fail using the CUDA 13.2 configuration. |
In a use-case a problem with FFTs showed up, that previously surfaced a number of times. This lead to a reworking of the FFT implementations. Here is the example that caused massive problems:
The latter command (y-direction FFT) is in the previous code insanely slow (~15 seconds) as opposed to 0.1 sec for the X direction.
The reason is the for loop over the "trailing" batch dimensions, and the 1 million kernel starts, which is 100x slower than internal batching. I tried some approaches using streams but with no success, which led to rewriting the planning algorithm.
I find it quite essential to have a decently fast Y-transform, since in many implementations of ND-FFTs or (in our case the CZT algorithm in
FourierToosl.jl) transforms are consecutively applied in the various directions.In addition, there previously were unnecessary restrictions to possible combinations of transform directions and the dimensions always had to be order from low to high.
In the current version:
Realinput type, in which the external batch dimensions lead to anidiststride of uneven size, require the application of two in-placecircshift!on the input array between batching over each second element. Maybe this problem, which throws an alignment error, existed even previously but never showed up or was never reported.P.S.: Sorry for having updated the dependencies without proper testing of the whole
CUDA.jl. You can simply revert the changes toProject.toml