diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index c944641..1f9e012 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -13,7 +13,7 @@ jobs: fail-fast: false matrix: version: - - '1.1' + - 'min' - '1' # - 'nightly' os: @@ -22,7 +22,7 @@ jobs: - x64 steps: - uses: actions/checkout@v2 - - uses: julia-actions/setup-julia@v1 + - uses: julia-actions/setup-julia@v2 with: version: ${{ matrix.version }} arch: ${{ matrix.arch }} @@ -41,6 +41,7 @@ jobs: - uses: julia-actions/julia-buildpkg@v1 - uses: julia-actions/julia-runtest@v1 - uses: julia-actions/julia-processcoverage@v1 - - uses: codecov/codecov-action@v1 + - uses: codecov/codecov-action@v5 with: file: lcov.info + token: ${{ secrets.CODECOV_TOKEN }} diff --git a/.github/workflows/TagBot.yml b/.github/workflows/TagBot.yml index 9880dd0..aabc045 100644 --- a/.github/workflows/TagBot.yml +++ b/.github/workflows/TagBot.yml @@ -28,8 +28,5 @@ jobs: steps: - uses: JuliaRegistries/TagBot@v1 with: - token: ${{ secrets.GITHUB_TOKEN }} - # Edit the following line to reflect the actual name of the GitHub Secret containing your private key - ssh: ${{ secrets.DOCUMENTER_KEY }} - # ssh: ${{ secrets.NAME_OF_MY_SSH_PRIVATE_KEY_SECRET }} + token: ${{ secrets.TAGBOT_TOKEN }} registry: HolyLab/HolyLabRegistry \ No newline at end of file diff --git a/Project.toml b/Project.toml index f6c4e53..af92025 100644 --- a/Project.toml +++ b/Project.toml @@ -15,9 +15,9 @@ SharedArrays = "1a1011a3-84de-559e-8e89-a11a2f7dc383" CUDA = "3,4,5" ImageCore = "0.8.1, 0.9, 0.10" Primes = "0.4, 0.5" -RegisterCore = "0.2" -RegisterMismatchCommon = "0.2.2" -julia = "1" +RegisterCore = "0.2, 1" +RegisterMismatchCommon = "0.2.2, 1" +julia = "1.10" [extras] CenterIndexedArrays = "46a7138f-0d70-54e1-8ada-fb8296f91f24" diff --git a/src/RegisterMismatchCuda.jl b/src/RegisterMismatchCuda.jl index d4abd85..db93d65 100644 --- a/src/RegisterMismatchCuda.jl +++ b/src/RegisterMismatchCuda.jl @@ -35,38 +35,38 @@ The major types and functions exported are: """ RegisterMismatchCuda -mutable struct CuRCpair{T<:AbstractFloat,N} - R::CuArray{T,N} - C::CuArray{Complex{T},N} +mutable struct CuRCpair{T <: AbstractFloat, N} + R::CuArray{T, N} + C::CuArray{Complex{T}, N} rng::NTuple{N} end -function CuRCpair{T}(::UndefInitializer, realsize::Dims{N}) where {T<:AbstractFloat,N} +function CuRCpair{T}(::UndefInitializer, realsize::Dims{N}) where {T <: AbstractFloat, N} csize = [realsize...] - csize[1] = realsize[1]>>1 + 1 - C = CuArray{Complex{T},N}(undef, csize...) + csize[1] = realsize[1] >> 1 + 1 + C = CuArray{Complex{T}, N}(undef, csize...) csize[1] *= 2 R = reshape(reinterpret(T, vec(C)), (csize...,)::Dims{N}) - rng = map(n->1:n,realsize) - CuRCpair{T,N}(R, C, rng) + rng = map(n -> 1:n, realsize) + return CuRCpair{T, N}(R, C, rng) end -function CuRCpair(A::Array{T}) where {T<:AbstractFloat} +function CuRCpair(A::Array{T}) where {T <: AbstractFloat} P = CuRCpair(eltype(A), size(A)) copyto!(P.R, A) - P + return P end -function plan_fft_pair(P::CuRCpair{T,N}) where {T,N} +function plan_fft_pair(P::CuRCpair{T, N}) where {T, N} fwd = CUFFT.plan_rfft(P.R[P.rng...]) inv = CUFFT.plan_inv(fwd) - fwd, inv + return fwd, inv end -mutable struct NanCorrFFTs{T<:AbstractFloat,N} - I0::CuRCpair{T,N} - I1::CuRCpair{T,N} - I2::CuRCpair{T,N} +mutable struct NanCorrFFTs{T <: AbstractFloat, N} + I0::CuRCpair{T, N} + I1::CuRCpair{T, N} + I2::CuRCpair{T, N} end """ @@ -75,51 +75,51 @@ end Prepare for FFT-based mismatch computations over domains of size `aperture_width`, computing the mismatch up to shifts of size `maxshift`. """ -mutable struct CMStorage{T<:AbstractFloat,N} +mutable struct CMStorage{T <: AbstractFloat, N} aperture_width::Vector{Float64} maxshift::Vector{Int} getindexes::Vector{UnitRange{Int}} # indexes for pulling padded data, in source-coordinates setindexes::Vector{UnitRange{Int}} # indexes for pushing fixed data, in source-coordinates - fixed::NanCorrFFTs{T,N} - moving::NanCorrFFTs{T,N} - num::CuRCpair{T,N} - denom::CuRCpair{T,N} - numhost::Array{T,N} - denomhost::Array{T,N} + fixed::NanCorrFFTs{T, N} + moving::NanCorrFFTs{T, N} + num::CuRCpair{T, N} + denom::CuRCpair{T, N} + numhost::Array{T, N} + denomhost::Array{T, N} # the next two store the result of calling plan_fft! and plan_ifft! fftfunc::Any ifftfunc::Any shiftindices::Vector{Vector{Int}} # indices for performing fftshift & snipping from -maxshift:maxshift end -function CMStorage{T,N}(::UndefInitializer, aperture_width::NTuple{N,<:Real}, maxshift::Dims{N}) where {T<:AbstractFloat,N} - blocksize = map(x->ceil(Int,x), aperture_width) +function CMStorage{T, N}(::UndefInitializer, aperture_width::NTuple{N, <:Real}, maxshift::Dims{N}) where {T <: AbstractFloat, N} + blocksize = map(x -> ceil(Int, x), aperture_width) padsz = padsize(blocksize, maxshift) getindexes = padranges(blocksize, maxshift) - setindexes = UnitRange{Int}[(1:blocksize[i]).+maxshift[i] for i = 1:length(blocksize)] - fixed = NanCorrFFTs(CuRCpair{T}(undef, padsz), CuRCpair{T}(undef, padsz), CuRCpair{T}(undef, padsz)) + setindexes = UnitRange{Int}[(1:blocksize[i]) .+ maxshift[i] for i in 1:length(blocksize)] + fixed = NanCorrFFTs(CuRCpair{T}(undef, padsz), CuRCpair{T}(undef, padsz), CuRCpair{T}(undef, padsz)) moving = NanCorrFFTs(CuRCpair{T}(undef, padsz), CuRCpair{T}(undef, padsz), CuRCpair{T}(undef, padsz)) num = CuRCpair{T}(undef, padsz) denom = CuRCpair{T}(undef, padsz) - mmsz = map(x->2x+1, (maxshift...,)) + mmsz = map(x -> 2x + 1, (maxshift...,)) numhost, denomhost = Array{T}(undef, mmsz), Array{T}(undef, mmsz) fftfunc, ifftfunc = plan_fft_pair(num) maxshiftv = [maxshift...] - shiftindices = Vector{Int}[ [padsz[i].+(-maxshift[i]+1:0); 1:maxshift[i]+1] for i = 1:length(maxshift) ] - CMStorage{T,N}(Float64[aperture_width...], maxshiftv, getindexes, setindexes, fixed, moving, num, denom, numhost, denomhost, fftfunc, ifftfunc, shiftindices) + shiftindices = Vector{Int}[ [padsz[i] .+ ((-maxshift[i] + 1):0); 1:(maxshift[i] + 1)] for i in 1:length(maxshift) ] + return CMStorage{T, N}(Float64[aperture_width...], maxshiftv, getindexes, setindexes, fixed, moving, num, denom, numhost, denomhost, fftfunc, ifftfunc, shiftindices) end -CMStorage{T}(::UndefInitializer, aperture_width::NTuple{N,<:Real}, maxshift::Dims{N}) where {T<:AbstractFloat,N} = - CMStorage{T,N}(undef, aperture_width, maxshift) +CMStorage{T}(::UndefInitializer, aperture_width::NTuple{N, <:Real}, maxshift::Dims{N}) where {T <: AbstractFloat, N} = + CMStorage{T, N}(undef, aperture_width, maxshift) # Note: display doesn't do anything, but here for compatibility with the CPU version -CMStorage{T}(::UndefInitializer, blocksize::NTuple{N,<:Real}, maxshift::Dims{N}; display=false) where {T<:Real,N} = CMStorage{T}(undef, blocksize, maxshift) -CMStorage{T,N}(::UndefInitializer, blocksize::NTuple{N,<:Real}, maxshift::Dims{N}; display=false) where {T<:Real,N} = CMStorage{T,N}(undef, blocksize, maxshift) +CMStorage{T}(::UndefInitializer, blocksize::NTuple{N, <:Real}, maxshift::Dims{N}; display = false) where {T <: Real, N} = CMStorage{T}(undef, blocksize, maxshift) +CMStorage{T, N}(::UndefInitializer, blocksize::NTuple{N, <:Real}, maxshift::Dims{N}; display = false) where {T <: Real, N} = CMStorage{T, N}(undef, blocksize, maxshift) CUDA.context(cms::CMStorage) = context(cms.num.C) -eltype(cms::CMStorage{T,N}) where {T,N} = T - ndims(cms::CMStorage{T,N}) where {T,N} = N +eltype(cms::CMStorage{T, N}) where {T, N} = T +ndims(cms::CMStorage{T, N}) where {T, N} = N # Some tools from Images sdims(A::CuArray) = ndims(A) @@ -140,22 +140,22 @@ normalization scheme (`:intensity` or `:pixels`). This operation is synchronous with respect to the host. """ -function mismatch(::Type{T}, fixed::AbstractArray, moving::AbstractArray, maxshift::DimsLike; normalization = :intensity) where T<:Real +function mismatch(::Type{T}, fixed::AbstractArray, moving::AbstractArray, maxshift::DimsLike; normalization = :intensity) where {T <: Real} assertsamesize(fixed, moving) - d_fixed = CuArray{T}(fixed) + d_fixed = CuArray{T}(fixed) d_moving = CuArray{T}(moving) - mm = mismatch(d_fixed, d_moving, maxshift, normalization=normalization) - mm + mm = mismatch(d_fixed, d_moving, maxshift, normalization = normalization) + return mm end -function mismatch(fixed::CuArray{T}, moving::CuArray{T}, maxshift::DimsLike; normalization = :intensity) where T +function mismatch(fixed::CuArray{T}, moving::CuArray{T}, maxshift::DimsLike; normalization = :intensity) where {T} assertsamesize(fixed, moving) nd = ndims(fixed) cms = CMStorage{T}(undef, size(fixed), maxshift) mm = MismatchArray(T, (2 .* maxshift .+ 1)...) fillfixed!(cms, fixed) - mismatch!(mm, cms, moving, normalization=normalization) - mm + mismatch!(mm, cms, moving, normalization = normalization) + return mm end """ @@ -183,39 +183,43 @@ in a rectangular grid, you can use an `N`-dimensional array-of-tuples (or array-of-vectors) or an `N+1`-dimensional array with the center positions specified along the first dimension. See `aperture_grid`. """ -function mismatch_apertures(::Type{T}, - fixed::AbstractArray, - moving::AbstractArray, - aperture_centers::AbstractArray, - aperture_width::WidthLike, - maxshift::DimsLike; - kwargs...) where T +function mismatch_apertures( + ::Type{T}, + fixed::AbstractArray, + moving::AbstractArray, + aperture_centers::AbstractArray, + aperture_width::WidthLike, + maxshift::DimsLike; + kwargs... + ) where {T} assertsamesize(fixed, moving) - d_fixed = CuArray{T}(sdata(fixed)) + d_fixed = CuArray{T}(sdata(fixed)) d_moving = CuArray{T}(moving) mms = mismatch_apertures(d_fixed, d_moving, aperture_centers, aperture_width, maxshift; kwargs...) - mms + return mms end # only difference here relative to RegisterMismatch is the lack of the # FFTW keywords -function mismatch_apertures(fixed::CuArray{T}, - moving::CuArray, - aperture_centers::AbstractArray, - aperture_width::WidthLike, - maxshift::DimsLike; - normalization = :pixels, - kwargs...) where T +function mismatch_apertures( + fixed::CuArray{T}, + moving::CuArray, + aperture_centers::AbstractArray, + aperture_width::WidthLike, + maxshift::DimsLike; + normalization = :pixels, + kwargs... + ) where {T} nd = sdims(fixed) - assertsamesize(fixed,moving) + assertsamesize(fixed, moving) (length(aperture_width) == nd && length(maxshift) == nd) || error("Dimensionality mismatch") mms = allocate_mmarrays(T, aperture_centers, maxshift) cms = CMStorage{T}(undef, aperture_width, maxshift; kwargs...) - mismatch_apertures!(mms, fixed, moving, aperture_centers, cms; normalization=normalization) - mms + mismatch_apertures!(mms, fixed, moving, aperture_centers, cms; normalization = normalization) + return mms end -function fillfixed!(cms::CMStorage{T}, fixed::CuArray; f_indexes = ntuple(i->1:size(fixed,i), ndims(fixed))) where T +function fillfixed!(cms::CMStorage{T}, fixed::CuArray; f_indexes = ntuple(i -> 1:size(fixed, i), ndims(fixed))) where {T} ctx = context(cms) context(fixed) == ctx || error("Fixed and cms must be on the same context") nd = ndims(cms) @@ -227,17 +231,17 @@ function fillfixed!(cms::CMStorage{T}, fixed::CuArray; f_indexes = ntuple(i->1:s fill!(paddedf, NaN) dstindexes = Array{UnitRange{Int}}(undef, nd) srcindexes = Array{UnitRange{Int}}(undef, nd) - for idim = 1:nd + for idim in 1:nd tmp = f_indexes[idim] - i1 = first(tmp) >= 1 ? 1 : 2-first(tmp) - i2 = last(tmp) <= size(fixed, idim) ? length(tmp) : length(tmp)-(last(tmp)-size(fixed, idim)) + i1 = first(tmp) >= 1 ? 1 : 2 - first(tmp) + i2 = last(tmp) <= size(fixed, idim) ? length(tmp) : length(tmp) - (last(tmp) - size(fixed, idim)) srcindexes[idim] = tmp[i1]:tmp[i2] dstindexes[idim] = cms.setindexes[idim][i1]:cms.setindexes[idim][i2] end copyto!(paddedf, CartesianIndices(Tuple(dstindexes)), fixed, CartesianIndices(Tuple(srcindexes))) #This conversion may be inefficient. # Prepare the components of the convolution - threadspb = calculate_threads(size(paddedf), attribute(dev, CUDA.DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK)÷2) - nblocks = ceil.(Int, size(paddedf)./ threadspb) + threadspb = calculate_threads(size(paddedf), attribute(dev, CUDA.DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK) ÷ 2) + nblocks = ceil.(Int, size(paddedf) ./ threadspb) @cuda blocks = nblocks threads = threadspb kernel_conv_components!(paddedf, cms.fixed.I2.R, cms.fixed.I0.R) synchronize() # Compute FFTs @@ -245,7 +249,7 @@ function fillfixed!(cms::CMStorage{T}, fixed::CuArray; f_indexes = ntuple(i->1:s for item in (obj.I0, obj.I1, obj.I2) copyto!(item.C, cms.fftfunc * item.R[item.rng...]) end - obj + return obj end """ @@ -253,7 +257,7 @@ end computes the mismatch as a function of shift, storing the result in `mm`. The `fixed` image has been prepared in `cms`, a `CMStorage` object. """ -function mismatch!(mm::MismatchArray, cms::CMStorage{T}, moving::CuArray; normalization = :intensity, m_offset = ntuple(i->0, ndims(cms))) where T +function mismatch!(mm::MismatchArray, cms::CMStorage{T}, moving::CuArray; normalization = :intensity, m_offset = ntuple(i -> 0, ndims(cms))) where {T} ctx = context(cms) context(moving) == ctx || error("Fixed and cms must be on the same context") activate(ctx) @@ -261,10 +265,10 @@ function mismatch!(mm::MismatchArray, cms::CMStorage{T}, moving::CuArray; normal checksize_maxshift(mm, cms.maxshift) nd = ndims(cms) paddedm = cms.moving.I1.R - get!(paddedm, moving, ntuple(d->cms.getindexes[d].+m_offset[d], nd), T(NaN)) + get!(paddedm, moving, ntuple(d -> cms.getindexes[d] .+ m_offset[d], nd), T(NaN)) # Prepare the components of the convolution - threadspb = calculate_threads(size(paddedm), attribute(dev, CUDA.DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK)÷2) - nblocks = ceil.(Int, size(paddedm)./ threadspb) + threadspb = calculate_threads(size(paddedm), attribute(dev, CUDA.DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK) ÷ 2) + nblocks = ceil.(Int, size(paddedm) ./ threadspb) @cuda blocks = nblocks threads = threadspb kernel_conv_components!(paddedm, cms.moving.I2.R, cms.moving.I0.R) synchronize() # Compute FFTs @@ -275,11 +279,13 @@ function mismatch!(mm::MismatchArray, cms::CMStorage{T}, moving::CuArray; normal # Perform the convolution in fourier space d_numC = cms.num.C d_denomC = cms.denom.C - args = (cms.fixed.I1.C, cms.fixed.I2.C, cms.fixed.I0.C, - cms.moving.I1.C, cms.moving.I2.C, cms.moving.I0.C, - cms.num.C, cms.denom.C) - threadspb = calculate_threads(size(d_numC), attribute(dev, CUDA.DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK)÷2) - nblocks = ceil.(Int, size(d_numC)./ threadspb) + args = ( + cms.fixed.I1.C, cms.fixed.I2.C, cms.fixed.I0.C, + cms.moving.I1.C, cms.moving.I2.C, cms.moving.I0.C, + cms.num.C, cms.denom.C, + ) + threadspb = calculate_threads(size(d_numC), attribute(dev, CUDA.DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK) ÷ 2) + nblocks = ceil.(Int, size(d_numC) ./ threadspb) if normalization == :intensity @cuda blocks = nblocks threads = threadspb kernel_calcNumDenom_intensity!(args...) elseif normalization == :pixels @@ -292,11 +298,11 @@ function mismatch!(mm::MismatchArray, cms::CMStorage{T}, moving::CuArray; normal # Compute the IFFTs d_num = cms.num.R d_denom = cms.denom.R - rng = CartesianIndices(cms.num.rng); + rng = CartesianIndices(cms.num.rng) copyto!(d_num, rng, cms.ifftfunc * d_numC, rng) copyto!(d_denom, rng, cms.ifftfunc * d_denomC, rng) # Copy result to host - copyto!(mm, (view(Array(d_num), cms.shiftindices...), view(Array(d_denom), cms.shiftindices...))) + return copyto!(mm, (view(Array(d_num), cms.shiftindices...), view(Array(d_denom), cms.shiftindices...))) end """ @@ -308,36 +314,36 @@ in `cms`, a `CMStorage` object. The results are stored in `mms`, an Array-of-MismatchArrays which must have length equal to the number of aperture centers. """ -function mismatch_apertures!(mms, fixed, moving, aperture_centers, cms; normalization=:pixels) +function mismatch_apertures!(mms, fixed, moving, aperture_centers, cms; normalization = :pixels) assertsamesize(fixed, moving) N = ndims(cms) - for (mm,center) in zip(mms, each_point(aperture_centers)) + for (mm, center) in zip(mms, each_point(aperture_centers)) rng = aperture_range(center, cms.aperture_width) - fillfixed!(cms, fixed; f_indexes=rng) - offset = [first(rng[d])-1 for d = 1:N] - mismatch!(mm, cms, moving; normalization=normalization, m_offset=offset) + fillfixed!(cms, fixed; f_indexes = rng) + offset = [first(rng[d]) - 1 for d in 1:N] + mismatch!(mm, cms, moving; normalization = normalization, m_offset = offset) end synchronize() - mms + return mms end ### Utilities function assertsamesize(A::AbstractArray, B::AbstractArray) - size(A,1) == size(B,1) && size(A,2) == size(B,2) && size(A,3) == size(B,3) || error("Arrays are not the same size") + return size(A, 1) == size(B, 1) && size(A, 2) == size(B, 2) && size(A, 3) == size(B, 3) || error("Arrays are not the same size") end ### Deprecations -@deprecate CuRCpair(realtype::Type{T}, realsize) where {T<:AbstractFloat} CuRCpair{T}(undef, realsize) +@deprecate CuRCpair(realtype::Type{T}, realsize) where {T <: AbstractFloat} CuRCpair{T}(undef, realsize) -function CMStorage{T}(::UndefInitializer, aperture_width::WidthLike, maxshift::DimsLike; kwargs...) where {T<:Real} +function CMStorage{T}(::UndefInitializer, aperture_width::WidthLike, maxshift::DimsLike; kwargs...) where {T <: Real} Base.depwarn("CMStorage with aperture_width::$(typeof(aperture_width)) and maxshift::$(typeof(maxshift)) is deprecated, use tuples instead", :CMStorage) (N = length(aperture_width)) == length(maxshift) || error("Dimensionality mismatch") - return CMStorage{T,N}(undef, (aperture_width...,), (maxshift...,); kwargs...) + return CMStorage{T, N}(undef, (aperture_width...,), (maxshift...,); kwargs...) end -@deprecate CMStorage(::Type{T}, aperture_width::WidthLike, maxshift::DimsLike; kwargs...) where {T<:AbstractFloat} CMStorage{T}(undef, aperture_width, maxshift; kwargs...) +@deprecate CMStorage(::Type{T}, aperture_width::WidthLike, maxshift::DimsLike; kwargs...) where {T <: AbstractFloat} CMStorage{T}(undef, aperture_width, maxshift; kwargs...) end # module diff --git a/src/kernels.jl b/src/kernels.jl index 2af71ee..e650269 100644 --- a/src/kernels.jl +++ b/src/kernels.jl @@ -1,17 +1,21 @@ -cucartesianindex(a::AbstractVector) = CartesianIndex((blockIdx().x-1) * blockDim().x + threadIdx().x) -cucartesianindex(a::AbstractMatrix) = CartesianIndex((blockIdx().x-1) * blockDim().x + threadIdx().x, - (blockIdx().y-1) * blockDim().y + threadIdx().y) -cucartesianindex(a::AbstractArray{<:Any,3}) = CartesianIndex((blockIdx().x-1) * blockDim().x + threadIdx().x, - (blockIdx().y-1) * blockDim().y + threadIdx().y, - (blockIdx().z-1) * blockDim().z + threadIdx().z) +cucartesianindex(a::AbstractVector) = CartesianIndex((blockIdx().x - 1) * blockDim().x + threadIdx().x) +cucartesianindex(a::AbstractMatrix) = CartesianIndex( + (blockIdx().x - 1) * blockDim().x + threadIdx().x, + (blockIdx().y - 1) * blockDim().y + threadIdx().y +) +cucartesianindex(a::AbstractArray{<:Any, 3}) = CartesianIndex( + (blockIdx().x - 1) * blockDim().x + threadIdx().x, + (blockIdx().y - 1) * blockDim().y + threadIdx().y, + (blockIdx().z - 1) * blockDim().z + threadIdx().z +) -function calculate_threads(sz::Dims{N}, maxthreadspb) where N +function calculate_threads(sz::Dims{N}, maxthreadspb) where {N} threads = ones(Int, N) L = prod(sz) iterdim = Iterators.cycle(1:N) dim, state = iterate(iterdim) for (b, p) in factor(maxthreadspb) - for i = 1:p + for i in 1:p P = prod(threads) P >= L && return (threads...,)::Dims{N} threads[dim] *= b @@ -25,7 +29,7 @@ end Pitch must be expressed in elements, not bytes! width (x) corresponds to the fastest dimension (the same as pitch), depth (z) to the slowest The array-dimension parameters must be the same for all 3 arrays. =# -function kernel_conv_components!(A::T, A2::T, thetaA::T) where T +function kernel_conv_components!(A::T, A2::T, thetaA::T) where {T} Tel = eltype(T) i = cucartesianindex(A) if checkbounds(Bool, A, i) @@ -36,15 +40,17 @@ function kernel_conv_components!(A::T, A2::T, thetaA::T) where T A[i] = local_A end thetaA[i] = local_thetaA ? Tel(0) : Tel(1) - A2[i] = local_A * local_A; + A2[i] = local_A * local_A end return nothing end # CUDA kernel_calcNumDenom: case INTENSITY, compute numerator and denominator before ifftn -function kernel_calcNumDenom_intensity!(f_fft::T, f2_fft::T, thetaf_fft::T, - m_fft::T, m2_fft::T, thetam_fft::T, - numerator_fft::T, denominator_fft::T) where T +function kernel_calcNumDenom_intensity!( + f_fft::T, f2_fft::T, thetaf_fft::T, + m_fft::T, m2_fft::T, thetam_fft::T, + numerator_fft::T, denominator_fft::T + ) where {T} i = cucartesianindex(f_fft) if checkbounds(Bool, f_fft, i) c1 = f2_fft[i]' * thetam_fft[i] @@ -59,18 +65,20 @@ function kernel_calcNumDenom_intensity!(f_fft::T, f2_fft::T, thetaf_fft::T, end # CUDA kernel_calcNumDenom: case PIXELS, compute numerator and denominator before ifftn -function kernel_calcNumDenom_pixels!(f_fft::T, f2_fft::T, thetaf_fft::T, - m_fft::T, m2_fft::T, thetam_fft::T, - numerator_fft::T, denominator_fft::T) where T +function kernel_calcNumDenom_pixels!( + f_fft::T, f2_fft::T, thetaf_fft::T, + m_fft::T, m2_fft::T, thetam_fft::T, + numerator_fft::T, denominator_fft::T + ) where {T} i = cucartesianindex(f_fft) if checkbounds(Bool, f_fft, i) - c1 = -2 * f_fft[i]' - c1 = c1 * m_fft[i] - c2 = thetaf_fft[i]' * m2_fft[i] - c1 = c1 + c2 - c2 = f2_fft[i]' * thetam_fft[i] - numerator_fft[i] = c1 + c2; - denominator_fft[i] = thetaf_fft[i]' * thetam_fft[i] + c1 = -2 * f_fft[i]' + c1 = c1 * m_fft[i] + c2 = thetaf_fft[i]' * m2_fft[i] + c1 = c1 + c2 + c2 = f2_fft[i]' * thetam_fft[i] + numerator_fft[i] = c1 + c2 + denominator_fft[i] = thetaf_fft[i]' * thetam_fft[i] end return nothing end diff --git a/test/runtests.jl b/test/runtests.jl index c6b816b..2fcde5d 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -3,12 +3,18 @@ using CUDA, RegisterCore, CenterIndexedArrays import RegisterMismatchCuda RM = RegisterMismatchCuda -accuracy = 1e-5 +accuracy = 1.0e-5 + +if !CUDA.functional() + @warn "No functional CUDA device found; skipping tests." + exit(0) +end devlist = CuDevice[] -map(dev->capability(dev) >= v"2.0" ? push!(devlist,dev) : nothing, devices()) +map(dev -> capability(dev) >= v"2.0" ? push!(devlist, dev) : nothing, devices()) if isempty(devlist) - error("There is no CUDA device having capability bigger than version 2.0.") + @warn "No CUDA device with capability >= 2.0 found; skipping tests." + exit(0) end @testset "kernel_conv_components" begin @@ -45,16 +51,16 @@ end end @testset "kernel_calcNumDenom" begin - function run_cpu(f::Function, f_fft, f2_fft, thetaf_fft, m_fft,m2_fft, thetam_fft) + function run_cpu(f::Function, f_fft, f2_fft, thetaf_fft, m_fft, m2_fft, thetam_fft) numerator_fft = similar(f_fft) denominator_fft = similar(f_fft) for i in CartesianIndices(f_fft) - f(i, f_fft, f2_fft, thetaf_fft, m_fft,m2_fft, thetam_fft, numerator_fft, denominator_fft) + f(i, f_fft, f2_fft, thetaf_fft, m_fft, m2_fft, thetam_fft, numerator_fft, denominator_fft) end numerator_fft, denominator_fft end - function run_gpu(kernel::Function, f_fft, f2_fft, thetaf_fft, m_fft,m2_fft, thetam_fft) + function run_gpu(kernel::Function, f_fft, f2_fft, thetaf_fft, m_fft, m2_fft, thetam_fft) d_f = CuArray(f_fft) d_f2 = CuArray(f2_fft) d_tf = CuArray(thetaf_fft) @@ -73,7 +79,7 @@ end Array(d_numerator), Array(d_denominator) end - function calcNumDenom_intensity!(i, f_fft, f2_fft, thetaf_fft, m_fft,m2_fft, thetam_fft, numerator_fft, denominator_fft) + function calcNumDenom_intensity!(i, f_fft, f2_fft, thetaf_fft, m_fft, m2_fft, thetam_fft, numerator_fft, denominator_fft) if checkbounds(Bool, f_fft, i) c1 = f2_fft[i]' * thetam_fft[i] c2 = thetaf_fft[i]' * m2_fft[i] @@ -85,34 +91,34 @@ end end end - function calcNumDenom_pixels!(i, f_fft, f2_fft, thetaf_fft, m_fft,m2_fft, thetam_fft, numerator_fft, denominator_fft) + function calcNumDenom_pixels!(i, f_fft, f2_fft, thetaf_fft, m_fft, m2_fft, thetam_fft, numerator_fft, denominator_fft) if checkbounds(Bool, f_fft, i) c1 = -2 * f_fft[i]' c1 = c1 * m_fft[i] c2 = thetaf_fft[i]' * m2_fft[i] c1 = c1 + c2 c2 = f2_fft[i]' * thetam_fft[i] - numerator_fft[i] = c1 + c2; + numerator_fft[i] = c1 + c2 denominator_fft[i] = thetaf_fft[i]' * thetam_fft[i] end end for dev in devlist device!(dev) do - A = rand(Complex{Float64},3,3) - B = rand(Complex{Float64},3,3) - C = rand(Complex{Float64},3,3) - D = rand(Complex{Float64},3,3) - E = rand(Complex{Float64},3,3) - F = rand(Complex{Float64},3,3) - num_cpu, denom_cpu = run_cpu(calcNumDenom_intensity!,A,B,C,D,E,F) - num_gpu, denom_gpu = run_gpu(RM.kernel_calcNumDenom_intensity!,A,B,C,D,E,F) - @test ≈(num_cpu, num_gpu, atol=accuracy) - @test ≈(denom_cpu, denom_gpu, atol=accuracy) - num_cpu, denom_cpu = run_cpu(calcNumDenom_pixels!,A,B,C,D,E,F) - num_gpu, denom_gpu = run_gpu(RM.kernel_calcNumDenom_pixels!,A,B,C,D,E,F) - @test ≈(num_cpu, num_gpu, atol=accuracy) - @test ≈(denom_cpu, denom_gpu, atol=accuracy) + A = rand(Complex{Float64}, 3, 3) + B = rand(Complex{Float64}, 3, 3) + C = rand(Complex{Float64}, 3, 3) + D = rand(Complex{Float64}, 3, 3) + E = rand(Complex{Float64}, 3, 3) + F = rand(Complex{Float64}, 3, 3) + num_cpu, denom_cpu = run_cpu(calcNumDenom_intensity!, A, B, C, D, E, F) + num_gpu, denom_gpu = run_gpu(RM.kernel_calcNumDenom_intensity!, A, B, C, D, E, F) + @test ≈(num_cpu, num_gpu, atol = accuracy) + @test ≈(denom_cpu, denom_gpu, atol = accuracy) + num_cpu, denom_cpu = run_cpu(calcNumDenom_pixels!, A, B, C, D, E, F) + num_gpu, denom_gpu = run_gpu(RM.kernel_calcNumDenom_pixels!, A, B, C, D, E, F) + @test ≈(num_cpu, num_gpu, atol = accuracy) + @test ≈(denom_cpu, denom_gpu, atol = accuracy) end end end @@ -120,18 +126,18 @@ end @testset "Mismatch 2D test" begin for dev in devlist device!(dev) do - A = zeros(5,5) - A[3,3] = 3 - B = zeros(5,5) - B[4,5] = 3 - maxshift = (2,3) + A = zeros(5, 5) + A[3, 3] = 3 + B = zeros(5, 5) + B[4, 5] = 3 + maxshift = (2, 3) mm = RM.mismatch(A, A, maxshift) num, denom = RM.separate(mm) RM.truncatenoise!(mm, 0.01) - @test indmin_mismatch(mm, 0.01) == CartesianIndex((0,0)) + @test argmin_mismatch(mm, 0.01) == CartesianIndex((0, 0)) mm = RM.mismatch(A, B, maxshift) RM.truncatenoise!(mm, 0.01) - @test indmin_mismatch(mm, 0.01) == CartesianIndex((1,2)) + @test argmin_mismatch(mm, 0.01) == CartesianIndex((1, 2)) # Testing on more complex objects # img = rand(map(UInt8,0:255), 256, 256) @@ -139,13 +145,13 @@ end # fixed = map(Float32, img[rng...]) # moving = map(Float32, img[rng[1].+13, rng[2].-8]) # maxshift = (20, 20) - img = rand(map(UInt8,0:255), 60, 60) + img = rand(map(UInt8, 0:255), 60, 60) rng = Any[1:40, 10:50] fixed = map(Float32, img[rng...]) - moving = map(Float32, img[rng[1].+6, rng[2].-8]) + moving = map(Float32, img[rng[1] .+ 6, rng[2] .- 8]) maxshift = (10, 10) mm = RM.mismatch(fixed, moving, maxshift) - @test indmin_mismatch(mm, 0.01) == CartesianIndex((-6,8)) + @test argmin_mismatch(mm, 0.01) == CartesianIndex((-6, 8)) end end end @@ -154,19 +160,19 @@ end for dev in devlist device!(dev) do # Test 3d similarly - Apad = parent(padarray(reshape(1:80*6, 10, 8, 6), Fill(0, (4,3,2)))) - Bpad = parent(padarray(rand(1:80*6, 10, 8, 6), Fill(0, (4,3,2)))) - mm = RM.mismatch(Apad, Bpad, (4,3,2)) + Apad = parent(padarray(reshape(1:(80 * 6), 10, 8, 6), Fill(0, (4, 3, 2)))) + Bpad = parent(padarray(rand(1:(80 * 6), 10, 8, 6), Fill(0, (4, 3, 2)))) + mm = RM.mismatch(Apad, Bpad, (4, 3, 2)) num, denom = RegisterCore.separate(mm) mmref = CenterIndexedArray{Float64}(undef, 9, 7, 5) - for k=-2:2, j = -3:3, i = -4:4 - Bshift = circshift(Bpad,-[i,j,k]) - df = Apad-Bshift - mmref[i,j,k] = sum(df.^2) + for k in -2:2, j in -3:3, i in -4:4 + Bshift = circshift(Bpad, -[i, j, k]) + df = Apad - Bshift + mmref[i, j, k] = sum(df .^ 2) end - nrm = sum(Apad.^2)+sum(Bpad.^2) - @test ≈(mmref.data, num.data, atol=accuracy*nrm) - @test ≈(fill(nrm,size(denom)), denom.data, atol=accuracy*nrm) + nrm = sum(Apad .^ 2) + sum(Bpad .^ 2) + @test ≈(mmref.data, num.data, atol = accuracy * nrm) + @test ≈(fill(nrm, size(denom)), denom.data, atol = accuracy * nrm) end end end @@ -174,31 +180,31 @@ end @testset "Mismatch apertures 2D test" begin for dev in devlist device!(dev) do - for imsz in ((15,16), (14,17)) - for maxshift in ((4,3), (3,2)) - for gridsize in ((3,3), (2,1), (2,3), (2,2), (1,3)) + for imsz in ((15, 16), (14, 17)) + for maxshift in ((4, 3), (3, 2)) + for gridsize in ((3, 3), (2, 1), (2, 3), (2, 2), (1, 3)) Apad = parent(padarray(reshape(1:prod(imsz), imsz[1], imsz[2]), Fill(0, maxshift, maxshift))) Bpad = parent(padarray(rand(1:20, imsz[1], imsz[2]), Fill(0, maxshift, maxshift))) # intensity normalization - mms = RM.mismatch_apertures(Float64, Apad, Bpad, gridsize, maxshift, normalization=:intensity, display=false) + mms = RM.mismatch_apertures(Float64, Apad, Bpad, gridsize, maxshift, normalization = :intensity, display = false) nums, denoms = RegisterCore.separate(mms) num = sum(nums) denom = sum(denoms) mm = CenterIndexedArray{Float64}(undef, (2 .* maxshift .+ 1)...) - for j = -maxshift[2]:maxshift[2], i = -maxshift[1]:maxshift[1] - Bshift = circshift(Bpad,-[i,j]) - df = Apad-Bshift - mm[i,j] = sum(df.^2) + for j in -maxshift[2]:maxshift[2], i in -maxshift[1]:maxshift[1] + Bshift = circshift(Bpad, -[i, j]) + df = Apad - Bshift + mm[i, j] = sum(df .^ 2) end - nrm = sum(Apad.^2)+sum(Bpad.^2) - @test ≈(mm.data, num.data, atol=accuracy*nrm) - @test ≈(fill(nrm,size(denom)), denom.data, atol=accuracy*nrm) + nrm = sum(Apad .^ 2) + sum(Bpad .^ 2) + @test ≈(mm.data, num.data, atol = accuracy * nrm) + @test ≈(fill(nrm, size(denom)), denom.data, atol = accuracy * nrm) # pixel normalization - mms = RM.mismatch_apertures(Float64, Apad, Bpad, gridsize, maxshift, normalization=:pixels, display=false) + mms = RM.mismatch_apertures(Float64, Apad, Bpad, gridsize, maxshift, normalization = :pixels, display = false) _, denoms = RegisterCore.separate(mms) denom = sum(denoms) - n = Vector{Int}[size(Apad,i).-abs.(-maxshift[i]:maxshift[i]) for i = 1:2] - @test ≈(denom.data, n[1].*n[2]', atol=accuracy*maximum(denom)) + n = Vector{Int}[size(Apad, i) .- abs.(-maxshift[i]:maxshift[i]) for i in 1:2] + @test ≈(denom.data, n[1] .* n[2]', atol = accuracy * maximum(denom)) end end end @@ -210,21 +216,21 @@ end for dev in devlist device!(dev) do # Test 3d similarly - Apad = parent(padarray(reshape(1:80*6, 10, 8, 6), Fill(0, (4,3,2)))) - Bpad = parent(padarray(rand(1:80*6, 10, 8, 6), Fill(0, (4,3,2)))) - mms = RM.mismatch_apertures(Apad, Bpad, (2,3,2), (4,3,2), normalization=:intensity, display=false) + Apad = parent(padarray(reshape(1:(80 * 6), 10, 8, 6), Fill(0, (4, 3, 2)))) + Bpad = parent(padarray(rand(1:(80 * 6), 10, 8, 6), Fill(0, (4, 3, 2)))) + mms = RM.mismatch_apertures(Apad, Bpad, (2, 3, 2), (4, 3, 2), normalization = :intensity, display = false) nums, denoms = RegisterCore.separate(mms) num = sum(nums) denom = sum(denoms) mmref = CenterIndexedArray{Float64}(undef, 9, 7, 5) - for k=-2:2, j = -3:3, i = -4:4 - Bshift = circshift(Bpad,-[i,j,k]) - df = Apad-Bshift - mmref[i,j,k] = sum(df.^2) + for k in -2:2, j in -3:3, i in -4:4 + Bshift = circshift(Bpad, -[i, j, k]) + df = Apad - Bshift + mmref[i, j, k] = sum(df .^ 2) end - nrm = sum(Apad.^2)+sum(Bpad.^2) - @test ≈(mmref.data, num.data, atol=accuracy*nrm) - @test ≈(fill(sum(Apad.^2)+sum(Bpad.^2),size(denom)), denom.data, atol=accuracy*nrm) + nrm = sum(Apad .^ 2) + sum(Bpad .^ 2) + @test ≈(mmref.data, num.data, atol = accuracy * nrm) + @test ≈(fill(sum(Apad .^ 2) + sum(Bpad .^ 2), size(denom)), denom.data, atol = accuracy * nrm) end end end