Slow when blocks and threads not specified

Hello, 
Here is a MWE following this post on the [forum](https://discourse.julialang.org/t/choosing-between-kernelabstractions-acceleratedkernels-parallelstencils-or-just-cuda-jl/135833/10
)

```julia
using ParallelStencil
@init_parallel_stencil(CUDA, Float64, 2, inbounds=true)
import CUDA

const Nx::Int = 2048
const Ny::Int = 512

threads_a = (16, 16)
blocks_a = (Nx, Ny) .÷ threads_a
threads_b = (32, 8)
blocks_b = (Nx, Ny) .÷ threads_b

@parallel_indices (i ,j) function initialize_to_HSS!(ρ)
    ρ[i,j] *= exp(-(j-1)*Float64(0.01)/Float64(5))
    return nothing
end

@parallel_indices (i ,j) function compute_stress!(σ, ρ)
    σ[i,j,1] = ρ[i,j]
    σ[i,j,2] = ρ[i,j]
    return nothing
end

function time_step!(ρ, σ)
    @parallel compute_stress!(σ, ρ)
end

function time_step_a!(ρ, σ)
    @parallel blocks_a threads_a compute_stress!(σ, ρ)
end

function time_step_b!(ρ, σ)
    @parallel blocks_b threads_b compute_stress!(σ, ρ)
end

ρ = @ones(Nx, Ny)
σ = @zeros(Nx, Ny, 2)
@parallel initialize_to_HSS!(ρ)

println(" Warm-up")
CUDA.@sync for _= 1:100
    time_step!(ρ, σ)
end
CUDA.@sync for _= 1:100
    time_step_a!(ρ, σ)
end
CUDA.@sync for _= 1:100
    time_step_b!(ρ, σ)
end

println("   CUDA.@time   ")
println(" With default threads and blocks")
CUDA.@time CUDA.@sync for _= 1:1000
    time_step!(ρ, σ)
end
println(" With thread = $threads_a , block = $blocks_a :")
CUDA.@time CUDA.@sync for _= 1:1000
    time_step_a!(ρ, σ)
end
println(" With thread = $threads_b , block = $blocks_b :")
CUDA.@time CUDA.@sync for _= 1:1000
    time_step_b!(ρ, σ)
end

println("   @btime   ")
println(" With default threads and blocks")
@btime for _= 1:1000
    time_step!(ρ, σ)
end
println(" With thread = $threads_a , block = $blocks_a :")
@btime for _= 1:1000
    time_step_a!(ρ, σ)
end
println(" With thread = $threads_b , block = $blocks_b :")
@btime for _= 1:1000
    time_step_b!(ρ, σ)
end

println("   Benchmarck   ")
println(" With default threads and blocks")
display(@benchmark( time_step!(ρ, σ) ))
println(" With thread = $threads_a , block = $blocks_a :")
display(@benchmark( time_step_a!(ρ, σ) ))
println(" With thread = $threads_b , block = $blocks_b :")
display(@benchmark( time_step_b!(ρ, σ) ))


############### OUTPUT
julia --optimize=3 MWE.jl 
 Warm-up
   CUDA.@time   
 With default threads and blocks
  0.076369 seconds (4.00 k CPU allocations: 62.562 KiB)
 With thread = (16, 16) , block = (128, 32) :
  0.047538 seconds (31.00 k CPU allocations: 1.129 MiB)
 With thread = (32, 8) , block = (64, 64) :
  0.046857 seconds (31.00 k CPU allocations: 1.129 MiB)
   @btime   
 With default threads and blocks
  73.066 ms (4000 allocations: 62.50 KiB)
 With thread = (16, 16) , block = (128, 32) :
  44.145 ms (31000 allocations: 1.13 MiB)
 With thread = (32, 8) , block = (64, 64) :
  43.864 ms (31000 allocations: 1.13 MiB)
   Benchmarck   
 With default threads and blocks
BenchmarkTools.Trial: 10000 samples with 1 evaluation per sample.
 Range (min … max):  71.331 μs … 145.453 μs  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     73.171 μs               ┊ GC (median):    0.00%
 Time  (mean ± σ):   73.273 μs ±   1.024 μs  ┊ GC (mean ± σ):  0.00% ± 0.00%

                   ▁▃▄▅▄▇▇█▆▇▇▇▅▅▄▄▃▁▁▁                         
  ▂▁▂▂▂▂▂▃▂▃▃▄▄▅▆▇██████████████████████▆▇▆▅▄▅▅▄▄▃▃▃▃▃▃▃▃▃▂▃▂▂ ▅
  71.3 μs         Histogram: frequency by time         75.4 μs <

 Memory estimate: 64 bytes, allocs estimate: 4.
 With thread = (16, 16) , block = (128, 32) :
BenchmarkTools.Trial: 10000 samples with 1 evaluation per sample.
 Range (min … max):  42.911 μs … 170.523 μs  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     44.450 μs               ┊ GC (median):    0.00%
 Time  (mean ± σ):   44.567 μs ±   1.525 μs  ┊ GC (mean ± σ):  0.00% ± 0.00%

               ▁▂▄▄▆█▇█▆▅▅▃▂▂                                   
  ▂▁▂▂▂▂▂▃▃▄▅▆▇████████████████▆▆▆▅▄▄▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂ ▄
  42.9 μs         Histogram: frequency by time         47.3 μs <

 Memory estimate: 1.16 KiB, allocs estimate: 31.
 With thread = (32, 8) , block = (64, 64) :
BenchmarkTools.Trial: 10000 samples with 1 evaluation per sample.
 Range (min … max):  42.511 μs … 120.992 μs  ┊ GC (min … max): 0.00% … 0.00%
 Time  (median):     44.110 μs               ┊ GC (median):    0.00%
 Time  (mean ± σ):   44.190 μs ±   1.141 μs  ┊ GC (mean ± σ):  0.00% ± 0.00%

                   ▁▂▁▄▃▆▆▅▇█▆▇▆▄▆▃▄▃▂▂▁                        
  ▂▂▁▂▂▂▂▂▂▃▃▄▅▅▆▆▆███████████████████████▆▇▆▅▅▅▄▄▄▃▃▃▃▃▂▂▂▂▂▂ ▅
  42.5 μs         Histogram: frequency by time         45.9 μs <

 Memory estimate: 1.16 KiB, allocs estimate: 31.
```

Thank you,
Best

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Slow when blocks and threads not specified #208

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Slow when blocks and threads not specified #208

Description

Metadata

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Issue actions