Skip to content

Slow when blocks and threads not specified #208

@Lu-Dumoulin

Description

@Lu-Dumoulin

Hello,
Here is a MWE following this post on the forum

using ParallelStencil
@init_parallel_stencil(CUDA, Float64, 2, inbounds=true)
import CUDA

const Nx::Int = 2048
const Ny::Int = 512

threads_a = (16, 16)
blocks_a = (Nx, Ny)  threads_a
threads_b = (32, 8)
blocks_b = (Nx, Ny)  threads_b

@parallel_indices (i ,j) function initialize_to_HSS!(ρ)
    ρ[i,j] *= exp(-(j-1)*Float64(0.01)/Float64(5))
    return nothing
end

@parallel_indices (i ,j) function compute_stress!(σ, ρ)
    σ[i,j,1] = ρ[i,j]
    σ[i,j,2] = ρ[i,j]
    return nothing
end

function time_step!(ρ, σ)
    @parallel compute_stress!(σ, ρ)
end

function time_step_a!(ρ, σ)
    @parallel blocks_a threads_a compute_stress!(σ, ρ)
end

function time_step_b!(ρ, σ)
    @parallel blocks_b threads_b compute_stress!(σ, ρ)
end

ρ = @ones(Nx, Ny)
σ = @zeros(Nx, Ny, 2)
@parallel initialize_to_HSS!(ρ)

println(" Warm-up")
CUDA.@sync for _= 1:100
    time_step!(ρ, σ)
end
CUDA.@sync for _= 1:100
    time_step_a!(ρ, σ)
end
CUDA.@sync for _= 1:100
    time_step_b!(ρ, σ)
end

println("   CUDA.@time   ")
println(" With default threads and blocks")
CUDA.@time CUDA.@sync for _= 1:1000
    time_step!(ρ, σ)
end
println(" With thread = $threads_a , block = $blocks_a :")
CUDA.@time CUDA.@sync for _= 1:1000
    time_step_a!(ρ, σ)
end
println(" With thread = $threads_b , block = $blocks_b :")
CUDA.@time CUDA.@sync for _= 1:1000
    time_step_b!(ρ, σ)
end

println("   @btime   ")
println(" With default threads and blocks")
@btime for _= 1:1000
    time_step!(ρ, σ)
end
println(" With thread = $threads_a , block = $blocks_a :")
@btime for _= 1:1000
    time_step_a!(ρ, σ)
end
println(" With thread = $threads_b , block = $blocks_b :")
@btime for _= 1:1000
    time_step_b!(ρ, σ)
end

println("   Benchmarck   ")
println(" With default threads and blocks")
display(@benchmark( time_step!(ρ, σ) ))
println(" With thread = $threads_a , block = $blocks_a :")
display(@benchmark( time_step_a!(ρ, σ) ))
println(" With thread = $threads_b , block = $blocks_b :")
display(@benchmark( time_step_b!(ρ, σ) ))


############### OUTPUT
julia --optimize=3 MWE.jl 
 Warm-up
   CUDA.@time   
 With default threads and blocks
  0.076369 seconds (4.00 k CPU allocations: 62.562 KiB)
 With thread = (16, 16) , block = (128, 32) :
  0.047538 seconds (31.00 k CPU allocations: 1.129 MiB)
 With thread = (32, 8) , block = (64, 64) :
  0.046857 seconds (31.00 k CPU allocations: 1.129 MiB)
   @btime   
 With default threads and blocks
  73.066 ms (4000 allocations: 62.50 KiB)
 With thread = (16, 16) , block = (128, 32) :
  44.145 ms (31000 allocations: 1.13 MiB)
 With thread = (32, 8) , block = (64, 64) :
  43.864 ms (31000 allocations: 1.13 MiB)
   Benchmarck   
 With default threads and blocks
BenchmarkTools.Trial: 10000 samples with 1 evaluation per sample.
 Range (min  max):  71.331 μs  145.453 μs  ┊ GC (min  max): 0.00%  0.00%
 Time  (median):     73.171 μs               ┊ GC (median):    0.00%
 Time  (mean ± σ):   73.273 μs ±   1.024 μs  ┊ GC (mean ± σ):  0.00% ± 0.00%

                   ▁▃▄▅▄▇▇█▆▇▇▇▅▅▄▄▃▁▁▁                         
  ▂▁▂▂▂▂▂▃▂▃▃▄▄▅▆▇██████████████████████▆▇▆▅▄▅▅▄▄▃▃▃▃▃▃▃▃▃▂▃▂▂ ▅
  71.3 μs         Histogram: frequency by time         75.4 μs <

 Memory estimate: 64 bytes, allocs estimate: 4.
 With thread = (16, 16) , block = (128, 32) :
BenchmarkTools.Trial: 10000 samples with 1 evaluation per sample.
 Range (min  max):  42.911 μs  170.523 μs  ┊ GC (min  max): 0.00%  0.00%
 Time  (median):     44.450 μs               ┊ GC (median):    0.00%
 Time  (mean ± σ):   44.567 μs ±   1.525 μs  ┊ GC (mean ± σ):  0.00% ± 0.00%

               ▁▂▄▄▆█▇█▆▅▅▃▂▂                                   
  ▂▁▂▂▂▂▂▃▃▄▅▆▇████████████████▆▆▆▅▄▄▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂ ▄
  42.9 μs         Histogram: frequency by time         47.3 μs <

 Memory estimate: 1.16 KiB, allocs estimate: 31.
 With thread = (32, 8) , block = (64, 64) :
BenchmarkTools.Trial: 10000 samples with 1 evaluation per sample.
 Range (min  max):  42.511 μs  120.992 μs  ┊ GC (min  max): 0.00%  0.00%
 Time  (median):     44.110 μs               ┊ GC (median):    0.00%
 Time  (mean ± σ):   44.190 μs ±   1.141 μs  ┊ GC (mean ± σ):  0.00% ± 0.00%

                   ▁▂▁▄▃▆▆▅▇█▆▇▆▄▆▃▄▃▂▂▁                        
  ▂▂▁▂▂▂▂▂▂▃▃▄▅▅▆▆▆███████████████████████▆▇▆▅▅▅▄▄▄▃▃▃▃▃▂▂▂▂▂▂ ▅
  42.5 μs         Histogram: frequency by time         45.9 μs <

 Memory estimate: 1.16 KiB, allocs estimate: 31.

Thank you,
Best

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions