using ParallelStencil
@init_parallel_stencil(CUDA, Float64, 2, inbounds=true)
import CUDA
const Nx::Int = 2048
const Ny::Int = 512
threads_a = (16, 16)
blocks_a = (Nx, Ny) .÷ threads_a
threads_b = (32, 8)
blocks_b = (Nx, Ny) .÷ threads_b
@parallel_indices (i ,j) function initialize_to_HSS!(ρ)
ρ[i,j] *= exp(-(j-1)*Float64(0.01)/Float64(5))
return nothing
end
@parallel_indices (i ,j) function compute_stress!(σ, ρ)
σ[i,j,1] = ρ[i,j]
σ[i,j,2] = ρ[i,j]
return nothing
end
function time_step!(ρ, σ)
@parallel compute_stress!(σ, ρ)
end
function time_step_a!(ρ, σ)
@parallel blocks_a threads_a compute_stress!(σ, ρ)
end
function time_step_b!(ρ, σ)
@parallel blocks_b threads_b compute_stress!(σ, ρ)
end
ρ = @ones(Nx, Ny)
σ = @zeros(Nx, Ny, 2)
@parallel initialize_to_HSS!(ρ)
println(" Warm-up")
CUDA.@sync for _= 1:100
time_step!(ρ, σ)
end
CUDA.@sync for _= 1:100
time_step_a!(ρ, σ)
end
CUDA.@sync for _= 1:100
time_step_b!(ρ, σ)
end
println(" CUDA.@time ")
println(" With default threads and blocks")
CUDA.@time CUDA.@sync for _= 1:1000
time_step!(ρ, σ)
end
println(" With thread = $threads_a , block = $blocks_a :")
CUDA.@time CUDA.@sync for _= 1:1000
time_step_a!(ρ, σ)
end
println(" With thread = $threads_b , block = $blocks_b :")
CUDA.@time CUDA.@sync for _= 1:1000
time_step_b!(ρ, σ)
end
println(" @btime ")
println(" With default threads and blocks")
@btime for _= 1:1000
time_step!(ρ, σ)
end
println(" With thread = $threads_a , block = $blocks_a :")
@btime for _= 1:1000
time_step_a!(ρ, σ)
end
println(" With thread = $threads_b , block = $blocks_b :")
@btime for _= 1:1000
time_step_b!(ρ, σ)
end
println(" Benchmarck ")
println(" With default threads and blocks")
display(@benchmark( time_step!(ρ, σ) ))
println(" With thread = $threads_a , block = $blocks_a :")
display(@benchmark( time_step_a!(ρ, σ) ))
println(" With thread = $threads_b , block = $blocks_b :")
display(@benchmark( time_step_b!(ρ, σ) ))
############### OUTPUT
julia --optimize=3 MWE.jl
Warm-up
CUDA.@time
With default threads and blocks
0.076369 seconds (4.00 k CPU allocations: 62.562 KiB)
With thread = (16, 16) , block = (128, 32) :
0.047538 seconds (31.00 k CPU allocations: 1.129 MiB)
With thread = (32, 8) , block = (64, 64) :
0.046857 seconds (31.00 k CPU allocations: 1.129 MiB)
@btime
With default threads and blocks
73.066 ms (4000 allocations: 62.50 KiB)
With thread = (16, 16) , block = (128, 32) :
44.145 ms (31000 allocations: 1.13 MiB)
With thread = (32, 8) , block = (64, 64) :
43.864 ms (31000 allocations: 1.13 MiB)
Benchmarck
With default threads and blocks
BenchmarkTools.Trial: 10000 samples with 1 evaluation per sample.
Range (min … max): 71.331 μs … 145.453 μs ┊ GC (min … max): 0.00% … 0.00%
Time (median): 73.171 μs ┊ GC (median): 0.00%
Time (mean ± σ): 73.273 μs ± 1.024 μs ┊ GC (mean ± σ): 0.00% ± 0.00%
▁▃▄▅▄▇▇█▆▇▇▇▅▅▄▄▃▁▁▁
▂▁▂▂▂▂▂▃▂▃▃▄▄▅▆▇██████████████████████▆▇▆▅▄▅▅▄▄▃▃▃▃▃▃▃▃▃▂▃▂▂ ▅
71.3 μs Histogram: frequency by time 75.4 μs <
Memory estimate: 64 bytes, allocs estimate: 4.
With thread = (16, 16) , block = (128, 32) :
BenchmarkTools.Trial: 10000 samples with 1 evaluation per sample.
Range (min … max): 42.911 μs … 170.523 μs ┊ GC (min … max): 0.00% … 0.00%
Time (median): 44.450 μs ┊ GC (median): 0.00%
Time (mean ± σ): 44.567 μs ± 1.525 μs ┊ GC (mean ± σ): 0.00% ± 0.00%
▁▂▄▄▆█▇█▆▅▅▃▂▂
▂▁▂▂▂▂▂▃▃▄▅▆▇████████████████▆▆▆▅▄▄▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂ ▄
42.9 μs Histogram: frequency by time 47.3 μs <
Memory estimate: 1.16 KiB, allocs estimate: 31.
With thread = (32, 8) , block = (64, 64) :
BenchmarkTools.Trial: 10000 samples with 1 evaluation per sample.
Range (min … max): 42.511 μs … 120.992 μs ┊ GC (min … max): 0.00% … 0.00%
Time (median): 44.110 μs ┊ GC (median): 0.00%
Time (mean ± σ): 44.190 μs ± 1.141 μs ┊ GC (mean ± σ): 0.00% ± 0.00%
▁▂▁▄▃▆▆▅▇█▆▇▆▄▆▃▄▃▂▂▁
▂▂▁▂▂▂▂▂▂▃▃▄▅▅▆▆▆███████████████████████▆▇▆▅▅▅▄▄▄▃▃▃▃▃▂▂▂▂▂▂ ▅
42.5 μs Histogram: frequency by time 45.9 μs <
Memory estimate: 1.16 KiB, allocs estimate: 31.
Hello,
Here is a MWE following this post on the forum
Thank you,
Best