From e7daf1360d998d717bdf6c43230f1886a337a978 Mon Sep 17 00:00:00 2001 From: Aleksei Zvontsov <72526361+AZEY4@users.noreply.github.com> Date: Fri, 21 Jun 2024 17:57:28 +0200 Subject: [PATCH 1/5] Trivial implementation of LU decomposition using data flow tasks --- Project.toml | 4 ++++ benchmark/benchmarks.jl | 28 +++++++++++++----------- src/hmatrix.jl | 21 ++++++++++++++++++ src/lu.jl | 45 ++++++++++++++++++++++++++++++++++---- src/utils.jl | 2 +- test/lu_test_3.jl | 48 +++++++++++++++++++++++++++++++++++++++++ 6 files changed, 130 insertions(+), 18 deletions(-) create mode 100644 test/lu_test_3.jl diff --git a/Project.toml b/Project.toml index da1c32b..8c4831d 100644 --- a/Project.toml +++ b/Project.toml @@ -3,7 +3,11 @@ uuid = "8646bddf-ab1c-4fa7-9c51-ba187d647618" version = "0.2.8" [deps] +BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" +CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0" +DataFlowTasks = "d1549cb6-e9f4-42f8-98cc-ffc8d067ff5b" Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b" +GraphViz = "f526b714-d49f-11e8-06ff-31ed36ee7ee0" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" RecipesBase = "3cdcf5f2-1ef4-517c-9805-6587b60abb01" diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl index cc5da1f..a313eda 100644 --- a/benchmark/benchmarks.jl +++ b/benchmark/benchmarks.jl @@ -104,19 +104,21 @@ for (name, (K, p)) in kernels y = zeros($N) ) samples = 1 evals = 50 # LU factorization. The assemble is considered in a setup-phase. - SUITE[name]["LU threads=$threads"] = - @benchmarkable lu!(H, $comp; threads = $threads) setup = ( - H = assemble_hmatrix( - $K, - $Xclt, - $Xclt; - adm = $adm, - comp = $comp, - threads = true, - distributed = false, - global_index = $p, - ) - ) samples = 4 evals = 1 + for dataflowtasks in (true, false) + SUITE[name]["LU threads=$threads dataflowtasks=$(threads && dataflowtasks)"] = + @benchmarkable lu!(H, $comp; threads=$threads, dataflowtasks=$(threads && dataflowtasks)) setup = ( + H = assemble_hmatrix( + $K, + $Xclt, + $Xclt; + adm = $adm, + comp = $comp, + threads = true, + distributed = false, + global_index = $p, + ) + ) samples = 4 evals = 1 + end end end end diff --git a/src/hmatrix.jl b/src/hmatrix.jl index a47cfd3..d068821 100644 --- a/src/hmatrix.jl +++ b/src/hmatrix.jl @@ -1,3 +1,5 @@ +using DataFlowTasks + """ mutable struct HMatrix{R,T} <: AbstractMatrix{T} @@ -467,6 +469,25 @@ function compress!(H::HMatrix, comp) return H end +function DataFlowTasks.memory_overlap(A::HMatrix, B::HMatrix) + # TODO: compare child and all ancestors before: use nodes(M) instead? + if A === B + return true + elseif A.parentnode === B || A === B.parentnode + return true + end + chdA = leaves(A) + chdB = leaves(B) + for i in eachindex(chdA) + for j in eachindex(chdB) + if chdA[i].data === chdB[j].data + return true + end + end + end + return false +end + ############################################################################################ # Recipes ############################################################################################ diff --git a/src/lu.jl b/src/lu.jl index 80802d0..04ed744 100644 --- a/src/lu.jl +++ b/src/lu.jl @@ -1,3 +1,5 @@ +using DataFlowTasks + const NOPIVOT = VERSION >= v"1.7" ? NoPivot : Val{false} const HLU = LU{<:Any,<:HMatrix} @@ -24,15 +26,22 @@ end Hierarhical LU facotrization of `M`, using `comp` to generate the compressed blocks during the multiplication routines. """ -function LinearAlgebra.lu!(M::HMatrix, compressor; threads = use_threads()) +function LinearAlgebra.lu!(M::HMatrix, compressor; threads = use_threads(), dataflowtasks = false) # perform the lu decomposition of M in place T = eltype(M) nt = Threads.nthreads() chn = Channel{ACABuffer{T}}(nt) foreach(i -> put!(chn, ACABuffer(T)), 1:nt) - _lu!(M, compressor, threads, chn) - # wrap the result in the LU structure - return LU(M, Int[], 0) + if (dataflowtasks) + _lu_dataflow_tasks(M, compressor, threads, chn) + # wrap the result in the LU structure + d = @dspawn LU(@R(M), Int[], 0) label="result" + return fetch(d) + else + _lu!(M, compressor, threads, chn) + # wrap the result in the LU structure + return LU(M, Int[], 0) + end end """ @@ -84,6 +93,34 @@ function _lu!(M::HMatrix, compressor, threads, bufs = nothing) return M end +function _lu_dataflow_tasks(M::HMatrix, compressor, threads, bufs = nothing, level = 0, parent = (0,0)) + if isleaf(M) + @dspawn begin + @RW(M) + d = data(M) + @assert d isa Matrix + lu!(d, NOPIVOT()) + end label="lu($(parent[1]),$(parent[2]))\nlevel=$(level)" + else + @assert !hasdata(M) + chdM = children(M) + m, n = size(chdM) + for i in 1:m + _lu_dataflow_tasks(chdM[i, i], compressor, threads, bufs, level + 1, (i, i)) + for j in (i+1):n + @dspawn ldiv!(UnitLowerTriangular(@R(chdM[i, i])), @RW(chdM[i, j]), compressor, bufs) label="ldiv($i,$j)\nlevel=$(level+1)" + @dspawn rdiv!(@RW(chdM[j, i]), UpperTriangular(@R(chdM[i, i])), compressor, bufs) label="rdiv($j,$i)\nlevel=$(level+1)" + end + for j in (i+1):m + for k in (i+1):n + @dspawn hmul!(@RW(chdM[j, k]), @R(chdM[j, i]), @R(chdM[i, k]), -1, 1, compressor, bufs) label="hmul($j,$k)\nlevel=$(level+1)" + end + end + end + end + return M +end + function LinearAlgebra.ldiv!(A::HLU, y::AbstractVector; global_index = true) p = A.factors # underlying data ctree = coltree(p) diff --git a/src/utils.jl b/src/utils.jl index 1173693..04e8d0e 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -64,7 +64,7 @@ function leaves(tree) end """ - leaves(tree) + nodes(tree) Return a vector containing all the nodes of `tree`. """ diff --git a/test/lu_test_3.jl b/test/lu_test_3.jl new file mode 100644 index 0000000..5c961da --- /dev/null +++ b/test/lu_test_3.jl @@ -0,0 +1,48 @@ +using Test +using HMatrices +using LinearAlgebra +using Random +using StaticArrays +using BenchmarkTools +using DataFlowTasks + +using HMatrices + +using GraphViz +using CairoMakie + +include(joinpath(HMatrices.PROJECT_ROOT, "test", "testutils.jl")) + +Random.seed!(1) + +m = 5000 +T = Float64 +X = points_on_sphere(m) +Y = X + +K = laplace_matrix(X, X) +K1 = laplace_matrix(X,X) + +splitter = CardinalitySplitter(; nmax = 50) +Xclt = ClusterTree(X, splitter) +Yclt = ClusterTree(Y, splitter) +adm = StrongAdmissibilityStd(3) +comp = PartialACA(; atol = 1e-10) + +function run(threads, dataflowtasks) + println("run (threads=$threads, dataflowtasks=$dataflowtasks)") + H = assemble_hmatrix(K, Xclt, Yclt; adm, comp, threads, distributed = false) + println(H) + hlu = lu(H; atol = 1e-10, threads, dataflowtasks) + y = rand(m) + M = Matrix(K) + exact = M \ y + approx = hlu \ y + @test norm(exact - approx, Inf) < 1e-10 + # test multiplication by checking if the solution is correct + @test hlu.L * (hlu.U * approx) ≈ y +end + +log_info = DataFlowTasks.@log run(true, true) +GraphViz.Graph(log_info) +plot(log_info; categories=["lu", "ldiv", "rdiv", "hmul"]) From 186e0debd81620102282abb14f9bef2a4bd72d55 Mon Sep 17 00:00:00 2001 From: Aleksei Zvontsov <72526361+AZEY4@users.noreply.github.com> Date: Sun, 23 Jun 2024 14:44:30 +0200 Subject: [PATCH 2/5] Fixed issues related to passing workflows --- benchmark/benchmarks.jl | 28 +++++++++++------------- src/hmatrix.jl | 36 ++++++++++++++++++++++++++++--- src/lu.jl | 44 ++++++++++++++++++++++++++----------- test/hmatrix_test.jl | 30 ++++++++++++++++++++++++++ test/lu_test_3.jl | 48 ----------------------------------------- 5 files changed, 107 insertions(+), 79 deletions(-) delete mode 100644 test/lu_test_3.jl diff --git a/benchmark/benchmarks.jl b/benchmark/benchmarks.jl index a313eda..cc5da1f 100644 --- a/benchmark/benchmarks.jl +++ b/benchmark/benchmarks.jl @@ -104,21 +104,19 @@ for (name, (K, p)) in kernels y = zeros($N) ) samples = 1 evals = 50 # LU factorization. The assemble is considered in a setup-phase. - for dataflowtasks in (true, false) - SUITE[name]["LU threads=$threads dataflowtasks=$(threads && dataflowtasks)"] = - @benchmarkable lu!(H, $comp; threads=$threads, dataflowtasks=$(threads && dataflowtasks)) setup = ( - H = assemble_hmatrix( - $K, - $Xclt, - $Xclt; - adm = $adm, - comp = $comp, - threads = true, - distributed = false, - global_index = $p, - ) - ) samples = 4 evals = 1 - end + SUITE[name]["LU threads=$threads"] = + @benchmarkable lu!(H, $comp; threads = $threads) setup = ( + H = assemble_hmatrix( + $K, + $Xclt, + $Xclt; + adm = $adm, + comp = $comp, + threads = true, + distributed = false, + global_index = $p, + ) + ) samples = 4 evals = 1 end end end diff --git a/src/hmatrix.jl b/src/hmatrix.jl index d068821..faf1309 100644 --- a/src/hmatrix.jl +++ b/src/hmatrix.jl @@ -158,6 +158,7 @@ function _show(io, hmat, allow_empty = false) @printf(io, "\n\t min number of elements per leaf: %i", minimum(points_per_leaf)) @printf(io, "\n\t max number of elements per leaf: %i", maximum(points_per_leaf)) depth_per_leaf = map(depth, leaves_) + # TODO: the depth of tree is always zero because all leafs are roots @printf(io, "\n\t depth of tree: %i", maximum(depth_per_leaf)) @printf(io, "\n\t compression ratio: %f\n", compression_ratio(hmat)) return io @@ -469,18 +470,47 @@ function compress!(H::HMatrix, comp) return H end +""" + ancestors(H::HMatrix) + +Return all ancestors of `H`. +""" +function ancestors(H::HMatrix) + ancestors = [] + W = H + while !isroot(W) + W = parentnode(W) + push!(ancestors, W) + end + return ancestors +end + +""" + issubmatrix(A::HMatrix, B::HMatrix) + +Return `true` if B is a submatrix of A. +""" +function issubmatrix(A::HMatrix, B::HMatrix) + for ancB in ancestors(B) + if A === ancB + return true + end + end + return false +end + function DataFlowTasks.memory_overlap(A::HMatrix, B::HMatrix) - # TODO: compare child and all ancestors before: use nodes(M) instead? + # TODO: compare leaves in more efficient way. if A === B return true - elseif A.parentnode === B || A === B.parentnode + elseif issubmatrix(A, B) || issubmatrix(B, A) return true end chdA = leaves(A) chdB = leaves(B) for i in eachindex(chdA) for j in eachindex(chdB) - if chdA[i].data === chdB[j].data + if data(chdA[i]) === data(chdB[j]) return true end end diff --git a/src/lu.jl b/src/lu.jl index 04ed744..57131ba 100644 --- a/src/lu.jl +++ b/src/lu.jl @@ -26,19 +26,19 @@ end Hierarhical LU facotrization of `M`, using `comp` to generate the compressed blocks during the multiplication routines. """ -function LinearAlgebra.lu!(M::HMatrix, compressor; threads = use_threads(), dataflowtasks = false) +function LinearAlgebra.lu!(M::HMatrix, compressor; threads = use_threads()) # perform the lu decomposition of M in place T = eltype(M) nt = Threads.nthreads() chn = Channel{ACABuffer{T}}(nt) foreach(i -> put!(chn, ACABuffer(T)), 1:nt) - if (dataflowtasks) - _lu_dataflow_tasks(M, compressor, threads, chn) + if (threads) + _lu_threads!(M, compressor, chn) # wrap the result in the LU structure - d = @dspawn LU(@R(M), Int[], 0) label="result" + d = @dspawn LU(@R(M), Int[], 0) label = "result" return fetch(d) else - _lu!(M, compressor, threads, chn) + _lu!(M, compressor, chn) # wrap the result in the LU structure return LU(M, Int[], 0) end @@ -68,7 +68,7 @@ Hierarchical LU factorization. See [`lu!`](@ref) for the available options. """ LinearAlgebra.lu(M::HMatrix, args...; kwargs...) = lu!(deepcopy(M), args...; kwargs...) -function _lu!(M::HMatrix, compressor, threads, bufs = nothing) +function _lu!(M::HMatrix, compressor, bufs = nothing) if isleaf(M) d = data(M) @assert d isa Matrix @@ -78,7 +78,7 @@ function _lu!(M::HMatrix, compressor, threads, bufs = nothing) chdM = children(M) m, n = size(chdM) for i in 1:m - _lu!(chdM[i, i], compressor, threads, bufs) + _lu!(chdM[i, i], compressor, bufs) for j in (i+1):n ldiv!(UnitLowerTriangular(chdM[i, i]), chdM[i, j], compressor, bufs) rdiv!(chdM[j, i], UpperTriangular(chdM[i, i]), compressor, bufs) @@ -93,27 +93,45 @@ function _lu!(M::HMatrix, compressor, threads, bufs = nothing) return M end -function _lu_dataflow_tasks(M::HMatrix, compressor, threads, bufs = nothing, level = 0, parent = (0,0)) +function _lu_threads!(M::HMatrix, compressor, bufs = nothing, level = 0, parent = (0, 0)) if isleaf(M) @dspawn begin @RW(M) d = data(M) @assert d isa Matrix lu!(d, NOPIVOT()) - end label="lu($(parent[1]),$(parent[2]))\nlevel=$(level)" + end label = "lu($(parent[1]),$(parent[2]))\nlevel=$(level)" else @assert !hasdata(M) chdM = children(M) m, n = size(chdM) for i in 1:m - _lu_dataflow_tasks(chdM[i, i], compressor, threads, bufs, level + 1, (i, i)) + _lu_threads!(chdM[i, i], compressor, bufs, level + 1, (i, i)) for j in (i+1):n - @dspawn ldiv!(UnitLowerTriangular(@R(chdM[i, i])), @RW(chdM[i, j]), compressor, bufs) label="ldiv($i,$j)\nlevel=$(level+1)" - @dspawn rdiv!(@RW(chdM[j, i]), UpperTriangular(@R(chdM[i, i])), compressor, bufs) label="rdiv($j,$i)\nlevel=$(level+1)" + @dspawn ldiv!( + UnitLowerTriangular(@R(chdM[i, i])), + @RW(chdM[i, j]), + compressor, + bufs, + ) label = "ldiv($i,$j)\nlevel=$(level+1)" + @dspawn rdiv!( + @RW(chdM[j, i]), + UpperTriangular(@R(chdM[i, i])), + compressor, + bufs, + ) label = "rdiv($j,$i)\nlevel=$(level+1)" end for j in (i+1):m for k in (i+1):n - @dspawn hmul!(@RW(chdM[j, k]), @R(chdM[j, i]), @R(chdM[i, k]), -1, 1, compressor, bufs) label="hmul($j,$k)\nlevel=$(level+1)" + @dspawn hmul!( + @RW(chdM[j, k]), + @R(chdM[j, i]), + @R(chdM[i, k]), + -1, + 1, + compressor, + bufs, + ) label = "hmul($j,$k)\nlevel=$(level+1)" end end end diff --git a/test/hmatrix_test.jl b/test/hmatrix_test.jl index a8e0cf3..fa3ba0b 100644 --- a/test/hmatrix_test.jl +++ b/test/hmatrix_test.jl @@ -3,6 +3,7 @@ using StaticArrays using HMatrices using LinearAlgebra using SparseArrays +using DataFlowTasks include(joinpath(HMatrices.PROJECT_ROOT, "test", "testutils.jl")) @@ -58,3 +59,32 @@ end Hnew = axpy!(true, S, deepcopy(H)) @test Matrix(Hnew) == (H_full + Matrix(S)) end + +@testset "Memory overlap" begin + m = 1000 + T = Float64 + + X = points_on_sphere(m) + Y = X + K = laplace_matrix(X, X) + + X1 = points_on_sphere(m) + Y1 = X1 + K1 = laplace_matrix(X1, X1) + + splitter = CardinalitySplitter(; nmax = 50) + Xclt = ClusterTree(X, splitter) + Yclt = ClusterTree(Y, splitter) + X1clt = ClusterTree(X1, splitter) + Y1clt = ClusterTree(Y1, splitter) + adm = StrongAdmissibilityStd(3) + comp = PartialACA(; atol = 1e-10) + + H = assemble_hmatrix(K, Xclt, Yclt; adm, comp, threads = false, distributed = false) + H1 = assemble_hmatrix(K1, X1clt, Y1clt; adm, comp, threads = false, distributed = false) + + # Test memory overlap function + @test DataFlowTasks.memory_overlap(H, H) == true + @test DataFlowTasks.memory_overlap(H, H1) == false + @test DataFlowTasks.memory_overlap(H, H.children[1]) == true +end diff --git a/test/lu_test_3.jl b/test/lu_test_3.jl deleted file mode 100644 index 5c961da..0000000 --- a/test/lu_test_3.jl +++ /dev/null @@ -1,48 +0,0 @@ -using Test -using HMatrices -using LinearAlgebra -using Random -using StaticArrays -using BenchmarkTools -using DataFlowTasks - -using HMatrices - -using GraphViz -using CairoMakie - -include(joinpath(HMatrices.PROJECT_ROOT, "test", "testutils.jl")) - -Random.seed!(1) - -m = 5000 -T = Float64 -X = points_on_sphere(m) -Y = X - -K = laplace_matrix(X, X) -K1 = laplace_matrix(X,X) - -splitter = CardinalitySplitter(; nmax = 50) -Xclt = ClusterTree(X, splitter) -Yclt = ClusterTree(Y, splitter) -adm = StrongAdmissibilityStd(3) -comp = PartialACA(; atol = 1e-10) - -function run(threads, dataflowtasks) - println("run (threads=$threads, dataflowtasks=$dataflowtasks)") - H = assemble_hmatrix(K, Xclt, Yclt; adm, comp, threads, distributed = false) - println(H) - hlu = lu(H; atol = 1e-10, threads, dataflowtasks) - y = rand(m) - M = Matrix(K) - exact = M \ y - approx = hlu \ y - @test norm(exact - approx, Inf) < 1e-10 - # test multiplication by checking if the solution is correct - @test hlu.L * (hlu.U * approx) ≈ y -end - -log_info = DataFlowTasks.@log run(true, true) -GraphViz.Graph(log_info) -plot(log_info; categories=["lu", "ldiv", "rdiv", "hmul"]) From 88f38e2eea3576fa6ee7de2abce4329438b380cd Mon Sep 17 00:00:00 2001 From: Aleksei Zvontsov <72526361+AZEY4@users.noreply.github.com> Date: Sun, 23 Jun 2024 15:03:58 +0200 Subject: [PATCH 3/5] cleaned up dependencies --- Project.toml | 3 --- test/Project.toml | 1 + 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/Project.toml b/Project.toml index 8c4831d..c88ea0e 100644 --- a/Project.toml +++ b/Project.toml @@ -3,11 +3,8 @@ uuid = "8646bddf-ab1c-4fa7-9c51-ba187d647618" version = "0.2.8" [deps] -BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" -CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0" DataFlowTasks = "d1549cb6-e9f4-42f8-98cc-ffc8d067ff5b" Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b" -GraphViz = "f526b714-d49f-11e8-06ff-31ed36ee7ee0" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" RecipesBase = "3cdcf5f2-1ef4-517c-9805-6587b60abb01" diff --git a/test/Project.toml b/test/Project.toml index a2203d0..3fd491f 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -1,6 +1,7 @@ [deps] ComputationalResources = "ed09eef8-17a6-5b46-8889-db040fac31e3" Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b" +DataFlowTasks = "d1549cb6-e9f4-42f8-98cc-ffc8d067ff5b" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" From 78e26b9a4fb7fc0938d60bb398907c966355d154 Mon Sep 17 00:00:00 2001 From: Aleksei Zvontsov <72526361+AZEY4@users.noreply.github.com> Date: Sun, 23 Jun 2024 15:48:51 +0200 Subject: [PATCH 4/5] fixed bug in ancestors function --- src/hmatrix.jl | 2 +- test/lu_test.jl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hmatrix.jl b/src/hmatrix.jl index faf1309..52fd7af 100644 --- a/src/hmatrix.jl +++ b/src/hmatrix.jl @@ -478,7 +478,7 @@ Return all ancestors of `H`. function ancestors(H::HMatrix) ancestors = [] W = H - while !isroot(W) + while W !== parentnode(W) W = parentnode(W) push!(ancestors, W) end diff --git a/test/lu_test.jl b/test/lu_test.jl index 9ee8b2b..58a798f 100644 --- a/test/lu_test.jl +++ b/test/lu_test.jl @@ -24,7 +24,7 @@ adm = StrongAdmissibilityStd(3) comp = PartialACA(; atol = 1e-10) for threads in (false, true) H = assemble_hmatrix(K, Xclt, Yclt; adm, comp, threads, distributed = false) - hlu = lu(H; atol = 1e-10) + hlu = lu(H; atol = 1e-10, threads = threads) y = rand(m) M = Matrix(K) exact = M \ y From 4bfb14822bd5cce691228102f4581e38894afe85 Mon Sep 17 00:00:00 2001 From: Aleksei Zvontsov <72526361+AZEY4@users.noreply.github.com> Date: Tue, 2 Jul 2024 16:42:43 +0200 Subject: [PATCH 5/5] added script to create speedup plot --- test/Project.toml | 4 ++- test/lu_plots/create_lu_plots.bat | 37 ++++++++++++++++++++ test/lu_plots/create_lu_plots.sh | 37 ++++++++++++++++++++ test/lu_plots/create_plots.jl | 58 +++++++++++++++++++++++++++++++ test/lu_plots/run_lu.jl | 52 +++++++++++++++++++++++++++ 5 files changed, 187 insertions(+), 1 deletion(-) create mode 100644 test/lu_plots/create_lu_plots.bat create mode 100644 test/lu_plots/create_lu_plots.sh create mode 100644 test/lu_plots/create_plots.jl create mode 100644 test/lu_plots/run_lu.jl diff --git a/test/Project.toml b/test/Project.toml index 3fd491f..fa37e66 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -1,7 +1,9 @@ [deps] +CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0" ComputationalResources = "ed09eef8-17a6-5b46-8889-db040fac31e3" -Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b" DataFlowTasks = "d1549cb6-e9f4-42f8-98cc-ffc8d067ff5b" +Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b" +JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" diff --git a/test/lu_plots/create_lu_plots.bat b/test/lu_plots/create_lu_plots.bat new file mode 100644 index 0000000..6dff56a --- /dev/null +++ b/test/lu_plots/create_lu_plots.bat @@ -0,0 +1,37 @@ +@echo off + +:: First argument: path to HMatrices project +:: Second argument: path where to store plots + +if "%1"=="" ( + set JULIA_PROJECT_PATH=%~dp0\..\.. +) else ( + set JULIA_PROJECT_PATH=%1 +) + +if "%2"=="" ( + set PLOTS_PATH=%~dp0 +) else ( + set PLOTS_PATH=%2 +) + +:: Path to the Julia project environment +set JULIA_ENVIROMENT_PATH=%JULIA_PROJECT_PATH%\test + +:: Paths to the scripts +set RUN_LU_PATH=%JULIA_PROJECT_PATH%\test\lu_plots\run_lu.jl +set CREATE_PLOTS_PATH=%JULIA_PROJECT_PATH%\test\lu_plots\create_plots.jl + +:: Space-separated list of thread counts to use +set THREAD_COUNTS=1 2 4 6 8 + +:: Loop over each thread count +for %%t in (%THREAD_COUNTS%) do ( + echo Running LU with %%t threads... + set JULIA_NUM_THREADS=%%t + julia --project=%JULIA_ENVIROMENT_PATH% %RUN_LU_PATH% %PLOTS_PATH% +) + +:: Create the plots +echo Creating plots... +julia --project=%JULIA_ENVIROMENT_PATH% %CREATE_PLOTS_PATH% %PLOTS_PATH% diff --git a/test/lu_plots/create_lu_plots.sh b/test/lu_plots/create_lu_plots.sh new file mode 100644 index 0000000..eeed2ec --- /dev/null +++ b/test/lu_plots/create_lu_plots.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +# First argument: path to HMatrices project +# Second argument: path where to store plots + +if [ -z "$1" ]; then + JULIA_PROJECT_PATH="$(dirname "$(realpath "$0")")/../.." +else + JULIA_PROJECT_PATH="$1" +fi + +if [ -z "$2" ]; then + PLOTS_PATH="$(dirname "$(realpath "$0")")" +else + PLOTS_PATH="$2" +fi + +# Path to the Julia project environment +JULIA_ENVIRONMENT_PATH="$JULIA_PROJECT_PATH/test" + +# Paths to the scripts +RUN_LU_PATH="$JULIA_PROJECT_PATH/test/lu_plots/run_lu.jl" +CREATE_PLOTS_PATH="$JULIA_PROJECT_PATH/test/lu_plots/create_plots.jl" + +# Space-separated list of thread counts to use +THREAD_COUNTS="1 2 4 6 8" + +# Loop over each thread count +for t in $THREAD_COUNTS; do + echo "Running LU with $t threads..." + export JULIA_NUM_THREADS=$t + julia --project="$JULIA_ENVIRONMENT_PATH" "$RUN_LU_PATH" "$PLOTS_PATH" +done + +# Create the plots +echo "Creating plots..." +julia --project="$JULIA_ENVIRONMENT_PATH" "$CREATE_PLOTS_PATH" "$PLOTS_PATH" diff --git a/test/lu_plots/create_plots.jl b/test/lu_plots/create_plots.jl new file mode 100644 index 0000000..0527dd0 --- /dev/null +++ b/test/lu_plots/create_plots.jl @@ -0,0 +1,58 @@ +using HMatrices +using LinearAlgebra +using StaticArrays +using JLD2 + +using CairoMakie + +const FILE_PATH = joinpath(ARGS[1], "lu_results.jld2") +const PLOT_PATH = joinpath(ARGS[1], "nthreads_speedup_plot.svg") + +results = load(FILE_PATH)["results"] +signequal_result = results["1"] +info = Dict{Int,Dict{Int,Float64}}() + +for (nthreads, result) in results + for (size, time) in result + if !haskey(info, size) + info[size] = Dict{Int,Float64}() + end + speedup = signequal_result[size] / time + info[size][parse(Int64, nthreads)] = speedup + end +end + +f = Figure() +ax = Axis( + f[1, 1]; + title = "Speedup of HLU depending on number of threads", + xlabel = "Number of threads", + ylabel = "Speedup", +) + +for (size, speedup_info) in info + sorted_info = sort(collect(pairs(speedup_info))) + nthreads = [x[1] for x in sorted_info] + speedups = [x[2] for x in sorted_info] + lines!(ax, nthreads, speedups; label = "$size") +end + +perfect_speedup = collect(keys(info[minimum(keys(info))])) # Assuming the smallest size has all thread counts +lines!( + ax, + perfect_speedup, + perfect_speedup; + linestyle = :dash, + linewidth = 2, + color = :red, + label = "Perfect speedup", +) + +ax.yticks = (minimum(perfect_speedup):1:maximum(perfect_speedup)) +ax.xticks = (minimum(perfect_speedup):1:maximum(perfect_speedup)) + +f[1, 2] = Legend(f, ax, "Size"; framevisible = false) + +save(PLOT_PATH, f) + +f diff --git a/test/lu_plots/run_lu.jl b/test/lu_plots/run_lu.jl new file mode 100644 index 0000000..5c0f230 --- /dev/null +++ b/test/lu_plots/run_lu.jl @@ -0,0 +1,52 @@ +using Test +using HMatrices +using LinearAlgebra +using Random +using StaticArrays +using BenchmarkTools +using JLD2 + +using HMatrices: RkMatrix + +include(joinpath(HMatrices.PROJECT_ROOT, "test", "testutils.jl")) + +const FILE_PATH = joinpath(ARGS[1], "lu_results.jld2") + +Random.seed!(1) + +# 3D Laplace on sphere +problem_size = (100, 500, 1000) +threads = Threads.nthreads() > 1 + +results = Dict{String,Dict{Int,Float64}}() +if isfile(FILE_PATH) + results = load(FILE_PATH)["results"] +end + +thread_key = string(Threads.nthreads()) +if !haskey(results, thread_key) + results[thread_key] = Dict{Int,Float64}() +end + +for size in problem_size + println("Problem size=$size") + m = size + T = Float64 + X = points_on_sphere(m) + Y = X + + K = laplace_matrix(X, X) + + splitter = CardinalitySplitter(; nmax = 50) + Xclt = ClusterTree(X, splitter) + Yclt = ClusterTree(Y, splitter) + adm = StrongAdmissibilityStd(3) + comp = PartialACA(; atol = 1e-10) + + H = assemble_hmatrix(K, Xclt, Yclt; adm, comp, threads, distributed = false) + time = @belapsed lu($H; atol = 1e-10, threads = $threads) + results[thread_key][size] = time + println("Elapsed time=$time\n") +end + +JLD2.@save FILE_PATH results