diff --git a/CMakeLists.txt b/CMakeLists.txt index 84e4f64..3b54cd3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -96,8 +96,15 @@ cmake_dependent_option( cmake_dependent_option( HIOP_USE_MAGMA "Use Magma linear algebra" ON "HIOP_USE_GPU" OFF ) + cmake_dependent_option( - HIOP_USE_RESOLVE "Build with cuSolver LU support" ON "HIOP_USE_CUDA" OFF + HIOP_USE_RESOLVE "Build with ReSolve sparse solver support" ON "HIOP_USE_CUDA" OFF +) + +option( + HIOP_USE_EVLOSER + "Enable the experimental EVLOSER sparse solver" + OFF ) add_library(hiop_tpl INTERFACE) @@ -357,12 +364,17 @@ if(HIOP_SPARSE) endif() endif(HIOP_USE_STRUMPACK) - if (HIOP_USE_RESOLVE) + if (HIOP_USE_RESOLVE OR HIOP_USE_EVLOSER) set(HIOP_KLU_DIR CACHE PATH "Path to KLU directory") include(FindKLU) if(NOT KLU_LIBRARY) message(STATUS "Cannot find KLU, disabling cuSOLVER LU module ...") - set(HIOP_USE_RESOLVE OFF CACHE BOOL "Build without cuSOLVER LU module." FORCE) + if(HIOP_USE_RESOLVE) + set(HIOP_USE_RESOLVE OFF CACHE BOOL "Build without cuSOLVER LU module." FORCE) + endif() + if(HIOP_USE_EVLOSER) + set(HIOP_USE_EVLOSER OFF CACHE BOOL "Build without EVLOSER." FORCE) + endif() else() target_link_libraries(hiop_tpl INTERFACE KLU) endif() @@ -394,14 +406,15 @@ if(HIOP_SPARSE) endif() endif(HIOP_USE_GINKGO) - if(NOT HIOP_USE_COINHSL AND NOT HIOP_USE_STRUMPACK AND NOT HIOP_USE_PARDISO AND NOT HIOP_USE_GINKGO) + if(NOT HIOP_USE_COINHSL AND NOT HIOP_USE_STRUMPACK AND NOT HIOP_USE_PARDISO AND NOT HIOP_USE_GINKGO AND NOT HIOP_USE_EVLOSER) set(HIOP_SPARSE OFF CACHE BOOL "Build without sparse linear algebra" FORCE) message(STATUS "Cannot find COINHSL, STRUMPACK, PARDISO nor GINKGO for sparse linear algebra, and the option HIOP_SPARSE will be disabled") -endif(NOT HIOP_USE_COINHSL AND NOT HIOP_USE_STRUMPACK AND NOT HIOP_USE_PARDISO AND NOT HIOP_USE_GINKGO) +endif() else(HIOP_SPARSE) set(HIOP_USE_COINHSL OFF CACHE BOOL "Build without COINHSL" FORCE) set(HIOP_USE_STRUMPACK OFF CACHE BOOL "Build without STRUMPACK" FORCE) set(HIOP_USE_RESOLVE OFF CACHE BOOL "Build without cuSOLVER LU module" FORCE) + set(HIOP_USE_EVLOSER OFF CACHE BOOL "Build without EVLOSER" FORCE) set(HIOP_USE_PARDISO OFF CACHE BOOL "Build without PARDISO" FORCE) set(HIOP_USE_GINKGO OFF CACHE BOOL "Build without GINKGO" FORCE) endif(HIOP_SPARSE) @@ -528,6 +541,13 @@ if (HIOP_WITH_MAKETEST) add_test(NAME SparseMatrixTest COMMAND ${RUNCMD} "$") add_test(NAME SymmetricSparseMatrixTest COMMAND ${RUNCMD} "$") + if(HIOP_USE_EVLOSER AND NOT HIOP_USE_GPU) + add_test( + NAME EVLOSERKLUCPU + COMMAND ${RUNCMD} "$" + ) + endif() + # Test drivers in the form of user applications add_subdirectory(src/Drivers) endif(HIOP_WITH_MAKETEST) diff --git a/src/Drivers/Sparse/CMakeLists.txt b/src/Drivers/Sparse/CMakeLists.txt index a1506b2..d9f9b46 100644 --- a/src/Drivers/Sparse/CMakeLists.txt +++ b/src/Drivers/Sparse/CMakeLists.txt @@ -17,15 +17,18 @@ target_link_libraries(NlpSparseEx3.exe HiOp::HiOp) add_executable(NlpSparseEx4.exe NlpSparseEx4.cpp NlpSparseEx4Driver.cpp) target_link_libraries(NlpSparseEx4.exe HiOp::HiOp) -if(HIOP_USE_RAJA) - if(HIOP_USE_GPU AND HIOP_USE_CUDA AND HIOP_USE_RESOLVE) - set_source_files_properties( - NlpSparseRajaEx2.cpp - NlpSparseRajaEx2Driver.cpp - PROPERTIES LANGUAGE CUDA - ) - - add_executable(NlpSparseRajaEx2.exe NlpSparseRajaEx2Driver.cpp NlpSparseRajaEx2.cpp) +# This driver used to be CUDA/ReSolve-only; EVLOSER adds the HIP path here too. +if(HIOP_USE_RAJA AND HIOP_USE_GPU AND (HIOP_USE_RESOLVE OR HIOP_USE_EVLOSER)) + if(HIOP_USE_CUDA OR HIOP_USE_HIP) + if(HIOP_USE_CUDA) + set_source_files_properties( + NlpSparseRajaEx2.cpp + NlpSparseRajaEx2Driver.cpp + PROPERTIES LANGUAGE CUDA + ) + endif() + + add_executable(NlpSparseRajaEx2.exe NlpSparseRajaEx2Driver.cpp NlpSparseRajaEx2.cpp) target_link_libraries(NlpSparseRajaEx2.exe HiOp::HiOp) install(TARGETS NlpSparseRajaEx2.exe DESTINATION bin) list(APPEND hiopSparseEx_INTERFACE_HEADERS NlpSparseRajaEx2.hpp) @@ -56,6 +59,17 @@ add_test(NAME NlpSparse1_2 COMMAND ${RUNCMD} "$" " if(HIOP_USE_CUDA) add_test(NAME NlpSparse1_3 COMMAND ${RUNCMD} "$" "500" "-cusolver" "-selfcheck") endif(HIOP_USE_CUDA) +# CPU-only EVLOSER integration test. +if(HIOP_USE_EVLOSER AND NOT HIOP_USE_GPU) + add_test( + NAME NlpSparse1_EVLOSER_CPU + COMMAND ${RUNCMD} + "$" + "500" + "-evloser" + "-selfcheck" + ) +endif() if(HIOP_USE_PARDISO) add_test(NAME NlpSparse1_4 COMMAND ${RUNCMD} "$" "500" "-pardiso" "-selfcheck") endif(HIOP_USE_PARDISO) @@ -73,6 +87,18 @@ add_test(NAME NlpSparse2_2 COMMAND ${RUNCMD} "$" " if(HIOP_USE_CUDA) add_test(NAME NlpSparse2_3 COMMAND ${RUNCMD} "$" "500" "-cusolver" "-inertiafree" "-selfcheck") endif(HIOP_USE_CUDA) +# CPU-only EVLOSER integration test with the required inertia-free configuration. +if(HIOP_USE_EVLOSER AND NOT HIOP_USE_GPU) + add_test( + NAME NlpSparse2_EVLOSER_CPU + COMMAND ${RUNCMD} + "$" + "500" + "-evloser" + "-inertiafree" + "-selfcheck" + ) +endif() if(HIOP_USE_GINKGO) add_test(NAME NlpSparse2_4 COMMAND ${RUNCMD} "$" "500" "-ginkgo" "-inertiafree" "-selfcheck") if(HIOP_USE_CUDA) @@ -88,6 +114,16 @@ if(HIOP_USE_RAJA AND HIOP_USE_GPU AND HIOP_USE_CUDA AND HIOP_USE_RESOLVE) add_test(NAME NlpSparseRaja2_2 COMMAND ${RUNCMD} "$" "500" "-inertiafree" "-selfcheck" "-resolve_cuda_rf") endif() +# CUDA EVLOSER RF uses the same RAJA sparse driver as the ReSolve RF test. +if(HIOP_USE_RAJA AND HIOP_USE_GPU AND HIOP_USE_CUDA AND HIOP_USE_EVLOSER) + add_test(NAME NlpSparseRaja2_3 COMMAND ${RUNCMD} "$" "500" "-inertiafree" "-selfcheck" "-evloser_cuda_rf") +endif() + +# HIP EVLOSER RF needs a separate test guard because ReSolve remains CUDA-only. +if(HIOP_USE_RAJA AND HIOP_USE_GPU AND HIOP_USE_HIP AND HIOP_USE_EVLOSER) + add_test(NAME NlpSparseRaja2_4 COMMAND ${RUNCMD} "$" "500" "-inertiafree" "-selfcheck" "-evloser_hip_rf") +endif() + add_test(NAME NlpSparse3_1 COMMAND ${RUNCMD} "$" "500" "-selfcheck") if(HIOP_BUILD_SHARED AND NOT HIOP_USE_GPU ) add_test(NAME NlpSparseCinterface COMMAND ${RUNCMD} "$") diff --git a/src/Drivers/Sparse/NlpSparseEx1Driver.cpp b/src/Drivers/Sparse/NlpSparseEx1Driver.cpp index 4136490..97d8811 100644 --- a/src/Drivers/Sparse/NlpSparseEx1Driver.cpp +++ b/src/Drivers/Sparse/NlpSparseEx1Driver.cpp @@ -16,6 +16,7 @@ static bool parse_arguments(int argc, bool& self_check, bool& use_pardiso, bool& use_cusolver, + bool& use_evloser, bool& use_ginkgo, bool& use_ginkgo_cuda, bool& use_ginkgo_hip, @@ -23,9 +24,10 @@ static bool parse_arguments(int argc, { self_check = false; use_pardiso = false; + use_evloser = false; use_ginkgo = false; use_ginkgo_cuda = false; - use_ginkgo_cuda = false; + use_ginkgo_hip = false; force_fr = false; n = 3; scal = 1.0; @@ -50,6 +52,8 @@ static bool parse_arguments(int argc, use_pardiso = true; } else if(std::string(argv[4]) == "-cusolver") { use_cusolver = true; + } else if(std::string(argv[4]) == "-evloser") { + use_evloser = true; } else if(std::string(argv[4]) == "-ginkgo") { use_ginkgo = true; } else if(std::string(argv[4]) == "-ginkgo_cuda") { @@ -70,6 +74,8 @@ static bool parse_arguments(int argc, use_pardiso = true; } else if(std::string(argv[3]) == "-cusolver") { use_cusolver = true; + } else if(std::string(argv[3]) == "-evloser") { + use_evloser = true; } else if(std::string(argv[3]) == "-ginkgo") { use_ginkgo = true; } else if(std::string(argv[3]) == "-ginkgo_cuda") { @@ -90,6 +96,8 @@ static bool parse_arguments(int argc, use_pardiso = true; } else if(std::string(argv[2]) == "-cusolver") { use_cusolver = true; + } else if(std::string(argv[2]) == "-evloser") { + use_evloser = true; } else if(std::string(argv[2]) == "-ginkgo") { use_ginkgo = true; } else if(std::string(argv[2]) == "-ginkgo_cuda") { @@ -116,7 +124,8 @@ static bool parse_arguments(int argc, scal = 1.0; } - if(use_cusolver && use_pardiso) { + // EVLOSER follows the same sparse-LU driver path as cuSOLVER here. + if((use_cusolver || use_evloser) && use_pardiso) { printf("Selected both, cuSOLVER and Pardiso. "); printf("You can select only one linear solver.\n\n"); return false; @@ -147,6 +156,15 @@ static bool parse_arguments(int argc, } #endif +// If EVLOSER is not available, de-select it. +#ifndef HIOP_USE_EVLOSER + if(use_evloser) { + printf("HiOp built without support for EVLOSER. "); + printf("Using default linear solver ...\n"); + use_evloser = false; + } +#endif + return true; }; @@ -164,6 +182,7 @@ static void usage(const char* exeName) " '-pardiso' or '-cusolver': use Pardiso or cuSOLVER " "as the linear solver [optional]\n"); printf(" '-cusolver': use cuSOLVER as the linear solver [optional]\n"); + printf(" '-evloser': use EVLOSER as the linear solver [optional]\n"); printf(" '-fr': force to reset feasibility in the 1st iteration [optional]\n"); printf( " '-selfcheck': compares the optimal objective with a previously saved value for the " @@ -189,6 +208,7 @@ int main(int argc, char** argv) bool selfCheck = false; bool use_pardiso = false; bool use_cusolver = false; + bool use_evloser = false; bool use_ginkgo = false; bool use_ginkgo_cuda = false; bool use_ginkgo_hip = false; @@ -203,6 +223,7 @@ int main(int argc, char** argv) selfCheck, use_pardiso, use_cusolver, + use_evloser, use_ginkgo, use_ginkgo_cuda, use_ginkgo_hip, @@ -230,10 +251,18 @@ int main(int argc, char** argv) if(use_pardiso) { nlp.options->SetStringValue("linear_solver_sparse", "pardiso"); } - if(use_cusolver) { + // EVLOSER keeps the ReSolve RF settings below but selects the EVLOSER solver name. + if(use_cusolver || use_evloser) { nlp.options->SetStringValue("duals_init", "zero"); nlp.options->SetStringValue("linsol_mode", "speculative"); - nlp.options->SetStringValue("linear_solver_sparse", "resolve"); + if(use_evloser) { + nlp.options->SetStringValue("linear_solver_sparse", "evloser"); + } else { + nlp.options->SetStringValue("linear_solver_sparse", "resolve"); + } + +#if defined(HIOP_USE_CUDA) || defined(HIOP_USE_HIP) + // Device ReSolve and EVLOSER configurations use RF and hybrid execution. nlp.options->SetStringValue("resolve_refactorization", "rf"); nlp.options->SetIntegerValue("ir_inner_maxit", 100); nlp.options->SetNumericValue("ir_inner_tol", 1e-8); @@ -241,6 +270,7 @@ int main(int argc, char** argv) nlp.options->SetIntegerValue("ir_inner_conv_cond", 2); nlp.options->SetStringValue("ir_inner_gs_scheme", "cgs2"); nlp.options->SetStringValue("compute_mode", "hybrid"); +#endif // LU solver needs to use inertia free approach nlp.options->SetStringValue("fact_acceptor", "inertia_free"); nlp.options->SetIntegerValue("ir_outer_maxit", 0); diff --git a/src/Drivers/Sparse/NlpSparseEx2Driver.cpp b/src/Drivers/Sparse/NlpSparseEx2Driver.cpp index e664635..e6d8cb1 100644 --- a/src/Drivers/Sparse/NlpSparseEx2Driver.cpp +++ b/src/Drivers/Sparse/NlpSparseEx2Driver.cpp @@ -16,6 +16,7 @@ static bool parse_arguments(int argc, bool& inertia_free, bool& use_cusolver, bool& use_resolve, + bool& use_evloser, bool& use_ginkgo, bool& use_ginkgo_cuda, bool& use_ginkgo_hip) @@ -25,9 +26,10 @@ static bool parse_arguments(int argc, inertia_free = false; use_cusolver = false; use_resolve = false; + use_evloser = false; use_ginkgo = false; use_ginkgo_cuda = false; - use_ginkgo_cuda = false; + use_ginkgo_hip = false; switch(argc) { case 1: // no arguments @@ -41,6 +43,8 @@ static bool parse_arguments(int argc, inertia_free = true; } else if(std::string(argv[4]) == "-cusolver") { use_cusolver = true; + } else if(std::string(argv[4]) == "-evloser") { + use_evloser = true; } else if(std::string(argv[4]) == "-ginkgo") { use_ginkgo = true; } else if(std::string(argv[4]) == "-ginkgo_cuda") { @@ -64,6 +68,8 @@ static bool parse_arguments(int argc, inertia_free = true; } else if(std::string(argv[3]) == "-cusolver") { use_cusolver = true; + } else if(std::string(argv[3]) == "-evloser") { + use_evloser = true; } else if(std::string(argv[3]) == "-ginkgo") { use_ginkgo = true; } else if(std::string(argv[3]) == "-ginkgo_cuda") { @@ -87,6 +93,8 @@ static bool parse_arguments(int argc, inertia_free = true; } else if(std::string(argv[2]) == "-cusolver") { use_cusolver = true; + } else if(std::string(argv[2]) == "-evloser") { + use_evloser = true; } else if(std::string(argv[2]) == "-ginkgo") { use_ginkgo = true; } else if(std::string(argv[2]) == "-ginkgo_cuda") { @@ -110,6 +118,8 @@ static bool parse_arguments(int argc, inertia_free = true; } else if(std::string(argv[1]) == "-cusolver") { use_cusolver = true; + } else if(std::string(argv[1]) == "-evloser") { + use_evloser = true; } else if(std::string(argv[1]) == "-ginkgo") { use_ginkgo = true; } else if(std::string(argv[1]) == "-ginkgo_cuda") { @@ -133,12 +143,21 @@ static bool parse_arguments(int argc, #ifndef HIOP_USE_CUDA if(use_cusolver) { printf("HiOp built without CUDA support. "); - printf("Using default instead of cuSOLVER ...\n"); + printf("Using default instead of cuSOLVER/EVLOSER ...\n"); use_cusolver = false; } #endif -// Use cuSOLVER's LU factorization, if it was configured +// If EVLOSER is not available, de-select it. +#ifndef HIOP_USE_EVLOSER + if(use_evloser) { + printf("HiOp built without EVLOSER support. "); + printf("Using default linear solver ...\n"); + use_evloser = false; + } +#endif + +// Use Resolve's sparse LU path when cuSOLVER was selected. #ifdef HIOP_USE_RESOLVE if(use_cusolver) { use_resolve = true; @@ -146,9 +165,9 @@ static bool parse_arguments(int argc, #endif // If cuSOLVER was selected, but inertia free approach was not, add inertia-free - if(use_cusolver && !(inertia_free)) { + if((use_cusolver || use_evloser) && !(inertia_free)) { inertia_free = true; - printf("LU solver from cuSOLVER library requires inertia free approach. "); + printf("Selected LU sparse solver requires inertia free approach. "); printf("Enabling now ...\n"); } @@ -182,6 +201,7 @@ static void usage(const char* exeName) " '-selfcheck': compares the optimal objective with a previously saved value for the " "problem specified by 'problem_size'. [optional]\n"); printf(" '-cusolver': use cuSOLVER linear solver [optional]\n"); + printf(" '-evloser': use EVLOSER linear solver [optional]\n"); printf(" '-ginkgo': use GINKGO linear solver [optional]\n"); } @@ -206,6 +226,7 @@ int main(int argc, char** argv) bool inertia_free = false; bool use_cusolver = false; bool use_resolve = false; + bool use_evloser = false; bool use_ginkgo = false; bool use_ginkgo_cuda = false; bool use_ginkgo_hip = false; @@ -216,6 +237,7 @@ int main(int argc, char** argv) inertia_free, use_cusolver, use_resolve, + use_evloser, use_ginkgo, use_ginkgo_cuda, use_ginkgo_hip)) { @@ -243,16 +265,24 @@ int main(int argc, char** argv) if(inertia_free) { nlp.options->SetStringValue("fact_acceptor", "inertia_free"); } - if(use_resolve) { + if(use_resolve || use_evloser) { nlp.options->SetStringValue("duals_init", "zero"); nlp.options->SetStringValue("linsol_mode", "speculative"); - nlp.options->SetStringValue("linear_solver_sparse", "resolve"); + + if(use_evloser) { + nlp.options->SetStringValue("linear_solver_sparse", "evloser"); + } else { + nlp.options->SetStringValue("linear_solver_sparse", "resolve"); + } +#if defined(HIOP_USE_CUDA) || defined(HIOP_USE_HIP) + // Device ReSolve and EVLOSER configurations use RF and hybrid execution. nlp.options->SetStringValue("resolve_refactorization", "rf"); nlp.options->SetStringValue("compute_mode", "hybrid"); nlp.options->SetIntegerValue("ir_outer_maxit", 0); nlp.options->SetIntegerValue("ir_inner_conv_cond", 2); nlp.options->SetStringValue("ir_inner_gs_scheme", "cgs2"); nlp.options->SetNumericValue("ir_inner_tol", 1e-8); +#endif } if(use_ginkgo) { nlp.options->SetStringValue("linsol_mode", "speculative"); diff --git a/src/Drivers/Sparse/NlpSparseRajaEx2Driver.cpp b/src/Drivers/Sparse/NlpSparseRajaEx2Driver.cpp index d82aa66..c6952f3 100644 --- a/src/Drivers/Sparse/NlpSparseRajaEx2Driver.cpp +++ b/src/Drivers/Sparse/NlpSparseRajaEx2Driver.cpp @@ -20,6 +20,8 @@ static bool parse_arguments(int argc, bool& inertia_free, bool& use_resolve_cuda_glu, bool& use_resolve_cuda_rf, + bool& use_evloser_cuda_rf, + bool& use_evloser_hip_rf, bool& use_ginkgo, bool& use_ginkgo_cuda, bool& use_ginkgo_hip) @@ -29,6 +31,8 @@ static bool parse_arguments(int argc, inertia_free = false; use_resolve_cuda_glu = false; use_resolve_cuda_rf = false; + use_evloser_cuda_rf = false; + use_evloser_hip_rf = false; use_ginkgo = false; use_ginkgo_cuda = false; use_ginkgo_hip = false; @@ -47,6 +51,10 @@ static bool parse_arguments(int argc, use_resolve_cuda_glu = true; } else if(std::string(argv[4]) == "-resolve_cuda_rf") { use_resolve_cuda_rf = true; + } else if(std::string(argv[4]) == "-evloser_cuda_rf") { + use_evloser_cuda_rf = true; + } else if(std::string(argv[4]) == "-evloser_hip_rf") { + use_evloser_hip_rf = true; } else if(std::string(argv[4]) == "-ginkgo") { use_ginkgo = true; } else if(std::string(argv[4]) == "-ginkgo_cuda") { @@ -72,6 +80,10 @@ static bool parse_arguments(int argc, use_resolve_cuda_glu = true; } else if(std::string(argv[3]) == "-resolve_cuda_rf") { use_resolve_cuda_rf = true; + } else if(std::string(argv[3]) == "-evloser_cuda_rf") { + use_evloser_cuda_rf = true; + } else if(std::string(argv[3]) == "-evloser_hip_rf") { + use_evloser_hip_rf = true; } else if(std::string(argv[3]) == "-ginkgo") { use_ginkgo = true; } else if(std::string(argv[3]) == "-ginkgo_cuda") { @@ -97,6 +109,10 @@ static bool parse_arguments(int argc, use_resolve_cuda_glu = true; } else if(std::string(argv[2]) == "-resolve_cuda_rf") { use_resolve_cuda_rf = true; + } else if(std::string(argv[2]) == "-evloser_cuda_rf") { + use_evloser_cuda_rf = true; + } else if(std::string(argv[2]) == "-evloser_hip_rf") { + use_evloser_hip_rf = true; } else if(std::string(argv[2]) == "-ginkgo") { use_ginkgo = true; } else if(std::string(argv[2]) == "-ginkgo_cuda") { @@ -122,6 +138,10 @@ static bool parse_arguments(int argc, use_resolve_cuda_glu = true; } else if(std::string(argv[1]) == "-resolve_cuda_rf") { use_resolve_cuda_rf = true; + } else if(std::string(argv[1]) == "-evloser_cuda_rf") { + use_evloser_cuda_rf = true; + } else if(std::string(argv[1]) == "-evloser_hip_rf") { + use_evloser_hip_rf = true; } else if(std::string(argv[1]) == "-ginkgo") { use_ginkgo = true; } else if(std::string(argv[1]) == "-ginkgo_cuda") { @@ -153,21 +173,54 @@ static bool parse_arguments(int argc, printf("Using default instead of ReSolve ...\n"); use_resolve_cuda_rf = false; } + if(use_evloser_cuda_rf) { + printf("HiOp built without CUDA support. "); + printf("Using default instead of EVLOSER ...\n"); + use_evloser_cuda_rf = false; + } +#endif + +// EVLOSER HIP RF requires HIP support. +#ifndef HIOP_USE_HIP + if(use_evloser_hip_rf) { + printf("HiOp built without HIP support. "); + printf("Using default instead of EVLOSER ...\n"); + use_evloser_hip_rf = false; + } #endif // If ReSolve was selected, but inertia free approach was not, add inertia-free - if((use_resolve_cuda_glu || use_resolve_cuda_rf) && !(inertia_free)) { + // EVLOSER RF has the same inertia-free requirement as the ReSolve sparse-LU path. + if((use_resolve_cuda_glu || use_resolve_cuda_rf || use_evloser_cuda_rf || use_evloser_hip_rf) && !(inertia_free)) { inertia_free = true; printf("LU solver from ReSolve library requires inertia free approach. "); printf("Enabling now ...\n"); } - if(use_resolve_cuda_glu && use_resolve_cuda_rf) { + // GLU and RF still share one refactorization option, so keep this conflict check explicit. + if(use_resolve_cuda_glu && (use_resolve_cuda_rf || use_evloser_cuda_rf || use_evloser_hip_rf)) { use_resolve_cuda_rf = false; - printf("You can select either GLU or Rf refactorization with ReSolve, not both. "); + use_evloser_cuda_rf = false; + use_evloser_hip_rf = false; + printf("You can select either GLU or Rf refactorization, not both. "); printf("Using default GLU refactorization ...\n"); } + // ReSolve RF and EVLOSER RF select different backend classes, so only one should be active. + if(use_resolve_cuda_rf && (use_evloser_cuda_rf || use_evloser_hip_rf)) { + use_evloser_cuda_rf = false; + use_evloser_hip_rf = false; + printf("You can select either ReSolve or EVLOSER, not both. "); + printf("Using ReSolve ...\n"); + } + + // EVLOSER has separate CUDA and HIP RF flags because the HIP path disables IR below. + if(use_evloser_cuda_rf && use_evloser_hip_rf) { + use_evloser_hip_rf = false; + printf("You can select either CUDA RF or HIP RF with EVLOSER, not both. "); + printf("Using CUDA RF ...\n"); + } + // If Ginkgo is not available, de-select it. #ifndef HIOP_USE_GINKGO if(use_ginkgo) { @@ -198,10 +251,16 @@ static void usage(const char* exeName) " '-selfcheck': compares the optimal objective with a previously saved value for the " "problem specified by 'problem_size'. [optional]\n"); printf( - " '-use_resolve_cuda_glu': use ReSolve linear solver with KLU factorization and cusolverGLU refactorization " + " '-resolve_cuda_glu': use ReSolve linear solver with KLU factorization and cusolverGLU refactorization " "[optional]\n"); printf( - " '-use_resolve_cuda_rf' : use ReSolve linear solver with KLU factorization and cusolverRf refactorization " + " '-resolve_cuda_rf' : use ReSolve linear solver with KLU factorization and cusolverRf refactorization " + "[optional]\n"); + printf( + " '-evloser_cuda_rf' : use EVLOSER linear solver with KLU factorization and cusolverRf refactorization " + "[optional]\n"); + printf( + " '-evloser_hip_rf' : use EVLOSER linear solver with KLU factorization and hipsolverRf refactorization " "[optional]\n"); printf(" '-ginkgo': use GINKGO linear solver [optional]\n"); } @@ -235,6 +294,8 @@ int main(int argc, char** argv) bool inertia_free = false; bool use_resolve_cuda_glu = false; bool use_resolve_cuda_rf = false; + bool use_evloser_cuda_rf = false; + bool use_evloser_hip_rf = false; bool use_ginkgo = false; bool use_ginkgo_cuda = false; bool use_ginkgo_hip = false; @@ -245,6 +306,8 @@ int main(int argc, char** argv) inertia_free, use_resolve_cuda_glu, use_resolve_cuda_rf, + use_evloser_cuda_rf, + use_evloser_hip_rf, use_ginkgo, use_ginkgo_cuda, use_ginkgo_hip)) { @@ -270,12 +333,24 @@ int main(int argc, char** argv) // only support cusolverLU right now, 2023.02.28 // lsq initialization of the duals fails for this example since the Jacobian is rank deficient // use zero initialization - nlp.options->SetStringValue("linear_solver_sparse", "resolve"); - if(use_resolve_cuda_rf) { + // EVLOSER uses the same refactorization option string; the solver name selects the backend. + if(use_evloser_cuda_rf || use_evloser_hip_rf) { + nlp.options->SetStringValue("linear_solver_sparse", "evloser"); + } else { + nlp.options->SetStringValue("linear_solver_sparse", "resolve"); + } + if(use_resolve_cuda_rf || use_evloser_cuda_rf) { nlp.options->SetStringValue("resolve_refactorization", "rf"); nlp.options->SetIntegerValue("ir_inner_maxit", 20); nlp.options->SetIntegerValue("ir_outer_maxit", 0); } + + // HIP EVLOSER RF currently runs without iterative refinement. + if(use_evloser_hip_rf) { + nlp.options->SetStringValue("resolve_refactorization", "rf"); + nlp.options->SetIntegerValue("ir_inner_maxit", 0); + nlp.options->SetIntegerValue("ir_outer_maxit", 0); + } nlp.options->SetStringValue("duals_init", "zero"); nlp.options->SetStringValue("mem_space", "device"); nlp.options->SetStringValue("fact_acceptor", "inertia_free"); diff --git a/src/Interface/hiop_defs.hpp.in b/src/Interface/hiop_defs.hpp.in index 68570c8..ec47a2c 100644 --- a/src/Interface/hiop_defs.hpp.in +++ b/src/Interface/hiop_defs.hpp.in @@ -11,6 +11,7 @@ #cmakedefine HIOP_USE_STRUMPACK #cmakedefine HIOP_USE_PARDISO #cmakedefine HIOP_USE_RESOLVE +#cmakedefine HIOP_USE_EVLOSER #cmakedefine HIOP_USE_GINKGO #cmakedefine HIOP_USE_AXOM #define HIOP_VERSION "@PROJECT_VERSION@" diff --git a/src/LinAlg/CMakeLists.txt b/src/LinAlg/CMakeLists.txt index 8710490..6d5b164 100644 --- a/src/LinAlg/CMakeLists.txt +++ b/src/LinAlg/CMakeLists.txt @@ -12,6 +12,7 @@ set(hiopLinAlg_INTERFACE_HEADERS hiopLinSolverSparseSTRUMPACK.hpp hiopLinSolverSparsePARDISO.hpp hiopLinSolverSparseReSolve.hpp + hiopLinSolverSparseEVLOSER.hpp hiopLinSolverUMFPACKZ.hpp hiopLinSolverCholCuSparse.hpp hiopMatrix.hpp @@ -101,6 +102,11 @@ set(hiopLinAlg_CUSOLVER_LU_SRC hiopLinSolverSparseReSolve.cpp ) +# EVLOSER wrapper source is separate from the ReSolve wrapper. If EVLOSER replaces ReSolve, merge this source path intentionally. +set(hiopLinAlg_EVLOSER_SRC + hiopLinSolverSparseEVLOSER.cpp +) + set(hiopLinAlg_CUSOLVER_CHOL_SRC hiopLinSolverCholCuSparse.cpp ) @@ -149,10 +155,21 @@ if(HIOP_SPARSE) list(APPEND hiopLinAlg_SRC ${hiopLinAlg_PARDISO_SRC}) endif(HIOP_USE_PARDISO) if(HIOP_USE_RESOLVE) - add_subdirectory(ReSolve) - list(APPEND hiopLinAlg_SRC ${hiopLinAlg_CUSOLVER_LU_SRC}) - set_source_files_properties(${hiopLinAlg_CUSOLVER_LU_SRC} PROPERTIES LANGUAGE CUDA) - endif(HIOP_USE_RESOLVE) + if(HIOP_USE_CUDA) + add_subdirectory(ReSolve) + list(APPEND hiopLinAlg_SRC ${hiopLinAlg_CUSOLVER_LU_SRC}) + set_source_files_properties(${hiopLinAlg_CUSOLVER_LU_SRC} PROPERTIES LANGUAGE CUDA) + endif() + endif() + + if(HIOP_USE_EVLOSER) + add_subdirectory(EVLOSER) + list(APPEND hiopLinAlg_SRC ${hiopLinAlg_EVLOSER_SRC}) + if(HIOP_USE_CUDA) + set_source_files_properties(${hiopLinAlg_EVLOSER_SRC} PROPERTIES LANGUAGE CUDA) + endif() + endif() + if(HIOP_USE_CUDA) list(APPEND hiopLinAlg_SRC ${hiopLinAlg_CUSOLVER_CHOL_SRC}) set_source_files_properties(${hiopLinAlg_CUSOLVER_CHOL_SRC} PROPERTIES LANGUAGE CUDA) @@ -219,8 +236,14 @@ install( ) add_library(hiopLinAlg OBJECT ${hiopLinAlg_SRC}) -if(HIOP_USE_RESOLVE) - target_link_libraries(hiop_tpl INTERFACE ReSolve) - install(TARGETS ReSolve EXPORT hiop-targets) +if(HIOP_USE_EVLOSER) + # Link EVLOSER separately so HIP builds do not require the CUDA-only ReSolve target. + target_link_libraries(hiop_tpl INTERFACE EVLOSER) + install(TARGETS EVLOSER EXPORT hiop-targets) +endif() + +if(HIOP_USE_RESOLVE AND HIOP_USE_CUDA) + target_link_libraries(hiop_tpl INTERFACE ReSolve) + install(TARGETS ReSolve EXPORT hiop-targets) endif() target_link_libraries(hiopLinAlg PRIVATE hiop_tpl) diff --git a/src/LinAlg/EVLOSER/CMakeLists.txt b/src/LinAlg/EVLOSER/CMakeLists.txt new file mode 100644 index 0000000..305dc05 --- /dev/null +++ b/src/LinAlg/EVLOSER/CMakeLists.txt @@ -0,0 +1,37 @@ +# Build EVLOSER embedded backend library + +set(EVLOSER_SRC + RefactorizationSolver.cpp + MatrixCsr.cpp +) + +if(HIOP_USE_CUDA) + list(APPEND EVLOSER_SRC + IterativeRefinement.cpp + KrylovSolverKernels.cu + ) +endif() + +add_library(EVLOSER STATIC ${EVLOSER_SRC}) + +if(HIOP_USE_CUDA) + target_compile_definitions(EVLOSER PRIVATE HIOP_USE_CUDA) +endif() + +if(HIOP_USE_HIP) + target_compile_definitions(EVLOSER PRIVATE HIOP_USE_HIP) +endif() + +target_include_directories(EVLOSER PRIVATE + $ + $ +) +target_link_libraries(EVLOSER PRIVATE KLU) + +if(HIOP_USE_CUDA) + target_link_libraries(EVLOSER PRIVATE hiop_cuda) +endif() + +if(HIOP_USE_HIP) + target_link_libraries(EVLOSER PRIVATE hipsparse hipsolver hipblas) +endif() diff --git a/src/LinAlg/EVLOSER/IterativeRefinement.cpp b/src/LinAlg/EVLOSER/IterativeRefinement.cpp new file mode 100644 index 0000000..109f0cd --- /dev/null +++ b/src/LinAlg/EVLOSER/IterativeRefinement.cpp @@ -0,0 +1,700 @@ +// This file is part of HiOp. For details, see https://github.com/LLNL/hiop. +// HiOp is released under the BSD 3-clause license +// (https://opensource.org/licenses/BSD-3-Clause). Please also read “Additional +// BSD Notice” below. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// i. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the disclaimer below. ii. Redistributions in +// binary form must reproduce the above copyright notice, this list of +// conditions and the disclaimer (as noted below) in the documentation and/or +// other materials provided with the distribution. +// iii. Neither the name of the LLNS/LLNL nor the names of its contributors may +// be used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY, LLC, +// THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, +// INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF +// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Additional BSD Notice +// 1. This notice is required to be provided under our contract with the U.S. +// Department of Energy (DOE). This work was produced at Lawrence Livermore +// National Laboratory under Contract No. DE-AC52-07NA27344 with the DOE. +// 2. Neither the United States Government nor Lawrence Livermore National +// Security, LLC nor any of their employees, makes any warranty, express or +// implied, or assumes any liability or responsibility for the accuracy, +// completeness, or usefulness of any information, apparatus, product, or +// process disclosed, or represents that its use would not infringe +// privately-owned rights. +// 3. Also, reference herein to any specific commercial products, process, or +// services by trade name, trademark, manufacturer or otherwise does not +// necessarily constitute or imply its endorsement, recommendation, or favoring +// by the United States Government or Lawrence Livermore National Security, +// LLC. The views and opinions of authors expressed herein do not necessarily +// state or reflect those of the United States Government or Lawrence Livermore +// National Security, LLC, and shall not be used for advertising or product +// endorsement purposes. + +/** + * @file IterativeRefinement.cpp + * + * @author Kasia Swirydowicz , PNNL + * @author Slaven Peles , ORNL + * + */ + +#include "IterativeRefinement.hpp" + +#if !defined(HIOP_USE_HIP) && !defined(HAVE_HIP) + +#include "hiop_blasdefs.hpp" +#include "KrylovSolverKernels.h" + +#include "klu.h" +#include "cusparse_v2.h" +#include +#include +#include +#include + +#define checkCudaErrors(val) evloserCheckCudaError((val), __FILE__, __LINE__) + +namespace EVLOSER +{ + +// Default constructor +IterativeRefinement::IterativeRefinement() {} + +// Parametrized constructor +IterativeRefinement::IterativeRefinement(int restart, double tol, int maxit) + : restart_{restart}, + maxit_{maxit}, + tol_{tol} +{} + +IterativeRefinement::~IterativeRefinement() +{ + cusparseDestroySpMat(mat_A_); + // free GPU variables that belong to this class and are not shared with CUSOLVER class + cudaFree(mv_buffer_); + cudaFree(d_V_); + cudaFree(d_Z_); + cudaFree(d_rvGPU_); + cudaFree(d_Hcolumn_); + + if(orth_option_ == "cgs2") { + cudaFree(d_H_col_); + } + // delete all CPU GMRES variables + delete[] h_H_; + + if(orth_option_ == "mgs_two_synch" || orth_option_ == "mgs_pm") { + delete[] h_L_; + delete[] h_rv_; + } + delete[] h_c_; + delete[] h_s_; + delete[] h_rs_; + + if(orth_option_ == "mgs_pm" || orth_option_ == "cgs2") { + delete[] h_aux_; + } +} + +int IterativeRefinement::setup_system_matrix(int n, int nnz, int* dia, int* dja, double* da) +{ + dia_ = dia; + dja_ = dja; + da_ = da; + n_ = n; + nnz_ = nnz; + checkCudaErrors(cusparseCreateCsr(&mat_A_, + n, + n, + nnz, + dia_, + dja_, + da_, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_BASE_ZERO, + CUDA_R_64F)); + return 0; +} + +int IterativeRefinement::setup(cusparseHandle_t cusparse_handle, + cublasHandle_t cublas_handle, + cusolverRfHandle_t cusolverrf_handle, + int n, + double* d_T, + int* d_P, + int* d_Q, + double* devx, + double* devr) +{ + cusparse_handle_ = cusparse_handle; + cublas_handle_ = cublas_handle; + cusolverrf_handle_ = cusolverrf_handle; + assert(n_ == n && "Size of the linear system incorrectly set in the iterative refinement class!\n"); + + // only set pointers + d_T_ = d_T; + d_P_ = d_P; + d_Q_ = d_Q; + + // setup matvec + + cusparseCreateDnVec(&vec_x_, n_, devx, CUDA_R_64F); + cusparseCreateDnVec(&vec_Ax_, n_, devr, CUDA_R_64F); + size_t buffer_size; + checkCudaErrors(cusparseSpMV_bufferSize(cusparse_handle_, + CUSPARSE_OPERATION_NON_TRANSPOSE, + &(minusone_), + mat_A_, + vec_x_, + &(one_), + vec_Ax_, + CUDA_R_64F, + CUSPARSE_SPMV_CSR_ALG2, + &buffer_size)); + + cudaDeviceSynchronize(); + checkCudaErrors(cudaMalloc(&mv_buffer_, buffer_size)); + + // allocate space for the GPU + + checkCudaErrors(cudaMalloc(&(d_V_), n_ * (restart_ + 1) * sizeof(double))); + checkCudaErrors(cudaMalloc(&(d_Z_), n_ * (restart_ + 1) * sizeof(double))); + checkCudaErrors(cudaMalloc(&(d_rvGPU_), 2 * (restart_ + 1) * sizeof(double))); + checkCudaErrors(cudaMalloc(&(d_Hcolumn_), 2 * (restart_ + 1) * (restart_ + 1) * sizeof(double))); + + // and for the CPU + + h_H_ = new double[restart_ * (restart_ + 1)]; + h_c_ = new double[restart_]; // needed for givens + h_s_ = new double[restart_]; // same + h_rs_ = new double[restart_ + 1]; // for residual norm history + + // for specific orthogonalization options, need a little more memory + if(orth_option_ == "mgs_two_synch" || orth_option_ == "mgs_pm") { + h_L_ = new double[restart_ * (restart_ + 1)]; + h_rv_ = new double[restart_ + 1]; + } + + if(orth_option_ == "cgs2") { + h_aux_ = new double[restart_ + 1]; + checkCudaErrors(cudaMalloc(&(d_H_col_), (restart_ + 1) * sizeof(double))); + } + + if(orth_option_ == "mgs_pm") { + h_aux_ = new double[restart_ + 1]; + } + return 0; +} + +double IterativeRefinement::getFinalResidalNorm() { return final_residual_norm_; } + +double IterativeRefinement::getInitialResidalNorm() { return initial_residual_norm_; } + +double IterativeRefinement::getBNorm() { return bnorm_; } + +int IterativeRefinement::getFinalNumberOfIterations() { return fgmres_iters_; } + +double IterativeRefinement::matrixAInfNrm() +{ + double nrm; + evloser_matrix_row_sums(n_, nnz_, dia_, da_, d_Z_); + cusolverSpDnrminf(cusolver_handle_, n_, d_Z_, &nrm, mv_buffer_ /* at least 8192 bytes */); + return nrm; +} + +double IterativeRefinement::vectorInfNrm(int n, double* d_v) +{ + double nrm; + + cusolverSpDnrminf(cusolver_handle_, n, d_v, &nrm, mv_buffer_ /* at least 8192 bytes */); + return nrm; +} + +void IterativeRefinement::fgmres(double* d_x, double* d_b) +{ + int outer_flag = 1; + int notconv = 1; + int i = 0; + int it = 0; + int j; + int k; + int k1; + + double t; + double rnorm; + double bnorm; + // double rnorm_aux; + double tolrel; + // V[0] = b-A*x_0 + cudaMemcpy(&(d_V_[0]), d_b, sizeof(double) * n_, cudaMemcpyDeviceToDevice); + + cudaMatvec(d_x, d_V_, "residual"); + + rnorm = 0.0; + cublasDdot(cublas_handle_, n_, d_b, 1, d_b, 1, &bnorm); + cublasDdot(cublas_handle_, n_, d_V_, 1, d_V_, 1, &rnorm); + // rnorm = ||V_1|| + rnorm = sqrt(rnorm); + bnorm = sqrt(bnorm); + bnorm_ = bnorm; + while(outer_flag) { + // check if maybe residual is already small enough? + if(it == 0) { + tolrel = tol_ * rnorm; + if(fabs(tolrel) < 1e-16) { + tolrel = 1e-16; + } + } + int exit_cond = 0; + if(conv_cond() == 0) { + exit_cond = ((fabs(rnorm - ZERO) <= EPSILON)); + } else { + if(conv_cond() == 1) { + exit_cond = ((fabs(rnorm - ZERO) <= EPSILON) || (rnorm < tol_)); + } else { + if(conv_cond() == 2) { + exit_cond = ((fabs(rnorm - ZERO) <= EPSILON) || (rnorm < (tol_ * bnorm))); + } + } + } + if(exit_cond) { + outer_flag = 0; + final_residual_norm_ = rnorm; + initial_residual_norm_ = rnorm; + fgmres_iters_ = 0; + break; + } + + // normalize first vector + t = 1.0 / rnorm; + cublasDscal(cublas_handle_, n_, &t, d_V_, 1); + + // initialize norm history + + h_rs_[0] = rnorm; + initial_residual_norm_ = rnorm; + i = -1; + notconv = 1; + + while((notconv) && (it < maxit_)) { + i++; + it++; + // Z_i = (LU)^{-1}*V_i + cudaMemcpy(&d_Z_[i * n_], &d_V_[i * n_], sizeof(double) * n_, cudaMemcpyDeviceToDevice); + checkCudaErrors(cusolverRfSolve(cusolverrf_handle_, d_P_, d_Q_, 1, d_T_, n_, &d_Z_[i * n_], n_)); + cudaDeviceSynchronize(); + // V_{i+1}=A*Z_i + cudaMatvec(&d_Z_[i * n_], &d_V_[(i + 1) * n_], "matvec"); + // orthogonalize V[i+1], form a column of h_L + GramSchmidt(i); + + if(i != 0) { + for(int k = 1; k <= i; k++) { + k1 = k - 1; + t = h_H_[i * (restart_ + 1) + k1]; + h_H_[i * (restart_ + 1) + k1] = h_c_[k1] * t + h_s_[k1] * h_H_[i * (restart_ + 1) + k]; + h_H_[i * (restart_ + 1) + k] = -h_s_[k1] * t + h_c_[k1] * h_H_[i * (restart_ + 1) + k]; + } + } // if i!=0 + + double Hii = h_H_[i * (restart_ + 1) + i]; + double Hii1 = h_H_[(i) * (restart_ + 1) + i + 1]; + double gam = sqrt(Hii * Hii + Hii1 * Hii1); + + if(fabs(gam - ZERO) <= EPSILON) { + gam = EPSMAC; + } + + /* next Given's rotation */ + h_c_[i] = Hii / gam; + h_s_[i] = Hii1 / gam; + h_rs_[i + 1] = -h_s_[i] * h_rs_[i]; + h_rs_[i] = h_c_[i] * h_rs_[i]; + + h_H_[(i) * (restart_ + 1) + (i)] = h_c_[i] * Hii + h_s_[i] * Hii1; + h_H_[(i) * (restart_ + 1) + (i + 1)] = h_c_[i] * Hii1 - h_s_[i] * Hii; + + // residual norm estimate + rnorm = fabs(h_rs_[i + 1]); + // check convergence + if(i + 1 >= restart_ || rnorm <= tolrel || it >= maxit_) { + notconv = 0; + } + } // inner while + + // solve tri system + h_rs_[i] = h_rs_[i] / h_H_[i * (restart_ + 1) + i]; + for(int ii = 2; ii <= i + 1; ii++) { + k = i - ii + 1; + k1 = k + 1; + t = h_rs_[k]; + for(j = k1; j <= i; j++) { + t -= h_H_[j * (restart_ + 1) + k] * h_rs_[j]; + } + h_rs_[k] = t / h_H_[k * (restart_ + 1) + k]; + } + + // get solution + for(j = 0; j <= i; j++) { + cublasDaxpy(cublas_handle_, n_, &h_rs_[j], &d_Z_[j * n_], 1, d_x, 1); + } + + /* test solution */ + + if(rnorm <= tolrel || it >= maxit_) { + // rnorm_aux = rnorm; + outer_flag = 0; + } + + cudaMemcpy(&d_V_[0], d_b, sizeof(double) * n_, cudaMemcpyDeviceToDevice); + cudaMatvec(d_x, d_V_, "residual"); + + rnorm = 0.0; + cublasDdot(cublas_handle_, n_, d_V_, 1, d_V_, 1, &rnorm); + // rnorm = ||V_1|| + rnorm = sqrt(rnorm); + + if(!outer_flag) { + final_residual_norm_ = rnorm; + fgmres_iters_ = it; + } + } // outer while +} + +// b-Ax +void IterativeRefinement::cudaMatvec(double* d_x, double* d_b, std::string option) +{ + cusparseCreateDnVec(&vec_x_, n_, d_x, CUDA_R_64F); + cusparseCreateDnVec(&vec_Ax_, n_, d_b, CUDA_R_64F); + if(option == "residual") { + // b = b-Ax + cusparseSpMV(cusparse_handle_, + CUSPARSE_OPERATION_NON_TRANSPOSE, + &minusone_, + mat_A_, + vec_x_, + &one_, + vec_Ax_, + CUDA_R_64F, + CUSPARSE_SPMV_CSR_ALG2, + mv_buffer_); + } else { + // just b = A*x + cusparseSpMV(cusparse_handle_, + CUSPARSE_OPERATION_NON_TRANSPOSE, + &one_, + mat_A_, + vec_x_, + &zero_, + vec_Ax_, + CUDA_R_64F, + CUSPARSE_SPMV_CSR_ALG2, + mv_buffer_); + } + cusparseDestroyDnVec(vec_x_); + cusparseDestroyDnVec(vec_Ax_); +} + +void IterativeRefinement::GramSchmidt(int i) +{ + double t; + const double one = 1.0; + const double minusone = -1.0; + const double zero = 0.0; + double s; + int sw = 0; + if(orth_option_ == "mgs") { + sw = 0; + } else { + if(orth_option_ == "cgs2") { + sw = 1; + } else { + if(orth_option_ == "mgs_two_synch") { + sw = 2; + } else { + if(orth_option_ == "mgs_pm") { + sw = 3; + } else { + // display error message and set sw = 0; + /* + nlp_->log->printf(hovWarning, + "Wrong Gram-Schmidt option. Setting default (modified Gram-Schmidt, mgs) ...\n"); + */ + sw = 0; + } + } + } + } + + switch(sw) { + case 0: // mgs + + for(int j = 0; j <= i; ++j) { + t = 0.0; + cublasDdot(cublas_handle_, n_, &d_V_[j * n_], 1, &d_V_[(i + 1) * n_], 1, &t); + + h_H_[i * (restart_ + 1) + j] = t; + t *= -1.0; + + cublasDaxpy(cublas_handle_, n_, &t, &d_V_[j * n_], 1, &d_V_[(i + 1) * n_], 1); + } + t = 0.0; + cublasDdot(cublas_handle_, n_, &d_V_[(i + 1) * n_], 1, &d_V_[(i + 1) * n_], 1, &t); + + // set the last entry in Hessenberg matrix + t = sqrt(t); + h_H_[(i) * (restart_ + 1) + i + 1] = t; + if(t != 0.0) { + t = 1.0 / t; + cublasDscal(cublas_handle_, n_, &t, &d_V_[(i + 1) * n_], 1); + } else { + assert(0 && "Iterative refinement failed, Krylov vector with zero norm\n"); + } + break; + + case 1: // cgs2 + // Hcol = V(:,1:i)^T *V(:,i+1); + cublasDgemv(cublas_handle_, CUBLAS_OP_T, n_, i + 1, &one_, d_V_, n_, &d_V_[(i + 1) * n_], 1, &zero_, d_H_col_, 1); + // V(:,i+1) = V(:, i+1) - V(:,1:i)*Hcol + cublasDgemv(cublas_handle_, CUBLAS_OP_N, n_, i + 1, &minusone_, d_V_, n_, d_H_col_, 1, &one_, &d_V_[n_ * (i + 1)], 1); + // copy H_col to aux, we will need it later + + cudaMemcpy(h_aux_, d_H_col_, sizeof(double) * (i + 1), cudaMemcpyDeviceToHost); + + // Hcol = V(:,1:i)*V(:,i+1); + cublasDgemv(cublas_handle_, CUBLAS_OP_T, n_, i + 1, &one_, d_V_, n_, &d_V_[(i + 1) * n_], 1, &zero_, d_H_col_, 1); + // V(:,i+1) = V(:, i+1) - V(:,1:i)*Hcol + + cublasDgemv(cublas_handle_, CUBLAS_OP_N, n_, i + 1, &minusone_, d_V_, n_, d_H_col_, 1, &one_, &d_V_[n_ * (i + 1)], 1); + // copy H_col to H + + cudaMemcpy(&h_H_[i * (restart_ + 1)], d_H_col_, sizeof(double) * (i + 1), cudaMemcpyDeviceToHost); + // add both pieces together (unstable otherwise, careful here!!) + for(int j = 0; j <= i; ++j) { + h_H_[i * (restart_ + 1) + j] += h_aux_[j]; + } + t = 0.0; + cublasDdot(cublas_handle_, n_, &d_V_[(i + 1) * n_], 1, &d_V_[(i + 1) * n_], 1, &t); + + // set the last entry in Hessenberg matrix + t = sqrt(t); + h_H_[(i) * (restart_ + 1) + i + 1] = t; + if(t != 0.0) { + t = 1.0 / t; + cublasDscal(cublas_handle_, n_, &t, &d_V_[(i + 1) * n_], 1); + } else { + assert(0 && "Iterative refinement failed, Krylov vector with zero norm\n"); + } + break; + // the two low synch schemes + case 2: + // KS: the kernels are limited by the size of the shared memory on the GPU. If too many vectors in Krylov space, use + // standard cublas routines. V[1:i]^T[V[i] w] + if(i < 200) { + evloser_mass_inner_product_two_vectors(n_, i, &d_V_[i * n_], &d_V_[(i + 1) * n_], d_V_, d_rvGPU_); + } else { + cublasDgemm(cublas_handle_, + CUBLAS_OP_T, + CUBLAS_OP_N, + i + 1, // m + 2, // n + n_, // k + &one, // alpha + d_V_, // A + n_, // lda + &d_V_[i * n_], // B + n_, // ldb + &zero, + d_rvGPU_, // c + i + 1); // ldc + } + // copy rvGPU to L + cudaMemcpy(&h_L_[(i) * (restart_ + 1)], d_rvGPU_, (i + 1) * sizeof(double), cudaMemcpyDeviceToHost); + + cudaMemcpy(h_rv_, &d_rvGPU_[i + 1], (i + 1) * sizeof(double), cudaMemcpyDeviceToHost); + + for(int j = 0; j <= i; ++j) { + h_H_[(i) * (restart_ + 1) + j] = 0.0; + } + // triangular solve + for(int j = 0; j <= i; ++j) { + h_H_[(i) * (restart_ + 1) + j] = h_rv_[j]; + s = 0.0; + for(int k = 0; k < j; ++k) { + s += h_L_[j * (restart_ + 1) + k] * h_H_[(i) * (restart_ + 1) + k]; + } // for k + h_H_[(i) * (restart_ + 1) + j] -= s; + } // for j + + cudaMemcpy(d_Hcolumn_, &h_H_[(i) * (restart_ + 1)], (i + 1) * sizeof(double), cudaMemcpyHostToDevice); + // again, use std cublas functions if Krylov space is too large + if(i < 200) { + evloser_mass_axpy(n_, i, d_V_, &d_V_[(i + 1) * n_], d_Hcolumn_); + } else { + cublasDgemm(cublas_handle_, + CUBLAS_OP_N, + CUBLAS_OP_N, + n_, // m + 1, // n + i + 1, // k + &minusone, // alpha + d_V_, // A + n_, // lda + d_Hcolumn_, // B + i + 1, // ldb + &one, + &d_V_[(i + 1) * n_], // c + n_); // ldc + } + // normalize (second synch) + t = 0.0; + cublasDdot(cublas_handle_, n_, &d_V_[(i + 1) * n_], 1, &d_V_[(i + 1) * n_], 1, &t); + + // set the last entry in Hessenberg matrix + t = sqrt(t); + h_H_[(i) * (restart_ + 1) + i + 1] = t; + if(t != 0.0) { + t = 1.0 / t; + cublasDscal(cublas_handle_, n_, &t, &d_V_[(i + 1) * n_], 1); + } else { + assert(0 && "Iterative refinement failed, Krylov vector with zero norm\n"); + } + break; + + case 3: // two synch Gauss-Seidel mgs, SUPER STABLE + // according to unpublisjed work by ST + // L is where we keep the triangular matrix(L is ON THE CPU) + // if Krylov space is too large, use std cublas (because out of shared mmory) + if(i < 200) { + evloser_mass_inner_product_two_vectors(n_, i, &d_V_[i * n_], &d_V_[(i + 1) * n_], d_V_, d_rvGPU_); + } else { + cublasDgemm(cublas_handle_, + CUBLAS_OP_T, + CUBLAS_OP_N, + i + 1, // m + 2, // n + n_, // k + &one, // alpha + d_V_, // A + n_, // lda + &d_V_[i * n_], // B + n_, // ldb + &zero, + d_rvGPU_, // c + i + 1); // ldc + } + // copy rvGPU to L + cudaMemcpy(&h_L_[(i) * (restart_ + 1)], d_rvGPU_, (i + 1) * sizeof(double), cudaMemcpyDeviceToHost); + + cudaMemcpy(h_rv_, &d_rvGPU_[i + 1], (i + 1) * sizeof(double), cudaMemcpyDeviceToHost); + + for(int j = 0; j <= i; ++j) { + h_H_[(i) * (restart_ + 1) + j] = 0.0; + } + // triangular solve + for(int j = 0; j <= i; ++j) { + h_H_[(i) * (restart_ + 1) + j] = h_rv_[j]; + s = 0.0; + for(int k = 0; k < j; ++k) { + s += h_L_[j * (restart_ + 1) + k] * h_H_[(i) * (restart_ + 1) + k]; + } // for k + h_H_[(i) * (restart_ + 1) + j] -= s; + } // for j + + // now compute h_rv = L^T h_H + double h; + for(int j = 0; j <= i; ++j) { + // go through COLUMN OF L + h_rv_[j] = 0.0; + for(int k = j + 1; k <= i; ++k) { + h = h_L_[k * (restart_ + 1) + j]; + h_rv_[j] += h_H_[(i) * (restart_ + 1) + k] * h; + } + } + + // and do one more tri solve with L^T: h_aux = (I-L)^{-1}h_rv + for(int j = 0; j <= i; ++j) { + h_aux_[j] = h_rv_[j]; + s = 0.0; + for(int k = 0; k < j; ++k) { + s += h_L_[j * (restart_ + 1) + k] * h_aux_[k]; + } // for k + h_aux_[j] -= s; + } // for j + + // and now subtract that from h_H + for(int j = 0; j <= i; ++j) { + h_H_[(i) * (restart_ + 1) + j] -= h_aux_[j]; + } + cudaMemcpy(d_Hcolumn_, &h_H_[(i) * (restart_ + 1)], (i + 1) * sizeof(double), cudaMemcpyHostToDevice); + // if Krylov space too large, use std cublas routines + if(i < 200) { + evloser_mass_axpy(n_, i, d_V_, &d_V_[(i + 1) * n_], d_Hcolumn_); + } else { + cublasDgemm(cublas_handle_, + CUBLAS_OP_N, + CUBLAS_OP_N, + n_, // m + 1, // n + i + 1, // k + &minusone, // alpha + d_V_, // A + n_, // lda + d_Hcolumn_, // B + i + 1, // ldb + &one, + &d_V_[(i + 1) * n_], // c + n_); // ldc + } + // normalize (second synch) + t = 0.0; + cublasDdot(cublas_handle_, n_, &d_V_[(i + 1) * n_], 1, &d_V_[(i + 1) * n_], 1, &t); + + // set the last entry in Hessenberg matrix + t = sqrt(t); + h_H_[(i) * (restart_ + 1) + i + 1] = t; + if(t != 0.0) { + t = 1.0 / t; + cublasDscal(cublas_handle_, n_, &t, &d_V_[(i + 1) * n_], 1); + } else { + assert(0 && "Iterative refinement failed, Krylov vector with zero norm\n"); + } + break; + + default: + assert(0 && "Iterative refinement failed, wrong orthogonalization.\n"); + break; + } // switch +} // GramSchmidt + +// Error checking utility for CUDA +// KS: might later become part of src/Utils, putting it here for now +template +void IterativeRefinement::evloserCheckCudaError(T result, const char* const file, int const line) +{ +#ifdef DEBUG + if(result) { + fprintf(stdout, "CUDA error at %s:%d, error# %d\n", file, line, result); + assert(false); + } +#endif +} + +} // namespace EVLOSER +#endif // !defined(HIOP_USE_HIP) && !defined(HAVE_HIP) diff --git a/src/LinAlg/EVLOSER/IterativeRefinement.hpp b/src/LinAlg/EVLOSER/IterativeRefinement.hpp new file mode 100644 index 0000000..2bc9f7b --- /dev/null +++ b/src/LinAlg/EVLOSER/IterativeRefinement.hpp @@ -0,0 +1,242 @@ +/** + * @file IterativeRefinement.hpp + * + * @author Kasia Swirydowicz , PNNL + * @author Slaven Peles , ORNL + * + */ + +#pragma once + +#include "klu.h" +#include "evloser_gpu_defs.hpp" +#include + +namespace EVLOSER +{ + +constexpr double ZERO = 0.0; +constexpr double EPSILON = 1.0e-18; +constexpr double EPSMAC = 1.0e-16; + +#if defined(HIOP_USE_HIP) || defined(HAVE_HIP) + +/** + * @brief No-op iterative refinement interface for HIP EVLOSER builds. + * + * EVLOSER iterative refinement currently depends on CUDA-only GLU/Krylov + * kernels. HIP builds keep this interface available so shared solver code can + * compile, but the HIP EVLOSER path disables iterative refinement. + */ +class IterativeRefinement +{ +public: + IterativeRefinement() = default; + IterativeRefinement(int restart, double tol, int maxit) + : restart_{restart}, + maxit_{maxit}, + tol_{tol} + {} + ~IterativeRefinement() = default; + + int setup(cusparseHandle_t, + cublasHandle_t, + evloserRfHandle_t, + int, + double*, + int*, + int*, + double*, + double*) + { + return -1; + } + + int getFinalNumberOfIterations() { return 0; } + double getFinalResidalNorm() { return 0.0; } + double getInitialResidalNorm() { return 0.0; } + double getBNorm() { return 0.0; } + + void fgmres(double*, double*) {} + + void set_tol(double tol) { tol_ = tol; } + + int setup_system_matrix(int, int, int*, int*, double*) { return -1; } + + int& maxit() { return maxit_; } + + double& tol() { return tol_; } + + std::string& orth_option() { return orth_option_; } + + int& restart() { return restart_; } + + int& conv_cond() { return conv_cond_; } + +private: + int restart_{0}; + int maxit_{0}; + double tol_{0.0}; + int conv_cond_{0}; + std::string orth_option_{"mgs"}; +}; + +#else + +/** + * @brief Iterative refinement class + * + */ +class IterativeRefinement +{ +public: + IterativeRefinement(); + IterativeRefinement(int restart, double tol, int maxit); + ~IterativeRefinement(); + int setup(cusparseHandle_t cusparse_handle, + cublasHandle_t cublas_handle, + evloserRfHandle_t cusolverrf_handle, + int n, + double* d_T, + int* d_P, + int* d_Q, + double* devx, + double* devr); + + int getFinalNumberOfIterations(); + double getFinalResidalNorm(); + double getInitialResidalNorm(); + double getBNorm(); + // this is public on purpose, can be used internally or outside, to compute the residual. + void fgmres(double* d_x, double* d_b); + void set_tol(double tol) { tol_ = tol; } ///< Set tolerance for the Krylov solver + + /** + * @brief Set the up system matrix object mat_A_ of type cusparseSpMatDescr_t + * + * @param n - size of the matrix + * @param nnz - number of nonzeros in the matrix + * @param irow - array of row pointers + * @param jcol - array of column indices + * @param val - array of sparse matrix values + * + * @return int + * + * @pre Arrays `irow`, `jcol` and `val` are on the device. + */ + int setup_system_matrix(int n, int nnz, int* irow, int* jcol, double* val); + + // Simple accessors + int& maxit() { return maxit_; } + + double& tol() { return tol_; } + + std::string& orth_option() { return orth_option_; } + + int& restart() { return restart_; } + + int& conv_cond() { return conv_cond_; } + +private: + // Krylov vectors + double* d_V_{nullptr}; + double* d_Z_{nullptr}; + + double final_residual_norm_; + double initial_residual_norm_; + double bnorm_; + int fgmres_iters_; + + // Solver parameters + int restart_; + int maxit_; + double tol_; + int conv_cond_; ///< convergence condition, can be 0, 1, 2 for IR + std::string orth_option_; + + // System matrix data + int n_; + int nnz_; + int* dia_{nullptr}; + int* dja_{nullptr}; + double* da_{nullptr}; + cusparseSpMatDescr_t mat_A_{nullptr}; + + // Matrix-vector product data + cusparseDnVecDescr_t vec_x_{nullptr}; + cusparseDnVecDescr_t vec_Ax_{nullptr}; + + // CUDA libraries handles - MUST BE SET AT INIT + cusparseHandle_t cusparse_handle_{nullptr}; + cublasHandle_t cublas_handle_{nullptr}; + evloserRfHandle_t cusolverrf_handle_{nullptr}; + cusolverSpHandle_t cusolver_handle_{nullptr}; + + // GPU data (?) + double* d_T_{nullptr}; + int* d_P_{nullptr}; + int* d_Q_{nullptr}; + + double* d_rvGPU_{nullptr}; + double* d_Hcolumn_{nullptr}; + double* d_H_col_{nullptr}; + void* mv_buffer_{nullptr}; ///< SpMV buffer + + // CPU: + double* h_L_{nullptr}; + double* h_H_{nullptr}; + double* h_rv_{nullptr}; + // for givens rotations + double* h_c_{nullptr}; + double* h_s_{nullptr}; + // for Hessenberg system + double* h_rs_{nullptr}; + // neded in some of the orthogonalization methods + double* h_aux_{nullptr}; + + // TODO: Something needs to be done with this :) + const double minusone_ = -1.0; + const double one_ = 1.0; + const double zero_ = 0.0; + + /** + * @brief orthogonalize i+1 vector against i vectors already orthogonal + * + * Private function needed for FGMRES. + * + * @param[in] i - number of orthogonal vectors + */ + void GramSchmidt(int i); + + /** + * @brief matvec black-box: b = b - A*d_x if option is "residual" and b=A*x + * if option is "matvec" + * + * @param d_x + * @param d_b + * @param option + * + * @todo Document d_x and d_b; are both of them modified in this function? + */ + void cudaMatvec(double* d_x, double* d_b, std::string option); + + // KS: needed for testing -- condider delating later + double matrixAInfNrm(); + double vectorInfNrm(int n, double* d_v); + // end of testing + + /** + * @brief Check for CUDA errors. + * + * @tparam T - type of the result + * @param result - result value + * @param file - file name where the error occured + * @param line - line at which the error occured + */ + template + void evloserCheckCudaError(T result, const char* const file, int const line); +}; + +#endif // defined(HIOP_USE_HIP) || defined(HAVE_HIP) + +} // namespace EVLOSER diff --git a/src/LinAlg/EVLOSER/KrylovSolverKernels.cu b/src/LinAlg/EVLOSER/KrylovSolverKernels.cu new file mode 100644 index 0000000..f6185c8 --- /dev/null +++ b/src/LinAlg/EVLOSER/KrylovSolverKernels.cu @@ -0,0 +1,215 @@ +// This file is part of HiOp. For details, see https://github.com/LLNL/hiop. +// HiOp is released under the BSD 3-clause license +// (https://opensource.org/licenses/BSD-3-Clause). Please also read “Additional +// BSD Notice” below. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// i. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the disclaimer below. ii. Redistributions in +// binary form must reproduce the above copyright notice, this list of +// conditions and the disclaimer (as noted below) in the documentation and/or +// other materials provided with the distribution. +// iii. Neither the name of the LLNS/LLNL nor the names of its contributors may +// be used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY, LLC, +// THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, +// INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF +// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Additional BSD Notice +// 1. This notice is required to be provided under our contract with the U.S. +// Department of Energy (DOE). This work was produced at Lawrence Livermore +// National Laboratory under Contract No. DE-AC52-07NA27344 with the DOE. +// 2. Neither the United States Government nor Lawrence Livermore National +// Security, LLC nor any of their employees, makes any warranty, express or +// implied, or assumes any liability or responsibility for the accuracy, +// completeness, or usefulness of any information, apparatus, product, or +// process disclosed, or represents that its use would not infringe +// privately-owned rights. +// 3. Also, reference herein to any specific commercial products, process, or +// services by trade name, trademark, manufacturer or otherwise does not +// necessarily constitute or imply its endorsement, recommendation, or favoring +// by the United States Government or Lawrence Livermore National Security, +// LLC. The views and opinions of authors expressed herein do not necessarily +// state or reflect those of the United States Government or Lawrence Livermore +// National Security, LLC, and shall not be used for advertising or product +// endorsement purposes. + +/** + * @file KrylovSolverKernels.cu + * + * @author Kasia Swirydowicz , PNNL + */ +#include "KrylovSolverKernels.h" +#define maxk 1024 +#define Tv5 1024 +//computes V^T[u1 u2] where v is n x k and u1 and u2 are nx1 +__global__ void evloser_MassIPTwoVec_kernel(const double* __restrict__ u1, + const double* __restrict__ u2, + const double* __restrict__ v, + double* result, + const int k, + const int N) +{ + int t = threadIdx.x; + int bsize = blockDim.x; + + // assume T threads per thread block (and k reductions to be performed) + volatile __shared__ double s_tmp1[Tv5]; + + volatile __shared__ double s_tmp2[Tv5]; + // map between thread index space and the problem index space + int j = blockIdx.x; + s_tmp1[t] = 0.0f; + s_tmp2[t] = 0.0f; + int nn = t; + double can1, can2, cbn; + + while(nn < N) { + can1 = u1[nn]; + can2 = u2[nn]; + + cbn = v[N * j + nn]; + s_tmp1[t] += can1 * cbn; + s_tmp2[t] += can2 * cbn; + + nn += bsize; + } + + __syncthreads(); + + if(Tv5 >= 1024) { + if(t < 512) { + s_tmp1[t] += s_tmp1[t + 512]; + s_tmp2[t] += s_tmp2[t + 512]; + } + __syncthreads(); + } + if(Tv5 >= 512) { + if(t < 256) { + s_tmp1[t] += s_tmp1[t + 256]; + s_tmp2[t] += s_tmp2[t + 256]; + } + __syncthreads(); + } + { + if(t < 128) { + s_tmp1[t] += s_tmp1[t + 128]; + s_tmp2[t] += s_tmp2[t + 128]; + } + __syncthreads(); + } + { + if(t < 64) { + s_tmp1[t] += s_tmp1[t + 64]; + s_tmp2[t] += s_tmp2[t + 64]; + } + __syncthreads(); + } + + if(t < 32) { + s_tmp1[t] += s_tmp1[t + 32]; + s_tmp2[t] += s_tmp2[t + 32]; + + s_tmp1[t] += s_tmp1[t + 16]; + s_tmp2[t] += s_tmp2[t + 16]; + + s_tmp1[t] += s_tmp1[t + 8]; + s_tmp2[t] += s_tmp2[t + 8]; + + s_tmp1[t] += s_tmp1[t + 4]; + s_tmp2[t] += s_tmp2[t + 4]; + + s_tmp1[t] += s_tmp1[t + 2]; + s_tmp2[t] += s_tmp2[t + 2]; + + s_tmp1[t] += s_tmp1[t + 1]; + s_tmp2[t] += s_tmp2[t + 1]; + } + if(t == 0) { + result[blockIdx.x] = s_tmp1[0]; + result[blockIdx.x + k] = s_tmp2[0]; + } +} + + +//mass AXPY i.e y = y - x*alpha where alpha is [k x 1], needed in 1 and 2 synch GMRES + +__global__ void evloser_massAxpy3_kernel(int N, + int k, + const double* x_data, + double* y_data, + const double* alpha) { + + unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int t = threadIdx.x; + __shared__ double s_alpha[maxk]; + if(t < k) { + s_alpha[t] = alpha[t]; + } + __syncthreads(); + + if(i < N) { + double temp = 0.0f; + for(int j = 0; j < k; ++j) { + temp += x_data[j * N + i] * s_alpha[j]; + } + y_data[i] -= temp; + } +} + +__global__ void evloser_matrixInfNormPart1(const int n, + const int nnz, + const int* a_ia, + const double* a_val, + double* result) { + + // one thread per row, pass through rows + // and sum + // can be done through atomics + //\sum_{j=1}^m abs(a_{ij}) + + int idx = blockIdx.x*blockDim.x + threadIdx.x; + while (idx < n){ + double sum = 0.0f; + for (int i = a_ia[idx]; i < a_ia[idx+1]; ++i) { + sum = sum + fabs(a_val[i]); + } + result[idx] = sum; + idx += (blockDim.x*gridDim.x); + } +} + + +void evloser_mass_inner_product_two_vectors(int n, + int i, + double* vec1, + double* vec2, + double* mvec, + double* result) +{ + evloser_MassIPTwoVec_kernel<<>>(vec1, vec2, mvec, result, i + 1, n); +} +void evloser_mass_axpy(int n, int i, double* x, double* y, double* alpha) +{ + evloser_massAxpy3_kernel<<<(n + 384 - 1) / 384, 384>>>(n, i + 1, x, y, alpha); +} + +void evloser_matrix_row_sums(int n, + int nnz, + int* a_ia, + double* a_val, + double* result) +{ + evloser_matrixInfNormPart1<<<1000,1024>>>(n, nnz, a_ia, a_val, result); +} diff --git a/src/LinAlg/EVLOSER/KrylovSolverKernels.h b/src/LinAlg/EVLOSER/KrylovSolverKernels.h new file mode 100644 index 0000000..e5eff1b --- /dev/null +++ b/src/LinAlg/EVLOSER/KrylovSolverKernels.h @@ -0,0 +1,69 @@ +// This file is part of HiOp. For details, see https://github.com/LLNL/hiop. +// HiOp is released under the BSD 3-clause license +// (https://opensource.org/licenses/BSD-3-Clause). Please also read “Additional +// BSD Notice” below. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// i. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the disclaimer below. ii. Redistributions in +// binary form must reproduce the above copyright notice, this list of +// conditions and the disclaimer (as noted below) in the documentation and/or +// other materials provided with the distribution. +// iii. Neither the name of the LLNS/LLNL nor the names of its contributors may +// be used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY, LLC, +// THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, +// INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF +// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Additional BSD Notice +// 1. This notice is required to be provided under our contract with the U.S. +// Department of Energy (DOE). This work was produced at Lawrence Livermore +// National Laboratory under Contract No. DE-AC52-07NA27344 with the DOE. +// 2. Neither the United States Government nor Lawrence Livermore National +// Security, LLC nor any of their employees, makes any warranty, express or +// implied, or assumes any liability or responsibility for the accuracy, +// completeness, or usefulness of any information, apparatus, product, or +// process disclosed, or represents that its use would not infringe +// privately-owned rights. +// 3. Also, reference herein to any specific commercial products, process, or +// services by trade name, trademark, manufacturer or otherwise does not +// necessarily constitute or imply its endorsement, recommendation, or favoring +// by the United States Government or Lawrence Livermore National Security, +// LLC. The views and opinions of authors expressed herein do not necessarily +// state or reflect those of the United States Government or Lawrence Livermore +// National Security, LLC, and shall not be used for advertising or product +// endorsement purposes. + +/** + * @file src/LinAlg/KrylovSolverKernels.h + * + * @author Kasia Swirydowicz , PNNL + * + */ + + +void evloser_mass_inner_product_two_vectors(int n, + int i, + double* vec1, + double* vec2, + double* mvec, + double* result); +void evloser_mass_axpy(int n, int i, double* x, double* y, double* alpha); + +//needed for matrix inf nrm +void evloser_matrix_row_sums(int n, + int nnz, + int* a_ia, + double* a_val, + double* result); diff --git a/src/LinAlg/EVLOSER/MatrixCsr.cpp b/src/LinAlg/EVLOSER/MatrixCsr.cpp new file mode 100644 index 0000000..3b8f505 --- /dev/null +++ b/src/LinAlg/EVLOSER/MatrixCsr.cpp @@ -0,0 +1,349 @@ +// This file is part of HiOp. For details, see https://github.com/LLNL/hiop. +// HiOp is released under the BSD 3-clause license +// (https://opensource.org/licenses/BSD-3-Clause). Please also read “Additional +// BSD Notice” below. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// i. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the disclaimer below. ii. Redistributions in +// binary form must reproduce the above copyright notice, this list of +// conditions and the disclaimer (as noted below) in the documentation and/or +// other materials provided with the distribution. +// iii. Neither the name of the LLNS/LLNL nor the names of its contributors may +// be used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY, LLC, +// THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, +// INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF +// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Additional BSD Notice +// 1. This notice is required to be provided under our contract with the U.S. +// Department of Energy (DOE). This work was produced at Lawrence Livermore +// National Laboratory under Contract No. DE-AC52-07NA27344 with the DOE. +// 2. Neither the United States Government nor Lawrence Livermore National +// Security, LLC nor any of their employees, makes any warranty, express or +// implied, or assumes any liability or responsibility for the accuracy, +// completeness, or usefulness of any information, apparatus, product, or +// process disclosed, or represents that its use would not infringe +// privately-owned rights. +// 3. Also, reference herein to any specific commercial products, process, or +// services by trade name, trademark, manufacturer or otherwise does not +// necessarily constitute or imply its endorsement, recommendation, or favoring +// by the United States Government or Lawrence Livermore National Security, +// LLC. The views and opinions of authors expressed herein do not necessarily +// state or reflect those of the United States Government or Lawrence Livermore +// National Security, LLC, and shall not be used for advertising or product +// endorsement purposes. + +/** + * @file MatrixCsr.cpp + * + * @author Kasia Swirydowicz , PNNL + * @author Slaven Peles , ORNL + * + */ + +#include "MatrixCsr.hpp" + +#include "evloser_gpu_defs.hpp" +#include +#include +#include +#include +#include +#include + +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) +#define checkGpuErrors(val) evloserCheckGpuError((val), __FILE__, __LINE__) +#endif + +namespace EVLOSER +{ + +MatrixCsr::MatrixCsr(ExecutionMode execution_mode) : execution_mode_(execution_mode) {} + +MatrixCsr::~MatrixCsr() +{ + clear_data(); +} + +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) +bool MatrixCsr::has_device_storage() const +{ + if(execution_mode_ == ExecutionMode::CPU) { + return false; + } + + const bool size_allocated = (n_ == 0) || (irows_ != nullptr); + const bool nnz_allocated = (nnz_ == 0) || (jcols_ != nullptr && vals_ != nullptr); + return size_allocated && nnz_allocated; +} +#endif + +bool MatrixCsr::has_host_mirror() const +{ + const bool size_allocated = (n_ == 0) || (irows_host_ != nullptr); + const bool nnz_allocated = (nnz_ == 0) || (jcols_host_ != nullptr && vals_host_ != nullptr); + return size_allocated && nnz_allocated; +} + +void MatrixCsr::allocate_size(int n) +{ + bool storage_allocated = irows_host_ != nullptr; + +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) + if(execution_mode_ == ExecutionMode::CUDA || execution_mode_ == ExecutionMode::HIP) { + storage_allocated = storage_allocated || irows_ != nullptr; + } +#endif + + if(storage_allocated) { + clear_data(); + } + + n_ = n; + +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) + if(execution_mode_ == ExecutionMode::CUDA || execution_mode_ == ExecutionMode::HIP) { + checkGpuErrors(evloserGpuMalloc(reinterpret_cast(&irows_), (n_ + 1) * sizeof(int))); + } +#endif + + irows_host_ = new int[n_ + 1]{0}; +} + +void MatrixCsr::allocate_nnz(int nnz) +{ + bool storage_allocated = jcols_host_ != nullptr || vals_host_ != nullptr; + +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) + if(execution_mode_ == ExecutionMode::CUDA || execution_mode_ == ExecutionMode::HIP) { + storage_allocated = storage_allocated || jcols_ != nullptr || vals_ != nullptr; + } +#endif + + if(storage_allocated) { +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) + if(execution_mode_ == ExecutionMode::CUDA || execution_mode_ == ExecutionMode::HIP) { + checkGpuErrors(evloserGpuFree(jcols_)); + checkGpuErrors(evloserGpuFree(vals_)); + + jcols_ = nullptr; + vals_ = nullptr; + } +#endif + + delete[] jcols_host_; + delete[] vals_host_; + + jcols_host_ = nullptr; + vals_host_ = nullptr; + nnz_ = 0; + } + + nnz_ = nnz; + + if(nnz_ == 0) { + return; + } + +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) + if(execution_mode_ == ExecutionMode::CUDA || execution_mode_ == ExecutionMode::HIP) { + checkGpuErrors(evloserGpuMalloc(reinterpret_cast(&jcols_), nnz_ * sizeof(int))); + checkGpuErrors(evloserGpuMalloc(reinterpret_cast(&vals_), nnz_ * sizeof(double))); + } +#endif + + jcols_host_ = new int[nnz_]{0}; + vals_host_ = new double[nnz_]{0}; +} + +void MatrixCsr::clear_data() +{ +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) + if(execution_mode_ == ExecutionMode::CUDA || execution_mode_ == ExecutionMode::HIP) { + checkGpuErrors(evloserGpuFree(irows_)); + checkGpuErrors(evloserGpuFree(jcols_)); + checkGpuErrors(evloserGpuFree(vals_)); + + irows_ = nullptr; + jcols_ = nullptr; + vals_ = nullptr; + } +#endif + + delete[] irows_host_; + delete[] jcols_host_; + delete[] vals_host_; + + irows_host_ = nullptr; + jcols_host_ = nullptr; + vals_host_ = nullptr; + + n_ = 0; + nnz_ = 0; +} + +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) +void MatrixCsr::update_from_host_mirror() +{ + if(execution_mode_ == ExecutionMode::CPU) { + assert(false && "Cannot update device storage in CPU execution mode."); + return; + } + + assert(has_device_storage()); + assert(has_host_mirror()); + + checkGpuErrors(evloserGpuMemcpy(irows_, irows_host_, sizeof(int) * (n_ + 1), evloserMemcpyHostToDevice)); + + if(nnz_ > 0) { + checkGpuErrors(evloserGpuMemcpy(jcols_, jcols_host_, sizeof(int) * nnz_, evloserMemcpyHostToDevice)); + checkGpuErrors(evloserGpuMemcpy(vals_, vals_host_, sizeof(double) * nnz_, evloserMemcpyHostToDevice)); + } +} + +void MatrixCsr::copy_to_host_mirror() +{ + if(execution_mode_ == ExecutionMode::CPU) { + assert(false && "Cannot copy from device storage in CPU execution mode."); + return; + } + + assert(has_device_storage()); + assert(has_host_mirror()); + + checkGpuErrors(evloserGpuMemcpy(irows_host_, irows_, sizeof(int) * (n_ + 1), evloserMemcpyDeviceToHost)); + + if(nnz_ > 0) { + checkGpuErrors(evloserGpuMemcpy(jcols_host_, jcols_, sizeof(int) * nnz_, evloserMemcpyDeviceToHost)); + checkGpuErrors(evloserGpuMemcpy(vals_host_, vals_, sizeof(double) * nnz_, evloserMemcpyDeviceToHost)); + } +} +#endif + +bool MatrixCsr::validate_host_structure(const char* caller, bool silent_output) const +{ + const char* caller_name = caller == nullptr ? "unknown caller" : caller; + + auto report = [&](const std::string& message) { + if(!silent_output) { + std::cout << "[EVLOSER] Invalid CSR matrix in " << caller_name << ": " << message << "\n"; + } + return false; + }; + + if(n_ <= 0) { + return report("matrix dimension must be positive"); + } + + if(nnz_ < 0) { + return report("number of nonzeros is negative"); + } + + if(irows_host_ == nullptr) { + return report("host row pointer is null"); + } + + if(irows_host_[0] != 0) { + return report("row pointer must start at zero"); + } + + for(int row = 0; row < n_; ++row) { + if(irows_host_[row] > irows_host_[row + 1]) { + return report("row pointer is not monotone"); + } + } + + if(irows_host_[n_] != nnz_) { + return report("final row pointer does not match nnz"); + } + + if(nnz_ == 0) { + return true; + } + + if(jcols_host_ == nullptr) { + return report("host column index array is null"); + } + + if(vals_host_ == nullptr) { + return report("host value array is null"); + } + + for(int k = 0; k < nnz_; ++k) { + if(jcols_host_[k] < 0 || jcols_host_[k] >= n_) { + return report("column index out of range"); + } + } + + return true; +} + +bool MatrixCsr::validate_host_values(const char* caller, bool silent_output) const +{ + const char* caller_name = caller == nullptr ? "unknown caller" : caller; + + auto report = [&](const std::string& message) { + if(!silent_output) { + std::cout << "[EVLOSER] Invalid CSR matrix values in " << caller_name << ": " << message << "\n"; + } + return false; + }; + + if(nnz_ < 0) { + return report("number of nonzeros is negative"); + } + + if(nnz_ == 0) { + return true; + } + + if(vals_host_ == nullptr) { + return report("host value array is null"); + } + + for(int k = 0; k < nnz_; ++k) { + if(!std::isfinite(vals_host_[k])) { + std::ostringstream message; + message << "matrix value at entry " << k << " is not finite"; + return report(message.str()); + } + } + + return true; +} + +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) +// Error checking utility for GPU backend +// KS: might later become part of src/Utils, putting it here for now +template +void MatrixCsr::evloserCheckGpuError(T result, const char* const file, int const line) +{ + if(result) { + std::cout << "GPU error at " << file << ":" << line << " error# " << result << "\n"; + assert(false); + } +} +#endif +} // namespace EVLOSER diff --git a/src/LinAlg/EVLOSER/MatrixCsr.hpp b/src/LinAlg/EVLOSER/MatrixCsr.hpp new file mode 100644 index 0000000..12e3c6d --- /dev/null +++ b/src/LinAlg/EVLOSER/MatrixCsr.hpp @@ -0,0 +1,137 @@ +#pragma once + +#include "evloser_execution_mode.hpp" + +namespace EVLOSER +{ + +class MatrixCsr +{ +public: + explicit MatrixCsr(ExecutionMode execution_mode); + ~MatrixCsr(); + + /// Allocate device and host row-pointer storage for an n-by-n CSR matrix. + void allocate_size(int n); + + /// Allocate device and host column-index/value storage for the current CSR matrix. + void allocate_nnz(int nnz); + + /// Release all owned device and host CSR storage. + void clear_data(); + + /// Return the matrix dimension. + int n() const { return n_; } + + /// Return the number of stored nonzeros. + int nnz() const { return nnz_; } + +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) + /// Return true when the required device CSR arrays have been allocated. + bool has_device_storage() const; +#endif + + /// Return true when the required host CSR mirror arrays have been allocated. + bool has_host_mirror() const; + + /** + * @brief Validate the host-side CSR structure before factorization/refactorization. + * + * Checks row-pointer monotonicity, final nnz consistency, and column-index bounds. + * + * @param caller Name of the caller used in diagnostic messages. + * @param silent_output Suppress diagnostic output when true. + * @return true if the host CSR structure is valid. + */ + bool validate_host_structure(const char* caller, bool silent_output) const; + + /** + * @brief Validate that all host-side CSR values are finite. + * + * @param caller Name of the caller used in diagnostic messages. + * @param silent_output Suppress diagnostic output when true. + * @return true if every stored host value is finite. + */ + bool validate_host_values(const char* caller, bool silent_output) const; + +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) + /// Return device row-pointer storage. + int* device_irows() { return irows_; } + + /// Return const device row-pointer storage. + const int* device_irows() const { return irows_; } + + /// Return device column-index storage. + int* device_jcols() { return jcols_; } + + /// Return const device column-index storage. + const int* device_jcols() const { return jcols_; } + + /// Return device value storage. + double* device_vals() { return vals_; } + + /// Return const device value storage. + const double* device_vals() const { return vals_; } +#endif + + /// Return host row-pointer mirror storage. + int* host_irows() { return irows_host_; } + + /// Return const host row-pointer mirror storage. + const int* host_irows() const { return irows_host_; } + + /// Return host column-index mirror storage. + int* host_jcols() { return jcols_host_; } + + /// Return const host column-index mirror storage. + const int* host_jcols() const { return jcols_host_; } + + /// Return host value mirror storage. + double* host_vals() { return vals_host_; } + + /// Return const host value mirror storage. + const double* host_vals() const { return vals_host_; } + +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) + /// Copy host-side CSR arrays into device storage. + void update_from_host_mirror(); + + /// Copy device CSR arrays into the host mirror. + void copy_to_host_mirror(); +#endif + +private: + const ExecutionMode execution_mode_; ///< Selected CPU, CUDA, or HIP execution path + int n_{0}; + int nnz_{0}; + +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) + int* irows_{nullptr}; + int* jcols_{nullptr}; + double* vals_{nullptr}; +#endif + + int* irows_host_{nullptr}; + int* jcols_host_{nullptr}; + double* vals_host_{nullptr}; + +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) + /** + * @brief Check for GPU backend errors. + * + * @tparam T - type of the result + * @param result - result value + * @param file - file name where the error occured + * @param line - line at which the error occured + */ + template + void evloserCheckGpuError(T result, const char* const file, int const line); +#endif +}; + +} // namespace EVLOSER diff --git a/src/LinAlg/EVLOSER/RefactorizationSolver.cpp b/src/LinAlg/EVLOSER/RefactorizationSolver.cpp new file mode 100644 index 0000000..756fd2d --- /dev/null +++ b/src/LinAlg/EVLOSER/RefactorizationSolver.cpp @@ -0,0 +1,1594 @@ +// This file is part of HiOp. For details, see https://github.com/LLNL/hiop. +// HiOp is released under the BSD 3-clause license +// (https://opensource.org/licenses/BSD-3-Clause). Please also read “Additional +// BSD Notice” below. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// i. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the disclaimer below. ii. Redistributions in +// binary form must reproduce the above copyright notice, this list of +// conditions and the disclaimer (as noted below) in the documentation and/or +// other materials provided with the distribution. +// iii. Neither the name of the LLNS/LLNL nor the names of its contributors may +// be used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY, LLC, +// THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, +// INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF +// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Additional BSD Notice +// 1. This notice is required to be provided under our contract with the U.S. +// Department of Energy (DOE). This work was produced at Lawrence Livermore +// National Laboratory under Contract No. DE-AC52-07NA27344 with the DOE. +// 2. Neither the United States Government nor Lawrence Livermore National +// Security, LLC nor any of their employees, makes any warranty, express or +// implied, or assumes any liability or responsibility for the accuracy, +// completeness, or usefulness of any information, apparatus, product, or +// process disclosed, or represents that its use would not infringe +// privately-owned rights. +// 3. Also, reference herein to any specific commercial products, process, or +// services by trade name, trademark, manufacturer or otherwise does not +// necessarily constitute or imply its endorsement, recommendation, or favoring +// by the United States Government or Lawrence Livermore National Security, +// LLC. The views and opinions of authors expressed herein do not necessarily +// state or reflect those of the United States Government or Lawrence Livermore +// National Security, LLC, and shall not be used for advertising or product +// endorsement purposes. + +/** + * @file RefactorizationSolver.cpp + * + * @author Kasia Swirydowicz , PNNL + * @author Slaven Peles , ORNL + * + */ + +#include "MatrixCsr.hpp" +#include "RefactorizationSolver.hpp" + +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) +#include "IterativeRefinement.hpp" +#endif + +#include "klu.h" +#include +#include +#include +#include +#include +#include +#include +#include + + +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) +#define checkGpuErrors(val) evloserCheckGpuError((val), __FILE__, __LINE__) +#endif + +namespace EVLOSER +{ +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) +namespace +{ + +struct KluFactorData +{ + int nnzL{0}; + int nnzU{0}; + std::vector Lp; + std::vector Li; + std::vector Lx; + std::vector Up; + std::vector Ui; + std::vector Ux; +}; + +struct HostCsrFactor +{ + std::vector rowptr; + std::vector colind; + std::vector values; +}; + +bool validate_csc_factor(const char* name, int n, int nnz, const std::vector& colptr, const std::vector& rowind, bool silent_output) +{ + auto report = [&](const std::string& message) { + if(!silent_output) { + std::cout << "[EVLOSER] Invalid KLU " << name << " factor: " << message << "\n"; + } + return false; + }; + + if(n <= 0) { + return report("factor dimension must be positive"); + } + + if(nnz < 0) { + return report("number of nonzeros is negative"); + } + + if(static_cast(colptr.size()) != n + 1) { + return report("column pointer size does not match dimension"); + } + + if(static_cast(rowind.size()) != nnz) { + return report("row index size does not match nnz"); + } + + if(colptr[0] != 0) { + return report("column pointer must start at zero"); + } + + for(int col = 0; col < n; ++col) { + if(colptr[col] > colptr[col + 1]) { + return report("column pointer is not monotone"); + } + } + + if(colptr[n] != nnz) { + return report("final column pointer does not match nnz"); + } + + for(int k = 0; k < nnz; ++k) { + if(rowind[k] < 0 || rowind[k] >= n) { + return report("row index out of range"); + } + } + + return true; +} + +bool extract_klu_factors(klu_numeric* numeric, + klu_symbolic* symbolic, + klu_common& common, + int n, + KluFactorData& factors, + bool silent_output) +{ + factors.nnzL = numeric->lnz; + factors.nnzU = numeric->unz; + + factors.Lp.assign(n + 1, 0); + factors.Li.assign(factors.nnzL, 0); + factors.Lx.assign(factors.nnzL, 0.0); + factors.Up.assign(n + 1, 0); + factors.Ui.assign(factors.nnzU, 0); + factors.Ux.assign(factors.nnzU, 0.0); + + const int ok = klu_extract(numeric, + symbolic, + factors.Lp.data(), + factors.Li.data(), + factors.Lx.data(), + factors.Up.data(), + factors.Ui.data(), + factors.Ux.data(), + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + &common); + + if(ok == 0) { + if(!silent_output) { + std::cout << "[EVLOSER] klu_extract failed while preparing GPU RF setup\n"; + } + return false; + } + + return validate_csc_factor("L", n, factors.nnzL, factors.Lp, factors.Li, silent_output) && + validate_csc_factor("U", n, factors.nnzU, factors.Up, factors.Ui, silent_output); +} + +HostCsrFactor convert_csc_to_csr(int n, + int nnz, + const std::vector& colptr, + const std::vector& rowind, + const std::vector& values) +{ + HostCsrFactor csr; + csr.rowptr.assign(n + 1, 0); + csr.colind.assign(nnz, 0); + csr.values.assign(nnz, 0.0); + + for(int col = 0; col < n; ++col) { + for(int k = colptr[col]; k < colptr[col + 1]; ++k) { + csr.rowptr[rowind[k] + 1]++; + } + } + + for(int row = 0; row < n; ++row) { + csr.rowptr[row + 1] += csr.rowptr[row]; + } + + std::vector offsets = csr.rowptr; + for(int col = 0; col < n; ++col) { + for(int k = colptr[col]; k < colptr[col + 1]; ++k) { + const int row = rowind[k]; + const int dest = offsets[row]++; + csr.colind[dest] = col; + csr.values[dest] = values[k]; + } + } + + return csr; +} + +bool validate_host_csr_factor(const char* name, int n, int nnz, const HostCsrFactor& csr, bool silent_output) +{ + auto report = [&](const std::string& message) { + if(!silent_output) { + std::cout << "[EVLOSER] Invalid host CSR " << name << " factor: " << message << "\n"; + } + return false; + }; + + if(static_cast(csr.rowptr.size()) != n + 1) { + return report("row pointer size does not match dimension"); + } + + if(static_cast(csr.colind.size()) != nnz || static_cast(csr.values.size()) != nnz) { + return report("column/value array size does not match nnz"); + } + + if(csr.rowptr[0] != 0) { + return report("row pointer must start at zero"); + } + + for(int row = 0; row < n; ++row) { + if(csr.rowptr[row] > csr.rowptr[row + 1]) { + return report("row pointer is not monotone"); + } + } + + if(csr.rowptr[n] != nnz) { + return report("final row pointer does not match nnz"); + } + + for(int k = 0; k < nnz; ++k) { + if(csr.colind[k] < 0 || csr.colind[k] >= n) { + return report("column index out of range"); + } + } + + return true; +} + +} // namespace +#endif +RefactorizationSolver::RefactorizationSolver(int n, ExecutionMode execution_mode) + : n_(n), + execution_mode_(execution_mode) +{ + mat_A_csr_ = new MatrixCsr(execution_mode_); +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) + if(execution_mode_ == ExecutionMode::CUDA || execution_mode_ == ExecutionMode::HIP) { + // handles + cusparseCreate(&handle_); + cusolverSpCreate(&handle_cusolver_); + cublasCreate(&handle_cublas_); + + // descriptors + cusparseCreateMatDescr(&descr_A_); + cusparseSetMatType(descr_A_, CUSPARSE_MATRIX_TYPE_GENERAL); + cusparseSetMatIndexBase(descr_A_, CUSPARSE_INDEX_BASE_ZERO); + } +#endif + + // Allocate host mirror for the solution vector + hostx_ = new double[n_]; +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) + if(execution_mode_ == ExecutionMode::CUDA || execution_mode_ == ExecutionMode::HIP) { + // Allocate solution and rhs vectors + checkGpuErrors(evloserGpuMalloc((void**)&devx_, n_ * sizeof(double))); + checkGpuErrors(evloserGpuMalloc((void**)&devr_, n_ * sizeof(double))); + } +#endif +} + +RefactorizationSolver::~RefactorizationSolver() +{ +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) + delete ir_; +#endif + delete mat_A_csr_; + +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) + if(execution_mode_ == ExecutionMode::CUDA || execution_mode_ == ExecutionMode::HIP) { + // Delete workspaces and handles + if(d_work_ != nullptr) { + (void)evloserGpuFree(d_work_); + } + cusparseDestroy(handle_); + cusolverSpDestroy(handle_cusolver_); + cublasDestroy(handle_cublas_); + cusparseDestroyMatDescr(descr_A_); + } +#endif + + // Delete host mirror for the solution vector + delete[] hostx_; + +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) + if(execution_mode_ == ExecutionMode::CUDA || execution_mode_ == ExecutionMode::HIP) { + // Delete residual and solution vectors + if(devr_ != nullptr) { + (void)evloserGpuFree(devr_); + } + if(devx_ != nullptr) { + (void)evloserGpuFree(devx_); + } + + // Delete matrix descriptor used in cuSolverGLU setup + if(cusolver_glu_enabled_) { + cusparseDestroyMatDescr(descr_M_); + cusolverSpDestroyGluInfo(info_M_); + } + + if(cusolver_rf_enabled_) { + if(d_P_ != nullptr) { + (void)evloserGpuFree(d_P_); + } + if(d_Q_ != nullptr) { + (void)evloserGpuFree(d_Q_); + } + if(d_T_ != nullptr) { + (void)evloserGpuFree(d_T_); + } + } + } +#endif + if(Numeric_ != nullptr) { + klu_free_numeric(&Numeric_, &Common_); + } + + if(Symbolic_ != nullptr) { + klu_free_symbolic(&Symbolic_, &Common_); + } + delete[] mia_; + delete[] mja_; +} + +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) +void RefactorizationSolver::enable_iterative_refinement() +{ + if(execution_mode_ == ExecutionMode::CPU) { + if(!silent_output_) { + std::cout << "[EVLOSER] Iterative refinement is unavailable in CPU execution mode.\n"; + } + return; + } + + if(ir_ == nullptr) { + ir_ = new IterativeRefinement(); + } + + iterative_refinement_enabled_ = (ir_ != nullptr); +} + +void RefactorizationSolver::disable_iterative_refinement() +{ + delete ir_; + ir_ = nullptr; + iterative_refinement_enabled_ = false; + use_ir_ = "no"; +} + +bool RefactorizationSolver::iterative_refinement_active() const +{ + return execution_mode_ != ExecutionMode::CPU && iterative_refinement_enabled_ && ir_ != nullptr && use_ir_ == "yes"; +} + +// TODO: Refactor to only pass mat_A_csr_ to setup_system_matrix; n and nnz can be read from mat_A_csr_ +void RefactorizationSolver::setup_iterative_refinement_matrix(int n, int nnz) +{ + if(!iterative_refinement_active()) { + return; + } + + ir_->setup_system_matrix(n, nnz, mat_A_csr_->device_irows(), mat_A_csr_->device_jcols(), mat_A_csr_->device_vals()); +} + +// TODO: Can this function be merged with setup_iterative_refinement_matrix ? +void RefactorizationSolver::configure_iterative_refinement(cusparseHandle_t cusparse_handle, + cublasHandle_t cublas_handle, + evloserRfHandle_t cusolverrf_handle, + int n, + double* d_T, + int* d_P, + int* d_Q, + double* devx, + double* devr) +{ + if(!iterative_refinement_active()) { + return; + } + + ir_->setup(cusparse_handle, cublas_handle, cusolverrf_handle, n, d_T, d_P, d_Q, devx, devr); +} +#endif + +bool RefactorizationSolver::validate_system_matrix(const char* caller) const +{ + if(mat_A_csr_ == nullptr) { + if(!silent_output_) { + std::cout << "[EVLOSER] Invalid matrix in " << caller << ": matrix object is null\n"; + } + return false; + } + + return mat_A_csr_->validate_host_structure(caller, silent_output_) && + mat_A_csr_->validate_host_values(caller, silent_output_); +} + +bool RefactorizationSolver::validate_klu_factorization(const char* caller) const +{ + if(Symbolic_ == nullptr) { + if(!silent_output_) { + std::cout << "[EVLOSER] Invalid KLU factorization in " << caller << ": symbolic factor is null\n"; + } + return false; + } + + if(Numeric_ == nullptr) { + if(!silent_output_) { + std::cout << "[EVLOSER] Invalid KLU factorization in " << caller << ": numeric factor is null\n"; + } + return false; + } + + if(Symbolic_->n != n_) { + if(!silent_output_) { + std::cout << "[EVLOSER] Invalid KLU factorization in " << caller << ": symbolic dimension " + << Symbolic_->n << " does not match solver dimension " << n_ << "\n"; + } + return false; + } + + if(Numeric_->n != n_) { + if(!silent_output_) { + std::cout << "[EVLOSER] Invalid KLU factorization in " << caller << ": numeric dimension " + << Numeric_->n << " does not match solver dimension " << n_ << "\n"; + } + return false; + } + + if(Symbolic_->Q == nullptr || Numeric_->Pnum == nullptr) { + if(!silent_output_) { + std::cout << "[EVLOSER] Invalid KLU factorization in " << caller + << ": missing permutation data\n"; + } + return false; + } + + return true; +} + +bool RefactorizationSolver::validate_solution(const double* solution, const char* caller) const +{ + const char* caller_name = caller == nullptr ? "unknown caller" : caller; + + if(solution == nullptr) { + if(!silent_output_) { + std::cout << "[EVLOSER] Invalid vector in " << caller_name << ": pointer is null\n"; + } + return false; + } + + for(int i = 0; i < n_; ++i) { + if(!std::isfinite(solution[i])) { + if(!silent_output_) { + std::cout << "[EVLOSER] Invalid vector in " << caller_name + << ": entry " << i << " is not finite\n"; + } + return false; + } + } + + return true; +} + +double RefactorizationSolver::compute_klu_residual(const double* rhs, + const double* solution) const +{ + if(rhs == nullptr || solution == nullptr || mat_A_csr_ == nullptr) { + return std::numeric_limits::infinity(); + } + + const int* rowptr = mat_A_csr_->host_irows(); + const int* colind = mat_A_csr_->host_jcols(); + const double* values = mat_A_csr_->host_vals(); + + long double residual_inf = 0.0L; + long double matrix_inf = 0.0L; + long double solution_inf = 0.0L; + long double rhs_inf = 0.0L; + + for(int i = 0; i < n_; ++i) { + solution_inf = + std::max(solution_inf, std::abs(static_cast(solution[i]))); + rhs_inf = + std::max(rhs_inf, std::abs(static_cast(rhs[i]))); + } + + for(int row = 0; row < n_; ++row) { + long double row_sum = 0.0L; + long double matrix_vector_product = 0.0L; + + for(int k = rowptr[row]; k < rowptr[row + 1]; ++k) { + const long double value = static_cast(values[k]); + + row_sum += std::abs(value); + matrix_vector_product += + value * static_cast(solution[colind[k]]); + } + + matrix_inf = std::max(matrix_inf, row_sum); + residual_inf = + std::max(residual_inf, + std::abs(matrix_vector_product - + static_cast(rhs[row]))); + } + + const long double scale = + std::max(1.0L, matrix_inf * solution_inf + rhs_inf); + + const long double normalized_residual = residual_inf / scale; + + if(!std::isfinite(normalized_residual)) { + return std::numeric_limits::infinity(); + } + + return static_cast(normalized_residual); +} + +klu_numeric* RefactorizationSolver::factor_klu_numeric(const char* caller) +{ + if(!validate_system_matrix(caller)) { + return nullptr; + } + + if(Symbolic_ == nullptr || Symbolic_->n != n_) { + if(!silent_output_) { + std::cout << "[EVLOSER] " << caller + << " requires valid KLU symbolic analysis.\n"; + } + return nullptr; + } + + klu_common trial_common = Common_; + trial_common.status = KLU_OK; + + klu_numeric* numeric = + klu_factor(mat_A_csr_->host_irows(), + mat_A_csr_->host_jcols(), + mat_A_csr_->host_vals(), + Symbolic_, + &trial_common); + + const int status = trial_common.status; + + if(numeric == nullptr || status != KLU_OK) { + if(!silent_output_) { + std::cout << "[EVLOSER] " << caller + << " failed with KLU status " << status << "\n"; + } + + if(numeric != nullptr) { + klu_free_numeric(&numeric, &trial_common); + } + + return nullptr; + } + return numeric; +} + +bool RefactorizationSolver::solve_klu_candidate( + klu_numeric* numeric, + const double* rhs, + std::vector& solution, + double& residual, + const char* caller) +{ + residual = std::numeric_limits::infinity(); + + if(numeric == nullptr || rhs == nullptr) { + return false; + } + + if(!validate_solution(rhs, caller)) { + return false; + } + + solution.assign(rhs, rhs + n_); + + klu_common trial_common = Common_; + trial_common.status = KLU_OK; + + const int ok = + klu_solve(Symbolic_, + numeric, + n_, + 1, + solution.data(), + &Common_); + + const int status = Common_.status; + + if(ok == 0 || status != KLU_OK) { + if(!silent_output_) { + std::cout << "[EVLOSER] " << caller + << " failed with KLU status " << status << "\n"; + } + return false; + } + + if(!validate_solution(solution.data(), caller)) { + return false; + } + + residual = compute_klu_residual(rhs, solution.data()); + + if(!std::isfinite(residual)) { + if(!silent_output_) { + std::cout << "[EVLOSER] " << caller + << " produced a non-finite residual.\n"; + } + return false; + } + + return true; +} + +bool RefactorizationSolver::solve_cpu_with_recovery(double* dx) +{ + last_klu_recovery_action_ = KluRecoveryAction::None; + + if(dx == nullptr) { + if(!silent_output_) { + std::cout << "[EVLOSER] KLU solve received a null right-hand side.\n"; + } + last_klu_recovery_action_ = KluRecoveryAction::Failed; + return false; + } + + if(!validate_system_matrix("KLU solve") || + !validate_solution(dx, "KLU right-hand side")) { + last_klu_recovery_action_ = KluRecoveryAction::Failed; + return false; + } + + if(Symbolic_ == nullptr || Symbolic_->n != n_) { + if(!silent_output_) { + std::cout << "[EVLOSER] KLU solve requires valid symbolic analysis.\n"; + } + + last_klu_recovery_action_ = KluRecoveryAction::Failed; + return false; + } + + /* + * A failed klu_refactor() may leave Common_.status nonzero. When recovery + * is pending, allow the fresh-factorization path to run instead of + * rejecting the solve based on that previous status. + */ + if(!klu_refactor_pending_validation_ && + !validate_klu_factorization("KLU solve")) { + last_klu_recovery_action_ = KluRecoveryAction::Failed; + return false; + } + + const std::vector rhs(dx, dx + n_); + + std::vector refactor_solution; + double residual_refactor = + std::numeric_limits::infinity(); + + bool refactor_solution_usable = false; + + /* + * With no pending value-only refactorization, Numeric_ already represents + * a fresh factorization. Solve normally and apply only the loose safety + * limit. + */ + if(!klu_refactor_pending_validation_) { + if(!solve_klu_candidate(Numeric_, + rhs.data(), + refactor_solution, + residual_refactor, + "KLU solve")) { + last_klu_recovery_action_ = KluRecoveryAction::Failed; + return false; + } + + if(residual_refactor > klu_residual_safety_limit_) { + if(!silent_output_) { + std::cout << "[EVLOSER] KLU residual " + << residual_refactor + << " exceeds safety limit " + << klu_residual_safety_limit_ << "\n"; + } + + last_klu_recovery_action_ = KluRecoveryAction::Failed; + return false; + } + + std::copy(refactor_solution.begin(), + refactor_solution.end(), + dx); + + return true; + } + + if(klu_refactor_succeeded_) { + refactor_solution_usable = + solve_klu_candidate(Numeric_, + rhs.data(), + refactor_solution, + residual_refactor, + "KLU refactorized solve"); + } + + const bool fresh_factorization_required = + !refactor_solution_usable || + residual_refactor > klu_suspicious_residual_threshold_; + + if(!fresh_factorization_required) { + if(residual_refactor > klu_residual_safety_limit_) { + if(!silent_output_) { + std::cout << "[EVLOSER] KLU refactorized residual " + << residual_refactor + << " exceeds safety limit " + << klu_residual_safety_limit_ << "\n"; + } + + klu_refactor_pending_validation_ = false; + klu_refactor_succeeded_ = false; + last_klu_recovery_action_ = KluRecoveryAction::Failed; + return false; + } + + std::copy(refactor_solution.begin(), + refactor_solution.end(), + dx); + + klu_refactor_pending_validation_ = false; + klu_refactor_succeeded_ = false; + last_klu_recovery_action_ = + KluRecoveryAction::RefactorAccepted; + + return true; + } + + if(!silent_output_ && refactor_solution_usable) { + std::cout << "[EVLOSER] KLU refactorized residual " + << residual_refactor + << " exceeds suspicious-result threshold " + << klu_suspicious_residual_threshold_ + << "; trying fresh factorization.\n"; + } + + klu_numeric* full_numeric = + factor_klu_numeric("KLU recovery factorization"); + + std::vector full_solution; + double residual_full = + std::numeric_limits::infinity(); + + const bool full_solution_usable = + full_numeric != nullptr && + solve_klu_candidate(full_numeric, + rhs.data(), + full_solution, + residual_full, + "KLU recovery solve"); + + const bool refactor_candidate_safe = + refactor_solution_usable && + residual_refactor <= klu_residual_safety_limit_; + + const bool full_candidate_safe = + full_solution_usable && + residual_full <= klu_residual_safety_limit_; + + if(!refactor_candidate_safe && !full_candidate_safe) { + if(full_numeric != nullptr) { + klu_free_numeric(&full_numeric, &Common_); + } + + if(!silent_output_) { + std::cout << "[EVLOSER] KLU recovery produced no candidate " + "within the residual safety limit.\n"; + } + + klu_refactor_pending_validation_ = false; + klu_refactor_succeeded_ = false; + last_klu_recovery_action_ = KluRecoveryAction::Failed; + + return false; + } + + const bool full_factor_materially_better = + full_candidate_safe && + refactor_candidate_safe && + residual_full < + klu_improvement_ratio_ * residual_refactor && + residual_refactor - residual_full > + klu_minimum_improvement_; + + const bool keep_full_factors = + full_candidate_safe && + (!refactor_candidate_safe || + full_factor_materially_better); + + if(keep_full_factors) { + if(Numeric_ != nullptr) { + klu_free_numeric(&Numeric_, &Common_); + } + + Numeric_ = full_numeric; + full_numeric = nullptr; + + std::copy(full_solution.begin(), + full_solution.end(), + dx); + + last_klu_recovery_action_ = + KluRecoveryAction::FullFactorAccepted; + } else { + /* + * Retain the refactored factors unless the fresh factorization is + * materially better. For the current solve, return whichever finite + * candidate has the smaller residual. + */ + if(full_candidate_safe && + residual_full < residual_refactor) { + std::copy(full_solution.begin(), + full_solution.end(), + dx); + } else { + std::copy(refactor_solution.begin(), + refactor_solution.end(), + dx); + } + + if(full_numeric != nullptr) { + klu_free_numeric(&full_numeric, &Common_); + } + + last_klu_recovery_action_ = + KluRecoveryAction::RefactorRetained; + } + + klu_refactor_pending_validation_ = false; + klu_refactor_succeeded_ = false; + + return true; +} + +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) +bool RefactorizationSolver::checkEvloserRfStatus(evloserRfStatus_t status, const char* caller) const +{ + if(status == evloserRfSuccess) { + return true; + } + + if(!silent_output_) { + std::cout << "[EVLOSER] " << caller << " failed with GPU RF status " << status << "\n"; + } + + return false; +} + +int RefactorizationSolver::resetEvloserRfValues(const char* caller) +{ + sp_status_ = evloserRfResetValues(n_, + nnz_, + mat_A_csr_->device_irows(), + mat_A_csr_->device_jcols(), + mat_A_csr_->device_vals(), + d_P_, + d_Q_, + handle_rf_); + + if(!checkEvloserRfStatus(sp_status_, caller)) { + return -1; + } + + checkGpuErrors(evloserGpuDeviceSynchronize()); + return 0; +} + +int RefactorizationSolver::analyzeEvloserRf(const char* caller) +{ + sp_status_ = evloserRfAnalyze(handle_rf_); + return checkEvloserRfStatus(sp_status_, caller) ? 0 : -1; +} + +int RefactorizationSolver::refactorizeEvloserRf(const char* caller) +{ + sp_status_ = evloserRfRefactor(handle_rf_); + return checkEvloserRfStatus(sp_status_, caller) ? 0 : -1; +} +#endif + +int RefactorizationSolver::setup_factorization() +{ + if(fact_ != "klu") { + assert(false && "Only KLU is available for the first factorization."); + return -1; + } + + if(!validate_system_matrix("KLU analysis")) { + return -1; + } + + klu_refactor_pending_validation_ = false; + klu_refactor_succeeded_ = false; + last_klu_recovery_action_ = KluRecoveryAction::None; + + // A new matrix structure invalidates both existing KLU states. + if(Numeric_ != nullptr) { + klu_free_numeric(&Numeric_, &Common_); + } + + if(Symbolic_ != nullptr) { + klu_free_symbolic(&Symbolic_, &Common_); + } + + if(initializeKLU() != 0) { + return -1; + } + + Symbolic_ = klu_analyze(n_, + mat_A_csr_->host_irows(), + mat_A_csr_->host_jcols(), + &Common_); + + if(Symbolic_ == nullptr || Common_.status != KLU_OK) { + if(!silent_output_) { + std::cout << "[EVLOSER] KLU symbolic analysis failed with status " + << Common_.status << "\n"; + } + return -1; + } + + return 0; +} + +int RefactorizationSolver::factorize() +{ + klu_numeric* fresh_numeric = + factor_klu_numeric("KLU factorization"); + + if(fresh_numeric == nullptr) { + return -1; + } + + if(Numeric_ != nullptr) { + klu_free_numeric(&Numeric_, &Common_); + } + + Numeric_ = fresh_numeric; + + klu_refactor_pending_validation_ = false; + klu_refactor_succeeded_ = false; + last_klu_recovery_action_ = KluRecoveryAction::None; + is_first_solve_ = true; + + return validate_klu_factorization("KLU factorization") + ? 0 + : -1; +} + +void RefactorizationSolver::setup_refactorization() +{ + if(!validate_system_matrix("refactorization setup")) { + return; + } + + if(execution_mode_ == ExecutionMode::CPU) { + // KLU numeric state is already available from factorize(). + return; + } + +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) + if(refact_ == "glu") { + if(execution_mode_ != ExecutionMode::CUDA) { + if(!silent_output_) { + std::cout << "[EVLOSER] GLU refactorization requires CUDA execution mode.\n"; + } + return; + } + + if(initializeCusolverGLU() != 0) { + return; + } + + if(refactorizationSetupCusolverGLU() != 0) { + return; + } + } else if(refact_ == "rf") { + if(initializeCusolverRf() != 0 || refactorizationSetupCusolverRf() != 0) { + assert(false && "EVLOSER RF setup failed."); + return; + } + if(iterative_refinement_active()) { + configure_iterative_refinement(handle_, handle_cublas_, handle_rf_, n_, d_T_, d_P_, d_Q_, devx_, devr_); + } + } else { // for future - + assert(0 && "Only glu and rf refactorizations available.\n"); + } + +#endif +} + +int RefactorizationSolver::refactorize() +{ + if(!validate_system_matrix("refactorization")) { + if(execution_mode_ == ExecutionMode::CPU) { + klu_refactor_pending_validation_ = false; + klu_refactor_succeeded_ = false; + last_klu_recovery_action_ = KluRecoveryAction::Failed; + } + return -1; + } + + if(execution_mode_ == ExecutionMode::CPU) { + if(!validate_klu_factorization("KLU refactorization")) { + klu_refactor_pending_validation_ = false; + klu_refactor_succeeded_ = false; + last_klu_recovery_action_ = KluRecoveryAction::Failed; + return -1; + } + + klu_refactor_pending_validation_ = true; + klu_refactor_succeeded_ = false; + last_klu_recovery_action_ = KluRecoveryAction::None; + + const int ok = + klu_refactor(mat_A_csr_->host_irows(), + mat_A_csr_->host_jcols(), + mat_A_csr_->host_vals(), + Symbolic_, + Numeric_, + &Common_); + + klu_refactor_succeeded_ = + ok != 0 && Common_.status == KLU_OK; + + if(!klu_refactor_succeeded_ && !silent_output_) { + std::cout + << "[EVLOSER] KLU refactorization failed with status " + << Common_.status + << "; fresh factorization will be attempted during solve.\n"; + } + + /* + * A failed KLU refactorization is recoverable. Continue to the solve, + * where a fresh numeric factorization will be attempted. + */ + return 0; + } + +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) + if(refact_ == "glu") { + if(execution_mode_ != ExecutionMode::CUDA) { + if(!silent_output_) { + std::cout + << "[EVLOSER] GLU refactorization requires CUDA execution mode.\n"; + } + return -1; + } + + sp_status_ = cusolverSpDgluReset(handle_cusolver_, + n_, + /* A is original matrix */ + nnz_, + descr_A_, + mat_A_csr_->device_vals(), + mat_A_csr_->device_irows(), + mat_A_csr_->device_jcols(), + info_M_); + + sp_status_ = + cusolverSpDgluFactor(handle_cusolver_, info_M_, d_work_); + } else { + if(refact_ == "rf") { + if(resetEvloserRfValues("GPU RF reset values") != 0) { + return -1; + } + + if(refactorizeEvloserRf("GPU RF refactorization") != 0) { + return -1; + } + } + } + + return 0; +#endif + + if(!silent_output_) { + std::cout + << "[EVLOSER] Selected refactorization backend is unavailable " + "in this build.\n"; + } + + return -1; +} + +bool RefactorizationSolver::triangular_solve(double* dx, double tol) +{ + if(execution_mode_ == ExecutionMode::CPU) { + (void)tol; + return solve_cpu_with_recovery(dx); + } + + if(dx == nullptr) { + if(!silent_output_) { + std::cout << "[EVLOSER] Solve received a null right-hand side.\n"; + } + return false; + } + +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) + if(refact_ == "glu") { + if(execution_mode_ != ExecutionMode::CUDA) { + if(!silent_output_) { + std::cout << "[EVLOSER] GLU solve requires CUDA execution mode.\n"; + } + return false; + } + + checkGpuErrors(evloserGpuMemcpy(devr_, dx, sizeof(double) * n_, evloserMemcpyDeviceToDevice)); + sp_status_ = cusolverSpDgluSolve(handle_cusolver_, + n_, + /* A is original matrix */ + nnz_, + descr_A_, + mat_A_csr_->device_vals(), + mat_A_csr_->device_irows(), + mat_A_csr_->device_jcols(), + devr_, /* right hand side */ + dx, /* left hand side */ + &ite_refine_succ_, + &r_nrminf_, + info_M_, + d_work_); + if(sp_status_ != 0) { + if(!silent_output_) { + std::cout << "GLU solve failed with status: " << sp_status_ << "\n"; + } + return false; + } + return true; + } + + if(refact_ == "rf") { + // First solve is performed on CPU + if(is_first_solve_) { + checkGpuErrors(evloserGpuMemcpy(hostx_, + dx, + sizeof(double) * n_, + evloserMemcpyDeviceToHost)); + + const int ok = klu_solve(Symbolic_, + Numeric_, + n_, + 1, + hostx_, + &Common_); + + if(ok == 0 || Common_.status != KLU_OK) { + if(!silent_output_) { + std::cout << "[EVLOSER] Initial KLU solve failed with status " + << Common_.status << "\n"; + } + return false; + } + + if(!validate_solution(hostx_, "initial KLU solve")) { + return false; + } + + checkGpuErrors(evloserGpuMemcpy(dx, + hostx_, + sizeof(double) * n_, + evloserMemcpyHostToDevice)); + + is_first_solve_ = false; + return true; + } + + checkGpuErrors(evloserGpuMemcpy(devr_, dx, sizeof(double) * n_, evloserMemcpyDeviceToDevice)); + + // Each next solve is performed on GPU + sp_status_ = evloserRfSolve(handle_rf_, + d_P_, + d_Q_, + 1, + d_T_, + n_, + dx, + n_); + if(sp_status_ != 0) { + if(!silent_output_) std::cout << "Rf solve failed with status: " << sp_status_ << "\n"; + return false; + } + + if(iterative_refinement_active()) { + // Set tolerance based on barrier parameter mu + ir_->set_tol(tol); + + ir_->fgmres(dx, devr_); + if(!silent_output_ && (ir_->getFinalResidalNorm() > tol * ir_->getBNorm())) { + std::cout << "[Warning] Iterative refinement did not converge!\n"; + std::cout << "\t Iterative refinement tolerance " << tol << "\n"; + std::cout << "\t Relative solution error " << ir_->getFinalResidalNorm() / ir_->getBNorm() << "\n"; + std::cout << "\t fgmres: init residual norm: " << ir_->getInitialResidalNorm() << "\n" + << "\t final residual norm: " << ir_->getFinalResidalNorm() << "\n" + << "\t number of iterations: " << ir_->getFinalNumberOfIterations() << "\n"; + } + } + return true; + } + + if(!silent_output_) { + std::cout << "Unknown refactorization " << refact_ << ", exiting\n"; + } + return false; +#else + + (void)dx; + (void)tol; + + if(!silent_output_) { + std::cout << "[EVLOSER] GPU triangular solve is unavailable in this build.\n"; + } + + return false; + +#endif +} + + +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) +// helper private function needed for format conversion +int RefactorizationSolver::createM(const int n, + const int /* nnzL */, + const int* Lp, + const int* Li, + const int /* nnzU */, + const int* Up, + const int* Ui) +{ + int row; + for(int i = 0; i < n; ++i) { + // go through EACH COLUMN OF L first + for(int j = Lp[i]; j < Lp[i + 1]; ++j) { + row = Li[j]; + // BUT dont count diagonal twice, important + if(row != i) { + mia_[row + 1]++; + } + } + // then each column of U + for(int j = Up[i]; j < Up[i + 1]; ++j) { + row = Ui[j]; + mia_[row + 1]++; + } + } + // then organize mia_; + mia_[0] = 0; + for(int i = 1; i < n + 1; i++) { + mia_[i] += mia_[i - 1]; + } + + std::vector Mshifts(n, 0); + for(int i = 0; i < n; ++i) { + // go through EACH COLUMN OF L first + for(int j = Lp[i]; j < Lp[i + 1]; ++j) { + row = Li[j]; + if(row != i) { + // place (row, i) where it belongs! + mja_[mia_[row] + Mshifts[row]] = i; + Mshifts[row]++; + } + } + // each column of U next + for(int j = Up[i]; j < Up[i + 1]; ++j) { + row = Ui[j]; + mja_[mia_[row] + Mshifts[row]] = i; + Mshifts[row]++; + } + } + return 0; +} +#endif + +int RefactorizationSolver::initializeKLU() +{ + if(klu_defaults(&Common_) == 0) { + if(!silent_output_) { + std::cout << "[EVLOSER] klu_defaults failed.\n"; + } + return -1; + } + + // TODO: consider making these user-configurable. + Common_.btf = 0; + Common_.ordering = ordering_; // COLAMD=1; AMD=0 + Common_.tol = 0.1; + Common_.scale = -1; + Common_.halt_if_singular = 1; + + if(Common_.status != KLU_OK) { + if(!silent_output_) { + std::cout << "[EVLOSER] Invalid KLU initialization status " + << Common_.status << "\n"; + } + return -1; + } + + return 0; +} + +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) +int RefactorizationSolver::initializeCusolverGLU() +{ +#if defined(HIOP_USE_HIP) || defined(HAVE_HIP) + std::cerr << "EVLOSER GLU refactorization is not supported on HIP. Use RF instead.\n"; + return -1; +#endif + + cusparseCreateMatDescr(&descr_M_); + cusparseSetMatType(descr_M_, CUSPARSE_MATRIX_TYPE_GENERAL); + cusparseSetMatIndexBase(descr_M_, CUSPARSE_INDEX_BASE_ZERO); + + // info (data structure where factorization is stored) + // this is done in the constructor - however, this function might be called more than once + cusolverSpDestroyGluInfo(info_M_); + cusolverSpCreateGluInfo(&info_M_); + + cusolver_glu_enabled_ = true; + return 0; +} + +int RefactorizationSolver::initializeCusolverRf() +{ + if(!checkEvloserRfStatus(evloserRfCreate(&handle_rf_), "evloserRfCreate")) { + return -1; + } + +#if defined(HIOP_USE_HIP) || defined(HAVE_HIP) + /* + * hipSOLVER RF uses the default RF settings. Some CUDA RF tuning calls are + * not portable to HIP. + */ +#else + sp_status_ = evloserRfSetAlgs(handle_rf_, evloserRfFactorizationAlg2, evloserRfTriangularSolveAlg2); + if(!checkEvloserRfStatus(sp_status_, "evloserRfSetAlgs")) { + return -1; + } + + sp_status_ = evloserRfSetMatrixFormat(handle_rf_, evloserRfMatrixFormatCsr, evloserRfUnitDiagonalStoredL); + if(!checkEvloserRfStatus(sp_status_, "evloserRfSetMatrixFormat")) { + return -1; + } + + sp_status_ = evloserRfSetResetValuesFastMode(handle_rf_, evloserRfResetValuesFastModeOn); + if(!checkEvloserRfStatus(sp_status_, "evloserRfSetResetValuesFastMode")) { + return -1; + } + + const double boost = 1e-12; + const double zero = 1e-14; + + sp_status_ = evloserRfSetNumericProperties(handle_rf_, zero, boost); + if(!checkEvloserRfStatus(sp_status_, "evloserRfSetNumericProperties")) { + return -1; + } +#endif + + cusolver_rf_enabled_ = true; + return 0; +} + +// call if both the matrix and the nnz structure changed or if convergence is +// poor while using refactorization. +int RefactorizationSolver::refactorizationSetupCusolverGLU() +{ +#if defined(HIOP_USE_HIP) || defined(HAVE_HIP) + std::cerr << "EVLOSER GLU refactorization is not supported on HIP. Use RF instead.\n"; + return -1; +#endif + + // for now this ONLY WORKS if proceeded by KLU. Might be worth decoupling + // later + + // get sizes + const int nnzL = Numeric_->lnz; + const int nnzU = Numeric_->unz; + + const int nnzM = (nnzL + nnzU - n_); + + /* parse the factorization */ + + mia_ = new int[n_ + 1]{0}; + mja_ = new int[nnzM]{0}; + int* Lp = new int[n_ + 1]; + int* Li = new int[nnzL]; + // we can't use nullptr instead od Lx and Ux because it causes SEG FAULT. It + // seems like a waste of memory though. + + double* Lx = new double[nnzL]; + int* Up = new int[n_ + 1]; + int* Ui = new int[nnzU]; + + double* Ux = new double[nnzU]; + + (void)klu_extract(Numeric_, + Symbolic_, + Lp, + Li, + Lx, + Up, + Ui, + Ux, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + &Common_); + createM(n_, nnzL, Lp, Li, nnzU, Up, Ui); + + delete[] Lp; + delete[] Li; + delete[] Lx; + delete[] Up; + delete[] Ui; + delete[] Ux; + + /* setup GLU */ + sp_status_ = cusolverSpDgluSetup(handle_cusolver_, + n_, + nnz_, + descr_A_, + mat_A_csr_->host_irows(), // kRowPtr_, + mat_A_csr_->host_jcols(), // jCol_, + Numeric_->Pnum, /* base-0 */ + Symbolic_->Q, /* base-0 */ + nnzM, /* nnzM */ + descr_M_, + mia_, + mja_, + info_M_); + + sp_status_ = cusolverSpDgluBufferSize(handle_cusolver_, info_M_, &size_M_); + assert(CUSOLVER_STATUS_SUCCESS == sp_status_); + + buffer_size_ = size_M_; + checkGpuErrors(evloserGpuMalloc((void**)&d_work_, buffer_size_)); + + sp_status_ = cusolverSpDgluAnalysis(handle_cusolver_, info_M_, d_work_); + assert(CUSOLVER_STATUS_SUCCESS == sp_status_); + + // reset and refactor so factors are ON THE GPU + + sp_status_ = cusolverSpDgluReset(handle_cusolver_, + n_, + /* A is original matrix */ + nnz_, + descr_A_, + mat_A_csr_->device_vals(), + mat_A_csr_->device_irows(), + mat_A_csr_->device_jcols(), + info_M_); + + assert(CUSOLVER_STATUS_SUCCESS == sp_status_); + sp_status_ = cusolverSpDgluFactor(handle_cusolver_, info_M_, d_work_); + return 0; +} + +int RefactorizationSolver::refactorizationSetupCusolverRf() +{ + // For now this path requires a prior KLU factorization. + if(!validate_klu_factorization("cuSOLVER RF setup")) { + return -1; + } + + KluFactorData factors; + if(!extract_klu_factors(Numeric_, Symbolic_, Common_, n_, factors, silent_output_)) { + return -1; + } + + HostCsrFactor L_csr = convert_csc_to_csr(n_, factors.nnzL, factors.Lp, factors.Li, factors.Lx); + HostCsrFactor U_csr = convert_csc_to_csr(n_, factors.nnzU, factors.Up, factors.Ui, factors.Ux); + + if(!validate_host_csr_factor("L", n_, factors.nnzL, L_csr, silent_output_)) { + return -1; + } + + if(!validate_host_csr_factor("U", n_, factors.nnzU, U_csr, silent_output_)) { + return -1; + } + + checkGpuErrors(evloserGpuMalloc(&d_P_, n_ * sizeof(int))); + checkGpuErrors(evloserGpuMalloc(&d_Q_, n_ * sizeof(int))); + checkGpuErrors(evloserGpuMalloc(&d_T_, n_ * sizeof(double))); + + checkGpuErrors(evloserGpuMemcpy(d_P_, Numeric_->Pnum, n_ * sizeof(int), evloserMemcpyHostToDevice)); + checkGpuErrors(evloserGpuMemcpy(d_Q_, Symbolic_->Q, n_ * sizeof(int), evloserMemcpyHostToDevice)); + + sp_status_ = evloserRfSetupHost(n_, + nnz_, + mat_A_csr_->host_irows(), + mat_A_csr_->host_jcols(), + mat_A_csr_->host_vals(), + factors.nnzL, + L_csr.rowptr.data(), + L_csr.colind.data(), + L_csr.values.data(), + factors.nnzU, + U_csr.rowptr.data(), + U_csr.colind.data(), + U_csr.values.data(), + Numeric_->Pnum, + Symbolic_->Q, + handle_rf_); + if(!checkEvloserRfStatus(sp_status_, "evloserRfSetupHost")) { + return -1; + } + + if(analyzeEvloserRf("GPU RF analysis") != 0) { + return -1; + } + + return refactorizeEvloserRf("GPU RF initial refactorization"); +} + +// Error checking utility for GPU backend calls +// KS: might later become part of src/Utils, putting it here for now +template +void RefactorizationSolver::evloserCheckGpuError(T result, const char* const file, int const line) +{ + if(result != evloserGpuSuccess) { + std::cout << "GPU backend error at " << file << ":" << line + << ", error# " << static_cast(result) + << ": " << evloserGpuGetErrorString(result) << "\n"; + assert(false); + } +} +#endif +} // namespace EVLOSER diff --git a/src/LinAlg/EVLOSER/RefactorizationSolver.hpp b/src/LinAlg/EVLOSER/RefactorizationSolver.hpp new file mode 100644 index 0000000..0e3f029 --- /dev/null +++ b/src/LinAlg/EVLOSER/RefactorizationSolver.hpp @@ -0,0 +1,365 @@ +// +// This file is part of HiOp. For details, see https://github.com/LLNL/hiop. +// HiOp is released under the BSD 3-clause license +// (https://opensource.org/licenses/BSD-3-Clause). Please also read “Additional +// BSD Notice” below. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// i. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the disclaimer below. ii. Redistributions in +// binary form must reproduce the above copyright notice, this list of +// conditions and the disclaimer (as noted below) in the documentation and/or +// other materials provided with the distribution. +// iii. Neither the name of the LLNS/LLNL nor the names of its contributors may +// be used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY, LLC, +// THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, +// INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF +// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Additional BSD Notice +// 1. This notice is required to be provided under our contract with the U.S. +// Department of Energy (DOE). This work was produced at Lawrence Livermore +// National Laboratory under Contract No. DE-AC52-07NA27344 with the DOE. +// 2. Neither the United States Government nor Lawrence Livermore National +// Security, LLC nor any of their employees, makes any warranty, express or +// implied, or assumes any liability or responsibility for the accuracy, +// completeness, or usefulness of any information, apparatus, product, or +// process disclosed, or represents that its use would not infringe +// privately-owned rights. +// 3. Also, reference herein to any specific commercial products, process, or +// services by trade name, trademark, manufacturer or otherwise does not +// necessarily constitute or imply its endorsement, recommendation, or favoring +// by the United States Government or Lawrence Livermore National Security, +// LLC. The views and opinions of authors expressed herein do not necessarily +// state or reflect those of the United States Government or Lawrence Livermore +// National Security, LLC, and shall not be used for advertising or product +// endorsement purposes. + +/** + * @file RefactorizationSolver.hpp + * + * @author Kasia Swirydowicz , PNNL + * @author Slaven Peles , ORNL + * + */ + +#pragma once + +#include "klu.h" +#include "evloser_execution_mode.hpp" +#include "evloser_gpu_defs.hpp" +#include +#include + +namespace EVLOSER +{ + +class MatrixCsr; +class IterativeRefinement; + +/** + * @brief Implements refactorization solvers using KLU and optional GPU sparse solver libraries + * + */ +class RefactorizationSolver +{ +public: + enum class KluRecoveryAction + { + None, + RefactorAccepted, + FullFactorAccepted, + RefactorRetained, + Failed + }; + // constructor + // RefactorizationSolver(); + RefactorizationSolver(int n, ExecutionMode execution_mode); + ~RefactorizationSolver(); + +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) + + /// Enable allocation and use of iterative refinement. + void enable_iterative_refinement(); + + /// Disable iterative refinement and release its owned state. + void disable_iterative_refinement(); + + /// Return true when iterative refinement is enabled, allocated, and requested. + bool iterative_refinement_active() const; + + /// Attach the current CSR matrix to the iterative refinement object. + void setup_iterative_refinement_matrix(int n, int nnz); + void configure_iterative_refinement(cusparseHandle_t cusparse_handle, + cublasHandle_t cublas_handle, + evloserRfHandle_t cusolverrf_handle, + int n, + double* d_T, + int* d_P, + int* d_Q, + double* devx, + double* devr); +#endif + + /** + * @brief Set the number of nonzeros in system matrix. + * + * @param nnz + */ + void set_nnz(int nnz) { nnz_ = nnz; } + +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) + IterativeRefinement* ir() { return ir_; } + double* devr() { return devr_; } +#endif + + MatrixCsr* mat_A_csr() { return mat_A_csr_; } + + int& ordering() { return ordering_; } + + std::string& fact() { return fact_; } + + std::string& refact() { return refact_; } + + std::string& use_ir() { return use_ir_; } + + void set_silent_output(bool silent_output) { silent_output_ = silent_output; } + + double& klu_suspicious_residual_threshold() + { + return klu_suspicious_residual_threshold_; + } + + double& klu_residual_safety_limit() + { + return klu_residual_safety_limit_; + } + + double& klu_improvement_ratio() + { + return klu_improvement_ratio_; + } + + double& klu_minimum_improvement() + { + return klu_minimum_improvement_; + } + + KluRecoveryAction last_klu_recovery_action() const + { + return last_klu_recovery_action_; + } + + /** + * @brief Set up factorization of the first linear system. + * + * @return int + */ + int setup_factorization(); + + /** + * @brief Factorize system matrix + * + * @return int - factorization status: success=0, failure=-1 + */ + int factorize(); + + /** + * @brief Set the up the refactorization + * + */ + void setup_refactorization(); + + /** + * @brief Refactorize system matrix + * + * @return int + */ + int refactorize(); + + /** + * @brief Solve the factored linear system. + * + * In CPU execution mode, dx must point to host memory. In CUDA or HIP + * execution mode, dx must point to device memory. + * + * @param dx rhs on entry and solution on return. + * @param tol ir tolerance for GPU execution. + * @return bool true when the solve succeeds and the solution is finite. + */ + bool triangular_solve(double* dx, double tol); + +private: + int n_{0}; ///< Size of the linear system + int nnz_{0}; ///< Number of nonzeros in the system's matrix + + const ExecutionMode execution_mode_; ///< Selected CPU, CUDA, or HIP execution path + + MatrixCsr* mat_A_csr_{nullptr}; ///< System matrix in nonsymmetric CSR format + IterativeRefinement* ir_{nullptr}; ///< Iterative refinement class + +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) + bool cusolver_glu_enabled_{false}; ///< GLU refactorization enabled flag + bool cusolver_rf_enabled_{false}; ///< Rf refactorization enabled flag + bool iterative_refinement_enabled_{false}; ///< Iterative refinement on/off flag +#endif + bool is_first_solve_{true}; ///< If it is first call to triangular solver + + // Options + int ordering_{-1}; + std::string fact_; + std::string refact_; + std::string use_ir_; + bool silent_output_{true}; + +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) + /** needed for GPU sparse solver **/ + + cusolverStatus_t sp_status_; + cusparseHandle_t handle_ = 0; + cusolverSpHandle_t handle_cusolver_ = nullptr; + cublasHandle_t handle_cublas_; + + cusparseMatDescr_t descr_A_; + cusparseMatDescr_t descr_M_; + csrluInfoHost_t info_lu_ = nullptr; + csrgluInfo_t info_M_ = nullptr; + + evloserRfHandle_t handle_rf_ = nullptr; + size_t buffer_size_{0}; + size_t size_M_{0}; + double* d_work_{nullptr}; + int ite_refine_succ_ = 0; + double r_nrminf_{0.0}; +#endif + + // KLU stuff + int klu_status_; + klu_common Common_{}; + klu_symbolic* Symbolic_ = nullptr; + klu_numeric* Numeric_ = nullptr; + + bool klu_refactor_pending_validation_{false}; + bool klu_refactor_succeeded_{false}; + + double klu_suspicious_residual_threshold_{1e-4}; + double klu_residual_safety_limit_{1e-1}; + double klu_improvement_ratio_{0.1}; + double klu_minimum_improvement_{1e-10}; + + KluRecoveryAction last_klu_recovery_action_{ + KluRecoveryAction::None}; + /*pieces of M */ + int* mia_ = nullptr; + int* mja_ = nullptr; + + /* CPU data */ + double* hostx_ = nullptr; + +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) + /* for GPU data */ + double* devx_ = nullptr; + double* devr_ = nullptr; + + /* needed for GPU Rf */ + int* d_P_ = nullptr; + int* d_Q_ = nullptr; // permutation matrices + double* d_T_ = nullptr; +#endif + +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) + /** + * @brief Function that computes M = (L-I) + U + * + * @param n + * @param nnzL + * @param Lp + * @param Li + * @param nnzU + * @param Up + * @param Ui + * @return int + */ + int createM(const int n, const int nnzL, const int* Lp, const int* Li, const int nnzU, const int* Up, const int* Ui); +#endif + + /// Validate the current CSR system matrix before solver setup or refactorization. + bool validate_system_matrix(const char* caller) const; + + /// Validate that KLU symbolic and numeric factors are available and dimensionally consistent. + bool validate_klu_factorization(const char* caller) const; + + /// Validate that the solution pointer is non-null and all solution values are finite. + bool validate_solution(const double* solution, const char* caller) const; + + /// Compute the normalized infinity-norm residual for the current CSR matrix. + double compute_klu_residual(const double* rhs, + const double* solution) const; + + /// Create fresh KLU numeric factors without replacing the currently retained factors. + klu_numeric* factor_klu_numeric(const char* caller); + + /// Solve one candidate system and compute its normalized residual. + bool solve_klu_candidate(klu_numeric* numeric, + const double* rhs, + std::vector& solution, + double& residual, + const char* caller); + + /// Complete a CPU solve, including refactorization recovery when required. + bool solve_cpu_with_recovery(double* dx); + + int initializeKLU(); + +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) + + int initializeCusolverGLU(); + int initializeCusolverRf(); + + int refactorizationSetupCusolverGLU(); + int refactorizationSetupCusolverRf(); + + /// Check and report a GPU RF status value. + bool checkEvloserRfStatus(evloserRfStatus_t status, const char* caller) const; + + /// Reset GPU RF values using the current device CSR matrix. + int resetEvloserRfValues(const char* caller); + + /// Run GPU RF analysis on the configured RF handle. + int analyzeEvloserRf(const char* caller); + + /// Run GPU RF numeric refactorization on the configured RF handle. + int refactorizeEvloserRf(const char* caller); + + /** + * @brief Check for GPU backend errors. + * + * @tparam T - type of the result + * @param result - result value + * @param file - file name where the error occurred + * @param line - line at which the error occurred + */ + template + void evloserCheckGpuError(T result, const char* const file, int const line); +#endif +}; + +} // namespace EVLOSER diff --git a/src/LinAlg/EVLOSER/evloser_cpu_defs.hpp b/src/LinAlg/EVLOSER/evloser_cpu_defs.hpp new file mode 100644 index 0000000..a6f8be8 --- /dev/null +++ b/src/LinAlg/EVLOSER/evloser_cpu_defs.hpp @@ -0,0 +1,26 @@ +#ifndef EVLOSER_CPU_DEFS_HPP +#define EVLOSER_CPU_DEFS_HPP + +#if !defined(HIOP_USE_CUDA) && !defined(HAVE_CUDA) && \ + !defined(HIOP_USE_HIP) && !defined(HAVE_HIP) + +using evloserGpuError_t = int; +using evloserGpuMemcpyKind_t = int; + +using cusolverStatus_t = int; +using cusparseHandle_t = void*; +using cusolverSpHandle_t = void*; +using cublasHandle_t = void*; +using cusparseMatDescr_t = void*; +using csrluInfoHost_t = void*; +using csrgluInfo_t = void*; + +using evloserRfStatus_t = int; +using evloserRfHandle_t = void*; + +static constexpr evloserGpuError_t evloserGpuSuccess = 0; +static constexpr evloserRfStatus_t evloserRfSuccess = 0; + +#endif + +#endif // EVLOSER_CPU_DEFS_HPP \ No newline at end of file diff --git a/src/LinAlg/EVLOSER/evloser_cusolver_defs.hpp b/src/LinAlg/EVLOSER/evloser_cusolver_defs.hpp new file mode 100644 index 0000000..dae7039 --- /dev/null +++ b/src/LinAlg/EVLOSER/evloser_cusolver_defs.hpp @@ -0,0 +1,291 @@ +// +// This file is part of HiOp. For details, see https://github.com/LLNL/hiop. HiOp +// is released under the BSD 3-clause license (https://opensource.org/licenses/BSD-3-Clause). +// Please also read “Additional BSD Notice” below. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// i. Redistributions of source code must retain the above copyright notice, this list +// of conditions and the disclaimer below. +// ii. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the disclaimer (as noted below) in the documentation and/or +// other materials provided with the distribution. +// iii. Neither the name of the LLNS/LLNL nor the names of its contributors may be used to +// endorse or promote products derived from this software without specific prior written +// permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT +// SHALL LAWRENCE LIVERMORE NATIONAL SECURITY, LLC, THE U.S. DEPARTMENT OF ENERGY OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED +// AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Additional BSD Notice +// 1. This notice is required to be provided under our contract with the U.S. Department +// of Energy (DOE). This work was produced at Lawrence Livermore National Laboratory under +// Contract No. DE-AC52-07NA27344 with the DOE. +// 2. Neither the United States Government nor Lawrence Livermore National Security, LLC +// nor any of their employees, makes any warranty, express or implied, or assumes any +// liability or responsibility for the accuracy, completeness, or usefulness of any +// information, apparatus, product, or process disclosed, or represents that its use would +// not infringe privately-owned rights. +// 3. Also, reference herein to any specific commercial products, process, or services by +// trade name, trademark, manufacturer or otherwise does not necessarily constitute or +// imply its endorsement, recommendation, or favoring by the United States Government or +// Lawrence Livermore National Security, LLC. The views and opinions of authors expressed +// herein do not necessarily state or reflect those of the United States Government or +// Lawrence Livermore National Security, LLC, and shall not be used for advertising or +// product endorsement purposes. + +/** + * @file evloser_cusolver_defs.hpp + * + * @author Kasia Swirydowicz , PNNL + * + * Contains prototypes of cuSOLVER functions not in public API. + * + */ + +#ifndef CUSOLVERDEFS_H +#define CUSOLVERDEFS_H + +#include "cusparse.h" +#include "cusolverSp.h" +#include +#include +#include +#include "cusolverSp_LOWLEVEL_PREVIEW.h" +#include + +using evloserGpuError_t = cudaError_t; +using evloserGpuMemcpyKind_t = cudaMemcpyKind; + +static const evloserGpuError_t evloserGpuSuccess = cudaSuccess; +static const evloserGpuMemcpyKind_t evloserMemcpyHostToDevice = cudaMemcpyHostToDevice; +static const evloserGpuMemcpyKind_t evloserMemcpyDeviceToHost = cudaMemcpyDeviceToHost; +static const evloserGpuMemcpyKind_t evloserMemcpyDeviceToDevice = cudaMemcpyDeviceToDevice; + +inline evloserGpuError_t evloserGpuMalloc(void** ptr, size_t size) +{ + return cudaMalloc(ptr, size); +} + +template +inline evloserGpuError_t evloserGpuMalloc(T** ptr, size_t size) +{ + return cudaMalloc(reinterpret_cast(ptr), size); +} + +inline evloserGpuError_t evloserGpuFree(void* ptr) +{ + return cudaFree(ptr); +} + +inline evloserGpuError_t evloserGpuMemcpy(void* dst, const void* src, size_t count, evloserGpuMemcpyKind_t kind) +{ + return cudaMemcpy(dst, src, count, kind); +} + +inline evloserGpuError_t evloserGpuDeviceSynchronize() +{ + return cudaDeviceSynchronize(); +} + +inline const char* evloserGpuGetErrorString(evloserGpuError_t status) +{ + return cudaGetErrorString(status); +} + +#include "cusolverRf.h" + +using evloserRfStatus_t = cusolverStatus_t; +using evloserRfHandle_t = cusolverRfHandle_t; +using evloserRfFactorization_t = cusolverRfFactorization_t; +using evloserRfTriangularSolve_t = cusolverRfTriangularSolve_t; +using evloserRfMatrixFormat_t = cusolverRfMatrixFormat_t; +using evloserRfUnitDiagonal_t = cusolverRfUnitDiagonal_t; +using evloserRfResetValuesFastMode_t = cusolverRfResetValuesFastMode_t; + +static const evloserRfStatus_t evloserRfSuccess = CUSOLVER_STATUS_SUCCESS; +static const evloserRfFactorization_t evloserRfFactorizationAlg2 = CUSOLVERRF_FACTORIZATION_ALG2; +static const evloserRfTriangularSolve_t evloserRfTriangularSolveAlg2 = CUSOLVERRF_TRIANGULAR_SOLVE_ALG2; +static const evloserRfMatrixFormat_t evloserRfMatrixFormatCsr = CUSOLVERRF_MATRIX_FORMAT_CSR; +static const evloserRfUnitDiagonal_t evloserRfUnitDiagonalStoredL = CUSOLVERRF_UNIT_DIAGONAL_STORED_L; +static const evloserRfResetValuesFastMode_t evloserRfResetValuesFastModeOn = CUSOLVERRF_RESET_VALUES_FAST_MODE_ON; + +inline evloserRfStatus_t evloserRfCreate(evloserRfHandle_t* handle) +{ + return cusolverRfCreate(handle); +} + +inline evloserRfStatus_t evloserRfDestroy(evloserRfHandle_t handle) +{ + return cusolverRfDestroy(handle); +} + +inline evloserRfStatus_t evloserRfSetAlgs(evloserRfHandle_t handle, + evloserRfFactorization_t fact_alg, + evloserRfTriangularSolve_t solve_alg) +{ + return cusolverRfSetAlgs(handle, fact_alg, solve_alg); +} + +inline evloserRfStatus_t evloserRfSetMatrixFormat(evloserRfHandle_t handle, + evloserRfMatrixFormat_t format, + evloserRfUnitDiagonal_t diag) +{ + return cusolverRfSetMatrixFormat(handle, format, diag); +} + +inline evloserRfStatus_t evloserRfSetResetValuesFastMode(evloserRfHandle_t handle, + evloserRfResetValuesFastMode_t fast_mode) +{ + return cusolverRfSetResetValuesFastMode(handle, fast_mode); +} + +inline evloserRfStatus_t evloserRfSetNumericProperties(evloserRfHandle_t handle, double zero, double boost) +{ + return cusolverRfSetNumericProperties(handle, zero, boost); +} + +inline evloserRfStatus_t evloserRfSetupHost(int n, + int nnzA, + int* csrRowPtrA, + int* csrColIndA, + double* csrValA, + int nnzL, + int* csrRowPtrL, + int* csrColIndL, + double* csrValL, + int nnzU, + int* csrRowPtrU, + int* csrColIndU, + double* csrValU, + int* P, + int* Q, + evloserRfHandle_t handle) +{ + return cusolverRfSetupHost(n, + nnzA, + csrRowPtrA, + csrColIndA, + csrValA, + nnzL, + csrRowPtrL, + csrColIndL, + csrValL, + nnzU, + csrRowPtrU, + csrColIndU, + csrValU, + P, + Q, + handle); +} + +inline evloserRfStatus_t evloserRfResetValues(int n, + int nnzA, + int* csrRowPtrA, + int* csrColIndA, + double* csrValA, + int* P, + int* Q, + evloserRfHandle_t handle) +{ + return cusolverRfResetValues(n, nnzA, csrRowPtrA, csrColIndA, csrValA, P, Q, handle); +} + +inline evloserRfStatus_t evloserRfAnalyze(evloserRfHandle_t handle) +{ + return cusolverRfAnalyze(handle); +} + +inline evloserRfStatus_t evloserRfRefactor(evloserRfHandle_t handle) +{ + return cusolverRfRefactor(handle); +} + +inline evloserRfStatus_t evloserRfSolve(evloserRfHandle_t handle, + int* P, + int* Q, + int nrhs, + double* Temp, + int ldt, + double* XF, + int ldxf) +{ + return cusolverRfSolve(handle, P, Q, nrhs, Temp, ldt, XF, ldxf); +} + +extern "C" { +/* + * prototype not in public header file + */ +struct csrgluInfo; +typedef struct csrgluInfo* csrgluInfo_t; + +cusolverStatus_t CUSOLVERAPI cusolverSpCreateGluInfo(csrgluInfo_t* info); + +cusolverStatus_t CUSOLVERAPI cusolverSpDestroyGluInfo(csrgluInfo_t info); + +cusolverStatus_t CUSOLVERAPI cusolverSpDgluSetup(cusolverSpHandle_t handle, + int m, + /* A can be base-0 or base-1 */ + int nnzA, + const cusparseMatDescr_t descrA, + const int* h_csrRowPtrA, + const int* h_csrColIndA, + const int* h_P, /* base-0 */ + const int* h_Q, /* base-0 */ + /* M can be base-0 or base-1 */ + int nnzM, + const cusparseMatDescr_t descrM, + const int* h_csrRowPtrM, + const int* h_csrColIndM, + csrgluInfo_t info); + +cusolverStatus_t CUSOLVERAPI cusolverSpDgluBufferSize(cusolverSpHandle_t handle, csrgluInfo_t info, size_t* pBufferSize); + +cusolverStatus_t CUSOLVERAPI cusolverSpDgluAnalysis(cusolverSpHandle_t handle, csrgluInfo_t info, void* workspace); + +cusolverStatus_t CUSOLVERAPI cusolverSpDgluReset(cusolverSpHandle_t handle, + int m, + /* A is original matrix */ + int nnzA, + const cusparseMatDescr_t descr_A, + const double* d_csrValA, + const int* d_csrRowPtrA, + const int* d_csrColIndA, + csrgluInfo_t info); + +cusolverStatus_t CUSOLVERAPI cusolverSpDgluFactor(cusolverSpHandle_t handle, csrgluInfo_t info, void* workspace); + +cusolverStatus_t CUSOLVERAPI cusolverSpDgluSolve(cusolverSpHandle_t handle, + int m, + /* A is original matrix */ + int nnzA, + const cusparseMatDescr_t descr_A, + const double* d_csrValA, + const int* d_csrRowPtrA, + const int* d_csrColIndA, + const double* d_b0, /* right hand side */ + double* d_x, /* left hand side */ + int* ite_refine_succ, + double* r_nrminf_ptr, + csrgluInfo_t info, + void* workspace); + +cusolverStatus_t CUSOLVERAPI cusolverSpDnrminf(cusolverSpHandle_t handle, + int n, + const double* x, + double* result, /* |x|_inf, host */ + void* d_work); /* at least 8192 bytes */ + +} // extern "C" + +#endif // CUSOLVERDEFS_H diff --git a/src/LinAlg/EVLOSER/evloser_execution_mode.hpp b/src/LinAlg/EVLOSER/evloser_execution_mode.hpp new file mode 100644 index 0000000..fddf86c --- /dev/null +++ b/src/LinAlg/EVLOSER/evloser_execution_mode.hpp @@ -0,0 +1,16 @@ +#ifndef EVLOSER_EXECUTION_MODE_HPP +#define EVLOSER_EXECUTION_MODE_HPP + +namespace EVLOSER +{ + +enum class ExecutionMode +{ + CPU, + CUDA, + HIP +}; + +} // namespace EVLOSER + +#endif diff --git a/src/LinAlg/EVLOSER/evloser_gpu_defs.hpp b/src/LinAlg/EVLOSER/evloser_gpu_defs.hpp new file mode 100644 index 0000000..15eaa64 --- /dev/null +++ b/src/LinAlg/EVLOSER/evloser_gpu_defs.hpp @@ -0,0 +1,25 @@ +/** + * @file evloser_gpu_defs.hpp + * + * Selects CUDA, HIP or CPU backend definitions for EVLOSER. + * + */ + +#ifndef EVLOSER_GPU_DEFS_H +#define EVLOSER_GPU_DEFS_H + +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) + +#include "evloser_cusolver_defs.hpp" + +#elif defined(HIOP_USE_HIP) || defined(HAVE_HIP) + +#include "evloser_hipsolver_defs.hpp" + +#else + +#include "evloser_cpu_defs.hpp" + +#endif + +#endif // EVLOSER_GPU_DEFS_H diff --git a/src/LinAlg/EVLOSER/evloser_hipsolver_defs.hpp b/src/LinAlg/EVLOSER/evloser_hipsolver_defs.hpp new file mode 100644 index 0000000..adb095f --- /dev/null +++ b/src/LinAlg/EVLOSER/evloser_hipsolver_defs.hpp @@ -0,0 +1,267 @@ +/** + * @file evloser_hipsolver_defs.hpp + * + * Defines HIP GPU backend wrappers used by EVLOSER. + * + */ + +#ifndef EVLOSER_HIPSOLVER_DEFS_H +#define EVLOSER_HIPSOLVER_DEFS_H + +#include + +#include +#include +#include +#include + +using evloserGpuError_t = hipError_t; +using evloserGpuMemcpyKind_t = hipMemcpyKind; + +static const evloserGpuError_t evloserGpuSuccess = hipSuccess; +static const evloserGpuMemcpyKind_t evloserMemcpyHostToDevice = hipMemcpyHostToDevice; +static const evloserGpuMemcpyKind_t evloserMemcpyDeviceToHost = hipMemcpyDeviceToHost; +static const evloserGpuMemcpyKind_t evloserMemcpyDeviceToDevice = hipMemcpyDeviceToDevice; + +inline evloserGpuError_t evloserGpuMalloc(void** ptr, size_t size) +{ + return hipMalloc(ptr, size); +} + +template +inline evloserGpuError_t evloserGpuMalloc(T** ptr, size_t size) +{ + return hipMalloc(reinterpret_cast(ptr), size); +} + +inline evloserGpuError_t evloserGpuFree(void* ptr) +{ + return hipFree(ptr); +} + +inline evloserGpuError_t evloserGpuMemcpy(void* dst, const void* src, size_t count, evloserGpuMemcpyKind_t kind) +{ + return hipMemcpy(dst, src, count, kind); +} + +inline evloserGpuError_t evloserGpuDeviceSynchronize() +{ + return hipDeviceSynchronize(); +} + +inline const char* evloserGpuGetErrorString(evloserGpuError_t status) +{ + return hipGetErrorString(status); +} + +/* + * Compatibility aliases for EVLOSER code that still uses CUDA-style sparse + * solver handle names. The EVLOSER source keeps those names so the HIP path + * stays close to the original ReSolve implementation. + */ +using cusolverStatus_t = hipsolverStatus_t; +using cusparseHandle_t = hipsparseHandle_t; +using cusolverSpHandle_t = hipsolverSpHandle_t; +using cublasHandle_t = hipblasHandle_t; +using cusparseMatDescr_t = hipsparseMatDescr_t; + +#define CUSOLVER_STATUS_SUCCESS HIPSOLVER_STATUS_SUCCESS + +#define cusparseCreate hipsparseCreate +#define cusparseDestroy hipsparseDestroy +#define cusparseCreateMatDescr hipsparseCreateMatDescr +#define cusparseDestroyMatDescr hipsparseDestroyMatDescr +#define cusparseSetMatType hipsparseSetMatType +#define cusparseSetMatIndexBase hipsparseSetMatIndexBase + +#define CUSPARSE_MATRIX_TYPE_GENERAL HIPSPARSE_MATRIX_TYPE_GENERAL +#define CUSPARSE_INDEX_BASE_ZERO HIPSPARSE_INDEX_BASE_ZERO + +#define cusolverSpCreate hipsolverSpCreate +#define cusolverSpDestroy hipsolverSpDestroy + +#define cublasCreate hipblasCreate +#define cublasDestroy hipblasDestroy + +using evloserRfStatus_t = hipsolverStatus_t; +using evloserRfHandle_t = hipsolverRfHandle_t; +using evloserRfFactorization_t = hipsolverRfFactorization_t; +using evloserRfTriangularSolve_t = hipsolverRfTriangularSolve_t; +using evloserRfMatrixFormat_t = hipsolverRfMatrixFormat_t; +using evloserRfUnitDiagonal_t = hipsolverRfUnitDiagonal_t; +using evloserRfResetValuesFastMode_t = hipsolverRfResetValuesFastMode_t; + +static const evloserRfStatus_t evloserRfSuccess = HIPSOLVER_STATUS_SUCCESS; +static const evloserRfFactorization_t evloserRfFactorizationAlg2 = HIPSOLVERRF_FACTORIZATION_ALG2; +static const evloserRfTriangularSolve_t evloserRfTriangularSolveAlg2 = HIPSOLVERRF_TRIANGULAR_SOLVE_ALG2; +static const evloserRfMatrixFormat_t evloserRfMatrixFormatCsr = HIPSOLVERRF_MATRIX_FORMAT_CSR; +static const evloserRfUnitDiagonal_t evloserRfUnitDiagonalStoredL = HIPSOLVERRF_UNIT_DIAGONAL_STORED_L; +static const evloserRfResetValuesFastMode_t evloserRfResetValuesFastModeOn = HIPSOLVERRF_RESET_VALUES_FAST_MODE_ON; + +inline evloserRfStatus_t evloserRfCreate(evloserRfHandle_t* handle) +{ + return hipsolverRfCreate(handle); +} + +inline evloserRfStatus_t evloserRfDestroy(evloserRfHandle_t handle) +{ + return hipsolverRfDestroy(handle); +} + +inline evloserRfStatus_t evloserRfSetAlgs(evloserRfHandle_t handle, + evloserRfFactorization_t fact_alg, + evloserRfTriangularSolve_t solve_alg) +{ + return hipsolverRfSetAlgs(handle, fact_alg, solve_alg); +} + +inline evloserRfStatus_t evloserRfSetMatrixFormat(evloserRfHandle_t handle, + evloserRfMatrixFormat_t format, + evloserRfUnitDiagonal_t diag) +{ + return hipsolverRfSetMatrixFormat(handle, format, diag); +} + +inline evloserRfStatus_t evloserRfSetResetValuesFastMode(evloserRfHandle_t handle, + evloserRfResetValuesFastMode_t fast_mode) +{ + return hipsolverRfSetResetValuesFastMode(handle, fast_mode); +} + +inline evloserRfStatus_t evloserRfSetNumericProperties(evloserRfHandle_t handle, double zero, double boost) +{ + return hipsolverRfSetNumericProperties(handle, zero, boost); +} + +inline evloserRfStatus_t evloserRfSetupHost(int n, + int nnzA, + int* csrRowPtrA, + int* csrColIndA, + double* csrValA, + int nnzL, + int* csrRowPtrL, + int* csrColIndL, + double* csrValL, + int nnzU, + int* csrRowPtrU, + int* csrColIndU, + double* csrValU, + int* P, + int* Q, + evloserRfHandle_t handle) +{ + return hipsolverRfSetupHost(n, + nnzA, + csrRowPtrA, + csrColIndA, + csrValA, + nnzL, + csrRowPtrL, + csrColIndL, + csrValL, + nnzU, + csrRowPtrU, + csrColIndU, + csrValU, + P, + Q, + handle); +} + +inline evloserRfStatus_t evloserRfResetValues(int n, + int nnzA, + int* csrRowPtrA, + int* csrColIndA, + double* csrValA, + int* P, + int* Q, + evloserRfHandle_t handle) +{ + return hipsolverRfResetValues(n, nnzA, csrRowPtrA, csrColIndA, csrValA, P, Q, handle); +} + +inline evloserRfStatus_t evloserRfAnalyze(evloserRfHandle_t handle) +{ + return hipsolverRfAnalyze(handle); +} + +inline evloserRfStatus_t evloserRfRefactor(evloserRfHandle_t handle) +{ + return hipsolverRfRefactor(handle); +} + +inline evloserRfStatus_t evloserRfSolve(evloserRfHandle_t handle, + int* P, + int* Q, + int nrhs, + double* Temp, + int ldt, + double* XF, + int ldxf) +{ + return hipsolverRfSolve(handle, P, Q, nrhs, Temp, ldt, XF, ldxf); +} + +/* + * cuSOLVER GLU is CUDA-only. These stubs allow HIP EVLOSER builds to compile + * code paths that are present in the shared implementation but not used by the + * HIP RF validation path. + */ +using csrluInfoHost_t = void*; +using csrgluInfo_t = void*; + +template +inline cusolverStatus_t cusolverSpCreateGluInfo(Args...) +{ + return static_cast(1); +} + +template +inline cusolverStatus_t cusolverSpDestroyGluInfo(Args...) +{ + return CUSOLVER_STATUS_SUCCESS; +} + +template +inline cusolverStatus_t cusolverSpDgluSetup(Args...) +{ + return static_cast(1); +} + +template +inline cusolverStatus_t cusolverSpDgluBufferSize(Args...) +{ + return static_cast(1); +} + +template +inline cusolverStatus_t cusolverSpDgluAnalysis(Args...) +{ + return static_cast(1); +} + +template +inline cusolverStatus_t cusolverSpDgluReset(Args...) +{ + return static_cast(1); +} + +template +inline cusolverStatus_t cusolverSpDgluFactor(Args...) +{ + return static_cast(1); +} + +template +inline cusolverStatus_t cusolverSpDgluSolve(Args...) +{ + return static_cast(1); +} + +template +inline cusolverStatus_t cusolverSpDnrminf(Args...) +{ + return static_cast(1); +} + +#endif // EVLOSER_HIPSOLVER_DEFS_H diff --git a/src/LinAlg/hiopLinSolverSparseEVLOSER.cpp b/src/LinAlg/hiopLinSolverSparseEVLOSER.cpp new file mode 100644 index 0000000..8994133 --- /dev/null +++ b/src/LinAlg/hiopLinSolverSparseEVLOSER.cpp @@ -0,0 +1,656 @@ +// This file is part of HiOp. For details, see https://github.com/LLNL/hiop. +// HiOp is released under the BSD 3-clause license +// (https://opensource.org/licenses/BSD-3-Clause). Please also read “Additional +// BSD Notice” below. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// i. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the disclaimer below. ii. Redistributions in +// binary form must reproduce the above copyright notice, this list of +// conditions and the disclaimer (as noted below) in the documentation and/or +// other materials provided with the distribution. +// iii. Neither the name of the LLNS/LLNL nor the names of its contributors may +// be used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY, LLC, +// THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, +// INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF +// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Additional BSD Notice +// 1. This notice is required to be provided under our contract with the U.S. +// Department of Energy (DOE). This work was produced at Lawrence Livermore +// National Laboratory under Contract No. DE-AC52-07NA27344 with the DOE. +// 2. Neither the United States Government nor Lawrence Livermore National +// Security, LLC nor any of their employees, makes any warranty, express or +// implied, or assumes any liability or responsibility for the accuracy, +// completeness, or usefulness of any information, apparatus, product, or +// process disclosed, or represents that its use would not infringe +// privately-owned rights. +// 3. Also, reference herein to any specific commercial products, process, or +// services by trade name, trademark, manufacturer or otherwise does not +// necessarily constitute or imply its endorsement, recommendation, or favoring +// by the United States Government or Lawrence Livermore National Security, +// LLC. The views and opinions of authors expressed herein do not necessarily +// state or reflect those of the United States Government or Lawrence Livermore +// National Security, LLC, and shall not be used for advertising or product +// endorsement purposes. + +/** + * @file hiopLinSolverSparseEVLOSER.cpp + * + * @author Kasia Swirydowicz , PNNL + * @author Slaven Peles , ORNL + * + */ + +#include "hiopLinSolverSparseEVLOSER.hpp" +#include "EVLOSER/RefactorizationSolver.hpp" +#include "EVLOSER/MatrixCsr.hpp" +#include "EVLOSER/evloser_gpu_defs.hpp" +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) +#include "EVLOSER/IterativeRefinement.hpp" +#endif + +#include "hiop_blasdefs.hpp" + +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) +#include "cusparse_v2.h" +#endif + +#include +#include +#include +#include +#include +#include + +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) +#define checkGpuErrors(val) hiopCheckGpuError((val), __FILE__, __LINE__) +#endif + +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) +/** + * @brief Map elements of one array to the other + * + * for(int k = 0; k < nnz_; k++) { + * vals[k] = M_->M()[index_convert_CSR2Triplet_host_[k]]; + * } + * + */ +template +__global__ void evloser_map_arrays_kernel(T* dst, const T* src, const I* mapidx, I n) +{ + I tid = blockDim.x * blockIdx.x + threadIdx.x; + + if(tid < n) { + dst[tid] = src[mapidx[tid]]; + } +} +#endif + +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) +/** + * @brief Map elements of one array to the other + * + * for(int i = 0; i < n_; i++) { + * if(index_convert_extra_Diag2CSR_host_[i] != -1) + * vals[index_convert_extra_Diag2CSR_host_[i]] += M_->M()[M_->numberOfNonzeros() - n_ + i]; + * } + * + */ +template +__global__ void evloser_add_to_array_kernel(T* dst, const T* src, const I* mapidx, I n, I nnz) +{ + I tid = blockDim.x * blockIdx.x + threadIdx.x; + + if(tid < n) { + if(mapidx[tid] != -1) dst[mapidx[tid]] += src[nnz - n + tid]; + } +} +#endif + +namespace hiop +{ +namespace +{ + +EVLOSER::ExecutionMode select_evloser_execution_mode(hiopNlpFormulation* nlp) +{ + const std::string mem_space = nlp->options->GetString("mem_space"); + + if(mem_space == "host" || mem_space == "default") { + return EVLOSER::ExecutionMode::CPU; + } + + if(mem_space == "device") { +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) + return EVLOSER::ExecutionMode::CUDA; +#elif defined(HIOP_USE_HIP) || defined(HAVE_HIP) + return EVLOSER::ExecutionMode::HIP; +#else + nlp->log->printf(hovError, + "EVLOSER device execution was requested, but HiOp was not built with CUDA or HIP support.\n"); + std::abort(); +#endif + } + + nlp->log->printf(hovError, "Memory space %s is incompatible with EVLOSER.\n", mem_space.c_str()); + std::abort(); +} + +} // namespace + +hiopLinSolverSymSparseEVLOSER::hiopLinSolverSymSparseEVLOSER(const int& n, const int& nnz, hiopNlpFormulation* nlp) + : hiopLinSolverSymSparse(n, nnz, nlp), + execution_mode_{select_evloser_execution_mode(nlp)}, + solver_{nullptr}, + m_{n}, + n_{n}, + nnz_{0}, + index_convert_CSR2Triplet_host_{nullptr}, + index_convert_extra_Diag2CSR_host_{nullptr}, +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) + index_convert_CSR2Triplet_device_{nullptr}, + index_convert_extra_Diag2CSR_device_{nullptr}, +#endif + factorizationSetupSucc_{0}, + is_first_call_{true} +{ + // Create embedded EVLOSER refactorization solver + solver_ = new EVLOSER::RefactorizationSolver(n, execution_mode_); + + // Device execution requires a host mirror for HiOp's KKT matrix. + if(execution_mode_ != EVLOSER::ExecutionMode::CPU) { + M_host_ = LinearAlgebraFactory::create_matrix_sparse("default", n, n, nnz); + } + + // Set embedded solver verbosity based on HiOp verbosity + if(nlp_->options->GetInteger("verbosity_level") >= 3) { + solver_->set_silent_output(false); + } + + // Select matrix ordering + int ordering = 1; + std::string ord = nlp_->options->GetString("linear_solver_sparse_ordering"); + if(ord == "amd_ssparse") { + ordering = 0; + } else if(ord == "colamd_ssparse") { + ordering = 1; + } else { + nlp_->log->printf(hovWarning, "Ordering %s not compatible with EVLOSER sparse solver, using default ...\n", ord.c_str()); + ordering = 1; + } + solver_->ordering() = ordering; + nlp_->log->printf(hovSummary, "Ordering: %d\n", solver_->ordering()); + + // Select factorization + std::string fact; + fact = nlp_->options->GetString("resolve_factorization"); + if(fact != "klu") { + nlp_->log->printf(hovWarning, "Factorization %s not compatible with EVLOSER sparse solver, using default ...\n", fact.c_str()); + fact = "klu"; + } + solver_->fact() = fact; + nlp_->log->printf(hovSummary, "Factorization: %s\n", solver_->fact().c_str()); + + // Select refactorization + std::string refact; + refact = nlp_->options->GetString("resolve_refactorization"); + if(refact != "glu" && refact != "rf") { + nlp_->log->printf(hovWarning, "Refactorization %s not compatible with EVLOSER sparse solver, using default ...\n", refact.c_str()); + refact = "glu"; + } + solver_->refact() = refact; + nlp_->log->printf(hovSummary, "Refactorization: %s\n", solver_->refact().c_str()); + + // by default, dont use iterative refinement + std::string use_ir{"no"}; +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) + if(execution_mode_ != EVLOSER::ExecutionMode::CPU) { + int maxit_test = nlp_->options->GetInteger("ir_inner_maxit"); + + + if((maxit_test < 0) || (maxit_test > 1000)) { + nlp_->log->printf(hovWarning, + "Wrong maxit value: %d. Use int maxit value between 0 and 1000. Setting default (50) ...\n", + maxit_test); + maxit_test = 50; + } +#if defined(HIOP_USE_HIP) || defined(HAVE_HIP) + // EVLOSER iterative refinement currently depends on CUDA-only kernels. + // Keep the HIP path on RF only until the IR path is ported. + solver_->disable_iterative_refinement(); + #else + if(maxit_test > 0) { + use_ir = "yes"; + solver_->enable_iterative_refinement(); + solver_->ir()->maxit() = maxit_test; + } else { + solver_->disable_iterative_refinement(); + } + #endif + if(use_ir == "yes") { + if((refact == "rf")) { + solver_->ir()->restart() = nlp_->options->GetInteger("ir_inner_restart"); + + if((solver_->ir()->restart() < 0) || (solver_->ir()->restart() > 100)) { + nlp_->log->printf(hovWarning, + "Wrong restart value: %d. Use int restart value between 1 and 100. Setting default (20) ...\n", + solver_->ir()->restart()); + solver_->ir()->restart() = 20; + } + + solver_->ir()->tol() = nlp_->options->GetNumeric("ir_inner_tol"); + if((solver_->ir()->tol() < 0) || (solver_->ir()->tol() > 1)) { + nlp_->log->printf(hovWarning, + "Wrong tol value: %e. Use double tol value between 0 and 1. Setting default (1e-12) ...\n", + solver_->ir()->tol()); + solver_->ir()->tol() = 1e-12; + } + solver_->ir()->orth_option() = nlp_->options->GetString("ir_inner_gs_scheme"); + + /* 0) "Standard" GMRES and FGMRES (Saad and Schultz, 1986, Saad, 1992) use Modified Gram-Schmidt ("mgs") to keep the + * Krylov vectors orthogonal. Modified Gram-Schmidt requires k synchronization (due to inner products) in iteration k + * and this becomes a scaling bottleneck for GPU-accelerated implementation and it becomes even more pronouced for + * MPI+GPU-acceleration. Modified Gram-Schidt can be replaced by a different scheme. + * + * 1) One can use Classical Gram-Schmidt ("cgs") which is numerically unstable or reorthogonalized Classical + * Gram-Schmidt ("cgs2"), which is numerically stable and requires 3 synchrnozations and each iteration. + * Reorthogonalized Classical Gram-Schmidt makes two passes of Classical Gram-Schmidt. And two passes are enough to get + * vectors orthogonal to machine precision (Bjorck 1967). + * + * 2) An alternative is a low-sych version (Swirydowicz and Thomas, 2020), which reformulates Modified Gram-Schmidt to + * be a (very small) triangular solve. It requires extra storage for the matrix used in triangular solve (kxk at + * iteration k), but only two sycnhronizations are needed per iteration. The inner producats are performed in bulk, + * which quarantees better GPU utilization. The second synchronization comes from normalizing the vector and can be + * eliminated if the norm is postponed to the next iteration, but also makes code more complicated. This is why we use + * two-synch method ("mgs_two_synch") + * + * 3) A recently submitted paper by Stephen Thomas (Thomas 202*) takes the triangular solve idea further and uses a + * different approximation for the inverse of a triangular matrix. It requires two (very small) triangular solves and + * two sychroniztions (if the norm is NOT delayed). It also guarantees that the vectors are orthogonal to the machine + * epsilon, as in cgs2. Since Stephen's paper is named "post modern GMRES", we call this Gram-Schmidt scheme "mgs_pm". + */ + if(solver_->ir()->orth_option() != "mgs" && solver_->ir()->orth_option() != "cgs2" && + solver_->ir()->orth_option() != "mgs_two_synch" && solver_->ir()->orth_option() != "mgs_pm") { + nlp_->log->printf( + hovWarning, + "mgs option : %s is wrong. Use 'mgs', 'cgs2', 'mgs_two_synch' or 'mgs_pm'. Switching to default (mgs) ...\n", + use_ir.c_str()); + solver_->ir()->orth_option() = "mgs"; + } + + solver_->ir()->conv_cond() = nlp_->options->GetInteger("ir_inner_conv_cond"); + + if((solver_->ir()->conv_cond() < 0) || (solver_->ir()->conv_cond() > 2)) { + nlp_->log->printf(hovWarning, + "Wrong IR convergence condition: %d. Use int value: 0, 1 or 2. Setting default (0) ...\n", + solver_->ir()->conv_cond()); + solver_->ir()->conv_cond() = 0; + } + + } else { + nlp_->log->printf(hovWarning, "Currently, inner iterative refinement works ONLY with cuSolverRf ... \n"); + use_ir = "no"; + solver_->disable_iterative_refinement(); + } + } + } +#endif + + solver_->use_ir() = use_ir; + nlp_->log->printf(hovSummary, "Use IR: %s\n", solver_->use_ir().c_str()); +} // constructor + +hiopLinSolverSymSparseEVLOSER::~hiopLinSolverSymSparseEVLOSER() +{ + delete solver_; + delete M_host_; + M_host_ = nullptr; + + // Delete CSR <--> triplet mappings + delete[] index_convert_CSR2Triplet_host_; + delete[] index_convert_extra_Diag2CSR_host_; +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) + if(index_convert_CSR2Triplet_device_ != nullptr) { + checkGpuErrors(evloserGpuFree(index_convert_CSR2Triplet_device_)); + } + if(index_convert_extra_Diag2CSR_device_ != nullptr) { + checkGpuErrors(evloserGpuFree(index_convert_extra_Diag2CSR_device_)); + } +#endif +} + +int hiopLinSolverSymSparseEVLOSER::matrixChanged() +{ + assert(n_ == M_->n() && M_->n() == M_->m()); + assert(n_ > 0); + + nlp_->runStats.linsolv.tmFactTime.start(); + + if(is_first_call_) { + firstCall(); + } else { + update_matrix_values(); + } + + if(factorizationSetupSucc_ == 0) { + int retval = solver_->factorize(); + if(retval == -1) { + nlp_->log->printf(hovWarning, "Numeric klu factorization failed. Regularizing ...\n"); + // This is not a catastrophic failure + // The matrix is singular so return -1 to regularaize! + return -1; + } else { // Numeric was succesfull so now can set up + solver_->setup_refactorization(); + factorizationSetupSucc_ = 1; + nlp_->log->printf(hovScalars, "Numeric klu factorization succesful! \n"); + } + } else { // factorizationSetupSucc_ == 1 + solver_->refactorize(); + } + + nlp_->runStats.linsolv.tmFactTime.stop(); + return 0; +} + +bool hiopLinSolverSymSparseEVLOSER::solve(hiopVector& x) +{ + assert(n_ == M_->n() && M_->n() == M_->m()); + assert(n_ > 0); + assert(x.get_size() == M_->n()); + + nlp_->runStats.linsolv.tmTriuSolves.start(); + + // Set IR tolerance + double ir_tol = nlp_->options->GetNumeric("ir_inner_tol"); + double* dx = x.local_data(); + bool retval = solver_->triangular_solve(dx, ir_tol); + if(!retval) { + nlp_->log->printf(hovError, // catastrophic failure + "EVLOSER triangular solve failed\n"); + } + + nlp_->runStats.linsolv.tmTriuSolves.stop(); + return retval; +} + +void hiopLinSolverSymSparseEVLOSER::firstCall() +{ + assert(n_ == M_->n() && M_->n() == M_->m()); + assert(n_ > 0); + + // Device execution requires a host copy for the initial KLU factorization. +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) + if(execution_mode_ != EVLOSER::ExecutionMode::CPU) { + checkGpuErrors( + evloserGpuMemcpy(M_host_->M(), M_->M(), sizeof(double) * M_->numberOfNonzeros(), evloserMemcpyDeviceToHost)); + checkGpuErrors(evloserGpuMemcpy(M_host_->i_row(), + M_->i_row(), + sizeof(index_type) * M_->numberOfNonzeros(), + evloserMemcpyDeviceToHost)); + checkGpuErrors(evloserGpuMemcpy(M_host_->j_col(), + M_->j_col(), + sizeof(index_type) * M_->numberOfNonzeros(), + evloserMemcpyDeviceToHost)); + } +#endif + // Transfer triplet to CSR form + + // Allocate row pointers and compute number of nonzeros. + solver_->mat_A_csr()->allocate_size(n_); + compute_nnz(); + solver_->set_nnz(nnz_); + + // Allocate column indices and matrix values + solver_->mat_A_csr()->allocate_nnz(nnz_); + + // Set column indices and matrix values. + set_csr_indices_values(); + +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) + if(execution_mode_ != EVLOSER::ExecutionMode::CPU) { + // Copy matrix to device + solver_->mat_A_csr()->update_from_host_mirror(); + + if(solver_->use_ir() == "yes") { + solver_->setup_iterative_refinement_matrix(n_, nnz_); + } + } +#endif + /* + * initialize matrix factorization + */ + if(solver_->setup_factorization() < 0) { + nlp_->log->printf(hovError, // catastrophic failure + "Symbolic factorization failed!\n"); + return; + }; + is_first_call_ = false; +} + +/// nnz_ is number of nonzeros in CSR matrix +/// M_->numberOfNonzeros() is number of zeros in symmetric triplet matrix +void hiopLinSolverSymSparseEVLOSER::update_matrix_values() +{ +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) + if(execution_mode_ == EVLOSER::ExecutionMode::CUDA) { + double* csr_vals = solver_->mat_A_csr()->device_vals(); + double* coo_vals = M_->M(); + int coo_nnz = M_->numberOfNonzeros(); + + const int blocksize = 512; + int gridsize = (nnz_ + blocksize - 1) / blocksize; + evloser_map_arrays_kernel + <<>>(csr_vals, coo_vals, index_convert_CSR2Triplet_device_, nnz_); + + gridsize = (n_ + blocksize - 1) / blocksize; + evloser_add_to_array_kernel + <<>>(csr_vals, coo_vals, index_convert_extra_Diag2CSR_device_, n_, coo_nnz); + + // If factorization was not successful, we need a copy of values on the host + if(factorizationSetupSucc_ == 0) { + checkGpuErrors(evloserGpuMemcpy(solver_->mat_A_csr()->host_vals(), + solver_->mat_A_csr()->device_vals(), + sizeof(double) * nnz_, + evloserMemcpyDeviceToHost)); + + } + + return; + } +#endif + + hiopMatrixSparse* matrix = M_; +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) + if(execution_mode_ != EVLOSER::ExecutionMode::CPU) { + checkGpuErrors(evloserGpuMemcpy(M_host_->M(), + M_->M(), + sizeof(double) * M_->numberOfNonzeros(), + evloserMemcpyDeviceToHost)); + matrix = M_host_; + } +#endif + double* vals = solver_->mat_A_csr()->host_vals(); + + for(int k = 0; k < nnz_; ++k) { + vals[k] = matrix->M()[index_convert_CSR2Triplet_host_[k]]; + } + + for(int i = 0; i < n_; ++i) { + if(index_convert_extra_Diag2CSR_host_[i] != -1) { + vals[index_convert_extra_Diag2CSR_host_[i]] += + matrix->M()[matrix->numberOfNonzeros() - n_ + i]; + } + } + +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) + if(execution_mode_ != EVLOSER::ExecutionMode::CPU) { + solver_->mat_A_csr()->update_from_host_mirror(); + } +#endif +} + +/// @pre Data is either on the host or the host mirror is synced with the device +void hiopLinSolverSymSparseEVLOSER::compute_nnz() +{ + // + // compute nnz in each row + // + int* row_ptr = solver_->mat_A_csr()->host_irows(); + + hiopMatrixSparse* M_host = + execution_mode_ == EVLOSER::ExecutionMode::CPU ? M_ : M_host_; + + assert(M_host != nullptr); + + // off-diagonal part + row_ptr[0] = 0; + for(int k = 0; k < M_host->numberOfNonzeros() - n_; k++) { + if(M_host->i_row()[k] != M_host->j_col()[k]) { + row_ptr[M_host->i_row()[k] + 1]++; + row_ptr[M_host->j_col()[k] + 1]++; + nnz_ += 2; + } + } + // diagonal part + for(int i = 0; i < n_; i++) { + row_ptr[i + 1]++; + nnz_ += 1; + } + // get correct row ptr index + for(int i = 1; i < n_ + 1; i++) { + row_ptr[i] += row_ptr[i - 1]; + } + assert(nnz_ == row_ptr[n_]); +} + +/// @pre Data is either on the host or the host mirror is synced with the device +void hiopLinSolverSymSparseEVLOSER::set_csr_indices_values() +{ + hiopMatrixSparse* M_host = + execution_mode_ == EVLOSER::ExecutionMode::CPU ? M_ : M_host_; + + assert(M_host != nullptr); + + // + // set correct col index and value + // + const int* row_ptr = solver_->mat_A_csr()->host_irows(); + int* col_idx = solver_->mat_A_csr()->host_jcols(); + double* vals = solver_->mat_A_csr()->host_vals(); + + index_convert_CSR2Triplet_host_ = new int[nnz_]; + index_convert_extra_Diag2CSR_host_ = new int[n_]; +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) + if(execution_mode_ == EVLOSER::ExecutionMode::CUDA) { + checkGpuErrors(evloserGpuMalloc(reinterpret_cast(&index_convert_CSR2Triplet_device_), nnz_ * sizeof(int))); + checkGpuErrors(evloserGpuMalloc(reinterpret_cast(&index_convert_extra_Diag2CSR_device_), n_ * sizeof(int))); + } +#endif + int* nnz_each_row_tmp = new int[n_]{0}; + int total_nnz_tmp{0}, nnz_tmp{0}, rowID_tmp, colID_tmp; + + for(int k = 0; k < n_; k++) { + index_convert_extra_Diag2CSR_host_[k] = -1; + } + + for(int k = 0; k < M_host->numberOfNonzeros() - n_; k++) { + rowID_tmp = M_host->i_row()[k]; + colID_tmp = M_host->j_col()[k]; + if(rowID_tmp == colID_tmp) { + nnz_tmp = nnz_each_row_tmp[rowID_tmp] + row_ptr[rowID_tmp]; + col_idx[nnz_tmp] = colID_tmp; + vals[nnz_tmp] = M_host->M()[k]; + index_convert_CSR2Triplet_host_[nnz_tmp] = k; + + vals[nnz_tmp] += M_host->M()[M_host->numberOfNonzeros() - n_ + rowID_tmp]; + index_convert_extra_Diag2CSR_host_[rowID_tmp] = nnz_tmp; + + nnz_each_row_tmp[rowID_tmp]++; + total_nnz_tmp++; + } else { + nnz_tmp = nnz_each_row_tmp[rowID_tmp] + row_ptr[rowID_tmp]; + col_idx[nnz_tmp] = colID_tmp; + vals[nnz_tmp] = M_host->M()[k]; + index_convert_CSR2Triplet_host_[nnz_tmp] = k; + + nnz_tmp = nnz_each_row_tmp[colID_tmp] + row_ptr[colID_tmp]; + col_idx[nnz_tmp] = rowID_tmp; + vals[nnz_tmp] = M_host->M()[k]; + index_convert_CSR2Triplet_host_[nnz_tmp] = k; + + nnz_each_row_tmp[rowID_tmp]++; + nnz_each_row_tmp[colID_tmp]++; + total_nnz_tmp += 2; + } + } + // correct the missing dia_gonal term + for(int i = 0; i < n_; i++) { + if(nnz_each_row_tmp[i] != row_ptr[i + 1] - row_ptr[i]) { + assert(nnz_each_row_tmp[i] == row_ptr[i + 1] - row_ptr[i] - 1); + nnz_tmp = nnz_each_row_tmp[i] + row_ptr[i]; + col_idx[nnz_tmp] = i; + vals[nnz_tmp] = M_host->M()[M_host->numberOfNonzeros() - n_ + i]; + index_convert_CSR2Triplet_host_[nnz_tmp] = M_host->numberOfNonzeros() - n_ + i; + total_nnz_tmp += 1; + + std::vector ind_temp(row_ptr[i + 1] - row_ptr[i]); + std::iota(ind_temp.begin(), ind_temp.end(), 0); + std::sort(ind_temp.begin(), ind_temp.end(), [&](int a, int b) { + return col_idx[a + row_ptr[i]] < col_idx[b + row_ptr[i]]; + }); + + reorder(vals + row_ptr[i], ind_temp, row_ptr[i + 1] - row_ptr[i]); + reorder(index_convert_CSR2Triplet_host_ + row_ptr[i], ind_temp, row_ptr[i + 1] - row_ptr[i]); + std::sort(col_idx + row_ptr[i], col_idx + row_ptr[i + 1]); + } + } +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) + if(execution_mode_ == EVLOSER::ExecutionMode::CUDA) { + checkGpuErrors(evloserGpuMemcpy(index_convert_CSR2Triplet_device_, + index_convert_CSR2Triplet_host_, + nnz_ * sizeof(int), + evloserMemcpyHostToDevice)); + checkGpuErrors(evloserGpuMemcpy(index_convert_extra_Diag2CSR_device_, + index_convert_extra_Diag2CSR_host_, + n_ * sizeof(int), + evloserMemcpyHostToDevice)); + } +#endif + delete[] nnz_each_row_tmp; +} + +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) +// Error checking utility for CUDA +// KS: might later become part of src/Utils, putting it here for now +template +void hiopLinSolverSymSparseEVLOSER::hiopCheckGpuError(T result, const char* const file, int const line) +{ + if(result) { + nlp_->log->printf(hovError, "GPU error at %s:%d, error# %d\n", file, line, result); + assert(false); + } +} +#endif +} // namespace hiop diff --git a/src/LinAlg/hiopLinSolverSparseEVLOSER.hpp b/src/LinAlg/hiopLinSolverSparseEVLOSER.hpp new file mode 100644 index 0000000..849acae --- /dev/null +++ b/src/LinAlg/hiopLinSolverSparseEVLOSER.hpp @@ -0,0 +1,166 @@ +// +// This file is part of HiOp. For details, see https://github.com/LLNL/hiop. +// HiOp is released under the BSD 3-clause license +// (https://opensource.org/licenses/BSD-3-Clause). Please also read “Additional +// BSD Notice” below. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// i. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the disclaimer below. ii. Redistributions in +// binary form must reproduce the above copyright notice, this list of +// conditions and the disclaimer (as noted below) in the documentation and/or +// other materials provided with the distribution. +// iii. Neither the name of the LLNS/LLNL nor the names of its contributors may +// be used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY, LLC, +// THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, +// INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF +// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Additional BSD Notice +// 1. This notice is required to be provided under our contract with the U.S. +// Department of Energy (DOE). This work was produced at Lawrence Livermore +// National Laboratory under Contract No. DE-AC52-07NA27344 with the DOE. +// 2. Neither the United States Government nor Lawrence Livermore National +// Security, LLC nor any of their employees, makes any warranty, express or +// implied, or assumes any liability or responsibility for the accuracy, +// completeness, or usefulness of any information, apparatus, product, or +// process disclosed, or represents that its use would not infringe +// privately-owned rights. +// 3. Also, reference herein to any specific commercial products, process, or +// services by trade name, trademark, manufacturer or otherwise does not +// necessarily constitute or imply its endorsement, recommendation, or favoring +// by the United States Government or Lawrence Livermore National Security, +// LLC. The views and opinions of authors expressed herein do not necessarily +// state or reflect those of the United States Government or Lawrence Livermore +// National Security, LLC, and shall not be used for advertising or product +// endorsement purposes. + +/** + * @file hiopLinSolverSparseEVLOSER.hpp + * + * @author Kasia Swirydowicz , PNNL + * @author Slaven Peles , ORNL + * + */ + +#ifndef HIOP_LINSOLVER_EVLOSER +#define HIOP_LINSOLVER_EVLOSER + +#include "hiopLinSolver.hpp" +#include "hiopMatrixSparseTriplet.hpp" +#include "EVLOSER/evloser_execution_mode.hpp" +#include + +/** Implements the sparse linear solver class using the EVLOSER interface + * to the embedded EVLOSER backend. + * + * @ingroup LinearSolvers + */ + +namespace EVLOSER +{ +// Forward declaration of inner IR class +class IterativeRefinement; +class MatrixCsr; +class RefactorizationSolver; +} // namespace EVLOSER + +namespace hiop +{ + +class hiopLinSolverSymSparseEVLOSER : public hiopLinSolverSymSparse +{ +public: + // constructor + hiopLinSolverSymSparseEVLOSER(const int& n, const int& nnz, hiopNlpFormulation* nlp); + virtual ~hiopLinSolverSymSparseEVLOSER(); + + /** + * @brief Triggers a refactorization of the matrix, if necessary. + * Overload from base class. + * In this case, KLU (SuiteSparse) is used to refactor + */ + virtual int matrixChanged(); + + /** + * @brief Solves a linear system. + * + * @param x is on entry the right hand side(s) of the system to be solved. + * + * @post On exit `x` is overwritten with the solution(s). + */ + virtual bool solve(hiopVector& x_); + + /** Multiple rhs not supported yet */ + virtual bool solve(hiopMatrix& /* x */) + { + assert(false && "not yet supported"); + return false; + } + +protected: + const EVLOSER::ExecutionMode execution_mode_; ///< Selected EVLOSER execution path + EVLOSER::RefactorizationSolver* solver_; + + int m_; ///< number of rows of the whole matrix + int n_; ///< number of cols of the whole matrix + int nnz_; ///< number of nonzeros in the matrix + + // Mapping on the host + int* index_convert_CSR2Triplet_host_; + int* index_convert_extra_Diag2CSR_host_; + +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) + // Mapping on the device + int* index_convert_CSR2Triplet_device_; + int* index_convert_extra_Diag2CSR_device_; +#endif + + // Algorithm control flags + int factorizationSetupSucc_; + bool is_first_call_; + + hiopMatrixSparse* M_host_{nullptr}; ///< Host mirror for the KKT matrix + + /** called the very first time a matrix is factored. Perform KLU + * factorization, allocate all aux variables + * + * @note Converts HiOp triplet matrix to CSR format. + */ + virtual void firstCall(); + + /** + * @brief Updates matrix values from HiOp object. + * + * @note This function maps data from HiOp supplied matrix M_ to data structures + * used by the linear solver. + */ + void update_matrix_values(); + + /** Function to compute nnz and set row pointers */ + void compute_nnz(); + /** Function to compute column indices and matrix values arrays */ + void set_csr_indices_values(); + +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) + template + void hiopCheckGpuError(T result, const char* const file, int const line); +#endif +}; + +} // namespace hiop + +#endif diff --git a/src/Optimization/hiopDualsUpdater.cpp b/src/Optimization/hiopDualsUpdater.cpp index 0ec4bbf..a66ec5a 100644 --- a/src/Optimization/hiopDualsUpdater.cpp +++ b/src/Optimization/hiopDualsUpdater.cpp @@ -71,9 +71,13 @@ #ifdef HIOP_USE_PARDISO #include "hiopLinSolverSparsePARDISO.hpp" #endif -#ifdef HIOP_USE_RESOLVE +#if defined(HIOP_USE_RESOLVE) && defined(HIOP_USE_CUDA) #include "hiopLinSolverSparseReSolve.hpp" #endif + +#ifdef HIOP_USE_EVLOSER +#include "hiopLinSolverSparseEVLOSER.hpp" +#endif #ifdef HIOP_USE_GINKGO #include "hiopLinSolverSparseGinkgo.hpp" #endif @@ -375,6 +379,20 @@ bool hiopDualsLsqUpdateLinsysAugSparse::instantiate_linear_solver(const char* li // compute mode CPU ///////////////////////////////////////////////////////////////////////////////////////// assert(nullptr == lin_sys_); +#ifdef HIOP_USE_EVLOSER + if(linear_solver == "evloser") { + if(fact_acceptor == "inertia_correction") { + nlp_->log->printf(hovError, + "LSQ linear solver with EVLOSER does not support inertia correction. " + "Please set option 'fact_acceptor' to 'inertia_free'.\n"); + assert(false); + return false; + } + + ss_log << "LSQ linear solver --- KKT_SPARSE_XDYcYd linsys: EVLOSER on CPU "; + lin_sys_ = new hiopLinSolverSymSparseEVLOSER(n, nnz, nlp_); + } +#endif // HIOP_USE_EVLOSER if(linear_solver == "ma57" || linear_solver == "auto") { #ifdef HIOP_USE_COINHSL ss_log << "LSQ linear solver --- KKT_SPARSE_XDYcYd linsys: MA57 "; @@ -426,25 +444,35 @@ bool hiopDualsLsqUpdateLinsysAugSparse::instantiate_linear_solver(const char* li // Under gpu compute_mode, which is work in progress, the initialization should be done only using // GPU sparse linear solvers. -#ifdef HIOP_USE_RESOLVE +#if defined(HIOP_USE_RESOLVE) || defined(HIOP_USE_EVLOSER) if(compute_mode == "gpu") { - assert((linear_solver == "resolve" || linear_solver == "auto") && + assert((linear_solver == "resolve" || linear_solver == "evloser" || linear_solver == "auto") && "the value for duals_init_linear_solver_sparse is invalid and should have been corrected during " "options processing"); } if(fact_acceptor == "inertia_correction") { nlp_->log->printf(hovError, - "LSQ linear solver with ReSolve does not support inertia correction. " + "LSQ linear solver with ReSolve or EVLOSER does not support inertia correction. " "Please set option 'fact_acceptor' to 'inertia_free'.\n"); assert(false); return false; } - // This is our first choice on the device. + +#if defined(HIOP_USE_RESOLVE) && defined(HIOP_USE_CUDA) + // ReSolve remains the first choice when it is available. if(linear_solver == "resolve" || linear_solver == "auto") { ss_log << "LSQ linear solver --- KKT_SPARSE_XDYcYd linsys: ReSolve "; lin_sys_ = new hiopLinSolverSymSparseReSolve(n, nnz, nlp_); } -#else // of #ifdef HIOP_USE_RESOLVE +#endif + +#ifdef HIOP_USE_EVLOSER + if(nullptr == lin_sys_ && (linear_solver == "evloser" || linear_solver == "auto")) { + ss_log << "LSQ linear solver --- KKT_SPARSE_XDYcYd linsys: EVLOSER "; + lin_sys_ = new hiopLinSolverSymSparseEVLOSER(n, nnz, nlp_); + } +#endif +#else // no ReSolve or EVLOSER support // under compute mode gpu, at this point we don't have a sparse linear solver if(compute_mode == "gpu") { if(linear_solver == "auto") { diff --git a/src/Optimization/hiopKKTLinSysSparse.cpp b/src/Optimization/hiopKKTLinSysSparse.cpp index e9af09d..332887f 100644 --- a/src/Optimization/hiopKKTLinSysSparse.cpp +++ b/src/Optimization/hiopKKTLinSysSparse.cpp @@ -57,9 +57,13 @@ #ifdef HIOP_USE_PARDISO #include "hiopLinSolverSparsePARDISO.hpp" #endif -#ifdef HIOP_USE_RESOLVE +#if defined(HIOP_USE_RESOLVE) && defined(HIOP_USE_CUDA) #include "hiopLinSolverSparseReSolve.hpp" #endif + +#ifdef HIOP_USE_EVLOSER +#include "hiopLinSolverSparseEVLOSER.hpp" +#endif #ifdef HIOP_USE_GINKGO #include "hiopLinSolverSparseGinkgo.hpp" #endif @@ -310,6 +314,22 @@ hiopLinSolverSymSparse* hiopKKTLinSysCompressedSparseXYcYd::determineAndCreateLi //////////////////////////////////////////////////////////////////////////////////////////////// assert(nullptr == linSys_); +#ifdef HIOP_USE_EVLOSER + if(linear_solver == "evloser") { + linsol_actual = "EVLOSER"; + linSys_ = new hiopLinSolverSymSparseEVLOSER(n, nnz, nlp_); + + auto* fact_acceptor_ic = dynamic_cast(fact_acceptor_); + if(fact_acceptor_ic) { + nlp_->log->printf(hovError, + "KKT_SPARSE_XYcYd linsys with EVLOSER does not support inertia correction. " + "Please set option 'fact_acceptor' to 'inertia_free'.\n"); + assert(false); + return nullptr; + } + } +#endif // HIOP_USE_EVLOSER + if(linear_solver == "ma57" || linear_solver == "auto") { #ifdef HIOP_USE_COINHSL linsol_actual = "MA57"; @@ -363,7 +383,10 @@ hiopLinSolverSymSparse* hiopKKTLinSysCompressedSparseXYcYd::determineAndCreateLi if((nullptr == linSys_ && linear_solver == "auto") || linear_solver == "resolve") { #if defined(HIOP_USE_RESOLVE) + // Only build the ReSolve solver object when CUDA is enabled. +#if defined(HIOP_USE_RESOLVE) && defined(HIOP_USE_CUDA) linSys_ = new hiopLinSolverSymSparseReSolve(n, nnz, nlp_); +#endif linsol_actual = "ReSolve"; auto* fact_acceptor_ic = dynamic_cast(fact_acceptor_); if(fact_acceptor_ic) { @@ -376,6 +399,22 @@ hiopLinSolverSymSparse* hiopKKTLinSysCompressedSparseXYcYd::determineAndCreateLi #endif } + // EVLOSER has its own solver object but uses this same sparse KKT selection point. + if(nullptr == linSys_ && linear_solver == "evloser") { +#if defined(HIOP_USE_EVLOSER) + linSys_ = new hiopLinSolverSymSparseEVLOSER(n, nnz, nlp_); + linsol_actual = "EVLOSER"; + auto* fact_acceptor_ic = dynamic_cast(fact_acceptor_); + if(fact_acceptor_ic) { + nlp_->log->printf(hovError, + "KKT_SPARSE_XYcYd linsys with EVLOSER does not support inertia correction. " + "Please set option 'fact_acceptor' to 'inertia_free'.\n"); + assert(false); + return nullptr; + } +#endif + } + if((nullptr == linSys_ && linear_solver == "auto") || linear_solver == "strumpack") { #if defined(HIOP_USE_STRUMPACK) linSys_ = new hiopLinSolverSymSparseSTRUMPACK(n, nnz, nlp_); @@ -686,7 +725,23 @@ hiopLinSolverSymSparse* hiopKKTLinSysCompressedSparseXDYcYd::determineAndCreateL ///////////////////////////////////////////////////////////////////////////////////////////// // CPU compute mode ///////////////////////////////////////////////////////////////////////////////////////////// - if(linear_solver == "ma57" || linear_solver == "auto") { + +#ifdef HIOP_USE_EVLOSER + if(linear_solver == "evloser") { + actual_lin_solver = "EVLOSER"; + linSys_ = new hiopLinSolverSymSparseEVLOSER(n, nnz, nlp_); + + auto* fact_acceptor_ic = dynamic_cast(fact_acceptor_); + if(fact_acceptor_ic) { + nlp_->log->printf(hovError, + "KKT_SPARSE_XDYcYd linsys with EVLOSER does not support inertia correction. " + "Please set option 'fact_acceptor' to 'inertia_free'.\n"); + assert(false); + return nullptr; + } + } +#endif // HIOP_USE_EVLOSER +if(linear_solver == "ma57" || linear_solver == "auto") { #ifdef HIOP_USE_COINHSL linSys_ = new hiopLinSolverSymSparseMA57(n, nnz, nlp_); actual_lin_solver = "MA57"; @@ -745,7 +800,10 @@ hiopLinSolverSymSparse* hiopKKTLinSysCompressedSparseXDYcYd::determineAndCreateL if(linear_solver == "resolve" || linear_solver == "auto") { #if defined(HIOP_USE_RESOLVE) actual_lin_solver = "ReSolve"; + // Only build the ReSolve solver object when CUDA is enabled. +#if defined(HIOP_USE_RESOLVE) && defined(HIOP_USE_CUDA) linSys_ = new hiopLinSolverSymSparseReSolve(n, nnz, nlp_); +#endif auto* fact_acceptor_ic = dynamic_cast(fact_acceptor_); if(fact_acceptor_ic) { nlp_->log->printf(hovError, @@ -757,6 +815,22 @@ hiopLinSolverSymSparse* hiopKKTLinSysCompressedSparseXDYcYd::determineAndCreateL #endif } // end resolve + // EVLOSER has its own solver object but uses this same sparse KKT selection point. + if(nullptr == linSys_ && linear_solver == "evloser") { +#if defined(HIOP_USE_EVLOSER) + actual_lin_solver = "EVLOSER"; + linSys_ = new hiopLinSolverSymSparseEVLOSER(n, nnz, nlp_); + auto* fact_acceptor_ic = dynamic_cast(fact_acceptor_); + if(fact_acceptor_ic) { + nlp_->log->printf(hovError, + "KKT_SPARSE_XDYcYd linsys with EVLOSER does not support inertia correction. " + "Please set option 'fact_acceptor' to 'inertia_free'.\n"); + assert(false); + return nullptr; + } +#endif + } // end evloser + if(nullptr == linSys_ && (linear_solver == "strumpack" || linear_solver == "auto")) { #if defined(HIOP_USE_STRUMPACK) actual_lin_solver = "STRUMPACK"; @@ -816,7 +890,10 @@ hiopLinSolverSymSparse* hiopKKTLinSysCompressedSparseXDYcYd::determineAndCreateL if(linear_solver == "resolve" || linear_solver == "auto") { #if defined(HIOP_USE_RESOLVE) + // Only build the ReSolve solver object when CUDA is enabled. +#if defined(HIOP_USE_RESOLVE) && defined(HIOP_USE_CUDA) linSys_ = new hiopLinSolverSymSparseReSolve(n, nnz, nlp_); +#endif nlp_->log->printf(hovScalars, "KKT_SPARSE_XDYcYd linsys: alloc ReSolve size %d (%d cons) (gpu)\n", n, neq + nineq); auto* fact_acceptor_ic = dynamic_cast(fact_acceptor_); if(fact_acceptor_ic) { @@ -828,6 +905,23 @@ hiopLinSolverSymSparse* hiopKKTLinSysCompressedSparseXDYcYd::determineAndCreateL } #endif } // end resolve + + // EVLOSER has its own solver object but uses this same sparse KKT selection point. + if(nullptr == linSys_ && linear_solver == "evloser") { +#if defined(HIOP_USE_EVLOSER) + linSys_ = new hiopLinSolverSymSparseEVLOSER(n, nnz, nlp_); + nlp_->log->printf(hovScalars, "KKT_SPARSE_XDYcYd linsys: alloc EVLOSER size %d (%d cons) (gpu)\n", n, neq + nineq); + auto* fact_acceptor_ic = dynamic_cast(fact_acceptor_); + if(fact_acceptor_ic) { + nlp_->log->printf(hovError, + "KKT_SPARSE_XDYcYd linsys with EVLOSER does not support inertia correction. " + "Please set option 'fact_acceptor' to 'inertia_free'.\n"); + assert(false); + return nullptr; + } +#endif + } + } // end of compute mode gpu } assert(linSys_ && "KKT_SPARSE_XDYcYd linsys: cannot instantiate backend linear solver"); diff --git a/src/Utils/hiopOptions.cpp b/src/Utils/hiopOptions.cpp index ec63f68..05c2481 100644 --- a/src/Utils/hiopOptions.cpp +++ b/src/Utils/hiopOptions.cpp @@ -922,12 +922,13 @@ void hiopOptionsNLP::register_options() // - 'gpu' compute mode: work in progress { - vector range{"auto", "ma57", "pardiso", "strumpack", "resolve", "ginkgo", "cusolver-chol"}; + // EVLOSER is a separate sparse solver option while the old ReSolve option stays available. + vector range{"auto", "ma57", "pardiso", "strumpack", "resolve", "evloser", "ginkgo", "cusolver-chol"}; register_str_option("linear_solver_sparse", "auto", range, - "Selects among MA57, PARDISO, STRUMPACK, cuSOLVER's Cholesky or LU, and GINKGO for the " + "Selects among MA57, PARDISO, STRUMPACK, ReSolve, EVLOSER, cuSOLVER's Cholesky or LU, and GINKGO for the " "sparse linear solves."); } @@ -936,12 +937,13 @@ void hiopOptionsNLP::register_options() // - when GPU mode is on, STRUMPACK is chosen by 'auto' if available // - choosing option ma57 or pardiso with GPU being on, it results in no device being used in the linear solve! { - vector range{"auto", "ma57", "pardiso", "resolve", "strumpack", "ginkgo"}; + // EVLOSER is also valid for dual initialization through the same sparse solver path. + vector range{"auto", "ma57", "pardiso", "resolve", "evloser", "strumpack", "ginkgo"}; register_str_option("duals_init_linear_solver_sparse", "auto", range, - "Selects among MA57, PARDISO, cuSOLVER, STRUMPACK, and GINKGO for the sparse linear solves."); + "Selects among MA57, PARDISO, ReSolve, EVLOSER, cuSOLVER, STRUMPACK, and GINKGO for the sparse linear solves."); } // choose hardware backend for the Ginkgo solver to run on. @@ -1402,7 +1404,7 @@ void hiopOptionsNLP::ensure_consistence() auto kkt_linsys = GetString("KKTLinsys"); auto sol_sp = GetString("linear_solver_sparse"); if(kkt_linsys == "full") { - if(sol_sp != "resolve" && sol_sp != "pardiso" && sol_sp != "strumpack" && sol_sp != "auto") { + if(sol_sp != "resolve" && sol_sp != "evloser" && sol_sp != "pardiso" && sol_sp != "strumpack" && sol_sp != "auto") { if(is_user_defined("linear_solver_sparse")) { log_printf(hovWarning, "The option 'linear_solver_sparse=%s' is not valid with option 'KKTLinsys=full'. " @@ -1425,6 +1427,30 @@ void hiopOptionsNLP::ensure_consistence() } } + +#ifndef HIOP_USE_EVLOSER + if(sol_sp == "evloser") { + if(is_user_defined("linear_solver_sparse")) { + log_printf(hovWarning, + "The option 'linear_solver_sparse=%s' is not valid because HiOp was built without EVLOSER support." + " Will use 'linear_solver_sparse=auto'.\n", + GetString("linear_solver_sparse").c_str()); + } + set_val("linear_solver_sparse", "auto"); + } + + if(GetString("duals_init_linear_solver_sparse") == "evloser") { + if(is_user_defined("duals_init_linear_solver_sparse")) { + log_printf( + hovWarning, + "The option 'duals_init_linear_solver_sparse=%s' is not valid because HiOp was built without EVLOSER support." + " Will use 'duals_init_linear_solver_sparse=auto'.\n", + GetString("duals_init_linear_solver_sparse").c_str()); + } + set_val("duals_init_linear_solver_sparse", "auto"); + } +#endif // HIOP_USE_EVLOSER + #ifndef HIOP_USE_CUDA if(sol_sp == "resolve" || sol_sp == "cusolver-chol") { if(is_user_defined("linear_solver_sparse")) { @@ -1559,7 +1585,9 @@ void hiopOptionsNLP::ensure_consistence() } set_val("fact_acceptor", "inertia_free"); } - } else if(GetString("linear_solver_sparse") == "strumpack" || GetString("linear_solver_sparse") == "resolve") { + // EVLOSER follows the same inertia-free fact_acceptor rule as the other sparse direct solvers. + } else if(GetString("linear_solver_sparse") == "strumpack" || GetString("linear_solver_sparse") == "resolve" || + GetString("linear_solver_sparse") == "evloser") { if(GetString("fact_acceptor") == "inertia_correction") { if(is_user_defined("fact_acceptor") && is_user_defined("linear_solver_sparse")) { log_printf(hovWarning, diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index bb4cc01..0d84551 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -71,3 +71,25 @@ target_link_libraries(test_pcg PRIVATE HiOp::HiOp) add_executable(test_bicgstab ${testBiCGStab_SRC}) target_link_libraries(test_bicgstab PRIVATE HiOp::HiOp) + +if(HIOP_USE_EVLOSER AND NOT HIOP_USE_GPU) + add_executable( + testEVLOSERKLUCPU + LinAlg/testEVLOSERKLUCPU.cpp + ) + + target_include_directories( + testEVLOSERKLUCPU + PRIVATE + ${PROJECT_SOURCE_DIR}/src/LinAlg/EVLOSER + ) + + + target_link_libraries( + testEVLOSERKLUCPU + PRIVATE + EVLOSER + KLU + ) + +endif() diff --git a/tests/LinAlg/testEVLOSERKLUCPU.cpp b/tests/LinAlg/testEVLOSERKLUCPU.cpp new file mode 100644 index 0000000..fa38fdb --- /dev/null +++ b/tests/LinAlg/testEVLOSERKLUCPU.cpp @@ -0,0 +1,427 @@ +#include "MatrixCsr.hpp" +#include "RefactorizationSolver.hpp" + +#include +#include +#include +#include +#include +#include + +namespace +{ + +constexpr double absolute_tolerance = 1e-10; +constexpr double relative_tolerance = 1e-8; + +bool nearly_equal(double actual, double expected) +{ + return std::abs(actual - expected) <= + absolute_tolerance + + relative_tolerance * std::max(std::abs(actual), std::abs(expected)); +} + +bool vectors_equal(const std::vector& actual, + const std::vector& expected) +{ + if(actual.size() != expected.size()) { + return false; + } + + for(std::size_t i = 0; i < actual.size(); ++i) { + if(!nearly_equal(actual[i], expected[i])) { + return false; + } + } + + return true; +} + +bool load_matrix(EVLOSER::RefactorizationSolver& solver, + int n, + const std::vector& rowptr, + const std::vector& colind, + const std::vector& values) +{ + if(static_cast(rowptr.size()) != n + 1 || + colind.size() != values.size()) { + return false; + } + + EVLOSER::MatrixCsr* matrix = solver.mat_A_csr(); + matrix->allocate_size(n); + matrix->allocate_nnz(static_cast(values.size())); + + std::copy(rowptr.begin(), rowptr.end(), matrix->host_irows()); + std::copy(colind.begin(), colind.end(), matrix->host_jcols()); + std::copy(values.begin(), values.end(), matrix->host_vals()); + + solver.set_nnz(static_cast(values.size())); + solver.ordering() = 1; + solver.fact() = "klu"; + solver.refact() = "klu"; + solver.use_ir() = "no"; + solver.set_silent_output(false); + + return true; +} + +bool factorize(EVLOSER::RefactorizationSolver& solver) +{ + return solver.setup_factorization() == 0 && + solver.factorize() == 0; +} + +bool test_analyze_factor_solve() +{ + EVLOSER::RefactorizationSolver solver(3, EVLOSER::ExecutionMode::CPU); + + const std::vector rowptr{0, 2, 5, 7}; + const std::vector colind{0, 1, 0, 1, 2, 1, 2}; + const std::vector values{4.0, 1.0, 1.0, 3.0, 1.0, 1.0, 2.0}; + + if(!load_matrix(solver, 3, rowptr, colind, values) || + !factorize(solver)) { + return false; + } + + std::vector rhs{6.0, 10.0, 8.0}; + + return solver.triangular_solve(rhs.data(), 0.0) && + vectors_equal(rhs, {1.0, 2.0, 3.0}); +} + +bool test_repeated_solve() +{ + EVLOSER::RefactorizationSolver solver(3, EVLOSER::ExecutionMode::CPU); + + const std::vector rowptr{0, 2, 5, 7}; + const std::vector colind{0, 1, 0, 1, 2, 1, 2}; + const std::vector values{4.0, 1.0, 1.0, 3.0, 1.0, 1.0, 2.0}; + + if(!load_matrix(solver, 3, rowptr, colind, values) || + !factorize(solver)) { + return false; + } + + std::vector first_rhs{6.0, 10.0, 8.0}; + if(!solver.triangular_solve(first_rhs.data(), 0.0) || + !vectors_equal(first_rhs, {1.0, 2.0, 3.0})) { + return false; + } + + std::vector second_rhs{-3.5, 2.5, 4.5}; + + return solver.triangular_solve(second_rhs.data(), 0.0) && + vectors_equal(second_rhs, {-1.0, 0.5, 2.0}); +} + +bool test_value_only_refactor() +{ + EVLOSER::RefactorizationSolver solver(3, EVLOSER::ExecutionMode::CPU); + + const std::vector rowptr{0, 2, 5, 7}; + const std::vector colind{0, 1, 0, 1, 2, 1, 2}; + const std::vector initial_values{4.0, 1.0, 1.0, 3.0, 1.0, 1.0, 2.0}; + const std::vector updated_values{5.0, 1.0, 1.0, 4.0, 1.0, 1.0, 3.0}; + + if(!load_matrix(solver, 3, rowptr, colind, initial_values) || + !factorize(solver)) { + return false; + } + + std::copy(updated_values.begin(), + updated_values.end(), + solver.mat_A_csr()->host_vals()); + + solver.setup_refactorization(); + + if(solver.refactorize() != 0) { + return false; + } + + std::vector rhs{7.0, 12.0, 11.0}; + + return solver.triangular_solve(rhs.data(), 0.0) && + vectors_equal(rhs, {1.0, 2.0, 3.0}); +} + +bool test_singular_factorization() +{ + EVLOSER::RefactorizationSolver solver(3, EVLOSER::ExecutionMode::CPU); + + const std::vector rowptr{0, 1, 2, 3}; + const std::vector colind{0, 1, 2}; + const std::vector values{1.0, 0.0, 1.0}; + + return load_matrix(solver, 3, rowptr, colind, values) && + solver.setup_factorization() == 0 && + solver.factorize() != 0; +} + +bool test_invalid_csr_structure() +{ + EVLOSER::RefactorizationSolver solver(3, EVLOSER::ExecutionMode::CPU); + + const std::vector rowptr{0, 2, 1, 3}; + const std::vector colind{0, 1, 2}; + const std::vector values{1.0, 1.0, 1.0}; + + return load_matrix(solver, 3, rowptr, colind, values) && + solver.setup_factorization() != 0; +} + +bool test_null_rhs() +{ + EVLOSER::RefactorizationSolver solver(3, EVLOSER::ExecutionMode::CPU); + + const std::vector rowptr{0, 1, 2, 3}; + const std::vector colind{0, 1, 2}; + const std::vector values{2.0, 3.0, 4.0}; + + return load_matrix(solver, 3, rowptr, colind, values) && + factorize(solver) && + !solver.triangular_solve(nullptr, 0.0); +} + +bool test_nonfinite_matrix_values() +{ + EVLOSER::RefactorizationSolver solver(3, EVLOSER::ExecutionMode::CPU); + + const std::vector rowptr{0, 1, 2, 3}; + const std::vector colind{0, 1, 2}; + const std::vector values{ + 1.0, + std::numeric_limits::quiet_NaN(), + 1.0}; + + return load_matrix(solver, 3, rowptr, colind, values) && + solver.setup_factorization() != 0; +} + +bool test_nonfinite_solution() +{ + EVLOSER::RefactorizationSolver solver(1, EVLOSER::ExecutionMode::CPU); + + const std::vector rowptr{0, 1}; + const std::vector colind{0}; + const std::vector values{1e-200}; + + if(!load_matrix(solver, 1, rowptr, colind, values) || + !factorize(solver)) { + return false; + } + + std::vector rhs{1e200}; + + return !solver.triangular_solve(rhs.data(), 0.0); +} + + +bool test_refactor_accepted() +{ + EVLOSER::RefactorizationSolver solver(3, EVLOSER::ExecutionMode::CPU); + + const std::vector rowptr{0, 2, 5, 7}; + const std::vector colind{0, 1, 0, 1, 2, 1, 2}; + const std::vector initial_values{ + 4.0, 1.0, 1.0, 3.0, 1.0, 1.0, 2.0}; + const std::vector updated_values{ + 5.0, 1.0, 1.0, 4.0, 1.0, 1.0, 3.0}; + + if(!load_matrix(solver, 3, rowptr, colind, initial_values) || + !factorize(solver)) { + return false; + } + + std::copy(updated_values.begin(), + updated_values.end(), + solver.mat_A_csr()->host_vals()); + + solver.setup_refactorization(); + + if(solver.refactorize() != 0) { + return false; + } + + std::vector rhs{7.0, 12.0, 11.0}; + + return solver.triangular_solve(rhs.data(), 0.0) && + vectors_equal(rhs, {1.0, 2.0, 3.0}) && + solver.last_klu_recovery_action() == + EVLOSER::RefactorizationSolver:: + KluRecoveryAction::RefactorAccepted; +} + +bool test_refactor_retained_after_comparison() +{ + EVLOSER::RefactorizationSolver solver(3, EVLOSER::ExecutionMode::CPU); + + const std::vector rowptr{0, 2, 5, 7}; + const std::vector colind{0, 1, 0, 1, 2, 1, 2}; + const std::vector initial_values{ + 4.0, 1.0, 1.0, 3.0, 1.0, 1.0, 2.0}; + const std::vector updated_values{ + 5.0, 1.0, 1.0, 4.0, 1.0, 1.0, 3.0}; + + if(!load_matrix(solver, 3, rowptr, colind, initial_values) || + !factorize(solver)) { + return false; + } + + std::copy(updated_values.begin(), + updated_values.end(), + solver.mat_A_csr()->host_vals()); + + solver.setup_refactorization(); + + /* + * Force comparison with a fresh numeric factorization. Since both + * candidates should have comparable residuals, retain the refactored + * factors. + */ + solver.klu_suspicious_residual_threshold() = -1.0; + + if(solver.refactorize() != 0) { + return false; + } + + std::vector rhs{7.0, 12.0, 11.0}; + + return solver.triangular_solve(rhs.data(), 0.0) && + vectors_equal(rhs, {1.0, 2.0, 3.0}) && + solver.last_klu_recovery_action() == + EVLOSER::RefactorizationSolver:: + KluRecoveryAction::RefactorRetained; +} + +bool test_failed_refactor_recovered_by_full_factorization() +{ + EVLOSER::RefactorizationSolver solver(2, EVLOSER::ExecutionMode::CPU); + + const std::vector rowptr{0, 2, 4}; + const std::vector colind{0, 1, 0, 1}; + + /* + * The updated matrix retains the sparsity pattern but invalidates the + * previous numerical pivot. A fresh factorization can choose a new pivot. + */ + const std::vector initial_values{ + 10.0, 1.0, + 1.0, 1.0}; + + const std::vector updated_values{ + 0.0, 1.0, + 1.0, 10.0}; + + if(!load_matrix(solver, 2, rowptr, colind, initial_values) || + !factorize(solver)) { + return false; + } + + std::copy(updated_values.begin(), + updated_values.end(), + solver.mat_A_csr()->host_vals()); + + solver.setup_refactorization(); + + /* + * Refactorization failure is recoverable, so refactorize() permits the + * subsequent solve to attempt a fresh numeric factorization. + */ + if(solver.refactorize() != 0) { + return false; + } + + std::vector rhs{-1.0, -8.0}; + + return solver.triangular_solve(rhs.data(), 0.0) && + vectors_equal(rhs, {2.0, -1.0}) && + solver.last_klu_recovery_action() == + EVLOSER::RefactorizationSolver:: + KluRecoveryAction::FullFactorAccepted; +} + +bool test_unrecoverable_refactor_failure() +{ + EVLOSER::RefactorizationSolver solver(2, EVLOSER::ExecutionMode::CPU); + + const std::vector rowptr{0, 2, 4}; + const std::vector colind{0, 1, 0, 1}; + const std::vector initial_values{ + 10.0, 1.0, + 1.0, 1.0}; + const std::vector singular_values{ + 1.0, 1.0, + 1.0, 1.0}; + + if(!load_matrix(solver, 2, rowptr, colind, initial_values) || + !factorize(solver)) { + return false; + } + + std::copy(singular_values.begin(), + singular_values.end(), + solver.mat_A_csr()->host_vals()); + + solver.setup_refactorization(); + + if(solver.refactorize() != 0) { + return false; + } + + std::vector rhs{2.0, 2.0}; + + return !solver.triangular_solve(rhs.data(), 0.0) && + solver.last_klu_recovery_action() == + EVLOSER::RefactorizationSolver:: + KluRecoveryAction::Failed; +} + +} // namespace + +int main() +{ + struct TestCase + { + const char* name; + bool (*run)(); + }; + + const TestCase tests[] = { + {"analyze/factor/solve", test_analyze_factor_solve}, + {"repeated solve", test_repeated_solve}, + {"value-only refactor", test_value_only_refactor}, + {"singular factorization", test_singular_factorization}, + {"invalid CSR structure", test_invalid_csr_structure}, + {"null RHS", test_null_rhs}, + {"non-finite matrix values", test_nonfinite_matrix_values}, + {"non-finite solution", test_nonfinite_solution}, + {"refactor accepted", test_refactor_accepted}, + {"refactor retained after fresh comparison", + test_refactor_retained_after_comparison}, + {"failed refactor recovered by full factorization", + test_failed_refactor_recovered_by_full_factorization}, + {"unrecoverable refactor failure", + test_unrecoverable_refactor_failure}, + }; + + int failures = 0; + + for(const TestCase& test : tests) { + const bool passed = test.run(); + std::cout << (passed ? "[PASS] " : "[FAIL] ") << test.name << "\n"; + + if(!passed) { + ++failures; + } + } + + if(failures != 0) { + std::cout << failures << " EVLOSER KLU CPU test(s) failed.\n"; + return 1; + } + + std::cout << "All EVLOSER KLU CPU tests passed.\n"; + return 0; +}