From 03f3c2af96e8789cf09407b1567ac6af80746bd6 Mon Sep 17 00:00:00 2001 From: Tamar DeWilde Date: Wed, 17 Jun 2026 02:01:00 -0400 Subject: [PATCH 01/28] Add EVLOSER sparse solver scaffold --- src/LinAlg/CMakeLists.txt | 7 + src/LinAlg/hiopLinSolverSparseEVLOSER.cpp | 559 ++++++++++++++++++++++ src/LinAlg/hiopLinSolverSparseEVLOSER.hpp | 161 +++++++ 3 files changed, 727 insertions(+) create mode 100644 src/LinAlg/hiopLinSolverSparseEVLOSER.cpp create mode 100644 src/LinAlg/hiopLinSolverSparseEVLOSER.hpp diff --git a/src/LinAlg/CMakeLists.txt b/src/LinAlg/CMakeLists.txt index 8710490..6a2a3d9 100644 --- a/src/LinAlg/CMakeLists.txt +++ b/src/LinAlg/CMakeLists.txt @@ -12,6 +12,7 @@ set(hiopLinAlg_INTERFACE_HEADERS hiopLinSolverSparseSTRUMPACK.hpp hiopLinSolverSparsePARDISO.hpp hiopLinSolverSparseReSolve.hpp + hiopLinSolverSparseEVLOSER.hpp hiopLinSolverUMFPACKZ.hpp hiopLinSolverCholCuSparse.hpp hiopMatrix.hpp @@ -101,6 +102,10 @@ set(hiopLinAlg_CUSOLVER_LU_SRC hiopLinSolverSparseReSolve.cpp ) +set(hiopLinAlg_EVLOSER_SRC + hiopLinSolverSparseEVLOSER.cpp +) + set(hiopLinAlg_CUSOLVER_CHOL_SRC hiopLinSolverCholCuSparse.cpp ) @@ -151,7 +156,9 @@ if(HIOP_SPARSE) if(HIOP_USE_RESOLVE) add_subdirectory(ReSolve) list(APPEND hiopLinAlg_SRC ${hiopLinAlg_CUSOLVER_LU_SRC}) + list(APPEND hiopLinAlg_SRC ${hiopLinAlg_EVLOSER_SRC}) set_source_files_properties(${hiopLinAlg_CUSOLVER_LU_SRC} PROPERTIES LANGUAGE CUDA) + set_source_files_properties(${hiopLinAlg_EVLOSER_SRC} PROPERTIES LANGUAGE CUDA) endif(HIOP_USE_RESOLVE) if(HIOP_USE_CUDA) list(APPEND hiopLinAlg_SRC ${hiopLinAlg_CUSOLVER_CHOL_SRC}) diff --git a/src/LinAlg/hiopLinSolverSparseEVLOSER.cpp b/src/LinAlg/hiopLinSolverSparseEVLOSER.cpp new file mode 100644 index 0000000..2ae0384 --- /dev/null +++ b/src/LinAlg/hiopLinSolverSparseEVLOSER.cpp @@ -0,0 +1,559 @@ +// This file is part of HiOp. For details, see https://github.com/LLNL/hiop. +// HiOp is released under the BSD 3-clause license +// (https://opensource.org/licenses/BSD-3-Clause). Please also read “Additional +// BSD Notice” below. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// i. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the disclaimer below. ii. Redistributions in +// binary form must reproduce the above copyright notice, this list of +// conditions and the disclaimer (as noted below) in the documentation and/or +// other materials provided with the distribution. +// iii. Neither the name of the LLNS/LLNL nor the names of its contributors may +// be used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY, LLC, +// THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, +// INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF +// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Additional BSD Notice +// 1. This notice is required to be provided under our contract with the U.S. +// Department of Energy (DOE). This work was produced at Lawrence Livermore +// National Laboratory under Contract No. DE-AC52-07NA27344 with the DOE. +// 2. Neither the United States Government nor Lawrence Livermore National +// Security, LLC nor any of their employees, makes any warranty, express or +// implied, or assumes any liability or responsibility for the accuracy, +// completeness, or usefulness of any information, apparatus, product, or +// process disclosed, or represents that its use would not infringe +// privately-owned rights. +// 3. Also, reference herein to any specific commercial products, process, or +// services by trade name, trademark, manufacturer or otherwise does not +// necessarily constitute or imply its endorsement, recommendation, or favoring +// by the United States Government or Lawrence Livermore National Security, +// LLC. The views and opinions of authors expressed herein do not necessarily +// state or reflect those of the United States Government or Lawrence Livermore +// National Security, LLC, and shall not be used for advertising or product +// endorsement purposes. + +/** + * @file hiopLinSolverSparseReSolve.cpp + * + * @author Kasia Swirydowicz , PNNL + * @author Slaven Peles , ORNL + * + */ + +#include "hiopLinSolverSparseEVLOSER.hpp" +#include +#include +#include + +#include "hiop_blasdefs.hpp" +#include "KrylovSolverKernels.h" + +#include "cusparse_v2.h" +#include +#include + +#define checkCudaErrors(val) hiopCheckCudaError((val), __FILE__, __LINE__) + +/** + * @brief Map elements of one array to the other + * + * for(int k = 0; k < nnz_; k++) { + * vals[k] = M_->M()[index_convert_CSR2Triplet_host_[k]]; + * } + * + */ +template +__global__ void mapArraysKernelEVLOSER(T* dst, const T* src, const I* mapidx, I n) +{ + I tid = blockDim.x * blockIdx.x + threadIdx.x; + + if(tid < n) { + dst[tid] = src[mapidx[tid]]; + } +} + +/** + * @brief Map elements of one array to the other + * + * for(int i = 0; i < n_; i++) { + * if(index_convert_extra_Diag2CSR_host_[i] != -1) + * vals[index_convert_extra_Diag2CSR_host_[i]] += M_->M()[M_->numberOfNonzeros() - n_ + i]; + * } + * + */ +template +__global__ void addToArrayKernelEVLOSER(T* dst, const T* src, const I* mapidx, I n, I nnz) +{ + I tid = blockDim.x * blockIdx.x + threadIdx.x; + + if(tid < n) { + if(mapidx[tid] != -1) dst[mapidx[tid]] += src[nnz - n + tid]; + } +} + +namespace hiop +{ +hiopLinSolverSymSparseEVLOSER::hiopLinSolverSymSparseEVLOSER(const int& n, const int& nnz, hiopNlpFormulation* nlp) + : hiopLinSolverSymSparse(n, nnz, nlp), + index_convert_CSR2Triplet_host_{nullptr}, + index_convert_extra_Diag2CSR_host_{nullptr}, + index_convert_CSR2Triplet_device_{nullptr}, + index_convert_extra_Diag2CSR_device_{nullptr}, + n_{n}, + nnz_{0}, + factorizationSetupSucc_{0}, + is_first_call_{true} +{ + // Create ReSolve solver and allocate rhs temporary storage + solver_ = new ReSolve::RefactorizationSolver(n); + + // If memory space is device, allocate host mirror for HiOp's KKT matrix in triplet format + if(nlp_->options->GetString("mem_space") == "device") { + M_host_ = LinearAlgebraFactory::create_matrix_sparse("default", n, n, nnz); + } + + // Set verbosity of ReSolve based on HiOp verbosity + if(nlp_->options->GetInteger("verbosity_level") >= 3) { + solver_->set_silent_output(false); + } + + // Select matrix ordering + int ordering = 1; + std::string ord = nlp_->options->GetString("linear_solver_sparse_ordering"); + if(ord == "amd_ssparse") { + ordering = 0; + } else if(ord == "colamd_ssparse") { + ordering = 1; + } else { + nlp_->log->printf(hovWarning, "Ordering %s not compatible with cuSOLVER LU, using default ...\n", ord.c_str()); + ordering = 1; + } + solver_->ordering() = ordering; + nlp_->log->printf(hovSummary, "Ordering: %d\n", solver_->ordering()); + + // Select factorization + std::string fact; + fact = nlp_->options->GetString("resolve_factorization"); + if(fact != "klu") { + nlp_->log->printf(hovWarning, "Factorization %s not compatible with cuSOLVER LU, using default ...\n", fact.c_str()); + fact = "klu"; + } + solver_->fact() = fact; + nlp_->log->printf(hovSummary, "Factorization: %s\n", solver_->fact().c_str()); + + // Select refactorization + std::string refact; + refact = nlp_->options->GetString("resolve_refactorization"); + if(refact != "glu" && refact != "rf") { + nlp_->log->printf(hovWarning, "Refactorization %s not compatible with cuSOLVER LU, using default ...\n", refact.c_str()); + refact = "glu"; + } + solver_->refact() = refact; + nlp_->log->printf(hovSummary, "Refactorization: %s\n", solver_->refact().c_str()); + + // by default, dont use iterative refinement + std::string use_ir; + int maxit_test = nlp_->options->GetInteger("ir_inner_maxit"); + + if((maxit_test < 0) || (maxit_test > 1000)) { + nlp_->log->printf(hovWarning, + "Wrong maxit value: %d. Use int maxit value between 0 and 1000. Setting default (50) ...\n", + maxit_test); + maxit_test = 50; + } + use_ir = "no"; + if(maxit_test > 0) { + use_ir = "yes"; + solver_->enable_iterative_refinement(); + solver_->ir()->maxit() = maxit_test; + } + if(use_ir == "yes") { + if((refact == "rf")) { + solver_->ir()->restart() = nlp_->options->GetInteger("ir_inner_restart"); + + if((solver_->ir()->restart() < 0) || (solver_->ir()->restart() > 100)) { + nlp_->log->printf(hovWarning, + "Wrong restart value: %d. Use int restart value between 1 and 100. Setting default (20) ...\n", + solver_->ir()->restart()); + solver_->ir()->restart() = 20; + } + + solver_->ir()->tol() = nlp_->options->GetNumeric("ir_inner_tol"); + if((solver_->ir()->tol() < 0) || (solver_->ir()->tol() > 1)) { + nlp_->log->printf(hovWarning, + "Wrong tol value: %e. Use double tol value between 0 and 1. Setting default (1e-12) ...\n", + solver_->ir()->tol()); + solver_->ir()->tol() = 1e-12; + } + solver_->ir()->orth_option() = nlp_->options->GetString("ir_inner_gs_scheme"); + + /* 0) "Standard" GMRES and FGMRES (Saad and Schultz, 1986, Saad, 1992) use Modified Gram-Schmidt ("mgs") to keep the + * Krylov vectors orthogonal. Modified Gram-Schmidt requires k synchronization (due to inner products) in iteration k + * and this becomes a scaling bottleneck for GPU-accelerated implementation and it becomes even more pronouced for + * MPI+GPU-acceleration. Modified Gram-Schidt can be replaced by a different scheme. + * + * 1) One can use Classical Gram-Schmidt ("cgs") which is numerically unstable or reorthogonalized Classical + * Gram-Schmidt ("cgs2"), which is numerically stable and requires 3 synchrnozations and each iteration. + * Reorthogonalized Classical Gram-Schmidt makes two passes of Classical Gram-Schmidt. And two passes are enough to get + * vectors orthogonal to machine precision (Bjorck 1967). + * + * 2) An alternative is a low-sych version (Swirydowicz and Thomas, 2020), which reformulates Modified Gram-Schmidt to + * be a (very small) triangular solve. It requires extra storage for the matrix used in triangular solve (kxk at + * iteration k), but only two sycnhronizations are needed per iteration. The inner producats are performed in bulk, + * which quarantees better GPU utilization. The second synchronization comes from normalizing the vector and can be + * eliminated if the norm is postponed to the next iteration, but also makes code more complicated. This is why we use + * two-synch method ("mgs_two_synch") + * + * 3) A recently submitted paper by Stephen Thomas (Thomas 202*) takes the triangular solve idea further and uses a + * different approximation for the inverse of a triangular matrix. It requires two (very small) triangular solves and + * two sychroniztions (if the norm is NOT delayed). It also guarantees that the vectors are orthogonal to the machine + * epsilon, as in cgs2. Since Stephen's paper is named "post modern GMRES", we call this Gram-Schmidt scheme "mgs_pm". + */ + if(solver_->ir()->orth_option() != "mgs" && solver_->ir()->orth_option() != "cgs2" && + solver_->ir()->orth_option() != "mgs_two_synch" && solver_->ir()->orth_option() != "mgs_pm") { + nlp_->log->printf( + hovWarning, + "mgs option : %s is wrong. Use 'mgs', 'cgs2', 'mgs_two_synch' or 'mgs_pm'. Switching to default (mgs) ...\n", + use_ir.c_str()); + solver_->ir()->orth_option() = "mgs"; + } + + solver_->ir()->conv_cond() = nlp_->options->GetInteger("ir_inner_conv_cond"); + + if((solver_->ir()->conv_cond() < 0) || (solver_->ir()->conv_cond() > 2)) { + nlp_->log->printf(hovWarning, + "Wrong IR convergence condition: %d. Use int value: 0, 1 or 2. Setting default (0) ...\n", + solver_->ir()->conv_cond()); + solver_->ir()->conv_cond() = 0; + } + + } else { + nlp_->log->printf(hovWarning, "Currently, inner iterative refinement works ONLY with cuSolverRf ... \n"); + use_ir = "no"; + } + } + solver_->use_ir() = use_ir; + nlp_->log->printf(hovSummary, "Use IR: %s\n", solver_->use_ir().c_str()); +} // constructor + +hiopLinSolverSymSparseEVLOSER::~hiopLinSolverSymSparseEVLOSER() +{ + delete solver_; + + // If memory space is device, delete allocated host mirrors + if(nlp_->options->GetString("mem_space") == "device") { + delete M_host_; + } + + // Delete CSR <--> triplet mappings + delete[] index_convert_CSR2Triplet_host_; + delete[] index_convert_extra_Diag2CSR_host_; + checkCudaErrors(cudaFree(index_convert_CSR2Triplet_device_)); + checkCudaErrors(cudaFree(index_convert_extra_Diag2CSR_device_)); +} + +int hiopLinSolverSymSparseEVLOSER::matrixChanged() +{ + assert(n_ == M_->n() && M_->n() == M_->m()); + assert(n_ > 0); + + nlp_->runStats.linsolv.tmFactTime.start(); + + if(is_first_call_) { + firstCall(); + } else { + update_matrix_values(); + } + + if(factorizationSetupSucc_ == 0) { + int retval = solver_->factorize(); + if(retval == -1) { + nlp_->log->printf(hovWarning, "Numeric klu factorization failed. Regularizing ...\n"); + // This is not a catastrophic failure + // The matrix is singular so return -1 to regularaize! + return -1; + } else { // Numeric was succesfull so now can set up + solver_->setup_refactorization(); + factorizationSetupSucc_ = 1; + nlp_->log->printf(hovScalars, "Numeric klu factorization succesful! \n"); + } + } else { // factorizationSetupSucc_ == 1 + solver_->refactorize(); + } + + nlp_->runStats.linsolv.tmFactTime.stop(); + return 0; +} + +bool hiopLinSolverSymSparseEVLOSER::solve(hiopVector& x) +{ + assert(n_ == M_->n() && M_->n() == M_->m()); + assert(n_ > 0); + assert(x.get_size() == M_->n()); + + nlp_->runStats.linsolv.tmTriuSolves.start(); + + // Set IR tolerance + double ir_tol = nlp_->options->GetNumeric("ir_inner_tol"); + + std::string mem_space = nlp_->options->GetString("mem_space"); + double* dx = x.local_data(); + + bool retval = solver_->triangular_solve(dx, ir_tol, mem_space); + if(!retval) { + nlp_->log->printf(hovError, // catastrophic failure + "ReSolve triangular solver failed\n"); + } + + nlp_->runStats.linsolv.tmTriuSolves.stop(); + return true; +} + +void hiopLinSolverSymSparseEVLOSER::firstCall() +{ + assert(n_ == M_->n() && M_->n() == M_->m()); + assert(n_ > 0); + + // If the matrix is on device, copy it to the host mirror + std::string mem_space = nlp_->options->GetString("mem_space"); + if(mem_space == "device") { + checkCudaErrors(cudaMemcpy(M_host_->M(), M_->M(), sizeof(double) * M_->numberOfNonzeros(), cudaMemcpyDeviceToHost)); + checkCudaErrors( + cudaMemcpy(M_host_->i_row(), M_->i_row(), sizeof(index_type) * M_->numberOfNonzeros(), cudaMemcpyDeviceToHost)); + checkCudaErrors( + cudaMemcpy(M_host_->j_col(), M_->j_col(), sizeof(index_type) * M_->numberOfNonzeros(), cudaMemcpyDeviceToHost)); + } + + // Transfer triplet to CSR form + + // Allocate row pointers and compute number of nonzeros. + solver_->mat_A_csr()->allocate_size(n_); + compute_nnz(); + solver_->set_nnz(nnz_); + + // Allocate column indices and matrix values + solver_->mat_A_csr()->allocate_nnz(nnz_); + + // Set column indices and matrix values. + set_csr_indices_values(); + + // Copy matrix to device + solver_->mat_A_csr()->update_from_host_mirror(); + + if(solver_->use_ir() == "yes") { + solver_->setup_iterative_refinement_matrix(n_, nnz_); + } + /* + * initialize matrix factorization + */ + if(solver_->setup_factorization() < 0) { + nlp_->log->printf(hovError, // catastrophic failure + "Symbolic factorization failed!\n"); + return; + }; + is_first_call_ = false; +} + +/// nnz_ is number of nonzeros in CSR matrix +/// M_->numberOfNonzeros() is number of zeros in symmetric triplet matrix +void hiopLinSolverSymSparseEVLOSER::update_matrix_values() +{ + std::string mem_space = nlp_->options->GetString("mem_space"); + if(mem_space == "device") { + double* csr_vals = solver_->mat_A_csr()->get_vals(); + double* coo_vals = M_->M(); + int coo_nnz = M_->numberOfNonzeros(); + + const int blocksize = 512; + int gridsize = (nnz_ + blocksize - 1) / blocksize; + mapArraysKernelEVLOSER<<>>(csr_vals, coo_vals, index_convert_CSR2Triplet_device_, nnz_); + + gridsize = (n_ + blocksize - 1) / blocksize; + addToArrayKernelEVLOSER + <<>>(csr_vals, coo_vals, index_convert_extra_Diag2CSR_device_, n_, coo_nnz); + + // If factorization was not successful, we need a copy of values on the host + if(factorizationSetupSucc_ == 0) + checkCudaErrors(cudaMemcpy(solver_->mat_A_csr()->get_vals_host(), + solver_->mat_A_csr()->get_vals(), + sizeof(double) * nnz_, + cudaMemcpyDeviceToHost)); + + } else { + // KKT matrix is on the host + double* vals = solver_->mat_A_csr()->get_vals_host(); + // update matrix + for(int k = 0; k < nnz_; k++) { + vals[k] = M_->M()[index_convert_CSR2Triplet_host_[k]]; + } + for(int i = 0; i < n_; i++) { + if(index_convert_extra_Diag2CSR_host_[i] != -1) + vals[index_convert_extra_Diag2CSR_host_[i]] += M_->M()[M_->numberOfNonzeros() - n_ + i]; + } + checkCudaErrors(cudaMemcpy(solver_->mat_A_csr()->get_vals(), + solver_->mat_A_csr()->get_vals_host(), + sizeof(double) * nnz_, + cudaMemcpyHostToDevice)); + } +} + +/// @pre Data is either on the host or the host mirror is synced with the device +void hiopLinSolverSymSparseEVLOSER::compute_nnz() +{ + // + // compute nnz in each row + // + int* row_ptr = solver_->mat_A_csr()->get_irows_host(); + + // If the data is on device, fetch it from the host mirror + hiopMatrixSparse* M_host = nullptr; + std::string mem_space = nlp_->options->GetString("mem_space"); + if(mem_space == "host" || mem_space == "default") { + M_host = M_; + } else if(mem_space == "device") { + M_host = M_host_; + } else { + nlp_->log->printf(hovError, "Memory space %s incompatible with ReSolve.\n", mem_space.c_str()); + } + + // off-diagonal part + row_ptr[0] = 0; + for(int k = 0; k < M_host->numberOfNonzeros() - n_; k++) { + if(M_host->i_row()[k] != M_host->j_col()[k]) { + row_ptr[M_host->i_row()[k] + 1]++; + row_ptr[M_host->j_col()[k] + 1]++; + nnz_ += 2; + } + } + // diagonal part + for(int i = 0; i < n_; i++) { + row_ptr[i + 1]++; + nnz_ += 1; + } + // get correct row ptr index + for(int i = 1; i < n_ + 1; i++) { + row_ptr[i] += row_ptr[i - 1]; + } + assert(nnz_ == row_ptr[n_]); +} + +/// @pre Data is either on the host or the host mirror is synced with the device +void hiopLinSolverSymSparseEVLOSER::set_csr_indices_values() +{ + // If the data is on device, fetch it from the host mirror + hiopMatrixSparse* M_host = nullptr; + std::string mem_space = nlp_->options->GetString("mem_space"); + if(mem_space == "host" || mem_space == "default") { + M_host = M_; + } else if(mem_space == "device") { + M_host = M_host_; + } else { + nlp_->log->printf(hovError, "Memory space %s incompatible with ReSolve.\n", mem_space.c_str()); + } + + // + // set correct col index and value + // + const int* row_ptr = solver_->mat_A_csr()->get_irows_host(); + int* col_idx = solver_->mat_A_csr()->get_jcols_host(); + double* vals = solver_->mat_A_csr()->get_vals_host(); + + index_convert_CSR2Triplet_host_ = new int[nnz_]; + index_convert_extra_Diag2CSR_host_ = new int[n_]; + checkCudaErrors(cudaMalloc(&index_convert_CSR2Triplet_device_, nnz_ * sizeof(int))); + checkCudaErrors(cudaMalloc(&index_convert_extra_Diag2CSR_device_, n_ * sizeof(int))); + + int* nnz_each_row_tmp = new int[n_]{0}; + int total_nnz_tmp{0}, nnz_tmp{0}, rowID_tmp, colID_tmp; + + for(int k = 0; k < n_; k++) { + index_convert_extra_Diag2CSR_host_[k] = -1; + } + + for(int k = 0; k < M_host->numberOfNonzeros() - n_; k++) { + rowID_tmp = M_host->i_row()[k]; + colID_tmp = M_host->j_col()[k]; + if(rowID_tmp == colID_tmp) { + nnz_tmp = nnz_each_row_tmp[rowID_tmp] + row_ptr[rowID_tmp]; + col_idx[nnz_tmp] = colID_tmp; + vals[nnz_tmp] = M_host->M()[k]; + index_convert_CSR2Triplet_host_[nnz_tmp] = k; + + vals[nnz_tmp] += M_host->M()[M_host->numberOfNonzeros() - n_ + rowID_tmp]; + index_convert_extra_Diag2CSR_host_[rowID_tmp] = nnz_tmp; + + nnz_each_row_tmp[rowID_tmp]++; + total_nnz_tmp++; + } else { + nnz_tmp = nnz_each_row_tmp[rowID_tmp] + row_ptr[rowID_tmp]; + col_idx[nnz_tmp] = colID_tmp; + vals[nnz_tmp] = M_host->M()[k]; + index_convert_CSR2Triplet_host_[nnz_tmp] = k; + + nnz_tmp = nnz_each_row_tmp[colID_tmp] + row_ptr[colID_tmp]; + col_idx[nnz_tmp] = rowID_tmp; + vals[nnz_tmp] = M_host->M()[k]; + index_convert_CSR2Triplet_host_[nnz_tmp] = k; + + nnz_each_row_tmp[rowID_tmp]++; + nnz_each_row_tmp[colID_tmp]++; + total_nnz_tmp += 2; + } + } + // correct the missing dia_gonal term + for(int i = 0; i < n_; i++) { + if(nnz_each_row_tmp[i] != row_ptr[i + 1] - row_ptr[i]) { + assert(nnz_each_row_tmp[i] == row_ptr[i + 1] - row_ptr[i] - 1); + nnz_tmp = nnz_each_row_tmp[i] + row_ptr[i]; + col_idx[nnz_tmp] = i; + vals[nnz_tmp] = M_host->M()[M_host->numberOfNonzeros() - n_ + i]; + index_convert_CSR2Triplet_host_[nnz_tmp] = M_host->numberOfNonzeros() - n_ + i; + total_nnz_tmp += 1; + + std::vector ind_temp(row_ptr[i + 1] - row_ptr[i]); + std::iota(ind_temp.begin(), ind_temp.end(), 0); + std::sort(ind_temp.begin(), ind_temp.end(), [&](int a, int b) { + return col_idx[a + row_ptr[i]] < col_idx[b + row_ptr[i]]; + }); + + reorder(vals + row_ptr[i], ind_temp, row_ptr[i + 1] - row_ptr[i]); + reorder(index_convert_CSR2Triplet_host_ + row_ptr[i], ind_temp, row_ptr[i + 1] - row_ptr[i]); + std::sort(col_idx + row_ptr[i], col_idx + row_ptr[i + 1]); + } + } + checkCudaErrors(cudaMemcpy(index_convert_CSR2Triplet_device_, + index_convert_CSR2Triplet_host_, + nnz_ * sizeof(int), + cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(index_convert_extra_Diag2CSR_device_, + index_convert_extra_Diag2CSR_host_, + n_ * sizeof(int), + cudaMemcpyHostToDevice)); + delete[] nnz_each_row_tmp; +} + +// Error checking utility for CUDA +// KS: might later become part of src/Utils, putting it here for now +template +void hiopLinSolverSymSparseEVLOSER::hiopCheckCudaError(T result, const char* const file, int const line) +{ + if(result) { + nlp_->log->printf(hovError, "CUDA error at %s:%d, error# %d\n", file, line, result); + assert(false); + } +} + +} // namespace hiop diff --git a/src/LinAlg/hiopLinSolverSparseEVLOSER.hpp b/src/LinAlg/hiopLinSolverSparseEVLOSER.hpp new file mode 100644 index 0000000..72202de --- /dev/null +++ b/src/LinAlg/hiopLinSolverSparseEVLOSER.hpp @@ -0,0 +1,161 @@ +// +// This file is part of HiOp. For details, see https://github.com/LLNL/hiop. +// HiOp is released under the BSD 3-clause license +// (https://opensource.org/licenses/BSD-3-Clause). Please also read “Additional +// BSD Notice” below. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// i. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the disclaimer below. ii. Redistributions in +// binary form must reproduce the above copyright notice, this list of +// conditions and the disclaimer (as noted below) in the documentation and/or +// other materials provided with the distribution. +// iii. Neither the name of the LLNS/LLNL nor the names of its contributors may +// be used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY, LLC, +// THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, +// INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF +// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Additional BSD Notice +// 1. This notice is required to be provided under our contract with the U.S. +// Department of Energy (DOE). This work was produced at Lawrence Livermore +// National Laboratory under Contract No. DE-AC52-07NA27344 with the DOE. +// 2. Neither the United States Government nor Lawrence Livermore National +// Security, LLC nor any of their employees, makes any warranty, express or +// implied, or assumes any liability or responsibility for the accuracy, +// completeness, or usefulness of any information, apparatus, product, or +// process disclosed, or represents that its use would not infringe +// privately-owned rights. +// 3. Also, reference herein to any specific commercial products, process, or +// services by trade name, trademark, manufacturer or otherwise does not +// necessarily constitute or imply its endorsement, recommendation, or favoring +// by the United States Government or Lawrence Livermore National Security, +// LLC. The views and opinions of authors expressed herein do not necessarily +// state or reflect those of the United States Government or Lawrence Livermore +// National Security, LLC, and shall not be used for advertising or product +// endorsement purposes. + +/** + * @file hiopLinSolverSparseReSolve.hpp + * + * @author Kasia Swirydowicz , PNNL + * @author Slaven Peles , ORNL + * + */ + +#ifndef HIOP_LINSOLVER_EVLOSER +#define HIOP_LINSOLVER_EVLOSER + +#include "hiopLinSolver.hpp" +#include "hiopMatrixSparseTriplet.hpp" +#include + +/** implements the linear solver class using nvidia_ cuSolver (GLU + * refactorization) + * + * @ingroup LinearSolvers + */ + +namespace ReSolve +{ +// Forward declaration of inner IR class +class IterativeRefinement; +class MatrixCsr; +class RefactorizationSolver; +} // namespace ReSolve + +namespace hiop +{ + +class hiopLinSolverSymSparseEVLOSER : public hiopLinSolverSymSparse +{ +public: + // constructor + hiopLinSolverSymSparseEVLOSER(const int& n, const int& nnz, hiopNlpFormulation* nlp); + virtual ~hiopLinSolverSymSparseEVLOSER(); + + /** + * @brief Triggers a refactorization of the matrix, if necessary. + * Overload from base class. + * In this case, KLU (SuiteSparse) is used to refactor + */ + virtual int matrixChanged(); + + /** + * @brief Solves a linear system. + * + * @param x is on entry the right hand side(s) of the system to be solved. + * + * @post On exit `x` is overwritten with the solution(s). + */ + virtual bool solve(hiopVector& x_); + + /** Multiple rhs not supported yet */ + virtual bool solve(hiopMatrix& /* x */) + { + assert(false && "not yet supported"); + return false; + } + +protected: + ReSolve::RefactorizationSolver* solver_; + + int m_; ///< number of rows of the whole matrix + int n_; ///< number of cols of the whole matrix + int nnz_; ///< number of nonzeros in the matrix + + // Mapping on the host + int* index_convert_CSR2Triplet_host_; + int* index_convert_extra_Diag2CSR_host_; + + // Mapping on the device + int* index_convert_CSR2Triplet_device_; + int* index_convert_extra_Diag2CSR_device_; + + // Algorithm control flags + int factorizationSetupSucc_; + bool is_first_call_; + + hiopMatrixSparse* M_host_{nullptr}; ///< Host mirror for the KKT matrix + + /* private function: creates a cuSolver data structure from KLU data + * structures. */ + + /** called the very first time a matrix is factored. Perform KLU + * factorization, allocate all aux variables + * + * @note Converts HiOp triplet matrix to CSR format. + */ + virtual void firstCall(); + + /** + * @brief Updates matrix values from HiOp object. + * + * @note This function maps data from HiOp supplied matrix M_ to data structures + * used by the linear solver. + */ + void update_matrix_values(); + + /** Function to compute nnz and set row pointers */ + void compute_nnz(); + /** Function to compute column indices and matrix values arrays */ + void set_csr_indices_values(); + + template + void hiopCheckCudaError(T result, const char* const file, int const line); +}; + +} // namespace hiop + +#endif From 6a5b291d0cd0f5ac66b982ad2067caabf8b8cf5d Mon Sep 17 00:00:00 2001 From: Tamar DeWilde Date: Wed, 17 Jun 2026 02:01:01 -0400 Subject: [PATCH 02/28] Add EVLOSER sparse solver option --- src/Utils/hiopOptions.cpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/Utils/hiopOptions.cpp b/src/Utils/hiopOptions.cpp index ec63f68..77cbdc7 100644 --- a/src/Utils/hiopOptions.cpp +++ b/src/Utils/hiopOptions.cpp @@ -922,7 +922,7 @@ void hiopOptionsNLP::register_options() // - 'gpu' compute mode: work in progress { - vector range{"auto", "ma57", "pardiso", "strumpack", "resolve", "ginkgo", "cusolver-chol"}; + vector range{"auto", "ma57", "pardiso", "strumpack", "resolve", "evloser", "ginkgo", "cusolver-chol"}; register_str_option("linear_solver_sparse", "auto", @@ -936,7 +936,7 @@ void hiopOptionsNLP::register_options() // - when GPU mode is on, STRUMPACK is chosen by 'auto' if available // - choosing option ma57 or pardiso with GPU being on, it results in no device being used in the linear solve! { - vector range{"auto", "ma57", "pardiso", "resolve", "strumpack", "ginkgo"}; + vector range{"auto", "ma57", "pardiso", "resolve", "evloser", "strumpack", "ginkgo"}; register_str_option("duals_init_linear_solver_sparse", "auto", @@ -1402,7 +1402,7 @@ void hiopOptionsNLP::ensure_consistence() auto kkt_linsys = GetString("KKTLinsys"); auto sol_sp = GetString("linear_solver_sparse"); if(kkt_linsys == "full") { - if(sol_sp != "resolve" && sol_sp != "pardiso" && sol_sp != "strumpack" && sol_sp != "auto") { + if(sol_sp != "resolve" && sol_sp != "evloser" && sol_sp != "pardiso" && sol_sp != "strumpack" && sol_sp != "auto") { if(is_user_defined("linear_solver_sparse")) { log_printf(hovWarning, "The option 'linear_solver_sparse=%s' is not valid with option 'KKTLinsys=full'. " @@ -1426,7 +1426,7 @@ void hiopOptionsNLP::ensure_consistence() } #ifndef HIOP_USE_CUDA - if(sol_sp == "resolve" || sol_sp == "cusolver-chol") { + if(sol_sp == "resolve" || sol_sp == "evloser" || sol_sp == "cusolver-chol") { if(is_user_defined("linear_solver_sparse")) { log_printf(hovWarning, "The option 'linear_solver_sparse=%s' is not valid without CUDA support enabled." @@ -1559,7 +1559,8 @@ void hiopOptionsNLP::ensure_consistence() } set_val("fact_acceptor", "inertia_free"); } - } else if(GetString("linear_solver_sparse") == "strumpack" || GetString("linear_solver_sparse") == "resolve") { + } else if(GetString("linear_solver_sparse") == "strumpack" || GetString("linear_solver_sparse") == "resolve" || + GetString("linear_solver_sparse") == "evloser") { if(GetString("fact_acceptor") == "inertia_correction") { if(is_user_defined("fact_acceptor") && is_user_defined("linear_solver_sparse")) { log_printf(hovWarning, From a06e10b63a3188e4e6d78d3ac13196c56c4f3802 Mon Sep 17 00:00:00 2001 From: Tamar DeWilde Date: Wed, 17 Jun 2026 02:01:01 -0400 Subject: [PATCH 03/28] Wire EVLOSER through sparse KKT systems --- src/Optimization/hiopDualsUpdater.cpp | 7 +++- src/Optimization/hiopKKTLinSysSparse.cpp | 47 ++++++++++++++++++++++++ 2 files changed, 53 insertions(+), 1 deletion(-) diff --git a/src/Optimization/hiopDualsUpdater.cpp b/src/Optimization/hiopDualsUpdater.cpp index 0ec4bbf..45d62e4 100644 --- a/src/Optimization/hiopDualsUpdater.cpp +++ b/src/Optimization/hiopDualsUpdater.cpp @@ -73,6 +73,7 @@ #endif #ifdef HIOP_USE_RESOLVE #include "hiopLinSolverSparseReSolve.hpp" +#include "hiopLinSolverSparseEVLOSER.hpp" #endif #ifdef HIOP_USE_GINKGO #include "hiopLinSolverSparseGinkgo.hpp" @@ -428,7 +429,7 @@ bool hiopDualsLsqUpdateLinsysAugSparse::instantiate_linear_solver(const char* li #ifdef HIOP_USE_RESOLVE if(compute_mode == "gpu") { - assert((linear_solver == "resolve" || linear_solver == "auto") && + assert((linear_solver == "resolve" || linear_solver == "evloser" || linear_solver == "auto") && "the value for duals_init_linear_solver_sparse is invalid and should have been corrected during " "options processing"); } @@ -444,6 +445,10 @@ bool hiopDualsLsqUpdateLinsysAugSparse::instantiate_linear_solver(const char* li ss_log << "LSQ linear solver --- KKT_SPARSE_XDYcYd linsys: ReSolve "; lin_sys_ = new hiopLinSolverSymSparseReSolve(n, nnz, nlp_); } + if(linear_solver == "evloser") { + ss_log << "LSQ linear solver --- KKT_SPARSE_XDYcYd linsys: EVLOSER "; + lin_sys_ = new hiopLinSolverSymSparseEVLOSER(n, nnz, nlp_); + } #else // of #ifdef HIOP_USE_RESOLVE // under compute mode gpu, at this point we don't have a sparse linear solver if(compute_mode == "gpu") { diff --git a/src/Optimization/hiopKKTLinSysSparse.cpp b/src/Optimization/hiopKKTLinSysSparse.cpp index e9af09d..ce8f843 100644 --- a/src/Optimization/hiopKKTLinSysSparse.cpp +++ b/src/Optimization/hiopKKTLinSysSparse.cpp @@ -59,6 +59,7 @@ #endif #ifdef HIOP_USE_RESOLVE #include "hiopLinSolverSparseReSolve.hpp" +#include "hiopLinSolverSparseEVLOSER.hpp" #endif #ifdef HIOP_USE_GINKGO #include "hiopLinSolverSparseGinkgo.hpp" @@ -376,6 +377,21 @@ hiopLinSolverSymSparse* hiopKKTLinSysCompressedSparseXYcYd::determineAndCreateLi #endif } + if(nullptr == linSys_ && linear_solver == "evloser") { +#if defined(HIOP_USE_RESOLVE) + linSys_ = new hiopLinSolverSymSparseEVLOSER(n, nnz, nlp_); + linsol_actual = "EVLOSER"; + auto* fact_acceptor_ic = dynamic_cast(fact_acceptor_); + if(fact_acceptor_ic) { + nlp_->log->printf(hovError, + "KKT_SPARSE_XYcYd linsys with EVLOSER does not support inertia correction. " + "Please set option 'fact_acceptor' to 'inertia_free'.\n"); + assert(false); + return nullptr; + } +#endif + } + if((nullptr == linSys_ && linear_solver == "auto") || linear_solver == "strumpack") { #if defined(HIOP_USE_STRUMPACK) linSys_ = new hiopLinSolverSymSparseSTRUMPACK(n, nnz, nlp_); @@ -757,6 +773,21 @@ hiopLinSolverSymSparse* hiopKKTLinSysCompressedSparseXDYcYd::determineAndCreateL #endif } // end resolve + if(nullptr == linSys_ && linear_solver == "evloser") { +#if defined(HIOP_USE_RESOLVE) + actual_lin_solver = "EVLOSER"; + linSys_ = new hiopLinSolverSymSparseEVLOSER(n, nnz, nlp_); + auto* fact_acceptor_ic = dynamic_cast(fact_acceptor_); + if(fact_acceptor_ic) { + nlp_->log->printf(hovError, + "KKT_SPARSE_XDYcYd linsys with EVLOSER does not support inertia correction. " + "Please set option 'fact_acceptor' to 'inertia_free'.\n"); + assert(false); + return nullptr; + } +#endif + } // end evloser + if(nullptr == linSys_ && (linear_solver == "strumpack" || linear_solver == "auto")) { #if defined(HIOP_USE_STRUMPACK) actual_lin_solver = "STRUMPACK"; @@ -828,6 +859,22 @@ hiopLinSolverSymSparse* hiopKKTLinSysCompressedSparseXDYcYd::determineAndCreateL } #endif } // end resolve + + if(nullptr == linSys_ && linear_solver == "evloser") { +#if defined(HIOP_USE_RESOLVE) + linSys_ = new hiopLinSolverSymSparseEVLOSER(n, nnz, nlp_); + nlp_->log->printf(hovScalars, "KKT_SPARSE_XDYcYd linsys: alloc EVLOSER size %d (%d cons) (gpu)\n", n, neq + nineq); + auto* fact_acceptor_ic = dynamic_cast(fact_acceptor_); + if(fact_acceptor_ic) { + nlp_->log->printf(hovError, + "KKT_SPARSE_XDYcYd linsys with EVLOSER does not support inertia correction. " + "Please set option 'fact_acceptor' to 'inertia_free'.\n"); + assert(false); + return nullptr; + } +#endif + } + } // end of compute mode gpu } assert(linSys_ && "KKT_SPARSE_XDYcYd linsys: cannot instantiate backend linear solver"); From 68f99f08001e60126d67e7b0fdbe3f5566e09d30 Mon Sep 17 00:00:00 2001 From: Tamar DeWilde Date: Wed, 17 Jun 2026 02:01:01 -0400 Subject: [PATCH 04/28] Add EVLOSER sparse solver coverage --- src/Drivers/Sparse/CMakeLists.txt | 1 + src/Drivers/Sparse/NlpSparseRajaEx2Driver.cpp | 41 ++++++++++++++++--- 2 files changed, 37 insertions(+), 5 deletions(-) diff --git a/src/Drivers/Sparse/CMakeLists.txt b/src/Drivers/Sparse/CMakeLists.txt index a1506b2..a08f353 100644 --- a/src/Drivers/Sparse/CMakeLists.txt +++ b/src/Drivers/Sparse/CMakeLists.txt @@ -86,6 +86,7 @@ endif(HIOP_USE_GINKGO) if(HIOP_USE_RAJA AND HIOP_USE_GPU AND HIOP_USE_CUDA AND HIOP_USE_RESOLVE) add_test(NAME NlpSparseRaja2_1 COMMAND ${RUNCMD} "$" "500" "-inertiafree" "-selfcheck" "-resolve_cuda_glu") add_test(NAME NlpSparseRaja2_2 COMMAND ${RUNCMD} "$" "500" "-inertiafree" "-selfcheck" "-resolve_cuda_rf") + add_test(NAME NlpSparseRaja2_3 COMMAND ${RUNCMD} "$" "500" "-inertiafree" "-selfcheck" "-evloser_cuda_rf") endif() add_test(NAME NlpSparse3_1 COMMAND ${RUNCMD} "$" "500" "-selfcheck") diff --git a/src/Drivers/Sparse/NlpSparseRajaEx2Driver.cpp b/src/Drivers/Sparse/NlpSparseRajaEx2Driver.cpp index d82aa66..579837d 100644 --- a/src/Drivers/Sparse/NlpSparseRajaEx2Driver.cpp +++ b/src/Drivers/Sparse/NlpSparseRajaEx2Driver.cpp @@ -20,6 +20,7 @@ static bool parse_arguments(int argc, bool& inertia_free, bool& use_resolve_cuda_glu, bool& use_resolve_cuda_rf, + bool& use_evloser_cuda_rf, bool& use_ginkgo, bool& use_ginkgo_cuda, bool& use_ginkgo_hip) @@ -29,6 +30,7 @@ static bool parse_arguments(int argc, inertia_free = false; use_resolve_cuda_glu = false; use_resolve_cuda_rf = false; + use_evloser_cuda_rf = false; use_ginkgo = false; use_ginkgo_cuda = false; use_ginkgo_hip = false; @@ -47,6 +49,8 @@ static bool parse_arguments(int argc, use_resolve_cuda_glu = true; } else if(std::string(argv[4]) == "-resolve_cuda_rf") { use_resolve_cuda_rf = true; + } else if(std::string(argv[4]) == "-evloser_cuda_rf") { + use_evloser_cuda_rf = true; } else if(std::string(argv[4]) == "-ginkgo") { use_ginkgo = true; } else if(std::string(argv[4]) == "-ginkgo_cuda") { @@ -72,6 +76,8 @@ static bool parse_arguments(int argc, use_resolve_cuda_glu = true; } else if(std::string(argv[3]) == "-resolve_cuda_rf") { use_resolve_cuda_rf = true; + } else if(std::string(argv[3]) == "-evloser_cuda_rf") { + use_evloser_cuda_rf = true; } else if(std::string(argv[3]) == "-ginkgo") { use_ginkgo = true; } else if(std::string(argv[3]) == "-ginkgo_cuda") { @@ -97,6 +103,8 @@ static bool parse_arguments(int argc, use_resolve_cuda_glu = true; } else if(std::string(argv[2]) == "-resolve_cuda_rf") { use_resolve_cuda_rf = true; + } else if(std::string(argv[2]) == "-evloser_cuda_rf") { + use_evloser_cuda_rf = true; } else if(std::string(argv[2]) == "-ginkgo") { use_ginkgo = true; } else if(std::string(argv[2]) == "-ginkgo_cuda") { @@ -122,6 +130,8 @@ static bool parse_arguments(int argc, use_resolve_cuda_glu = true; } else if(std::string(argv[1]) == "-resolve_cuda_rf") { use_resolve_cuda_rf = true; + } else if(std::string(argv[1]) == "-evloser_cuda_rf") { + use_evloser_cuda_rf = true; } else if(std::string(argv[1]) == "-ginkgo") { use_ginkgo = true; } else if(std::string(argv[1]) == "-ginkgo_cuda") { @@ -153,21 +163,33 @@ static bool parse_arguments(int argc, printf("Using default instead of ReSolve ...\n"); use_resolve_cuda_rf = false; } + if(use_evloser_cuda_rf) { + printf("HiOp built without CUDA support. "); + printf("Using default instead of EVLOSER ...\n"); + use_evloser_cuda_rf = false; + } #endif // If ReSolve was selected, but inertia free approach was not, add inertia-free - if((use_resolve_cuda_glu || use_resolve_cuda_rf) && !(inertia_free)) { + if((use_resolve_cuda_glu || use_resolve_cuda_rf || use_evloser_cuda_rf) && !(inertia_free)) { inertia_free = true; printf("LU solver from ReSolve library requires inertia free approach. "); printf("Enabling now ...\n"); } - if(use_resolve_cuda_glu && use_resolve_cuda_rf) { + if(use_resolve_cuda_glu && (use_resolve_cuda_rf || use_evloser_cuda_rf)) { use_resolve_cuda_rf = false; - printf("You can select either GLU or Rf refactorization with ReSolve, not both. "); + use_evloser_cuda_rf = false; + printf("You can select either GLU or Rf refactorization, not both. "); printf("Using default GLU refactorization ...\n"); } + if(use_resolve_cuda_rf && use_evloser_cuda_rf) { + use_evloser_cuda_rf = false; + printf("You can select either ReSolve or EVLOSER, not both. "); + printf("Using ReSolve ...\n"); + } + // If Ginkgo is not available, de-select it. #ifndef HIOP_USE_GINKGO if(use_ginkgo) { @@ -203,6 +225,9 @@ static void usage(const char* exeName) printf( " '-use_resolve_cuda_rf' : use ReSolve linear solver with KLU factorization and cusolverRf refactorization " "[optional]\n"); + printf( + " '-evloser_cuda_rf' : use EVLOSER linear solver with KLU factorization and cusolverRf refactorization " + "[optional]\n"); printf(" '-ginkgo': use GINKGO linear solver [optional]\n"); } @@ -235,6 +260,7 @@ int main(int argc, char** argv) bool inertia_free = false; bool use_resolve_cuda_glu = false; bool use_resolve_cuda_rf = false; + bool use_evloser_cuda_rf = false; bool use_ginkgo = false; bool use_ginkgo_cuda = false; bool use_ginkgo_hip = false; @@ -245,6 +271,7 @@ int main(int argc, char** argv) inertia_free, use_resolve_cuda_glu, use_resolve_cuda_rf, + use_evloser_cuda_rf, use_ginkgo, use_ginkgo_cuda, use_ginkgo_hip)) { @@ -270,8 +297,12 @@ int main(int argc, char** argv) // only support cusolverLU right now, 2023.02.28 // lsq initialization of the duals fails for this example since the Jacobian is rank deficient // use zero initialization - nlp.options->SetStringValue("linear_solver_sparse", "resolve"); - if(use_resolve_cuda_rf) { + if(use_evloser_cuda_rf) { + nlp.options->SetStringValue("linear_solver_sparse", "evloser"); + } else { + nlp.options->SetStringValue("linear_solver_sparse", "resolve"); + } + if(use_resolve_cuda_rf || use_evloser_cuda_rf) { nlp.options->SetStringValue("resolve_refactorization", "rf"); nlp.options->SetIntegerValue("ir_inner_maxit", 20); nlp.options->SetIntegerValue("ir_outer_maxit", 0); From bde1eedc862984e05b3e44e726f17f13939d15df Mon Sep 17 00:00:00 2001 From: Tamar DeWilde Date: Wed, 17 Jun 2026 02:01:01 -0400 Subject: [PATCH 05/28] Add EVLOSER sparse driver option --- src/Drivers/Sparse/NlpSparseEx1Driver.cpp | 24 +++++++++++++++--- src/Drivers/Sparse/NlpSparseEx2Driver.cpp | 30 ++++++++++++++++++----- 2 files changed, 44 insertions(+), 10 deletions(-) diff --git a/src/Drivers/Sparse/NlpSparseEx1Driver.cpp b/src/Drivers/Sparse/NlpSparseEx1Driver.cpp index 4136490..242f9dd 100644 --- a/src/Drivers/Sparse/NlpSparseEx1Driver.cpp +++ b/src/Drivers/Sparse/NlpSparseEx1Driver.cpp @@ -16,6 +16,7 @@ static bool parse_arguments(int argc, bool& self_check, bool& use_pardiso, bool& use_cusolver, + bool& use_evloser, bool& use_ginkgo, bool& use_ginkgo_cuda, bool& use_ginkgo_hip, @@ -23,6 +24,7 @@ static bool parse_arguments(int argc, { self_check = false; use_pardiso = false; + use_evloser = false; use_ginkgo = false; use_ginkgo_cuda = false; use_ginkgo_cuda = false; @@ -50,6 +52,8 @@ static bool parse_arguments(int argc, use_pardiso = true; } else if(std::string(argv[4]) == "-cusolver") { use_cusolver = true; + } else if(std::string(argv[4]) == "-evloser") { + use_evloser = true; } else if(std::string(argv[4]) == "-ginkgo") { use_ginkgo = true; } else if(std::string(argv[4]) == "-ginkgo_cuda") { @@ -70,6 +74,8 @@ static bool parse_arguments(int argc, use_pardiso = true; } else if(std::string(argv[3]) == "-cusolver") { use_cusolver = true; + } else if(std::string(argv[3]) == "-evloser") { + use_evloser = true; } else if(std::string(argv[3]) == "-ginkgo") { use_ginkgo = true; } else if(std::string(argv[3]) == "-ginkgo_cuda") { @@ -90,6 +96,8 @@ static bool parse_arguments(int argc, use_pardiso = true; } else if(std::string(argv[2]) == "-cusolver") { use_cusolver = true; + } else if(std::string(argv[2]) == "-evloser") { + use_evloser = true; } else if(std::string(argv[2]) == "-ginkgo") { use_ginkgo = true; } else if(std::string(argv[2]) == "-ginkgo_cuda") { @@ -116,7 +124,7 @@ static bool parse_arguments(int argc, scal = 1.0; } - if(use_cusolver && use_pardiso) { + if((use_cusolver || use_evloser) && use_pardiso) { printf("Selected both, cuSOLVER and Pardiso. "); printf("You can select only one linear solver.\n\n"); return false; @@ -140,10 +148,11 @@ static bool parse_arguments(int argc, // If HiOp is built without CUDA de-select cuSOLVER. #ifndef HIOP_USE_RESOLVE - if(use_cusolver) { + if(use_cusolver || use_evloser) { printf("HiOp built without support for ReSolve. "); printf("Using default linear solver ...\n"); use_cusolver = false; + use_evloser = false; } #endif @@ -164,6 +173,7 @@ static void usage(const char* exeName) " '-pardiso' or '-cusolver': use Pardiso or cuSOLVER " "as the linear solver [optional]\n"); printf(" '-cusolver': use cuSOLVER as the linear solver [optional]\n"); + printf(" '-evloser': use EVLOSER as the linear solver [optional]\n"); printf(" '-fr': force to reset feasibility in the 1st iteration [optional]\n"); printf( " '-selfcheck': compares the optimal objective with a previously saved value for the " @@ -189,6 +199,7 @@ int main(int argc, char** argv) bool selfCheck = false; bool use_pardiso = false; bool use_cusolver = false; + bool use_evloser = false; bool use_ginkgo = false; bool use_ginkgo_cuda = false; bool use_ginkgo_hip = false; @@ -203,6 +214,7 @@ int main(int argc, char** argv) selfCheck, use_pardiso, use_cusolver, + use_evloser, use_ginkgo, use_ginkgo_cuda, use_ginkgo_hip, @@ -230,10 +242,14 @@ int main(int argc, char** argv) if(use_pardiso) { nlp.options->SetStringValue("linear_solver_sparse", "pardiso"); } - if(use_cusolver) { + if(use_cusolver || use_evloser) { nlp.options->SetStringValue("duals_init", "zero"); nlp.options->SetStringValue("linsol_mode", "speculative"); - nlp.options->SetStringValue("linear_solver_sparse", "resolve"); + if(use_evloser) { + nlp.options->SetStringValue("linear_solver_sparse", "evloser"); + } else { + nlp.options->SetStringValue("linear_solver_sparse", "resolve"); + } nlp.options->SetStringValue("resolve_refactorization", "rf"); nlp.options->SetIntegerValue("ir_inner_maxit", 100); nlp.options->SetNumericValue("ir_inner_tol", 1e-8); diff --git a/src/Drivers/Sparse/NlpSparseEx2Driver.cpp b/src/Drivers/Sparse/NlpSparseEx2Driver.cpp index e664635..620a1c5 100644 --- a/src/Drivers/Sparse/NlpSparseEx2Driver.cpp +++ b/src/Drivers/Sparse/NlpSparseEx2Driver.cpp @@ -16,6 +16,7 @@ static bool parse_arguments(int argc, bool& inertia_free, bool& use_cusolver, bool& use_resolve, + bool& use_evloser, bool& use_ginkgo, bool& use_ginkgo_cuda, bool& use_ginkgo_hip) @@ -25,6 +26,7 @@ static bool parse_arguments(int argc, inertia_free = false; use_cusolver = false; use_resolve = false; + use_evloser = false; use_ginkgo = false; use_ginkgo_cuda = false; use_ginkgo_cuda = false; @@ -41,6 +43,8 @@ static bool parse_arguments(int argc, inertia_free = true; } else if(std::string(argv[4]) == "-cusolver") { use_cusolver = true; + } else if(std::string(argv[4]) == "-evloser") { + use_evloser = true; } else if(std::string(argv[4]) == "-ginkgo") { use_ginkgo = true; } else if(std::string(argv[4]) == "-ginkgo_cuda") { @@ -64,6 +68,8 @@ static bool parse_arguments(int argc, inertia_free = true; } else if(std::string(argv[3]) == "-cusolver") { use_cusolver = true; + } else if(std::string(argv[3]) == "-evloser") { + use_evloser = true; } else if(std::string(argv[3]) == "-ginkgo") { use_ginkgo = true; } else if(std::string(argv[3]) == "-ginkgo_cuda") { @@ -87,6 +93,8 @@ static bool parse_arguments(int argc, inertia_free = true; } else if(std::string(argv[2]) == "-cusolver") { use_cusolver = true; + } else if(std::string(argv[2]) == "-evloser") { + use_evloser = true; } else if(std::string(argv[2]) == "-ginkgo") { use_ginkgo = true; } else if(std::string(argv[2]) == "-ginkgo_cuda") { @@ -110,6 +118,8 @@ static bool parse_arguments(int argc, inertia_free = true; } else if(std::string(argv[1]) == "-cusolver") { use_cusolver = true; + } else if(std::string(argv[1]) == "-evloser") { + use_evloser = true; } else if(std::string(argv[1]) == "-ginkgo") { use_ginkgo = true; } else if(std::string(argv[1]) == "-ginkgo_cuda") { @@ -131,24 +141,25 @@ static bool parse_arguments(int argc, // If CUDA is not available, de-select cuSOLVER #ifndef HIOP_USE_CUDA - if(use_cusolver) { + if(use_cusolver || use_evloser) { printf("HiOp built without CUDA support. "); - printf("Using default instead of cuSOLVER ...\n"); + printf("Using default instead of cuSOLVER/EVLOSER ...\n"); use_cusolver = false; + use_evloser = false; } #endif // Use cuSOLVER's LU factorization, if it was configured #ifdef HIOP_USE_RESOLVE - if(use_cusolver) { + if(use_cusolver || use_evloser) { use_resolve = true; } #endif // If cuSOLVER was selected, but inertia free approach was not, add inertia-free - if(use_cusolver && !(inertia_free)) { + if((use_cusolver || use_evloser) && !(inertia_free)) { inertia_free = true; - printf("LU solver from cuSOLVER library requires inertia free approach. "); + printf("LU solver from ReSolve library requires inertia free approach. "); printf("Enabling now ...\n"); } @@ -182,6 +193,7 @@ static void usage(const char* exeName) " '-selfcheck': compares the optimal objective with a previously saved value for the " "problem specified by 'problem_size'. [optional]\n"); printf(" '-cusolver': use cuSOLVER linear solver [optional]\n"); + printf(" '-evloser': use EVLOSER linear solver [optional]\n"); printf(" '-ginkgo': use GINKGO linear solver [optional]\n"); } @@ -206,6 +218,7 @@ int main(int argc, char** argv) bool inertia_free = false; bool use_cusolver = false; bool use_resolve = false; + bool use_evloser = false; bool use_ginkgo = false; bool use_ginkgo_cuda = false; bool use_ginkgo_hip = false; @@ -216,6 +229,7 @@ int main(int argc, char** argv) inertia_free, use_cusolver, use_resolve, + use_evloser, use_ginkgo, use_ginkgo_cuda, use_ginkgo_hip)) { @@ -246,7 +260,11 @@ int main(int argc, char** argv) if(use_resolve) { nlp.options->SetStringValue("duals_init", "zero"); nlp.options->SetStringValue("linsol_mode", "speculative"); - nlp.options->SetStringValue("linear_solver_sparse", "resolve"); + if(use_evloser) { + nlp.options->SetStringValue("linear_solver_sparse", "evloser"); + } else { + nlp.options->SetStringValue("linear_solver_sparse", "resolve"); + } nlp.options->SetStringValue("resolve_refactorization", "rf"); nlp.options->SetStringValue("compute_mode", "hybrid"); nlp.options->SetIntegerValue("ir_outer_maxit", 0); From fef13b16b89d439d54a8c12568ae4944cac80222 Mon Sep 17 00:00:00 2001 From: Tamar DeWilde Date: Wed, 17 Jun 2026 02:01:01 -0400 Subject: [PATCH 06/28] Clean EVLOSER sparse solver wrapper --- src/LinAlg/hiopLinSolverSparseEVLOSER.cpp | 19 ++++++++++--------- src/LinAlg/hiopLinSolverSparseEVLOSER.hpp | 6 +++--- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/src/LinAlg/hiopLinSolverSparseEVLOSER.cpp b/src/LinAlg/hiopLinSolverSparseEVLOSER.cpp index 2ae0384..6f25601 100644 --- a/src/LinAlg/hiopLinSolverSparseEVLOSER.cpp +++ b/src/LinAlg/hiopLinSolverSparseEVLOSER.cpp @@ -46,7 +46,7 @@ // endorsement purposes. /** - * @file hiopLinSolverSparseReSolve.cpp + * @file hiopLinSolverSparseEVLOSER.cpp * * @author Kasia Swirydowicz , PNNL * @author Slaven Peles , ORNL @@ -112,12 +112,13 @@ hiopLinSolverSymSparseEVLOSER::hiopLinSolverSymSparseEVLOSER(const int& n, const index_convert_extra_Diag2CSR_host_{nullptr}, index_convert_CSR2Triplet_device_{nullptr}, index_convert_extra_Diag2CSR_device_{nullptr}, + m_{n}, n_{n}, nnz_{0}, factorizationSetupSucc_{0}, is_first_call_{true} { - // Create ReSolve solver and allocate rhs temporary storage + // Create embedded ReSolve refactorization solver for the EVLOSER wrapper solver_ = new ReSolve::RefactorizationSolver(n); // If memory space is device, allocate host mirror for HiOp's KKT matrix in triplet format @@ -125,7 +126,7 @@ hiopLinSolverSymSparseEVLOSER::hiopLinSolverSymSparseEVLOSER(const int& n, const M_host_ = LinearAlgebraFactory::create_matrix_sparse("default", n, n, nnz); } - // Set verbosity of ReSolve based on HiOp verbosity + // Set embedded solver verbosity based on HiOp verbosity if(nlp_->options->GetInteger("verbosity_level") >= 3) { solver_->set_silent_output(false); } @@ -138,7 +139,7 @@ hiopLinSolverSymSparseEVLOSER::hiopLinSolverSymSparseEVLOSER(const int& n, const } else if(ord == "colamd_ssparse") { ordering = 1; } else { - nlp_->log->printf(hovWarning, "Ordering %s not compatible with cuSOLVER LU, using default ...\n", ord.c_str()); + nlp_->log->printf(hovWarning, "Ordering %s not compatible with EVLOSER sparse solver, using default ...\n", ord.c_str()); ordering = 1; } solver_->ordering() = ordering; @@ -148,7 +149,7 @@ hiopLinSolverSymSparseEVLOSER::hiopLinSolverSymSparseEVLOSER(const int& n, const std::string fact; fact = nlp_->options->GetString("resolve_factorization"); if(fact != "klu") { - nlp_->log->printf(hovWarning, "Factorization %s not compatible with cuSOLVER LU, using default ...\n", fact.c_str()); + nlp_->log->printf(hovWarning, "Factorization %s not compatible with EVLOSER sparse solver, using default ...\n", fact.c_str()); fact = "klu"; } solver_->fact() = fact; @@ -158,7 +159,7 @@ hiopLinSolverSymSparseEVLOSER::hiopLinSolverSymSparseEVLOSER(const int& n, const std::string refact; refact = nlp_->options->GetString("resolve_refactorization"); if(refact != "glu" && refact != "rf") { - nlp_->log->printf(hovWarning, "Refactorization %s not compatible with cuSOLVER LU, using default ...\n", refact.c_str()); + nlp_->log->printf(hovWarning, "Refactorization %s not compatible with EVLOSER sparse solver, using default ...\n", refact.c_str()); refact = "glu"; } solver_->refact() = refact; @@ -315,7 +316,7 @@ bool hiopLinSolverSymSparseEVLOSER::solve(hiopVector& x) bool retval = solver_->triangular_solve(dx, ir_tol, mem_space); if(!retval) { nlp_->log->printf(hovError, // catastrophic failure - "ReSolve triangular solver failed\n"); + "EVLOSER triangular solve failed\n"); } nlp_->runStats.linsolv.tmTriuSolves.stop(); @@ -426,7 +427,7 @@ void hiopLinSolverSymSparseEVLOSER::compute_nnz() } else if(mem_space == "device") { M_host = M_host_; } else { - nlp_->log->printf(hovError, "Memory space %s incompatible with ReSolve.\n", mem_space.c_str()); + nlp_->log->printf(hovError, "Memory space %s incompatible with EVLOSER.\n", mem_space.c_str()); } // off-diagonal part @@ -461,7 +462,7 @@ void hiopLinSolverSymSparseEVLOSER::set_csr_indices_values() } else if(mem_space == "device") { M_host = M_host_; } else { - nlp_->log->printf(hovError, "Memory space %s incompatible with ReSolve.\n", mem_space.c_str()); + nlp_->log->printf(hovError, "Memory space %s incompatible with EVLOSER.\n", mem_space.c_str()); } // diff --git a/src/LinAlg/hiopLinSolverSparseEVLOSER.hpp b/src/LinAlg/hiopLinSolverSparseEVLOSER.hpp index 72202de..dc0e86f 100644 --- a/src/LinAlg/hiopLinSolverSparseEVLOSER.hpp +++ b/src/LinAlg/hiopLinSolverSparseEVLOSER.hpp @@ -47,7 +47,7 @@ // endorsement purposes. /** - * @file hiopLinSolverSparseReSolve.hpp + * @file hiopLinSolverSparseEVLOSER.hpp * * @author Kasia Swirydowicz , PNNL * @author Slaven Peles , ORNL @@ -61,8 +61,8 @@ #include "hiopMatrixSparseTriplet.hpp" #include -/** implements the linear solver class using nvidia_ cuSolver (GLU - * refactorization) +/** Implements the sparse linear solver class using the EVLOSER interface + * to the embedded ReSolve backend. * * @ingroup LinearSolvers */ From dfe6c71c2d573c7e2348cedae70aea095daf3484 Mon Sep 17 00:00:00 2001 From: Tamar DeWilde Date: Wed, 17 Jun 2026 02:01:01 -0400 Subject: [PATCH 07/28] Add EVLOSER sparse driver tests --- src/Drivers/Sparse/CMakeLists.txt | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/Drivers/Sparse/CMakeLists.txt b/src/Drivers/Sparse/CMakeLists.txt index a08f353..0c1e2c2 100644 --- a/src/Drivers/Sparse/CMakeLists.txt +++ b/src/Drivers/Sparse/CMakeLists.txt @@ -56,6 +56,9 @@ add_test(NAME NlpSparse1_2 COMMAND ${RUNCMD} "$" " if(HIOP_USE_CUDA) add_test(NAME NlpSparse1_3 COMMAND ${RUNCMD} "$" "500" "-cusolver" "-selfcheck") endif(HIOP_USE_CUDA) +if(HIOP_USE_CUDA AND HIOP_USE_RESOLVE) + add_test(NAME NlpSparse1_EVLOSER COMMAND ${RUNCMD} "$" "500" "-evloser" "-selfcheck") +endif() if(HIOP_USE_PARDISO) add_test(NAME NlpSparse1_4 COMMAND ${RUNCMD} "$" "500" "-pardiso" "-selfcheck") endif(HIOP_USE_PARDISO) @@ -73,6 +76,9 @@ add_test(NAME NlpSparse2_2 COMMAND ${RUNCMD} "$" " if(HIOP_USE_CUDA) add_test(NAME NlpSparse2_3 COMMAND ${RUNCMD} "$" "500" "-cusolver" "-inertiafree" "-selfcheck") endif(HIOP_USE_CUDA) +if(HIOP_USE_CUDA AND HIOP_USE_RESOLVE) + add_test(NAME NlpSparse2_EVLOSER COMMAND ${RUNCMD} "$" "500" "-evloser" "-inertiafree" "-selfcheck") +endif() if(HIOP_USE_GINKGO) add_test(NAME NlpSparse2_4 COMMAND ${RUNCMD} "$" "500" "-ginkgo" "-inertiafree" "-selfcheck") if(HIOP_USE_CUDA) From 5e91f10a4d1bc29ed522f1b1d4777bddb2ab9bed Mon Sep 17 00:00:00 2001 From: Tamar DeWilde Date: Wed, 17 Jun 2026 02:01:01 -0400 Subject: [PATCH 08/28] Add isolated EVLOSER embedded backend --- src/LinAlg/CMakeLists.txt | 11 +- src/LinAlg/EVLOSER/CMakeLists.txt | 22 + src/LinAlg/EVLOSER/IterativeRefinement.cpp | 697 +++++++++++++++++ src/LinAlg/EVLOSER/IterativeRefinement.hpp | 176 +++++ src/LinAlg/EVLOSER/KrylovSolverKernels.cu | 215 ++++++ src/LinAlg/EVLOSER/KrylovSolverKernels.h | 70 ++ src/LinAlg/EVLOSER/MatrixCsr.cpp | 143 ++++ src/LinAlg/EVLOSER/MatrixCsr.hpp | 56 ++ src/LinAlg/EVLOSER/RefactorizationSolver.cpp | 764 +++++++++++++++++++ src/LinAlg/EVLOSER/RefactorizationSolver.hpp | 244 ++++++ src/LinAlg/EVLOSER/resolve_cusolver_defs.hpp | 132 ++++ src/LinAlg/hiopLinSolverSparseEVLOSER.cpp | 7 +- src/LinAlg/hiopLinSolverSparseEVLOSER.hpp | 6 +- 13 files changed, 2532 insertions(+), 11 deletions(-) create mode 100644 src/LinAlg/EVLOSER/CMakeLists.txt create mode 100644 src/LinAlg/EVLOSER/IterativeRefinement.cpp create mode 100644 src/LinAlg/EVLOSER/IterativeRefinement.hpp create mode 100644 src/LinAlg/EVLOSER/KrylovSolverKernels.cu create mode 100644 src/LinAlg/EVLOSER/KrylovSolverKernels.h create mode 100644 src/LinAlg/EVLOSER/MatrixCsr.cpp create mode 100644 src/LinAlg/EVLOSER/MatrixCsr.hpp create mode 100644 src/LinAlg/EVLOSER/RefactorizationSolver.cpp create mode 100644 src/LinAlg/EVLOSER/RefactorizationSolver.hpp create mode 100644 src/LinAlg/EVLOSER/resolve_cusolver_defs.hpp diff --git a/src/LinAlg/CMakeLists.txt b/src/LinAlg/CMakeLists.txt index 6a2a3d9..482d0f1 100644 --- a/src/LinAlg/CMakeLists.txt +++ b/src/LinAlg/CMakeLists.txt @@ -154,11 +154,12 @@ if(HIOP_SPARSE) list(APPEND hiopLinAlg_SRC ${hiopLinAlg_PARDISO_SRC}) endif(HIOP_USE_PARDISO) if(HIOP_USE_RESOLVE) - add_subdirectory(ReSolve) + add_subdirectory(ReSolve) + add_subdirectory(EVLOSER) list(APPEND hiopLinAlg_SRC ${hiopLinAlg_CUSOLVER_LU_SRC}) - list(APPEND hiopLinAlg_SRC ${hiopLinAlg_EVLOSER_SRC}) + list(APPEND hiopLinAlg_SRC ${hiopLinAlg_EVLOSER_SRC}) set_source_files_properties(${hiopLinAlg_CUSOLVER_LU_SRC} PROPERTIES LANGUAGE CUDA) - set_source_files_properties(${hiopLinAlg_EVLOSER_SRC} PROPERTIES LANGUAGE CUDA) + set_source_files_properties(${hiopLinAlg_EVLOSER_SRC} PROPERTIES LANGUAGE CUDA) endif(HIOP_USE_RESOLVE) if(HIOP_USE_CUDA) list(APPEND hiopLinAlg_SRC ${hiopLinAlg_CUSOLVER_CHOL_SRC}) @@ -227,7 +228,7 @@ install( add_library(hiopLinAlg OBJECT ${hiopLinAlg_SRC}) if(HIOP_USE_RESOLVE) - target_link_libraries(hiop_tpl INTERFACE ReSolve) - install(TARGETS ReSolve EXPORT hiop-targets) + target_link_libraries(hiop_tpl INTERFACE ReSolve EVLOSER) + install(TARGETS ReSolve EVLOSER EXPORT hiop-targets) endif() target_link_libraries(hiopLinAlg PRIVATE hiop_tpl) diff --git a/src/LinAlg/EVLOSER/CMakeLists.txt b/src/LinAlg/EVLOSER/CMakeLists.txt new file mode 100644 index 0000000..aecb530 --- /dev/null +++ b/src/LinAlg/EVLOSER/CMakeLists.txt @@ -0,0 +1,22 @@ +# Build EVLOSER embedded backend library + +set(EVLOSER_SRC + RefactorizationSolver.cpp + MatrixCsr.cpp + IterativeRefinement.cpp + KrylovSolverKernels.cu +) +set(EVLOSER_HEADERS + RefactorizationSolver.hpp + MatrixCsr.hpp + IterativeRefinement.hpp +) + +set_source_files_properties(${EVLOSER_SRC} PROPERTIES LANGUAGE CUDA) + +add_library(EVLOSER STATIC ${EVLOSER_SRC}) +target_include_directories(EVLOSER INTERFACE + $ + $ +) +target_link_libraries(EVLOSER PRIVATE KLU hiop_cuda) diff --git a/src/LinAlg/EVLOSER/IterativeRefinement.cpp b/src/LinAlg/EVLOSER/IterativeRefinement.cpp new file mode 100644 index 0000000..d219e59 --- /dev/null +++ b/src/LinAlg/EVLOSER/IterativeRefinement.cpp @@ -0,0 +1,697 @@ +// This file is part of HiOp. For details, see https://github.com/LLNL/hiop. +// HiOp is released under the BSD 3-clause license +// (https://opensource.org/licenses/BSD-3-Clause). Please also read “Additional +// BSD Notice” below. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// i. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the disclaimer below. ii. Redistributions in +// binary form must reproduce the above copyright notice, this list of +// conditions and the disclaimer (as noted below) in the documentation and/or +// other materials provided with the distribution. +// iii. Neither the name of the LLNS/LLNL nor the names of its contributors may +// be used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY, LLC, +// THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, +// INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF +// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Additional BSD Notice +// 1. This notice is required to be provided under our contract with the U.S. +// Department of Energy (DOE). This work was produced at Lawrence Livermore +// National Laboratory under Contract No. DE-AC52-07NA27344 with the DOE. +// 2. Neither the United States Government nor Lawrence Livermore National +// Security, LLC nor any of their employees, makes any warranty, express or +// implied, or assumes any liability or responsibility for the accuracy, +// completeness, or usefulness of any information, apparatus, product, or +// process disclosed, or represents that its use would not infringe +// privately-owned rights. +// 3. Also, reference herein to any specific commercial products, process, or +// services by trade name, trademark, manufacturer or otherwise does not +// necessarily constitute or imply its endorsement, recommendation, or favoring +// by the United States Government or Lawrence Livermore National Security, +// LLC. The views and opinions of authors expressed herein do not necessarily +// state or reflect those of the United States Government or Lawrence Livermore +// National Security, LLC, and shall not be used for advertising or product +// endorsement purposes. + +/** + * @file IterativeRefinement.cpp + * + * @author Kasia Swirydowicz , PNNL + * @author Slaven Peles , ORNL + * + */ + +#include "IterativeRefinement.hpp" + +#include "hiop_blasdefs.hpp" +#include "KrylovSolverKernels.h" + +#include "klu.h" +#include "cusparse_v2.h" +#include +#include +#include +#include + +#define checkCudaErrors(val) resolveCheckCudaError((val), __FILE__, __LINE__) + +namespace EVLOSER +{ + +// Default constructor +IterativeRefinement::IterativeRefinement() {} + +// Parametrized constructor +IterativeRefinement::IterativeRefinement(int restart, double tol, int maxit) + : restart_{restart}, + maxit_{maxit}, + tol_{tol} +{} + +IterativeRefinement::~IterativeRefinement() +{ + cusparseDestroySpMat(mat_A_); + // free GPU variables that belong to this class and are not shared with CUSOLVER class + cudaFree(mv_buffer_); + cudaFree(d_V_); + cudaFree(d_Z_); + cudaFree(d_rvGPU_); + cudaFree(d_Hcolumn_); + + if(orth_option_ == "cgs2") { + cudaFree(d_H_col_); + } + // delete all CPU GMRES variables + delete[] h_H_; + + if(orth_option_ == "mgs_two_synch" || orth_option_ == "mgs_pm") { + delete[] h_L_; + delete[] h_rv_; + } + delete[] h_c_; + delete[] h_s_; + delete[] h_rs_; + + if(orth_option_ == "mgs_pm" || orth_option_ == "cgs2") { + delete[] h_aux_; + } +} + +int IterativeRefinement::setup_system_matrix(int n, int nnz, int* dia, int* dja, double* da) +{ + dia_ = dia; + dja_ = dja; + da_ = da; + n_ = n; + nnz_ = nnz; + checkCudaErrors(cusparseCreateCsr(&mat_A_, + n, + n, + nnz, + dia_, + dja_, + da_, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_BASE_ZERO, + CUDA_R_64F)); + return 0; +} + +int IterativeRefinement::setup(cusparseHandle_t cusparse_handle, + cublasHandle_t cublas_handle, + cusolverRfHandle_t cusolverrf_handle, + int n, + double* d_T, + int* d_P, + int* d_Q, + double* devx, + double* devr) +{ + cusparse_handle_ = cusparse_handle; + cublas_handle_ = cublas_handle; + cusolverrf_handle_ = cusolverrf_handle; + assert(n_ == n && "Size of the linear system incorrectly set in the iterative refinement class!\n"); + + // only set pointers + d_T_ = d_T; + d_P_ = d_P; + d_Q_ = d_Q; + + // setup matvec + + cusparseCreateDnVec(&vec_x_, n_, devx, CUDA_R_64F); + cusparseCreateDnVec(&vec_Ax_, n_, devr, CUDA_R_64F); + size_t buffer_size; + checkCudaErrors(cusparseSpMV_bufferSize(cusparse_handle_, + CUSPARSE_OPERATION_NON_TRANSPOSE, + &(minusone_), + mat_A_, + vec_x_, + &(one_), + vec_Ax_, + CUDA_R_64F, + CUSPARSE_SPMV_CSR_ALG2, + &buffer_size)); + + cudaDeviceSynchronize(); + checkCudaErrors(cudaMalloc(&mv_buffer_, buffer_size)); + + // allocate space for the GPU + + checkCudaErrors(cudaMalloc(&(d_V_), n_ * (restart_ + 1) * sizeof(double))); + checkCudaErrors(cudaMalloc(&(d_Z_), n_ * (restart_ + 1) * sizeof(double))); + checkCudaErrors(cudaMalloc(&(d_rvGPU_), 2 * (restart_ + 1) * sizeof(double))); + checkCudaErrors(cudaMalloc(&(d_Hcolumn_), 2 * (restart_ + 1) * (restart_ + 1) * sizeof(double))); + + // and for the CPU + + h_H_ = new double[restart_ * (restart_ + 1)]; + h_c_ = new double[restart_]; // needed for givens + h_s_ = new double[restart_]; // same + h_rs_ = new double[restart_ + 1]; // for residual norm history + + // for specific orthogonalization options, need a little more memory + if(orth_option_ == "mgs_two_synch" || orth_option_ == "mgs_pm") { + h_L_ = new double[restart_ * (restart_ + 1)]; + h_rv_ = new double[restart_ + 1]; + } + + if(orth_option_ == "cgs2") { + h_aux_ = new double[restart_ + 1]; + checkCudaErrors(cudaMalloc(&(d_H_col_), (restart_ + 1) * sizeof(double))); + } + + if(orth_option_ == "mgs_pm") { + h_aux_ = new double[restart_ + 1]; + } + return 0; +} + +double IterativeRefinement::getFinalResidalNorm() { return final_residual_norm_; } + +double IterativeRefinement::getInitialResidalNorm() { return initial_residual_norm_; } + +double IterativeRefinement::getBNorm() { return bnorm_; } + +int IterativeRefinement::getFinalNumberOfIterations() { return fgmres_iters_; } + +double IterativeRefinement::matrixAInfNrm() +{ + double nrm; + evloser_matrix_row_sums(n_, nnz_, dia_, da_, d_Z_); + cusolverSpDnrminf(cusolver_handle_, n_, d_Z_, &nrm, mv_buffer_ /* at least 8192 bytes */); + return nrm; +} + +double IterativeRefinement::vectorInfNrm(int n, double* d_v) +{ + double nrm; + + cusolverSpDnrminf(cusolver_handle_, n, d_v, &nrm, mv_buffer_ /* at least 8192 bytes */); + return nrm; +} + +void IterativeRefinement::fgmres(double* d_x, double* d_b) +{ + int outer_flag = 1; + int notconv = 1; + int i = 0; + int it = 0; + int j; + int k; + int k1; + + double t; + double rnorm; + double bnorm; + // double rnorm_aux; + double tolrel; + // V[0] = b-A*x_0 + cudaMemcpy(&(d_V_[0]), d_b, sizeof(double) * n_, cudaMemcpyDeviceToDevice); + + cudaMatvec(d_x, d_V_, "residual"); + + rnorm = 0.0; + cublasDdot(cublas_handle_, n_, d_b, 1, d_b, 1, &bnorm); + cublasDdot(cublas_handle_, n_, d_V_, 1, d_V_, 1, &rnorm); + // rnorm = ||V_1|| + rnorm = sqrt(rnorm); + bnorm = sqrt(bnorm); + bnorm_ = bnorm; + while(outer_flag) { + // check if maybe residual is already small enough? + if(it == 0) { + tolrel = tol_ * rnorm; + if(fabs(tolrel) < 1e-16) { + tolrel = 1e-16; + } + } + int exit_cond = 0; + if(conv_cond() == 0) { + exit_cond = ((fabs(rnorm - ZERO) <= EPSILON)); + } else { + if(conv_cond() == 1) { + exit_cond = ((fabs(rnorm - ZERO) <= EPSILON) || (rnorm < tol_)); + } else { + if(conv_cond() == 2) { + exit_cond = ((fabs(rnorm - ZERO) <= EPSILON) || (rnorm < (tol_ * bnorm))); + } + } + } + if(exit_cond) { + outer_flag = 0; + final_residual_norm_ = rnorm; + initial_residual_norm_ = rnorm; + fgmres_iters_ = 0; + break; + } + + // normalize first vector + t = 1.0 / rnorm; + cublasDscal(cublas_handle_, n_, &t, d_V_, 1); + + // initialize norm history + + h_rs_[0] = rnorm; + initial_residual_norm_ = rnorm; + i = -1; + notconv = 1; + + while((notconv) && (it < maxit_)) { + i++; + it++; + // Z_i = (LU)^{-1}*V_i + cudaMemcpy(&d_Z_[i * n_], &d_V_[i * n_], sizeof(double) * n_, cudaMemcpyDeviceToDevice); + checkCudaErrors(cusolverRfSolve(cusolverrf_handle_, d_P_, d_Q_, 1, d_T_, n_, &d_Z_[i * n_], n_)); + cudaDeviceSynchronize(); + // V_{i+1}=A*Z_i + cudaMatvec(&d_Z_[i * n_], &d_V_[(i + 1) * n_], "matvec"); + // orthogonalize V[i+1], form a column of h_L + GramSchmidt(i); + + if(i != 0) { + for(int k = 1; k <= i; k++) { + k1 = k - 1; + t = h_H_[i * (restart_ + 1) + k1]; + h_H_[i * (restart_ + 1) + k1] = h_c_[k1] * t + h_s_[k1] * h_H_[i * (restart_ + 1) + k]; + h_H_[i * (restart_ + 1) + k] = -h_s_[k1] * t + h_c_[k1] * h_H_[i * (restart_ + 1) + k]; + } + } // if i!=0 + + double Hii = h_H_[i * (restart_ + 1) + i]; + double Hii1 = h_H_[(i) * (restart_ + 1) + i + 1]; + double gam = sqrt(Hii * Hii + Hii1 * Hii1); + + if(fabs(gam - ZERO) <= EPSILON) { + gam = EPSMAC; + } + + /* next Given's rotation */ + h_c_[i] = Hii / gam; + h_s_[i] = Hii1 / gam; + h_rs_[i + 1] = -h_s_[i] * h_rs_[i]; + h_rs_[i] = h_c_[i] * h_rs_[i]; + + h_H_[(i) * (restart_ + 1) + (i)] = h_c_[i] * Hii + h_s_[i] * Hii1; + h_H_[(i) * (restart_ + 1) + (i + 1)] = h_c_[i] * Hii1 - h_s_[i] * Hii; + + // residual norm estimate + rnorm = fabs(h_rs_[i + 1]); + // check convergence + if(i + 1 >= restart_ || rnorm <= tolrel || it >= maxit_) { + notconv = 0; + } + } // inner while + + // solve tri system + h_rs_[i] = h_rs_[i] / h_H_[i * (restart_ + 1) + i]; + for(int ii = 2; ii <= i + 1; ii++) { + k = i - ii + 1; + k1 = k + 1; + t = h_rs_[k]; + for(j = k1; j <= i; j++) { + t -= h_H_[j * (restart_ + 1) + k] * h_rs_[j]; + } + h_rs_[k] = t / h_H_[k * (restart_ + 1) + k]; + } + + // get solution + for(j = 0; j <= i; j++) { + cublasDaxpy(cublas_handle_, n_, &h_rs_[j], &d_Z_[j * n_], 1, d_x, 1); + } + + /* test solution */ + + if(rnorm <= tolrel || it >= maxit_) { + // rnorm_aux = rnorm; + outer_flag = 0; + } + + cudaMemcpy(&d_V_[0], d_b, sizeof(double) * n_, cudaMemcpyDeviceToDevice); + cudaMatvec(d_x, d_V_, "residual"); + + rnorm = 0.0; + cublasDdot(cublas_handle_, n_, d_V_, 1, d_V_, 1, &rnorm); + // rnorm = ||V_1|| + rnorm = sqrt(rnorm); + + if(!outer_flag) { + final_residual_norm_ = rnorm; + fgmres_iters_ = it; + } + } // outer while +} + +// b-Ax +void IterativeRefinement::cudaMatvec(double* d_x, double* d_b, std::string option) +{ + cusparseCreateDnVec(&vec_x_, n_, d_x, CUDA_R_64F); + cusparseCreateDnVec(&vec_Ax_, n_, d_b, CUDA_R_64F); + if(option == "residual") { + // b = b-Ax + cusparseSpMV(cusparse_handle_, + CUSPARSE_OPERATION_NON_TRANSPOSE, + &minusone_, + mat_A_, + vec_x_, + &one_, + vec_Ax_, + CUDA_R_64F, + CUSPARSE_SPMV_CSR_ALG2, + mv_buffer_); + } else { + // just b = A*x + cusparseSpMV(cusparse_handle_, + CUSPARSE_OPERATION_NON_TRANSPOSE, + &one_, + mat_A_, + vec_x_, + &zero_, + vec_Ax_, + CUDA_R_64F, + CUSPARSE_SPMV_CSR_ALG2, + mv_buffer_); + } + cusparseDestroyDnVec(vec_x_); + cusparseDestroyDnVec(vec_Ax_); +} + +void IterativeRefinement::GramSchmidt(int i) +{ + double t; + const double one = 1.0; + const double minusone = -1.0; + const double zero = 0.0; + double s; + int sw = 0; + if(orth_option_ == "mgs") { + sw = 0; + } else { + if(orth_option_ == "cgs2") { + sw = 1; + } else { + if(orth_option_ == "mgs_two_synch") { + sw = 2; + } else { + if(orth_option_ == "mgs_pm") { + sw = 3; + } else { + // display error message and set sw = 0; + /* + nlp_->log->printf(hovWarning, + "Wrong Gram-Schmidt option. Setting default (modified Gram-Schmidt, mgs) ...\n"); + */ + sw = 0; + } + } + } + } + + switch(sw) { + case 0: // mgs + + for(int j = 0; j <= i; ++j) { + t = 0.0; + cublasDdot(cublas_handle_, n_, &d_V_[j * n_], 1, &d_V_[(i + 1) * n_], 1, &t); + + h_H_[i * (restart_ + 1) + j] = t; + t *= -1.0; + + cublasDaxpy(cublas_handle_, n_, &t, &d_V_[j * n_], 1, &d_V_[(i + 1) * n_], 1); + } + t = 0.0; + cublasDdot(cublas_handle_, n_, &d_V_[(i + 1) * n_], 1, &d_V_[(i + 1) * n_], 1, &t); + + // set the last entry in Hessenberg matrix + t = sqrt(t); + h_H_[(i) * (restart_ + 1) + i + 1] = t; + if(t != 0.0) { + t = 1.0 / t; + cublasDscal(cublas_handle_, n_, &t, &d_V_[(i + 1) * n_], 1); + } else { + assert(0 && "Iterative refinement failed, Krylov vector with zero norm\n"); + } + break; + + case 1: // cgs2 + // Hcol = V(:,1:i)^T *V(:,i+1); + cublasDgemv(cublas_handle_, CUBLAS_OP_T, n_, i + 1, &one_, d_V_, n_, &d_V_[(i + 1) * n_], 1, &zero_, d_H_col_, 1); + // V(:,i+1) = V(:, i+1) - V(:,1:i)*Hcol + cublasDgemv(cublas_handle_, CUBLAS_OP_N, n_, i + 1, &minusone_, d_V_, n_, d_H_col_, 1, &one_, &d_V_[n_ * (i + 1)], 1); + // copy H_col to aux, we will need it later + + cudaMemcpy(h_aux_, d_H_col_, sizeof(double) * (i + 1), cudaMemcpyDeviceToHost); + + // Hcol = V(:,1:i)*V(:,i+1); + cublasDgemv(cublas_handle_, CUBLAS_OP_T, n_, i + 1, &one_, d_V_, n_, &d_V_[(i + 1) * n_], 1, &zero_, d_H_col_, 1); + // V(:,i+1) = V(:, i+1) - V(:,1:i)*Hcol + + cublasDgemv(cublas_handle_, CUBLAS_OP_N, n_, i + 1, &minusone_, d_V_, n_, d_H_col_, 1, &one_, &d_V_[n_ * (i + 1)], 1); + // copy H_col to H + + cudaMemcpy(&h_H_[i * (restart_ + 1)], d_H_col_, sizeof(double) * (i + 1), cudaMemcpyDeviceToHost); + // add both pieces together (unstable otherwise, careful here!!) + for(int j = 0; j <= i; ++j) { + h_H_[i * (restart_ + 1) + j] += h_aux_[j]; + } + t = 0.0; + cublasDdot(cublas_handle_, n_, &d_V_[(i + 1) * n_], 1, &d_V_[(i + 1) * n_], 1, &t); + + // set the last entry in Hessenberg matrix + t = sqrt(t); + h_H_[(i) * (restart_ + 1) + i + 1] = t; + if(t != 0.0) { + t = 1.0 / t; + cublasDscal(cublas_handle_, n_, &t, &d_V_[(i + 1) * n_], 1); + } else { + assert(0 && "Iterative refinement failed, Krylov vector with zero norm\n"); + } + break; + // the two low synch schemes + case 2: + // KS: the kernels are limited by the size of the shared memory on the GPU. If too many vectors in Krylov space, use + // standard cublas routines. V[1:i]^T[V[i] w] + if(i < 200) { + evloser_mass_inner_product_two_vectors(n_, i, &d_V_[i * n_], &d_V_[(i + 1) * n_], d_V_, d_rvGPU_); + } else { + cublasDgemm(cublas_handle_, + CUBLAS_OP_T, + CUBLAS_OP_N, + i + 1, // m + 2, // n + n_, // k + &one, // alpha + d_V_, // A + n_, // lda + &d_V_[i * n_], // B + n_, // ldb + &zero, + d_rvGPU_, // c + i + 1); // ldc + } + // copy rvGPU to L + cudaMemcpy(&h_L_[(i) * (restart_ + 1)], d_rvGPU_, (i + 1) * sizeof(double), cudaMemcpyDeviceToHost); + + cudaMemcpy(h_rv_, &d_rvGPU_[i + 1], (i + 1) * sizeof(double), cudaMemcpyDeviceToHost); + + for(int j = 0; j <= i; ++j) { + h_H_[(i) * (restart_ + 1) + j] = 0.0; + } + // triangular solve + for(int j = 0; j <= i; ++j) { + h_H_[(i) * (restart_ + 1) + j] = h_rv_[j]; + s = 0.0; + for(int k = 0; k < j; ++k) { + s += h_L_[j * (restart_ + 1) + k] * h_H_[(i) * (restart_ + 1) + k]; + } // for k + h_H_[(i) * (restart_ + 1) + j] -= s; + } // for j + + cudaMemcpy(d_Hcolumn_, &h_H_[(i) * (restart_ + 1)], (i + 1) * sizeof(double), cudaMemcpyHostToDevice); + // again, use std cublas functions if Krylov space is too large + if(i < 200) { + evloser_mass_axpy(n_, i, d_V_, &d_V_[(i + 1) * n_], d_Hcolumn_); + } else { + cublasDgemm(cublas_handle_, + CUBLAS_OP_N, + CUBLAS_OP_N, + n_, // m + 1, // n + i + 1, // k + &minusone, // alpha + d_V_, // A + n_, // lda + d_Hcolumn_, // B + i + 1, // ldb + &one, + &d_V_[(i + 1) * n_], // c + n_); // ldc + } + // normalize (second synch) + t = 0.0; + cublasDdot(cublas_handle_, n_, &d_V_[(i + 1) * n_], 1, &d_V_[(i + 1) * n_], 1, &t); + + // set the last entry in Hessenberg matrix + t = sqrt(t); + h_H_[(i) * (restart_ + 1) + i + 1] = t; + if(t != 0.0) { + t = 1.0 / t; + cublasDscal(cublas_handle_, n_, &t, &d_V_[(i + 1) * n_], 1); + } else { + assert(0 && "Iterative refinement failed, Krylov vector with zero norm\n"); + } + break; + + case 3: // two synch Gauss-Seidel mgs, SUPER STABLE + // according to unpublisjed work by ST + // L is where we keep the triangular matrix(L is ON THE CPU) + // if Krylov space is too large, use std cublas (because out of shared mmory) + if(i < 200) { + evloser_mass_inner_product_two_vectors(n_, i, &d_V_[i * n_], &d_V_[(i + 1) * n_], d_V_, d_rvGPU_); + } else { + cublasDgemm(cublas_handle_, + CUBLAS_OP_T, + CUBLAS_OP_N, + i + 1, // m + 2, // n + n_, // k + &one, // alpha + d_V_, // A + n_, // lda + &d_V_[i * n_], // B + n_, // ldb + &zero, + d_rvGPU_, // c + i + 1); // ldc + } + // copy rvGPU to L + cudaMemcpy(&h_L_[(i) * (restart_ + 1)], d_rvGPU_, (i + 1) * sizeof(double), cudaMemcpyDeviceToHost); + + cudaMemcpy(h_rv_, &d_rvGPU_[i + 1], (i + 1) * sizeof(double), cudaMemcpyDeviceToHost); + + for(int j = 0; j <= i; ++j) { + h_H_[(i) * (restart_ + 1) + j] = 0.0; + } + // triangular solve + for(int j = 0; j <= i; ++j) { + h_H_[(i) * (restart_ + 1) + j] = h_rv_[j]; + s = 0.0; + for(int k = 0; k < j; ++k) { + s += h_L_[j * (restart_ + 1) + k] * h_H_[(i) * (restart_ + 1) + k]; + } // for k + h_H_[(i) * (restart_ + 1) + j] -= s; + } // for j + + // now compute h_rv = L^T h_H + double h; + for(int j = 0; j <= i; ++j) { + // go through COLUMN OF L + h_rv_[j] = 0.0; + for(int k = j + 1; k <= i; ++k) { + h = h_L_[k * (restart_ + 1) + j]; + h_rv_[j] += h_H_[(i) * (restart_ + 1) + k] * h; + } + } + + // and do one more tri solve with L^T: h_aux = (I-L)^{-1}h_rv + for(int j = 0; j <= i; ++j) { + h_aux_[j] = h_rv_[j]; + s = 0.0; + for(int k = 0; k < j; ++k) { + s += h_L_[j * (restart_ + 1) + k] * h_aux_[k]; + } // for k + h_aux_[j] -= s; + } // for j + + // and now subtract that from h_H + for(int j = 0; j <= i; ++j) { + h_H_[(i) * (restart_ + 1) + j] -= h_aux_[j]; + } + cudaMemcpy(d_Hcolumn_, &h_H_[(i) * (restart_ + 1)], (i + 1) * sizeof(double), cudaMemcpyHostToDevice); + // if Krylov space too large, use std cublas routines + if(i < 200) { + evloser_mass_axpy(n_, i, d_V_, &d_V_[(i + 1) * n_], d_Hcolumn_); + } else { + cublasDgemm(cublas_handle_, + CUBLAS_OP_N, + CUBLAS_OP_N, + n_, // m + 1, // n + i + 1, // k + &minusone, // alpha + d_V_, // A + n_, // lda + d_Hcolumn_, // B + i + 1, // ldb + &one, + &d_V_[(i + 1) * n_], // c + n_); // ldc + } + // normalize (second synch) + t = 0.0; + cublasDdot(cublas_handle_, n_, &d_V_[(i + 1) * n_], 1, &d_V_[(i + 1) * n_], 1, &t); + + // set the last entry in Hessenberg matrix + t = sqrt(t); + h_H_[(i) * (restart_ + 1) + i + 1] = t; + if(t != 0.0) { + t = 1.0 / t; + cublasDscal(cublas_handle_, n_, &t, &d_V_[(i + 1) * n_], 1); + } else { + assert(0 && "Iterative refinement failed, Krylov vector with zero norm\n"); + } + break; + + default: + assert(0 && "Iterative refinement failed, wrong orthogonalization.\n"); + break; + } // switch +} // GramSchmidt + +// Error checking utility for CUDA +// KS: might later become part of src/Utils, putting it here for now +template +void IterativeRefinement::resolveCheckCudaError(T result, const char* const file, int const line) +{ +#ifdef DEBUG + if(result) { + fprintf(stdout, "CUDA error at %s:%d, error# %d\n", file, line, result); + assert(false); + } +#endif +} + +} // namespace EVLOSER diff --git a/src/LinAlg/EVLOSER/IterativeRefinement.hpp b/src/LinAlg/EVLOSER/IterativeRefinement.hpp new file mode 100644 index 0000000..b7c1767 --- /dev/null +++ b/src/LinAlg/EVLOSER/IterativeRefinement.hpp @@ -0,0 +1,176 @@ +/** + * @file IterativeRefinement.hpp + * + * @author Kasia Swirydowicz , PNNL + * @author Slaven Peles , ORNL + * + */ + +#pragma once + +#include "klu.h" +#include "resolve_cusolver_defs.hpp" +#include + +namespace EVLOSER +{ + +constexpr double ZERO = 0.0; +constexpr double EPSILON = 1.0e-18; +constexpr double EPSMAC = 1.0e-16; + +/** + * @brief Iterative refinement class + * + */ +class IterativeRefinement +{ +public: + IterativeRefinement(); + IterativeRefinement(int restart, double tol, int maxit); + ~IterativeRefinement(); + int setup(cusparseHandle_t cusparse_handle, + cublasHandle_t cublas_handle, + cusolverRfHandle_t cusolverrf_handle, + int n, + double* d_T, + int* d_P, + int* d_Q, + double* devx, + double* devr); + + int getFinalNumberOfIterations(); + double getFinalResidalNorm(); + double getInitialResidalNorm(); + double getBNorm(); + // this is public on purpose, can be used internally or outside, to compute the residual. + void fgmres(double* d_x, double* d_b); + void set_tol(double tol) { tol_ = tol; } ///< Set tolerance for the Krylov solver + + /** + * @brief Set the up system matrix object mat_A_ of type cusparseSpMatDescr_t + * + * @param n - size of the matrix + * @param nnz - number of nonzeros in the matrix + * @param irow - array of row pointers + * @param jcol - array of column indices + * @param val - array of sparse matrix values + * + * @return int + * + * @pre Arrays `irow`, `jcol` and `val` are on the device. + */ + int setup_system_matrix(int n, int nnz, int* irow, int* jcol, double* val); + + // Simple accessors + int& maxit() { return maxit_; } + + double& tol() { return tol_; } + + std::string& orth_option() { return orth_option_; } + + int& restart() { return restart_; } + + int& conv_cond() { return conv_cond_; } + +private: + // Krylov vectors + double* d_V_{nullptr}; + double* d_Z_{nullptr}; + + double final_residual_norm_; + double initial_residual_norm_; + double bnorm_; + int fgmres_iters_; + + // Solver parameters + int restart_; + int maxit_; + double tol_; + int conv_cond_; ///< convergence condition, can be 0, 1, 2 for IR + std::string orth_option_; + + // System matrix data + int n_; + int nnz_; + int* dia_{nullptr}; + int* dja_{nullptr}; + double* da_{nullptr}; + cusparseSpMatDescr_t mat_A_{nullptr}; + + // Matrix-vector product data + cusparseDnVecDescr_t vec_x_{nullptr}; + cusparseDnVecDescr_t vec_Ax_{nullptr}; + + // CUDA libraries handles - MUST BE SET AT INIT + cusparseHandle_t cusparse_handle_{nullptr}; + cublasHandle_t cublas_handle_{nullptr}; + cusolverRfHandle_t cusolverrf_handle_{nullptr}; + cusolverSpHandle_t cusolver_handle_{nullptr}; + + // GPU data (?) + double* d_T_{nullptr}; + int* d_P_{nullptr}; + int* d_Q_{nullptr}; + + double* d_rvGPU_{nullptr}; + double* d_Hcolumn_{nullptr}; + double* d_H_col_{nullptr}; + void* mv_buffer_{nullptr}; ///< SpMV buffer + + // CPU: + double* h_L_{nullptr}; + double* h_H_{nullptr}; + double* h_rv_{nullptr}; + // for givens rotations + double* h_c_{nullptr}; + double* h_s_{nullptr}; + // for Hessenberg system + double* h_rs_{nullptr}; + // neded in some of the orthogonalization methods + double* h_aux_{nullptr}; + + // TODO: Something needs to be done with this :) + const double minusone_ = -1.0; + const double one_ = 1.0; + const double zero_ = 0.0; + + /** + * @brief orthogonalize i+1 vector against i vectors already orthogonal + * + * Private function needed for FGMRES. + * + * @param[in] i - number of orthogonal vectors + */ + void GramSchmidt(int i); + + /** + * @brief matvec black-box: b = b - A*d_x if option is "residual" and b=A*x + * if option is "matvec" + * + * @param d_x + * @param d_b + * @param option + * + * @todo Document d_x and d_b; are both of them modified in this function? + */ + void cudaMatvec(double* d_x, double* d_b, std::string option); + + // KS: needed for testing -- condider delating later + double matrixAInfNrm(); + double vectorInfNrm(int n, double* d_v); + // end of testing + + /** + * @brief Check for CUDA errors. + * + * @tparam T - type of the result + * @param result - result value + * @param file - file name where the error occured + * @param line - line at which the error occured + */ + template + void resolveCheckCudaError(T result, const char* const file, int const line); +}; + +} // namespace EVLOSER diff --git a/src/LinAlg/EVLOSER/KrylovSolverKernels.cu b/src/LinAlg/EVLOSER/KrylovSolverKernels.cu new file mode 100644 index 0000000..6ad5ee1 --- /dev/null +++ b/src/LinAlg/EVLOSER/KrylovSolverKernels.cu @@ -0,0 +1,215 @@ +// This file is part of HiOp. For details, see https://github.com/LLNL/hiop. +// HiOp is released under the BSD 3-clause license +// (https://opensource.org/licenses/BSD-3-Clause). Please also read “Additional +// BSD Notice” below. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// i. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the disclaimer below. ii. Redistributions in +// binary form must reproduce the above copyright notice, this list of +// conditions and the disclaimer (as noted below) in the documentation and/or +// other materials provided with the distribution. +// iii. Neither the name of the LLNS/LLNL nor the names of its contributors may +// be used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY, LLC, +// THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, +// INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF +// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Additional BSD Notice +// 1. This notice is required to be provided under our contract with the U.S. +// Department of Energy (DOE). This work was produced at Lawrence Livermore +// National Laboratory under Contract No. DE-AC52-07NA27344 with the DOE. +// 2. Neither the United States Government nor Lawrence Livermore National +// Security, LLC nor any of their employees, makes any warranty, express or +// implied, or assumes any liability or responsibility for the accuracy, +// completeness, or usefulness of any information, apparatus, product, or +// process disclosed, or represents that its use would not infringe +// privately-owned rights. +// 3. Also, reference herein to any specific commercial products, process, or +// services by trade name, trademark, manufacturer or otherwise does not +// necessarily constitute or imply its endorsement, recommendation, or favoring +// by the United States Government or Lawrence Livermore National Security, +// LLC. The views and opinions of authors expressed herein do not necessarily +// state or reflect those of the United States Government or Lawrence Livermore +// National Security, LLC, and shall not be used for advertising or product +// endorsement purposes. + +/** + * @file KrylovSolverKernels.cu + * + * @author Kasia Swirydowicz , PNNL + */ +#include "KrylovSolverKernels.h" +#define maxk 1024 +#define Tv5 1024 +//computes V^T[u1 u2] where v is n x k and u1 and u2 are nx1 +__global__ void evloser_MassIPTwoVec_kernel(const double* __restrict__ u1, + const double* __restrict__ u2, + const double* __restrict__ v, + double* result, + const int k, + const int N) +{ + int t = threadIdx.x; + int bsize = blockDim.x; + + // assume T threads per thread block (and k reductions to be performed) + volatile __shared__ double s_tmp1[Tv5]; + + volatile __shared__ double s_tmp2[Tv5]; + // map between thread index space and the problem index space + int j = blockIdx.x; + s_tmp1[t] = 0.0f; + s_tmp2[t] = 0.0f; + int nn = t; + double can1, can2, cbn; + + while(nn < N) { + can1 = u1[nn]; + can2 = u2[nn]; + + cbn = v[N * j + nn]; + s_tmp1[t] += can1 * cbn; + s_tmp2[t] += can2 * cbn; + + nn += bsize; + } + + __syncthreads(); + + if(Tv5 >= 1024) { + if(t < 512) { + s_tmp1[t] += s_tmp1[t + 512]; + s_tmp2[t] += s_tmp2[t + 512]; + } + __syncthreads(); + } + if(Tv5 >= 512) { + if(t < 256) { + s_tmp1[t] += s_tmp1[t + 256]; + s_tmp2[t] += s_tmp2[t + 256]; + } + __syncthreads(); + } + { + if(t < 128) { + s_tmp1[t] += s_tmp1[t + 128]; + s_tmp2[t] += s_tmp2[t + 128]; + } + __syncthreads(); + } + { + if(t < 64) { + s_tmp1[t] += s_tmp1[t + 64]; + s_tmp2[t] += s_tmp2[t + 64]; + } + __syncthreads(); + } + + if(t < 32) { + s_tmp1[t] += s_tmp1[t + 32]; + s_tmp2[t] += s_tmp2[t + 32]; + + s_tmp1[t] += s_tmp1[t + 16]; + s_tmp2[t] += s_tmp2[t + 16]; + + s_tmp1[t] += s_tmp1[t + 8]; + s_tmp2[t] += s_tmp2[t + 8]; + + s_tmp1[t] += s_tmp1[t + 4]; + s_tmp2[t] += s_tmp2[t + 4]; + + s_tmp1[t] += s_tmp1[t + 2]; + s_tmp2[t] += s_tmp2[t + 2]; + + s_tmp1[t] += s_tmp1[t + 1]; + s_tmp2[t] += s_tmp2[t + 1]; + } + if(t == 0) { + result[blockIdx.x] = s_tmp1[0]; + result[blockIdx.x + k] = s_tmp2[0]; + } +} + + +//mass AXPY i.e y = y - x*alpha where alpha is [k x 1], needed in 1 and 2 synch GMRES + +__global__ void evloser_massAxpy3_kernel(int N, + int k, + const double* x_data, + double* y_data, + const double* alpha) { + + unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; + unsigned int t = threadIdx.x; + __shared__ double s_alpha[maxk]; + if(t < k) { + s_alpha[t] = alpha[t]; + } + __syncthreads(); + + if(i < N) { + double temp = 0.0f; + for(int j = 0; j < k; ++j) { + temp += x_data[j * N + i] * s_alpha[j]; + } + y_data[i] -= temp; + } +} + +__global__ void evloser_matrixInfNormPart1(const int n, + const int nnz, + const int* a_ia, + const double* a_val, + double* result) { + + // one thread per row, pass through rows + // and sum + // can be done through atomics + //\sum_{j=1}^m abs(a_{ij}) + + int idx = blockIdx.x*blockDim.x + threadIdx.x; + while (idx < n){ + double sum = 0.0f; + for (int i = a_ia[idx]; i < a_ia[idx+1]; ++i) { + sum = sum + fabs(a_val[i]); + } + result[idx] = sum; + idx += (blockDim.x*gridDim.x); + } +} + + +void evloser_mass_inner_product_two_vectors(int n, + int i, + double* vec1, + double* vec2, + double* mvec, + double* result) +{ + evloser_MassIPTwoVec_kernel<<>>(vec1, vec2, mvec, result, i + 1, n); +} +void evloser_mass_axpy(int n, int i, double* x, double* y, double* alpha) +{ + evloser_massAxpy3_kernel<<<(n + 384 - 1) / 384, 384>>>(n, i + 1, x, y, alpha); +} + +void evloser_matrix_row_sums(int n, + int nnz, + int* a_ia, + double* a_val, + double* result) +{ + evloser_matrixInfNormPart1<<<1000,1024>>>(n, nnz, a_ia, a_val, result); +} diff --git a/src/LinAlg/EVLOSER/KrylovSolverKernels.h b/src/LinAlg/EVLOSER/KrylovSolverKernels.h new file mode 100644 index 0000000..2f46000 --- /dev/null +++ b/src/LinAlg/EVLOSER/KrylovSolverKernels.h @@ -0,0 +1,70 @@ +// This file is part of HiOp. For details, see https://github.com/LLNL/hiop. +// HiOp is released under the BSD 3-clause license +// (https://opensource.org/licenses/BSD-3-Clause). Please also read “Additional +// BSD Notice” below. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// i. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the disclaimer below. ii. Redistributions in +// binary form must reproduce the above copyright notice, this list of +// conditions and the disclaimer (as noted below) in the documentation and/or +// other materials provided with the distribution. +// iii. Neither the name of the LLNS/LLNL nor the names of its contributors may +// be used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY, LLC, +// THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, +// INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF +// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Additional BSD Notice +// 1. This notice is required to be provided under our contract with the U.S. +// Department of Energy (DOE). This work was produced at Lawrence Livermore +// National Laboratory under Contract No. DE-AC52-07NA27344 with the DOE. +// 2. Neither the United States Government nor Lawrence Livermore National +// Security, LLC nor any of their employees, makes any warranty, express or +// implied, or assumes any liability or responsibility for the accuracy, +// completeness, or usefulness of any information, apparatus, product, or +// process disclosed, or represents that its use would not infringe +// privately-owned rights. +// 3. Also, reference herein to any specific commercial products, process, or +// services by trade name, trademark, manufacturer or otherwise does not +// necessarily constitute or imply its endorsement, recommendation, or favoring +// by the United States Government or Lawrence Livermore National Security, +// LLC. The views and opinions of authors expressed herein do not necessarily +// state or reflect those of the United States Government or Lawrence Livermore +// National Security, LLC, and shall not be used for advertising or product +// endorsement purposes. + +/** + * @file src/LinAlg/KrylovSolverKernels.h + * + * @author Kasia Swirydowicz , PNNL + * + */ + + +void evloser_mass_inner_product_two_vectors(int n, + int i, + double* vec1, + double* vec2, + double* mvec, + double* result); +void evloser_mass_axpy(int n, int i, double* x, double* y, double* alpha); + +//needed for matrix inf nrm +void evloser_matrix_row_sums(int n, + int nnz, + int* a_ia, + double* a_val, + double* result); + diff --git a/src/LinAlg/EVLOSER/MatrixCsr.cpp b/src/LinAlg/EVLOSER/MatrixCsr.cpp new file mode 100644 index 0000000..3220bec --- /dev/null +++ b/src/LinAlg/EVLOSER/MatrixCsr.cpp @@ -0,0 +1,143 @@ +// This file is part of HiOp. For details, see https://github.com/LLNL/hiop. +// HiOp is released under the BSD 3-clause license +// (https://opensource.org/licenses/BSD-3-Clause). Please also read “Additional +// BSD Notice” below. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// i. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the disclaimer below. ii. Redistributions in +// binary form must reproduce the above copyright notice, this list of +// conditions and the disclaimer (as noted below) in the documentation and/or +// other materials provided with the distribution. +// iii. Neither the name of the LLNS/LLNL nor the names of its contributors may +// be used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY, LLC, +// THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, +// INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF +// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Additional BSD Notice +// 1. This notice is required to be provided under our contract with the U.S. +// Department of Energy (DOE). This work was produced at Lawrence Livermore +// National Laboratory under Contract No. DE-AC52-07NA27344 with the DOE. +// 2. Neither the United States Government nor Lawrence Livermore National +// Security, LLC nor any of their employees, makes any warranty, express or +// implied, or assumes any liability or responsibility for the accuracy, +// completeness, or usefulness of any information, apparatus, product, or +// process disclosed, or represents that its use would not infringe +// privately-owned rights. +// 3. Also, reference herein to any specific commercial products, process, or +// services by trade name, trademark, manufacturer or otherwise does not +// necessarily constitute or imply its endorsement, recommendation, or favoring +// by the United States Government or Lawrence Livermore National Security, +// LLC. The views and opinions of authors expressed herein do not necessarily +// state or reflect those of the United States Government or Lawrence Livermore +// National Security, LLC, and shall not be used for advertising or product +// endorsement purposes. + +/** + * @file MatrixCsr.cpp + * + * @author Kasia Swirydowicz , PNNL + * @author Slaven Peles , ORNL + * + */ + +#include "hiop_blasdefs.hpp" +#include "MatrixCsr.hpp" + +#include "cusparse_v2.h" +#include +#include +#include +#include +#include + +#define checkCudaErrors(val) resolveCheckCudaError((val), __FILE__, __LINE__) + +namespace EVLOSER +{ + +MatrixCsr::MatrixCsr() {} + +MatrixCsr::~MatrixCsr() +{ + if(n_ == 0) return; + + clear_data(); +} + +void MatrixCsr::allocate_size(int n) +{ + n_ = n; + checkCudaErrors(cudaMalloc(&irows_, (n_ + 1) * sizeof(int))); + irows_host_ = new int[n_ + 1]{0}; +} + +void MatrixCsr::allocate_nnz(int nnz) +{ + nnz_ = nnz; + checkCudaErrors(cudaMalloc(&jcols_, nnz_ * sizeof(int))); + checkCudaErrors(cudaMalloc(&vals_, nnz_ * sizeof(double))); + jcols_host_ = new int[nnz_]{0}; + vals_host_ = new double[nnz_]{0}; +} + +void MatrixCsr::clear_data() +{ + checkCudaErrors(cudaFree(irows_)); + checkCudaErrors(cudaFree(jcols_)); + checkCudaErrors(cudaFree(vals_)); + + irows_ = nullptr; + jcols_ = nullptr; + vals_ = nullptr; + + delete[] irows_host_; + delete[] jcols_host_; + delete[] vals_host_; + + irows_host_ = nullptr; + jcols_host_ = nullptr; + vals_host_ = nullptr; + + n_ = 0; + nnz_ = 0; +} + +void MatrixCsr::update_from_host_mirror() +{ + checkCudaErrors(cudaMemcpy(irows_, irows_host_, sizeof(int) * (n_ + 1), cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(jcols_, jcols_host_, sizeof(int) * nnz_, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(vals_, vals_host_, sizeof(double) * nnz_, cudaMemcpyHostToDevice)); +} + +void MatrixCsr::copy_to_host_mirror() +{ + checkCudaErrors(cudaMemcpy(irows_host_, irows_, sizeof(int) * (n_ + 1), cudaMemcpyDeviceToHost)); + checkCudaErrors(cudaMemcpy(jcols_host_, jcols_, sizeof(int) * nnz_, cudaMemcpyDeviceToHost)); + checkCudaErrors(cudaMemcpy(vals_host_, vals_, sizeof(double) * nnz_, cudaMemcpyDeviceToHost)); +} + +// Error checking utility for CUDA +// KS: might later become part of src/Utils, putting it here for now +template +void MatrixCsr::resolveCheckCudaError(T result, const char* const file, int const line) +{ + if(result) { + std::cout << "CUDA error at " << file << ":" << line << " error# " << result << "\n"; + assert(false); + } +} + +} // namespace EVLOSER diff --git a/src/LinAlg/EVLOSER/MatrixCsr.hpp b/src/LinAlg/EVLOSER/MatrixCsr.hpp new file mode 100644 index 0000000..f34f40e --- /dev/null +++ b/src/LinAlg/EVLOSER/MatrixCsr.hpp @@ -0,0 +1,56 @@ +#pragma once + +namespace EVLOSER +{ + +class MatrixCsr +{ +public: + MatrixCsr(); + ~MatrixCsr(); + void allocate_size(int n); + void allocate_nnz(int nnz); + void clear_data(); + + int* get_irows() { return irows_; } + + const int* get_irows() const { return irows_; } + + int* get_jcols() { return jcols_; } + + double* get_vals() { return vals_; } + + int* get_irows_host() { return irows_host_; } + + int* get_jcols_host() { return jcols_host_; } + + double* get_vals_host() { return vals_host_; } + + void update_from_host_mirror(); + void copy_to_host_mirror(); + +private: + int n_{0}; + int nnz_{0}; + + int* irows_{nullptr}; + int* jcols_{nullptr}; + double* vals_{nullptr}; + + int* irows_host_{nullptr}; + int* jcols_host_{nullptr}; + double* vals_host_{nullptr}; + + /** + * @brief Check for CUDA errors. + * + * @tparam T - type of the result + * @param result - result value + * @param file - file name where the error occured + * @param line - line at which the error occured + */ + template + void resolveCheckCudaError(T result, const char* const file, int const line); +}; + +} // namespace EVLOSER diff --git a/src/LinAlg/EVLOSER/RefactorizationSolver.cpp b/src/LinAlg/EVLOSER/RefactorizationSolver.cpp new file mode 100644 index 0000000..5b0b13f --- /dev/null +++ b/src/LinAlg/EVLOSER/RefactorizationSolver.cpp @@ -0,0 +1,764 @@ +// This file is part of HiOp. For details, see https://github.com/LLNL/hiop. +// HiOp is released under the BSD 3-clause license +// (https://opensource.org/licenses/BSD-3-Clause). Please also read “Additional +// BSD Notice” below. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// i. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the disclaimer below. ii. Redistributions in +// binary form must reproduce the above copyright notice, this list of +// conditions and the disclaimer (as noted below) in the documentation and/or +// other materials provided with the distribution. +// iii. Neither the name of the LLNS/LLNL nor the names of its contributors may +// be used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY, LLC, +// THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, +// INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF +// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Additional BSD Notice +// 1. This notice is required to be provided under our contract with the U.S. +// Department of Energy (DOE). This work was produced at Lawrence Livermore +// National Laboratory under Contract No. DE-AC52-07NA27344 with the DOE. +// 2. Neither the United States Government nor Lawrence Livermore National +// Security, LLC nor any of their employees, makes any warranty, express or +// implied, or assumes any liability or responsibility for the accuracy, +// completeness, or usefulness of any information, apparatus, product, or +// process disclosed, or represents that its use would not infringe +// privately-owned rights. +// 3. Also, reference herein to any specific commercial products, process, or +// services by trade name, trademark, manufacturer or otherwise does not +// necessarily constitute or imply its endorsement, recommendation, or favoring +// by the United States Government or Lawrence Livermore National Security, +// LLC. The views and opinions of authors expressed herein do not necessarily +// state or reflect those of the United States Government or Lawrence Livermore +// National Security, LLC, and shall not be used for advertising or product +// endorsement purposes. + +/** + * @file RefactorizationSolver.cpp + * + * @author Kasia Swirydowicz , PNNL + * @author Slaven Peles , ORNL + * + */ + +#include "MatrixCsr.hpp" +#include "IterativeRefinement.hpp" +#include "RefactorizationSolver.hpp" + +#include "klu.h" +#include "cusparse_v2.h" +#include +#include +#include +#include + +#define checkCudaErrors(val) resolveCheckCudaError((val), __FILE__, __LINE__) + +namespace EVLOSER +{ + +RefactorizationSolver::RefactorizationSolver(int n) + : n_(n) +{ + mat_A_csr_ = new MatrixCsr(); + + // handles + cusparseCreate(&handle_); + cusolverSpCreate(&handle_cusolver_); + cublasCreate(&handle_cublas_); + + // descriptors + cusparseCreateMatDescr(&descr_A_); + cusparseSetMatType(descr_A_, CUSPARSE_MATRIX_TYPE_GENERAL); + cusparseSetMatIndexBase(descr_A_, CUSPARSE_INDEX_BASE_ZERO); + + // Allocate host mirror for the solution vector + hostx_ = new double[n_]; + + // Allocate solution and rhs vectors + checkCudaErrors(cudaMalloc(&devx_, n_ * sizeof(double))); + checkCudaErrors(cudaMalloc(&devr_, n_ * sizeof(double))); +} + +RefactorizationSolver::~RefactorizationSolver() +{ + if(iterative_refinement_enabled_) delete ir_; + delete mat_A_csr_; + + // Delete workspaces and handles + cudaFree(d_work_); + cusparseDestroy(handle_); + cusolverSpDestroy(handle_cusolver_); + cublasDestroy(handle_cublas_); + cusparseDestroyMatDescr(descr_A_); + + // Delete host mirror for the solution vector + delete[] hostx_; + + // Delete residual and solution vectors + cudaFree(devr_); + cudaFree(devx_); + + // Delete matrix descriptor used in cuSolverGLU setup + if(cusolver_glu_enabled_) { + cusparseDestroyMatDescr(descr_M_); + cusolverSpDestroyGluInfo(info_M_); + } + + if(cusolver_rf_enabled_) { + cudaFree(d_P_); + cudaFree(d_Q_); + cudaFree(d_T_); + } + + klu_free_symbolic(&Symbolic_, &Common_); + klu_free_numeric(&Numeric_, &Common_); + delete[] mia_; + delete[] mja_; +} + +void RefactorizationSolver::enable_iterative_refinement() +{ + ir_ = new IterativeRefinement(); + if(ir_ != nullptr) iterative_refinement_enabled_ = true; +} + +// TODO: Refactor to only pass mat_A_csr_ to setup_system_matrix; n and nnz can be read from mat_A_csr_ +void RefactorizationSolver::setup_iterative_refinement_matrix(int n, int nnz) +{ + ir_->setup_system_matrix(n, nnz, mat_A_csr_->get_irows(), mat_A_csr_->get_jcols(), mat_A_csr_->get_vals()); +} + +// TODO: Can this function be merged with setup_iterative_refinement_matrix ? +void RefactorizationSolver::configure_iterative_refinement(cusparseHandle_t cusparse_handle, + cublasHandle_t cublas_handle, + cusolverRfHandle_t cusolverrf_handle, + int n, + double* d_T, + int* d_P, + int* d_Q, + double* devx, + double* devr) +{ + ir_->setup(cusparse_handle, cublas_handle, cusolverrf_handle, n, d_T, d_P, d_Q, devx, devr); +} + +int RefactorizationSolver::setup_factorization() +{ + int* row_ptr = mat_A_csr_->get_irows_host(); + int* col_idx = mat_A_csr_->get_jcols_host(); + + if(fact_ == "klu") { + /* initialize KLU setup parameters, dont factorize yet */ + initializeKLU(); + + /*perform KLU but only the symbolic analysis (important) */ + klu_free_symbolic(&Symbolic_, &Common_); + klu_free_numeric(&Numeric_, &Common_); + Symbolic_ = klu_analyze(n_, row_ptr, col_idx, &Common_); + + if(Symbolic_ == nullptr) { + return -1; + } + } else { // for future + assert(0 && "Only KLU is available for the first factorization.\n"); + } + return 0; +} + +int RefactorizationSolver::factorize() +{ + Numeric_ = klu_factor(mat_A_csr_->get_irows_host(), + mat_A_csr_->get_jcols_host(), + mat_A_csr_->get_vals_host(), + Symbolic_, + &Common_); + return (Numeric_ == nullptr) ? -1 : 0; +} + +void RefactorizationSolver::setup_refactorization() +{ + if(refact_ == "glu") { + initializeCusolverGLU(); + refactorizationSetupCusolverGLU(); + } else if(refact_ == "rf") { + initializeCusolverRf(); + refactorizationSetupCusolverRf(); + if(use_ir_ == "yes") { + configure_iterative_refinement(handle_, handle_cublas_, handle_rf_, n_, d_T_, d_P_, d_Q_, devx_, devr_); + } + } else { // for future - + assert(0 && "Only glu and rf refactorizations available.\n"); + } +} + +int RefactorizationSolver::refactorize() +{ + if(refact_ == "glu") { + sp_status_ = cusolverSpDgluReset(handle_cusolver_, + n_, + /* A is original matrix */ + nnz_, + descr_A_, + mat_A_csr_->get_vals(), + mat_A_csr_->get_irows(), + mat_A_csr_->get_jcols(), + info_M_); + sp_status_ = cusolverSpDgluFactor(handle_cusolver_, info_M_, d_work_); + } else { + if(refact_ == "rf") { + sp_status_ = cusolverRfResetValues(n_, + nnz_, + mat_A_csr_->get_irows(), + mat_A_csr_->get_jcols(), + mat_A_csr_->get_vals(), + d_P_, + d_Q_, + handle_rf_); + cudaDeviceSynchronize(); + sp_status_ = cusolverRfRefactor(handle_rf_); + } + } + return 0; +} + +bool RefactorizationSolver::triangular_solve(double* dx, double tol, std::string memspace) +{ + if(refact_ == "glu") { + double* devx = nullptr; + if(memspace == "device") { + checkCudaErrors(cudaMemcpy(devr_, dx, sizeof(double) * n_, cudaMemcpyDeviceToDevice)); + devx = dx; + } else { + checkCudaErrors(cudaMemcpy(devr_, dx, sizeof(double) * n_, cudaMemcpyHostToDevice)); + devx = devx_; + } + sp_status_ = cusolverSpDgluSolve(handle_cusolver_, + n_, + /* A is original matrix */ + nnz_, + descr_A_, + mat_A_csr_->get_vals(), + mat_A_csr_->get_irows(), + mat_A_csr_->get_jcols(), + devr_, /* right hand side */ + devx, /* left hand side, local pointer */ + &ite_refine_succ_, + &r_nrminf_, + info_M_, + d_work_); + if(sp_status_ != 0 && !silent_output_) { + std::cout << "GLU solve failed with status: " << sp_status_ << "\n"; + return false; + } + if(memspace == "device") { + // do nothing + } else { + checkCudaErrors(cudaMemcpy(dx, devx_, sizeof(double) * n_, cudaMemcpyDeviceToHost)); + } + return true; + } + + if(refact_ == "rf") { + // First solve is performed on CPU + if(is_first_solve_) { + double* hostx = nullptr; + if(memspace == "device") { + checkCudaErrors(cudaMemcpy(hostx_, dx, sizeof(double) * n_, cudaMemcpyDeviceToHost)); + hostx = hostx_; + } else { + hostx = dx; + } + int ok = klu_solve(Symbolic_, Numeric_, n_, 1, hostx, &Common_); // replace dx with hostx + klu_free_numeric(&Numeric_, &Common_); + klu_free_symbolic(&Symbolic_, &Common_); + is_first_solve_ = false; + if(memspace == "device") { + checkCudaErrors(cudaMemcpy(dx, hostx, sizeof(double) * n_, cudaMemcpyHostToDevice)); + } else { + // do nothing + } + return true; + } + + double* devx = nullptr; + if(memspace == "device") { + devx = dx; + checkCudaErrors(cudaMemcpy(devr_, dx, sizeof(double) * n_, cudaMemcpyDeviceToDevice)); + } else { + checkCudaErrors(cudaMemcpy(devx_, dx, sizeof(double) * n_, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(devr_, devx_, sizeof(double) * n_, cudaMemcpyDeviceToDevice)); + devx = devx_; + } + + // Each next solve is performed on GPU + sp_status_ = cusolverRfSolve(handle_rf_, + d_P_, + d_Q_, + 1, + d_T_, + n_, + devx, // replace devx_ with local pointer devx + n_); + if(sp_status_ != 0) { + if(!silent_output_) std::cout << "Rf solve failed with status: " << sp_status_ << "\n"; + return false; + } + + if(use_ir_ == "yes") { + // Set tolerance based on barrier parameter mu + ir_->set_tol(tol); + + ir_->fgmres(devx, devr_); // replace devx_ with local pointer devx + if(!silent_output_ && (ir_->getFinalResidalNorm() > tol * ir_->getBNorm())) { + std::cout << "[Warning] Iterative refinement did not converge!\n"; + std::cout << "\t Iterative refinement tolerance " << tol << "\n"; + std::cout << "\t Relative solution error " << ir_->getFinalResidalNorm() / ir_->getBNorm() << "\n"; + std::cout << "\t fgmres: init residual norm: " << ir_->getInitialResidalNorm() << "\n" + << "\t final residual norm: " << ir_->getFinalResidalNorm() << "\n" + << "\t number of iterations: " << ir_->getFinalNumberOfIterations() << "\n"; + } + } + if(memspace == "device") { + // do nothing + } else { + checkCudaErrors(cudaMemcpy(dx, devx_, sizeof(double) * n_, cudaMemcpyDeviceToHost)); + } + return true; + } + + if(!silent_output_) { + std::cout << "Unknown refactorization " << refact_ << ", exiting\n"; + } + return false; +} + +// helper private function needed for format conversion +int RefactorizationSolver::createM(const int n, + const int /* nnzL */, + const int* Lp, + const int* Li, + const int /* nnzU */, + const int* Up, + const int* Ui) +{ + int row; + for(int i = 0; i < n; ++i) { + // go through EACH COLUMN OF L first + for(int j = Lp[i]; j < Lp[i + 1]; ++j) { + row = Li[j]; + // BUT dont count diagonal twice, important + if(row != i) { + mia_[row + 1]++; + } + } + // then each column of U + for(int j = Up[i]; j < Up[i + 1]; ++j) { + row = Ui[j]; + mia_[row + 1]++; + } + } + // then organize mia_; + mia_[0] = 0; + for(int i = 1; i < n + 1; i++) { + mia_[i] += mia_[i - 1]; + } + + std::vector Mshifts(n, 0); + for(int i = 0; i < n; ++i) { + // go through EACH COLUMN OF L first + for(int j = Lp[i]; j < Lp[i + 1]; ++j) { + row = Li[j]; + if(row != i) { + // place (row, i) where it belongs! + mja_[mia_[row] + Mshifts[row]] = i; + Mshifts[row]++; + } + } + // each column of U next + for(int j = Up[i]; j < Up[i + 1]; ++j) { + row = Ui[j]; + mja_[mia_[row] + Mshifts[row]] = i; + Mshifts[row]++; + } + } + return 0; +} + +int RefactorizationSolver::initializeKLU() +{ + klu_defaults(&Common_); + + // TODO: consider making this a part of setup options so that user can + // set up these values. For now, we keep them hard-wired. + Common_.btf = 0; + Common_.ordering = ordering_; // COLAMD=1; AMD=0 + Common_.tol = 0.1; + Common_.scale = -1; + Common_.halt_if_singular = 1; + + return 0; +} + +int RefactorizationSolver::initializeCusolverGLU() +{ + cusparseCreateMatDescr(&descr_M_); + cusparseSetMatType(descr_M_, CUSPARSE_MATRIX_TYPE_GENERAL); + cusparseSetMatIndexBase(descr_M_, CUSPARSE_INDEX_BASE_ZERO); + + // info (data structure where factorization is stored) + // this is done in the constructor - however, this function might be called more than once + cusolverSpDestroyGluInfo(info_M_); + cusolverSpCreateGluInfo(&info_M_); + + cusolver_glu_enabled_ = true; + return 0; +} + +int RefactorizationSolver::initializeCusolverRf() +{ + cusolverRfCreate(&handle_rf_); + + checkCudaErrors(cusolverRfSetAlgs(handle_rf_, CUSOLVERRF_FACTORIZATION_ALG2, CUSOLVERRF_TRIANGULAR_SOLVE_ALG2)); + + checkCudaErrors(cusolverRfSetMatrixFormat(handle_rf_, CUSOLVERRF_MATRIX_FORMAT_CSR, CUSOLVERRF_UNIT_DIAGONAL_STORED_L)); + + cusolverRfSetResetValuesFastMode(handle_rf_, CUSOLVERRF_RESET_VALUES_FAST_MODE_ON); + + const double boost = 1e-12; + const double zero = 1e-14; + + cusolverRfSetNumericProperties(handle_rf_, zero, boost); + + cusolver_rf_enabled_ = true; + return 0; +} + +// call if both the matrix and the nnz structure changed or if convergence is +// poor while using refactorization. +int RefactorizationSolver::refactorizationSetupCusolverGLU() +{ + // for now this ONLY WORKS if proceeded by KLU. Might be worth decoupling + // later + + // get sizes + const int nnzL = Numeric_->lnz; + const int nnzU = Numeric_->unz; + + const int nnzM = (nnzL + nnzU - n_); + + /* parse the factorization */ + + mia_ = new int[n_ + 1]{0}; + mja_ = new int[nnzM]{0}; + int* Lp = new int[n_ + 1]; + int* Li = new int[nnzL]; + // we can't use nullptr instead od Lx and Ux because it causes SEG FAULT. It + // seems like a waste of memory though. + + double* Lx = new double[nnzL]; + int* Up = new int[n_ + 1]; + int* Ui = new int[nnzU]; + + double* Ux = new double[nnzU]; + + int ok = klu_extract(Numeric_, + Symbolic_, + Lp, + Li, + Lx, + Up, + Ui, + Ux, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + &Common_); + createM(n_, nnzL, Lp, Li, nnzU, Up, Ui); + + delete[] Lp; + delete[] Li; + delete[] Lx; + delete[] Up; + delete[] Ui; + delete[] Ux; + + /* setup GLU */ + sp_status_ = cusolverSpDgluSetup(handle_cusolver_, + n_, + nnz_, + descr_A_, + mat_A_csr_->get_irows_host(), // kRowPtr_, + mat_A_csr_->get_jcols_host(), // jCol_, + Numeric_->Pnum, /* base-0 */ + Symbolic_->Q, /* base-0 */ + nnzM, /* nnzM */ + descr_M_, + mia_, + mja_, + info_M_); + + sp_status_ = cusolverSpDgluBufferSize(handle_cusolver_, info_M_, &size_M_); + assert(CUSOLVER_STATUS_SUCCESS == sp_status_); + + buffer_size_ = size_M_; + checkCudaErrors(cudaMalloc((void**)&d_work_, buffer_size_)); + + sp_status_ = cusolverSpDgluAnalysis(handle_cusolver_, info_M_, d_work_); + assert(CUSOLVER_STATUS_SUCCESS == sp_status_); + + // reset and refactor so factors are ON THE GPU + + sp_status_ = cusolverSpDgluReset(handle_cusolver_, + n_, + /* A is original matrix */ + nnz_, + descr_A_, + mat_A_csr_->get_vals(), + mat_A_csr_->get_irows(), + mat_A_csr_->get_jcols(), + info_M_); + + assert(CUSOLVER_STATUS_SUCCESS == sp_status_); + sp_status_ = cusolverSpDgluFactor(handle_cusolver_, info_M_, d_work_); + return 0; +} + +int RefactorizationSolver::refactorizationSetupCusolverRf() +{ + // for now this ONLY WORKS if preceeded by KLU. Might be worth decoupling + // later + const int nnzL = Numeric_->lnz; + const int nnzU = Numeric_->unz; + + checkCudaErrors(cudaMalloc(&d_P_, (n_) * sizeof(int))); + checkCudaErrors(cudaMalloc(&d_Q_, (n_) * sizeof(int))); + checkCudaErrors(cudaMalloc(&d_T_, (n_) * sizeof(double))); + + checkCudaErrors(cudaMemcpy(d_P_, Numeric_->Pnum, sizeof(int) * (n_), cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(d_Q_, Symbolic_->Q, sizeof(int) * (n_), cudaMemcpyHostToDevice)); + + int* Lp = new int[n_ + 1]; + int* Li = new int[nnzL]; + double* Lx = new double[nnzL]; + int* Up = new int[n_ + 1]; + int* Ui = new int[nnzU]; + double* Ux = new double[nnzU]; + + int ok = klu_extract(Numeric_, + Symbolic_, + Lp, + Li, + Lx, + Up, + Ui, + Ux, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + &Common_); + + /* CSC */ + int* d_Lp; + int* d_Li; + int* d_Up; + int* d_Ui; + double* d_Lx; + double* d_Ux; + /* CSR */ + int* d_Lp_csr; + int* d_Li_csr; + int* d_Up_csr; + int* d_Ui_csr; + double* d_Lx_csr; + double* d_Ux_csr; + + /* allocate CSC */ + checkCudaErrors(cudaMalloc(&d_Lp, (n_ + 1) * sizeof(int))); + checkCudaErrors(cudaMalloc(&d_Li, nnzL * sizeof(int))); + checkCudaErrors(cudaMalloc(&d_Lx, nnzL * sizeof(double))); + checkCudaErrors(cudaMalloc(&d_Up, (n_ + 1) * sizeof(int))); + checkCudaErrors(cudaMalloc(&d_Ui, nnzU * sizeof(int))); + checkCudaErrors(cudaMalloc(&d_Ux, nnzU * sizeof(double))); + + /* allocate CSR */ + checkCudaErrors(cudaMalloc(&d_Lp_csr, (n_ + 1) * sizeof(int))); + checkCudaErrors(cudaMalloc(&d_Li_csr, nnzL * sizeof(int))); + checkCudaErrors(cudaMalloc(&d_Lx_csr, nnzL * sizeof(double))); + checkCudaErrors(cudaMalloc(&d_Up_csr, (n_ + 1) * sizeof(int))); + checkCudaErrors(cudaMalloc(&d_Ui_csr, nnzU * sizeof(int))); + checkCudaErrors(cudaMalloc(&d_Ux_csr, nnzU * sizeof(double))); + + /* copy CSC to the GPU */ + checkCudaErrors(cudaMemcpy(d_Lp, Lp, sizeof(int) * (n_ + 1), cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(d_Li, Li, sizeof(int) * (nnzL), cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(d_Lx, Lx, sizeof(double) * (nnzL), cudaMemcpyHostToDevice)); + + checkCudaErrors(cudaMemcpy(d_Up, Up, sizeof(int) * (n_ + 1), cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(d_Ui, Ui, sizeof(int) * (nnzU), cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(d_Ux, Ux, sizeof(double) * (nnzU), cudaMemcpyHostToDevice)); + + /* we dont need these any more */ + delete[] Lp; + delete[] Li; + delete[] Lx; + delete[] Up; + delete[] Ui; + delete[] Ux; + + /* now CSC to CSR using the new cuda 11 awkward way */ + size_t bufferSizeL; + size_t bufferSizeU; + + cusparseStatus_t csp = cusparseCsr2cscEx2_bufferSize(handle_, + n_, + n_, + nnzL, + d_Lx, + d_Lp, + d_Li, + d_Lx_csr, + d_Lp_csr, + d_Li_csr, + CUDA_R_64F, + CUSPARSE_ACTION_NUMERIC, + CUSPARSE_INDEX_BASE_ZERO, + CUSPARSE_CSR2CSC_ALG1, + &bufferSizeL); + + csp = cusparseCsr2cscEx2_bufferSize(handle_, + n_, + n_, + nnzU, + d_Ux, + d_Up, + d_Ui, + d_Ux_csr, + d_Up_csr, + d_Ui_csr, + CUDA_R_64F, + CUSPARSE_ACTION_NUMERIC, + CUSPARSE_INDEX_BASE_ZERO, + CUSPARSE_CSR2CSC_ALG1, + &bufferSizeU); + /* allocate buffers */ + + double* d_workL; + double* d_workU; + checkCudaErrors(cudaMalloc((void**)&d_workL, bufferSizeL)); + checkCudaErrors(cudaMalloc((void**)&d_workU, bufferSizeU)); + + /* actual CSC to CSR */ + + csp = cusparseCsr2cscEx2(handle_, + n_, + n_, + nnzL, + d_Lx, + d_Lp, + d_Li, + d_Lx_csr, + d_Lp_csr, + d_Li_csr, + CUDA_R_64F, + CUSPARSE_ACTION_NUMERIC, + CUSPARSE_INDEX_BASE_ZERO, + CUSPARSE_CSR2CSC_ALG1, + d_workL); + + csp = cusparseCsr2cscEx2(handle_, + n_, + n_, + nnzU, + d_Ux, + d_Up, + d_Ui, + d_Ux_csr, + d_Up_csr, + d_Ui_csr, + CUDA_R_64F, + CUSPARSE_ACTION_NUMERIC, + CUSPARSE_INDEX_BASE_ZERO, + CUSPARSE_CSR2CSC_ALG1, + d_workU); + + (void)csp; // mute unused variable warnings + + /* CSC no longer needed, nor the work arrays! */ + + cudaFree(d_Lp); + cudaFree(d_Li); + cudaFree(d_Lx); + + cudaFree(d_Up); + cudaFree(d_Ui); + cudaFree(d_Ux); + + cudaFree(d_workU); + cudaFree(d_workL); + + /* actual setup */ + + sp_status_ = cusolverRfSetupDevice(n_, + nnz_, + mat_A_csr_->get_irows(), // dia_, + mat_A_csr_->get_jcols(), // dja_, + mat_A_csr_->get_vals(), // da_, + nnzL, + d_Lp_csr, + d_Li_csr, + d_Lx_csr, + nnzU, + d_Up_csr, + d_Ui_csr, + d_Ux_csr, + d_P_, + d_Q_, + handle_rf_); + cudaDeviceSynchronize(); + sp_status_ = cusolverRfAnalyze(handle_rf_); + + // clean up + cudaFree(d_Lp_csr); + cudaFree(d_Li_csr); + cudaFree(d_Lx_csr); + + cudaFree(d_Up_csr); + cudaFree(d_Ui_csr); + cudaFree(d_Ux_csr); + + return 0; +} + +// Error checking utility for CUDA +// KS: might later become part of src/Utils, putting it here for now +template +void RefactorizationSolver::resolveCheckCudaError(T result, const char* const file, int const line) +{ + if(result) { + fprintf(stdout, "CUDA error at %s:%d, error# %d\n", file, line, result); + assert(false); + } +} + +} // namespace EVLOSER diff --git a/src/LinAlg/EVLOSER/RefactorizationSolver.hpp b/src/LinAlg/EVLOSER/RefactorizationSolver.hpp new file mode 100644 index 0000000..c6f603e --- /dev/null +++ b/src/LinAlg/EVLOSER/RefactorizationSolver.hpp @@ -0,0 +1,244 @@ +// +// This file is part of HiOp. For details, see https://github.com/LLNL/hiop. +// HiOp is released under the BSD 3-clause license +// (https://opensource.org/licenses/BSD-3-Clause). Please also read “Additional +// BSD Notice” below. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// i. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the disclaimer below. ii. Redistributions in +// binary form must reproduce the above copyright notice, this list of +// conditions and the disclaimer (as noted below) in the documentation and/or +// other materials provided with the distribution. +// iii. Neither the name of the LLNS/LLNL nor the names of its contributors may +// be used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL LAWRENCE LIVERMORE NATIONAL SECURITY, LLC, +// THE U.S. DEPARTMENT OF ENERGY OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, +// INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF +// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Additional BSD Notice +// 1. This notice is required to be provided under our contract with the U.S. +// Department of Energy (DOE). This work was produced at Lawrence Livermore +// National Laboratory under Contract No. DE-AC52-07NA27344 with the DOE. +// 2. Neither the United States Government nor Lawrence Livermore National +// Security, LLC nor any of their employees, makes any warranty, express or +// implied, or assumes any liability or responsibility for the accuracy, +// completeness, or usefulness of any information, apparatus, product, or +// process disclosed, or represents that its use would not infringe +// privately-owned rights. +// 3. Also, reference herein to any specific commercial products, process, or +// services by trade name, trademark, manufacturer or otherwise does not +// necessarily constitute or imply its endorsement, recommendation, or favoring +// by the United States Government or Lawrence Livermore National Security, +// LLC. The views and opinions of authors expressed herein do not necessarily +// state or reflect those of the United States Government or Lawrence Livermore +// National Security, LLC, and shall not be used for advertising or product +// endorsement purposes. + +/** + * @file RefactorizationSolver.hpp + * + * @author Kasia Swirydowicz , PNNL + * @author Slaven Peles , ORNL + * + */ + +#pragma once + +#include "klu.h" +#include "resolve_cusolver_defs.hpp" +#include + +namespace EVLOSER +{ + +class MatrixCsr; +class IterativeRefinement; + +/** + * @brief Implements refactorization solvers using KLU and cuSOLVER libraries + * + */ +class RefactorizationSolver +{ +public: + // constructor + // RefactorizationSolver(); + RefactorizationSolver(int n); + ~RefactorizationSolver(); + + void enable_iterative_refinement(); + void setup_iterative_refinement_matrix(int n, int nnz); + void configure_iterative_refinement(cusparseHandle_t cusparse_handle, + cublasHandle_t cublas_handle, + cusolverRfHandle_t cusolverrf_handle, + int n, + double* d_T, + int* d_P, + int* d_Q, + double* devx, + double* devr); + + /** + * @brief Set the number of nonzeros in system matrix. + * + * @param nnz + */ + void set_nnz(int nnz) { nnz_ = nnz; } + + IterativeRefinement* ir() { return ir_; } + + MatrixCsr* mat_A_csr() { return mat_A_csr_; } + + double* devr() { return devr_; } + + int& ordering() { return ordering_; } + + std::string& fact() { return fact_; } + + std::string& refact() { return refact_; } + + std::string& use_ir() { return use_ir_; } + + void set_silent_output(bool silent_output) { silent_output_ = silent_output; } + + /** + * @brief Set up factorization of the first linear system. + * + * @return int + */ + int setup_factorization(); + + /** + * @brief Factorize system matrix + * + * @return int - factorization status: success=0, failure=-1 + */ + int factorize(); + + /** + * @brief Set the up the refactorization + * + */ + void setup_refactorization(); + + /** + * @brief Refactorize system matrix + * + * @return int + */ + int refactorize(); + + /** + * @brief Invokes triangular solver given matrix factors + * + * @param dx + * @param tol + * @return bool + */ + bool triangular_solve(double* dx, double tol, std::string memspace); + +private: + int n_{0}; ///< Size of the linear system + int nnz_{0}; ///< Number of nonzeros in the system's matrix + + MatrixCsr* mat_A_csr_{nullptr}; ///< System matrix in nonsymmetric CSR format + IterativeRefinement* ir_{nullptr}; ///< Iterative refinement class + + bool cusolver_glu_enabled_{false}; ///< cusolverGLU on/off flag + bool cusolver_rf_enabled_{false}; ///< cusolverRf on/off flag + bool iterative_refinement_enabled_{false}; ///< Iterative refinement on/off flag + bool is_first_solve_{true}; ///< If it is first call to triangular solver + + // Options + int ordering_{-1}; + std::string fact_; + std::string refact_; + std::string use_ir_; + bool silent_output_{true}; + + /** needed for cuSolver **/ + + cusolverStatus_t sp_status_; + cusparseHandle_t handle_ = 0; + cusolverSpHandle_t handle_cusolver_ = nullptr; + cublasHandle_t handle_cublas_; + + cusparseMatDescr_t descr_A_; + cusparseMatDescr_t descr_M_; + csrluInfoHost_t info_lu_ = nullptr; + csrgluInfo_t info_M_ = nullptr; + + cusolverRfHandle_t handle_rf_ = nullptr; + size_t buffer_size_; + size_t size_M_; + double* d_work_; + int ite_refine_succ_ = 0; + double r_nrminf_; + + // KLU stuff + int klu_status_; + klu_common Common_; + klu_symbolic* Symbolic_ = nullptr; + klu_numeric* Numeric_ = nullptr; + /*pieces of M */ + int* mia_ = nullptr; + int* mja_ = nullptr; + + /* CPU data */ + double* hostx_ = nullptr; + + /* for GPU data */ + double* devx_ = nullptr; + double* devr_ = nullptr; + + /* needed for cuSolverRf */ + int* d_P_ = nullptr; + int* d_Q_ = nullptr; // permutation matrices + double* d_T_ = nullptr; + + /** + * @brief Function that computes M = (L-I) + U + * + * @param n + * @param nnzL + * @param Lp + * @param Li + * @param nnzU + * @param Up + * @param Ui + * @return int + */ + int createM(const int n, const int nnzL, const int* Lp, const int* Li, const int nnzU, const int* Up, const int* Ui); + + int initializeKLU(); + int initializeCusolverGLU(); + int initializeCusolverRf(); + + int refactorizationSetupCusolverGLU(); + int refactorizationSetupCusolverRf(); + + /** + * @brief Check for CUDA errors. + * + * @tparam T - type of the result + * @param result - result value + * @param file - file name where the error occured + * @param line - line at which the error occured + */ + template + void resolveCheckCudaError(T result, const char* const file, int const line); +}; + +} // namespace EVLOSER diff --git a/src/LinAlg/EVLOSER/resolve_cusolver_defs.hpp b/src/LinAlg/EVLOSER/resolve_cusolver_defs.hpp new file mode 100644 index 0000000..ce3b1fa --- /dev/null +++ b/src/LinAlg/EVLOSER/resolve_cusolver_defs.hpp @@ -0,0 +1,132 @@ +// +// This file is part of HiOp. For details, see https://github.com/LLNL/hiop. HiOp +// is released under the BSD 3-clause license (https://opensource.org/licenses/BSD-3-Clause). +// Please also read “Additional BSD Notice” below. +// +// Redistribution and use in source and binary forms, with or without modification, +// are permitted provided that the following conditions are met: +// i. Redistributions of source code must retain the above copyright notice, this list +// of conditions and the disclaimer below. +// ii. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the disclaimer (as noted below) in the documentation and/or +// other materials provided with the distribution. +// iii. Neither the name of the LLNS/LLNL nor the names of its contributors may be used to +// endorse or promote products derived from this software without specific prior written +// permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT +// SHALL LAWRENCE LIVERMORE NATIONAL SECURITY, LLC, THE U.S. DEPARTMENT OF ENERGY OR +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +// OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED +// AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, +// EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Additional BSD Notice +// 1. This notice is required to be provided under our contract with the U.S. Department +// of Energy (DOE). This work was produced at Lawrence Livermore National Laboratory under +// Contract No. DE-AC52-07NA27344 with the DOE. +// 2. Neither the United States Government nor Lawrence Livermore National Security, LLC +// nor any of their employees, makes any warranty, express or implied, or assumes any +// liability or responsibility for the accuracy, completeness, or usefulness of any +// information, apparatus, product, or process disclosed, or represents that its use would +// not infringe privately-owned rights. +// 3. Also, reference herein to any specific commercial products, process, or services by +// trade name, trademark, manufacturer or otherwise does not necessarily constitute or +// imply its endorsement, recommendation, or favoring by the United States Government or +// Lawrence Livermore National Security, LLC. The views and opinions of authors expressed +// herein do not necessarily state or reflect those of the United States Government or +// Lawrence Livermore National Security, LLC, and shall not be used for advertising or +// product endorsement purposes. + +/** + * @file hiop_cusolver_defs.hpp + * + * @author Kasia Swirydowicz , PNNL + * + * Contains prototypes of cuSOLVER functions not in public API. + * + */ + +#ifndef CUSOLVERDEFS_H +#define CUSOLVERDEFS_H + +#include "cusparse.h" +#include "cusolverSp.h" +#include +#include +#include +#include "cusolverSp_LOWLEVEL_PREVIEW.h" + +#include "cusolverRf.h" + +extern "C" { +/* + * prototype not in public header file + */ +struct csrgluInfo; +typedef struct csrgluInfo* csrgluInfo_t; + +cusolverStatus_t CUSOLVERAPI cusolverSpCreateGluInfo(csrgluInfo_t* info); + +cusolverStatus_t CUSOLVERAPI cusolverSpDestroyGluInfo(csrgluInfo_t info); + +cusolverStatus_t CUSOLVERAPI cusolverSpDgluSetup(cusolverSpHandle_t handle, + int m, + /* A can be base-0 or base-1 */ + int nnzA, + const cusparseMatDescr_t descrA, + const int* h_csrRowPtrA, + const int* h_csrColIndA, + const int* h_P, /* base-0 */ + const int* h_Q, /* base-0 */ + /* M can be base-0 or base-1 */ + int nnzM, + const cusparseMatDescr_t descrM, + const int* h_csrRowPtrM, + const int* h_csrColIndM, + csrgluInfo_t info); + +cusolverStatus_t CUSOLVERAPI cusolverSpDgluBufferSize(cusolverSpHandle_t handle, csrgluInfo_t info, size_t* pBufferSize); + +cusolverStatus_t CUSOLVERAPI cusolverSpDgluAnalysis(cusolverSpHandle_t handle, csrgluInfo_t info, void* workspace); + +cusolverStatus_t CUSOLVERAPI cusolverSpDgluReset(cusolverSpHandle_t handle, + int m, + /* A is original matrix */ + int nnzA, + const cusparseMatDescr_t descr_A, + const double* d_csrValA, + const int* d_csrRowPtrA, + const int* d_csrColIndA, + csrgluInfo_t info); + +cusolverStatus_t CUSOLVERAPI cusolverSpDgluFactor(cusolverSpHandle_t handle, csrgluInfo_t info, void* workspace); + +cusolverStatus_t CUSOLVERAPI cusolverSpDgluSolve(cusolverSpHandle_t handle, + int m, + /* A is original matrix */ + int nnzA, + const cusparseMatDescr_t descr_A, + const double* d_csrValA, + const int* d_csrRowPtrA, + const int* d_csrColIndA, + const double* d_b0, /* right hand side */ + double* d_x, /* left hand side */ + int* ite_refine_succ, + double* r_nrminf_ptr, + csrgluInfo_t info, + void* workspace); + +cusolverStatus_t CUSOLVERAPI cusolverSpDnrminf(cusolverSpHandle_t handle, + int n, + const double* x, + double* result, /* |x|_inf, host */ + void* d_work); /* at least 8192 bytes */ + +} // extern "C" + +#endif // CUSOLVERDEFS_H diff --git a/src/LinAlg/hiopLinSolverSparseEVLOSER.cpp b/src/LinAlg/hiopLinSolverSparseEVLOSER.cpp index 6f25601..f35ea87 100644 --- a/src/LinAlg/hiopLinSolverSparseEVLOSER.cpp +++ b/src/LinAlg/hiopLinSolverSparseEVLOSER.cpp @@ -55,8 +55,9 @@ #include "hiopLinSolverSparseEVLOSER.hpp" #include -#include -#include +#include "EVLOSER/RefactorizationSolver.hpp" +#include "EVLOSER/MatrixCsr.hpp" +#include "EVLOSER/IterativeRefinement.hpp" #include "hiop_blasdefs.hpp" #include "KrylovSolverKernels.h" @@ -119,7 +120,7 @@ hiopLinSolverSymSparseEVLOSER::hiopLinSolverSymSparseEVLOSER(const int& n, const is_first_call_{true} { // Create embedded ReSolve refactorization solver for the EVLOSER wrapper - solver_ = new ReSolve::RefactorizationSolver(n); + solver_ = new EVLOSER::RefactorizationSolver(n); // If memory space is device, allocate host mirror for HiOp's KKT matrix in triplet format if(nlp_->options->GetString("mem_space") == "device") { diff --git a/src/LinAlg/hiopLinSolverSparseEVLOSER.hpp b/src/LinAlg/hiopLinSolverSparseEVLOSER.hpp index dc0e86f..29b2363 100644 --- a/src/LinAlg/hiopLinSolverSparseEVLOSER.hpp +++ b/src/LinAlg/hiopLinSolverSparseEVLOSER.hpp @@ -67,13 +67,13 @@ * @ingroup LinearSolvers */ -namespace ReSolve +namespace EVLOSER { // Forward declaration of inner IR class class IterativeRefinement; class MatrixCsr; class RefactorizationSolver; -} // namespace ReSolve +} // namespace EVLOSER namespace hiop { @@ -109,7 +109,7 @@ class hiopLinSolverSymSparseEVLOSER : public hiopLinSolverSymSparse } protected: - ReSolve::RefactorizationSolver* solver_; + EVLOSER::RefactorizationSolver* solver_; int m_; ///< number of rows of the whole matrix int n_; ///< number of cols of the whole matrix From 5904032f049e391cd6bcca4711866951959c2861 Mon Sep 17 00:00:00 2001 From: Tamar DeWilde Date: Wed, 17 Jun 2026 02:01:02 -0400 Subject: [PATCH 09/28] Add EVLOSER matrix validation guards --- src/LinAlg/EVLOSER/MatrixCsr.cpp | 58 ++++++++++++++++ src/LinAlg/EVLOSER/MatrixCsr.hpp | 4 ++ src/LinAlg/EVLOSER/RefactorizationSolver.cpp | 71 ++++++++++++++++++++ src/LinAlg/EVLOSER/RefactorizationSolver.hpp | 3 + 4 files changed, 136 insertions(+) diff --git a/src/LinAlg/EVLOSER/MatrixCsr.cpp b/src/LinAlg/EVLOSER/MatrixCsr.cpp index 3220bec..15ef4b6 100644 --- a/src/LinAlg/EVLOSER/MatrixCsr.cpp +++ b/src/LinAlg/EVLOSER/MatrixCsr.cpp @@ -129,6 +129,64 @@ void MatrixCsr::copy_to_host_mirror() checkCudaErrors(cudaMemcpy(vals_host_, vals_, sizeof(double) * nnz_, cudaMemcpyDeviceToHost)); } +bool MatrixCsr::validate_host_structure(const char* caller, bool silent_output) const +{ + const char* caller_name = caller == nullptr ? "unknown caller" : caller; + + auto report = [&](const std::string& message) { + if(!silent_output) { + std::cout << "[EVLOSER] Invalid CSR matrix in " << caller_name << ": " << message << "\n"; + } + return false; + }; + + if(n_ <= 0) { + return report("matrix dimension must be positive"); + } + + if(nnz_ < 0) { + return report("number of nonzeros is negative"); + } + + if(irows_host_ == nullptr) { + return report("host row pointer is null"); + } + + if(irows_host_[0] != 0) { + return report("row pointer must start at zero"); + } + + for(int row = 0; row < n_; ++row) { + if(irows_host_[row] > irows_host_[row + 1]) { + return report("row pointer is not monotone"); + } + } + + if(irows_host_[n_] != nnz_) { + return report("final row pointer does not match nnz"); + } + + if(nnz_ == 0) { + return true; + } + + if(jcols_host_ == nullptr) { + return report("host column index array is null"); + } + + if(vals_host_ == nullptr) { + return report("host value array is null"); + } + + for(int k = 0; k < nnz_; ++k) { + if(jcols_host_[k] < 0 || jcols_host_[k] >= n_) { + return report("column index out of range"); + } + } + + return true; +} + // Error checking utility for CUDA // KS: might later become part of src/Utils, putting it here for now template diff --git a/src/LinAlg/EVLOSER/MatrixCsr.hpp b/src/LinAlg/EVLOSER/MatrixCsr.hpp index f34f40e..f43fb4e 100644 --- a/src/LinAlg/EVLOSER/MatrixCsr.hpp +++ b/src/LinAlg/EVLOSER/MatrixCsr.hpp @@ -12,6 +12,10 @@ class MatrixCsr void allocate_nnz(int nnz); void clear_data(); + int n() const { return n_; } + int nnz() const { return nnz_; } + bool validate_host_structure(const char* caller, bool silent_output) const; + int* get_irows() { return irows_; } const int* get_irows() const { return irows_; } diff --git a/src/LinAlg/EVLOSER/RefactorizationSolver.cpp b/src/LinAlg/EVLOSER/RefactorizationSolver.cpp index 5b0b13f..9915d5a 100644 --- a/src/LinAlg/EVLOSER/RefactorizationSolver.cpp +++ b/src/LinAlg/EVLOSER/RefactorizationSolver.cpp @@ -155,8 +155,67 @@ void RefactorizationSolver::configure_iterative_refinement(cusparseHandle_t cusp ir_->setup(cusparse_handle, cublas_handle, cusolverrf_handle, n, d_T, d_P, d_Q, devx, devr); } +bool RefactorizationSolver::validate_system_matrix(const char* caller) const +{ + if(mat_A_csr_ == nullptr) { + if(!silent_output_) { + std::cout << "[EVLOSER] Invalid matrix in " << caller << ": matrix object is null\n"; + } + return false; + } + + return mat_A_csr_->validate_host_structure(caller, silent_output_); +} + +bool RefactorizationSolver::validate_klu_factorization(const char* caller) const +{ + if(Symbolic_ == nullptr) { + if(!silent_output_) { + std::cout << "[EVLOSER] Invalid KLU factorization in " << caller << ": symbolic factor is null\n"; + } + return false; + } + + if(Numeric_ == nullptr) { + if(!silent_output_) { + std::cout << "[EVLOSER] Invalid KLU factorization in " << caller << ": numeric factor is null\n"; + } + return false; + } + + if(Symbolic_->n != n_) { + if(!silent_output_) { + std::cout << "[EVLOSER] Invalid KLU factorization in " << caller << ": symbolic dimension " + << Symbolic_->n << " does not match solver dimension " << n_ << "\n"; + } + return false; + } + + if(Numeric_->n != n_) { + if(!silent_output_) { + std::cout << "[EVLOSER] Invalid KLU factorization in " << caller << ": numeric dimension " + << Numeric_->n << " does not match solver dimension " << n_ << "\n"; + } + return false; + } + + if(Symbolic_->Q == nullptr || Numeric_->Pnum == nullptr) { + if(!silent_output_) { + std::cout << "[EVLOSER] Invalid KLU factorization in " << caller + << ": missing permutation data\n"; + } + return false; + } + + return true; +} + int RefactorizationSolver::setup_factorization() { + if(!validate_system_matrix("KLU analysis")) { + return -1; + } + int* row_ptr = mat_A_csr_->get_irows_host(); int* col_idx = mat_A_csr_->get_jcols_host(); @@ -190,6 +249,10 @@ int RefactorizationSolver::factorize() void RefactorizationSolver::setup_refactorization() { + if(!validate_system_matrix("refactorization setup")) { + return; + } + if(refact_ == "glu") { initializeCusolverGLU(); refactorizationSetupCusolverGLU(); @@ -206,6 +269,10 @@ void RefactorizationSolver::setup_refactorization() int RefactorizationSolver::refactorize() { + if(!validate_system_matrix("refactorization")) { + return -1; + } + if(refact_ == "glu") { sp_status_ = cusolverSpDgluReset(handle_cusolver_, n_, @@ -542,6 +609,10 @@ int RefactorizationSolver::refactorizationSetupCusolverGLU() int RefactorizationSolver::refactorizationSetupCusolverRf() { + if(!validate_klu_factorization("cuSOLVER RF setup")) { + return -1; + } + // for now this ONLY WORKS if preceeded by KLU. Might be worth decoupling // later const int nnzL = Numeric_->lnz; diff --git a/src/LinAlg/EVLOSER/RefactorizationSolver.hpp b/src/LinAlg/EVLOSER/RefactorizationSolver.hpp index c6f603e..95d3d06 100644 --- a/src/LinAlg/EVLOSER/RefactorizationSolver.hpp +++ b/src/LinAlg/EVLOSER/RefactorizationSolver.hpp @@ -222,6 +222,9 @@ class RefactorizationSolver */ int createM(const int n, const int nnzL, const int* Lp, const int* Li, const int nnzU, const int* Up, const int* Ui); + bool validate_system_matrix(const char* caller) const; + bool validate_klu_factorization(const char* caller) const; + int initializeKLU(); int initializeCusolverGLU(); int initializeCusolverRf(); From 70588e9b77c3d976c36621bedc759d11f62f847d Mon Sep 17 00:00:00 2001 From: Tamar DeWilde Date: Wed, 17 Jun 2026 02:01:02 -0400 Subject: [PATCH 10/28] Make EVLOSER CSR ownership explicit --- src/LinAlg/EVLOSER/MatrixCsr.cpp | 57 +++++++++++++++++--- src/LinAlg/EVLOSER/MatrixCsr.hpp | 26 ++++++--- src/LinAlg/EVLOSER/RefactorizationSolver.cpp | 46 ++++++++-------- src/LinAlg/hiopLinSolverSparseEVLOSER.cpp | 20 +++---- 4 files changed, 103 insertions(+), 46 deletions(-) diff --git a/src/LinAlg/EVLOSER/MatrixCsr.cpp b/src/LinAlg/EVLOSER/MatrixCsr.cpp index 15ef4b6..ab0f650 100644 --- a/src/LinAlg/EVLOSER/MatrixCsr.cpp +++ b/src/LinAlg/EVLOSER/MatrixCsr.cpp @@ -72,13 +72,29 @@ MatrixCsr::MatrixCsr() {} MatrixCsr::~MatrixCsr() { - if(n_ == 0) return; - clear_data(); } +bool MatrixCsr::has_device_storage() const +{ + const bool size_allocated = (n_ == 0) || (irows_ != nullptr); + const bool nnz_allocated = (nnz_ == 0) || (jcols_ != nullptr && vals_ != nullptr); + return size_allocated && nnz_allocated; +} + +bool MatrixCsr::has_host_mirror() const +{ + const bool size_allocated = (n_ == 0) || (irows_host_ != nullptr); + const bool nnz_allocated = (nnz_ == 0) || (jcols_host_ != nullptr && vals_host_ != nullptr); + return size_allocated && nnz_allocated; +} + void MatrixCsr::allocate_size(int n) { + if(irows_ != nullptr || irows_host_ != nullptr) { + clear_data(); + } + n_ = n; checkCudaErrors(cudaMalloc(&irows_, (n_ + 1) * sizeof(int))); irows_host_ = new int[n_ + 1]{0}; @@ -86,7 +102,24 @@ void MatrixCsr::allocate_size(int n) void MatrixCsr::allocate_nnz(int nnz) { + if(jcols_ != nullptr || vals_ != nullptr || jcols_host_ != nullptr || vals_host_ != nullptr) { + checkCudaErrors(cudaFree(jcols_)); + checkCudaErrors(cudaFree(vals_)); + delete[] jcols_host_; + delete[] vals_host_; + + jcols_ = nullptr; + vals_ = nullptr; + jcols_host_ = nullptr; + vals_host_ = nullptr; + nnz_ = 0; + } + nnz_ = nnz; + if(nnz_ == 0) { + return; + } + checkCudaErrors(cudaMalloc(&jcols_, nnz_ * sizeof(int))); checkCudaErrors(cudaMalloc(&vals_, nnz_ * sizeof(double))); jcols_host_ = new int[nnz_]{0}; @@ -117,16 +150,28 @@ void MatrixCsr::clear_data() void MatrixCsr::update_from_host_mirror() { + assert(has_device_storage()); + assert(has_host_mirror()); + checkCudaErrors(cudaMemcpy(irows_, irows_host_, sizeof(int) * (n_ + 1), cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpy(jcols_, jcols_host_, sizeof(int) * nnz_, cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpy(vals_, vals_host_, sizeof(double) * nnz_, cudaMemcpyHostToDevice)); + + if(nnz_ > 0) { + checkCudaErrors(cudaMemcpy(jcols_, jcols_host_, sizeof(int) * nnz_, cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(vals_, vals_host_, sizeof(double) * nnz_, cudaMemcpyHostToDevice)); + } } void MatrixCsr::copy_to_host_mirror() { + assert(has_device_storage()); + assert(has_host_mirror()); + checkCudaErrors(cudaMemcpy(irows_host_, irows_, sizeof(int) * (n_ + 1), cudaMemcpyDeviceToHost)); - checkCudaErrors(cudaMemcpy(jcols_host_, jcols_, sizeof(int) * nnz_, cudaMemcpyDeviceToHost)); - checkCudaErrors(cudaMemcpy(vals_host_, vals_, sizeof(double) * nnz_, cudaMemcpyDeviceToHost)); + + if(nnz_ > 0) { + checkCudaErrors(cudaMemcpy(jcols_host_, jcols_, sizeof(int) * nnz_, cudaMemcpyDeviceToHost)); + checkCudaErrors(cudaMemcpy(vals_host_, vals_, sizeof(double) * nnz_, cudaMemcpyDeviceToHost)); + } } bool MatrixCsr::validate_host_structure(const char* caller, bool silent_output) const diff --git a/src/LinAlg/EVLOSER/MatrixCsr.hpp b/src/LinAlg/EVLOSER/MatrixCsr.hpp index f43fb4e..bd7b6f6 100644 --- a/src/LinAlg/EVLOSER/MatrixCsr.hpp +++ b/src/LinAlg/EVLOSER/MatrixCsr.hpp @@ -14,21 +14,33 @@ class MatrixCsr int n() const { return n_; } int nnz() const { return nnz_; } + bool has_device_storage() const; + bool has_host_mirror() const; bool validate_host_structure(const char* caller, bool silent_output) const; - int* get_irows() { return irows_; } + int* device_irows() { return irows_; } - const int* get_irows() const { return irows_; } + const int* device_irows() const { return irows_; } - int* get_jcols() { return jcols_; } + int* device_jcols() { return jcols_; } - double* get_vals() { return vals_; } + const int* device_jcols() const { return jcols_; } - int* get_irows_host() { return irows_host_; } + double* device_vals() { return vals_; } - int* get_jcols_host() { return jcols_host_; } + const double* device_vals() const { return vals_; } - double* get_vals_host() { return vals_host_; } + int* host_irows() { return irows_host_; } + + const int* host_irows() const { return irows_host_; } + + int* host_jcols() { return jcols_host_; } + + const int* host_jcols() const { return jcols_host_; } + + double* host_vals() { return vals_host_; } + + const double* host_vals() const { return vals_host_; } void update_from_host_mirror(); void copy_to_host_mirror(); diff --git a/src/LinAlg/EVLOSER/RefactorizationSolver.cpp b/src/LinAlg/EVLOSER/RefactorizationSolver.cpp index 9915d5a..86f7d7b 100644 --- a/src/LinAlg/EVLOSER/RefactorizationSolver.cpp +++ b/src/LinAlg/EVLOSER/RefactorizationSolver.cpp @@ -138,7 +138,7 @@ void RefactorizationSolver::enable_iterative_refinement() // TODO: Refactor to only pass mat_A_csr_ to setup_system_matrix; n and nnz can be read from mat_A_csr_ void RefactorizationSolver::setup_iterative_refinement_matrix(int n, int nnz) { - ir_->setup_system_matrix(n, nnz, mat_A_csr_->get_irows(), mat_A_csr_->get_jcols(), mat_A_csr_->get_vals()); + ir_->setup_system_matrix(n, nnz, mat_A_csr_->device_irows(), mat_A_csr_->device_jcols(), mat_A_csr_->device_vals()); } // TODO: Can this function be merged with setup_iterative_refinement_matrix ? @@ -216,8 +216,8 @@ int RefactorizationSolver::setup_factorization() return -1; } - int* row_ptr = mat_A_csr_->get_irows_host(); - int* col_idx = mat_A_csr_->get_jcols_host(); + int* row_ptr = mat_A_csr_->host_irows(); + int* col_idx = mat_A_csr_->host_jcols(); if(fact_ == "klu") { /* initialize KLU setup parameters, dont factorize yet */ @@ -239,9 +239,9 @@ int RefactorizationSolver::setup_factorization() int RefactorizationSolver::factorize() { - Numeric_ = klu_factor(mat_A_csr_->get_irows_host(), - mat_A_csr_->get_jcols_host(), - mat_A_csr_->get_vals_host(), + Numeric_ = klu_factor(mat_A_csr_->host_irows(), + mat_A_csr_->host_jcols(), + mat_A_csr_->host_vals(), Symbolic_, &Common_); return (Numeric_ == nullptr) ? -1 : 0; @@ -279,18 +279,18 @@ int RefactorizationSolver::refactorize() /* A is original matrix */ nnz_, descr_A_, - mat_A_csr_->get_vals(), - mat_A_csr_->get_irows(), - mat_A_csr_->get_jcols(), + mat_A_csr_->device_vals(), + mat_A_csr_->device_irows(), + mat_A_csr_->device_jcols(), info_M_); sp_status_ = cusolverSpDgluFactor(handle_cusolver_, info_M_, d_work_); } else { if(refact_ == "rf") { sp_status_ = cusolverRfResetValues(n_, nnz_, - mat_A_csr_->get_irows(), - mat_A_csr_->get_jcols(), - mat_A_csr_->get_vals(), + mat_A_csr_->device_irows(), + mat_A_csr_->device_jcols(), + mat_A_csr_->device_vals(), d_P_, d_Q_, handle_rf_); @@ -317,9 +317,9 @@ bool RefactorizationSolver::triangular_solve(double* dx, double tol, std::string /* A is original matrix */ nnz_, descr_A_, - mat_A_csr_->get_vals(), - mat_A_csr_->get_irows(), - mat_A_csr_->get_jcols(), + mat_A_csr_->device_vals(), + mat_A_csr_->device_irows(), + mat_A_csr_->device_jcols(), devr_, /* right hand side */ devx, /* left hand side, local pointer */ &ite_refine_succ_, @@ -571,8 +571,8 @@ int RefactorizationSolver::refactorizationSetupCusolverGLU() n_, nnz_, descr_A_, - mat_A_csr_->get_irows_host(), // kRowPtr_, - mat_A_csr_->get_jcols_host(), // jCol_, + mat_A_csr_->host_irows(), // kRowPtr_, + mat_A_csr_->host_jcols(), // jCol_, Numeric_->Pnum, /* base-0 */ Symbolic_->Q, /* base-0 */ nnzM, /* nnzM */ @@ -597,9 +597,9 @@ int RefactorizationSolver::refactorizationSetupCusolverGLU() /* A is original matrix */ nnz_, descr_A_, - mat_A_csr_->get_vals(), - mat_A_csr_->get_irows(), - mat_A_csr_->get_jcols(), + mat_A_csr_->device_vals(), + mat_A_csr_->device_irows(), + mat_A_csr_->device_jcols(), info_M_); assert(CUSOLVER_STATUS_SUCCESS == sp_status_); @@ -792,9 +792,9 @@ int RefactorizationSolver::refactorizationSetupCusolverRf() sp_status_ = cusolverRfSetupDevice(n_, nnz_, - mat_A_csr_->get_irows(), // dia_, - mat_A_csr_->get_jcols(), // dja_, - mat_A_csr_->get_vals(), // da_, + mat_A_csr_->device_irows(), // dia_, + mat_A_csr_->device_jcols(), // dja_, + mat_A_csr_->device_vals(), // da_, nnzL, d_Lp_csr, d_Li_csr, diff --git a/src/LinAlg/hiopLinSolverSparseEVLOSER.cpp b/src/LinAlg/hiopLinSolverSparseEVLOSER.cpp index f35ea87..df82d9f 100644 --- a/src/LinAlg/hiopLinSolverSparseEVLOSER.cpp +++ b/src/LinAlg/hiopLinSolverSparseEVLOSER.cpp @@ -375,7 +375,7 @@ void hiopLinSolverSymSparseEVLOSER::update_matrix_values() { std::string mem_space = nlp_->options->GetString("mem_space"); if(mem_space == "device") { - double* csr_vals = solver_->mat_A_csr()->get_vals(); + double* csr_vals = solver_->mat_A_csr()->device_vals(); double* coo_vals = M_->M(); int coo_nnz = M_->numberOfNonzeros(); @@ -389,14 +389,14 @@ void hiopLinSolverSymSparseEVLOSER::update_matrix_values() // If factorization was not successful, we need a copy of values on the host if(factorizationSetupSucc_ == 0) - checkCudaErrors(cudaMemcpy(solver_->mat_A_csr()->get_vals_host(), - solver_->mat_A_csr()->get_vals(), + checkCudaErrors(cudaMemcpy(solver_->mat_A_csr()->host_vals(), + solver_->mat_A_csr()->device_vals(), sizeof(double) * nnz_, cudaMemcpyDeviceToHost)); } else { // KKT matrix is on the host - double* vals = solver_->mat_A_csr()->get_vals_host(); + double* vals = solver_->mat_A_csr()->host_vals(); // update matrix for(int k = 0; k < nnz_; k++) { vals[k] = M_->M()[index_convert_CSR2Triplet_host_[k]]; @@ -405,8 +405,8 @@ void hiopLinSolverSymSparseEVLOSER::update_matrix_values() if(index_convert_extra_Diag2CSR_host_[i] != -1) vals[index_convert_extra_Diag2CSR_host_[i]] += M_->M()[M_->numberOfNonzeros() - n_ + i]; } - checkCudaErrors(cudaMemcpy(solver_->mat_A_csr()->get_vals(), - solver_->mat_A_csr()->get_vals_host(), + checkCudaErrors(cudaMemcpy(solver_->mat_A_csr()->device_vals(), + solver_->mat_A_csr()->host_vals(), sizeof(double) * nnz_, cudaMemcpyHostToDevice)); } @@ -418,7 +418,7 @@ void hiopLinSolverSymSparseEVLOSER::compute_nnz() // // compute nnz in each row // - int* row_ptr = solver_->mat_A_csr()->get_irows_host(); + int* row_ptr = solver_->mat_A_csr()->host_irows(); // If the data is on device, fetch it from the host mirror hiopMatrixSparse* M_host = nullptr; @@ -469,9 +469,9 @@ void hiopLinSolverSymSparseEVLOSER::set_csr_indices_values() // // set correct col index and value // - const int* row_ptr = solver_->mat_A_csr()->get_irows_host(); - int* col_idx = solver_->mat_A_csr()->get_jcols_host(); - double* vals = solver_->mat_A_csr()->get_vals_host(); + const int* row_ptr = solver_->mat_A_csr()->host_irows(); + int* col_idx = solver_->mat_A_csr()->host_jcols(); + double* vals = solver_->mat_A_csr()->host_vals(); index_convert_CSR2Triplet_host_ = new int[nnz_]; index_convert_extra_Diag2CSR_host_ = new int[n_]; From 6e5e8d08c7ea93513a9312b8021ec0016b43cdcb Mon Sep 17 00:00:00 2001 From: Tamar DeWilde Date: Wed, 17 Jun 2026 02:01:02 -0400 Subject: [PATCH 11/28] Refactor EVLOSER KLU factor setup --- src/LinAlg/EVLOSER/RefactorizationSolver.cpp | 426 ++++++++++--------- 1 file changed, 230 insertions(+), 196 deletions(-) diff --git a/src/LinAlg/EVLOSER/RefactorizationSolver.cpp b/src/LinAlg/EVLOSER/RefactorizationSolver.cpp index 86f7d7b..cd0507a 100644 --- a/src/LinAlg/EVLOSER/RefactorizationSolver.cpp +++ b/src/LinAlg/EVLOSER/RefactorizationSolver.cpp @@ -69,6 +69,197 @@ namespace EVLOSER { +namespace +{ + +struct KluFactorData +{ + int nnzL{0}; + int nnzU{0}; + std::vector Lp; + std::vector Li; + std::vector Lx; + std::vector Up; + std::vector Ui; + std::vector Ux; +}; + +struct HostCsrFactor +{ + std::vector rowptr; + std::vector colind; + std::vector values; +}; + +bool validate_csc_factor(const char* name, int n, int nnz, const std::vector& colptr, const std::vector& rowind, bool silent_output) +{ + auto report = [&](const std::string& message) { + if(!silent_output) { + std::cout << "[EVLOSER] Invalid KLU " << name << " factor: " << message << "\n"; + } + return false; + }; + + if(n <= 0) { + return report("factor dimension must be positive"); + } + + if(nnz < 0) { + return report("number of nonzeros is negative"); + } + + if(static_cast(colptr.size()) != n + 1) { + return report("column pointer size does not match dimension"); + } + + if(static_cast(rowind.size()) != nnz) { + return report("row index size does not match nnz"); + } + + if(colptr[0] != 0) { + return report("column pointer must start at zero"); + } + + for(int col = 0; col < n; ++col) { + if(colptr[col] > colptr[col + 1]) { + return report("column pointer is not monotone"); + } + } + + if(colptr[n] != nnz) { + return report("final column pointer does not match nnz"); + } + + for(int k = 0; k < nnz; ++k) { + if(rowind[k] < 0 || rowind[k] >= n) { + return report("row index out of range"); + } + } + + return true; +} + +bool extract_klu_factors(klu_numeric* numeric, + klu_symbolic* symbolic, + klu_common& common, + int n, + KluFactorData& factors, + bool silent_output) +{ + factors.nnzL = numeric->lnz; + factors.nnzU = numeric->unz; + + factors.Lp.assign(n + 1, 0); + factors.Li.assign(factors.nnzL, 0); + factors.Lx.assign(factors.nnzL, 0.0); + factors.Up.assign(n + 1, 0); + factors.Ui.assign(factors.nnzU, 0); + factors.Ux.assign(factors.nnzU, 0.0); + + const int ok = klu_extract(numeric, + symbolic, + factors.Lp.data(), + factors.Li.data(), + factors.Lx.data(), + factors.Up.data(), + factors.Ui.data(), + factors.Ux.data(), + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + &common); + + if(ok == 0) { + if(!silent_output) { + std::cout << "[EVLOSER] klu_extract failed while preparing cuSOLVER RF setup\n"; + } + return false; + } + + return validate_csc_factor("L", n, factors.nnzL, factors.Lp, factors.Li, silent_output) && + validate_csc_factor("U", n, factors.nnzU, factors.Up, factors.Ui, silent_output); +} + +HostCsrFactor convert_csc_to_csr(int n, + int nnz, + const std::vector& colptr, + const std::vector& rowind, + const std::vector& values) +{ + HostCsrFactor csr; + csr.rowptr.assign(n + 1, 0); + csr.colind.assign(nnz, 0); + csr.values.assign(nnz, 0.0); + + for(int col = 0; col < n; ++col) { + for(int k = colptr[col]; k < colptr[col + 1]; ++k) { + csr.rowptr[rowind[k] + 1]++; + } + } + + for(int row = 0; row < n; ++row) { + csr.rowptr[row + 1] += csr.rowptr[row]; + } + + std::vector offsets = csr.rowptr; + for(int col = 0; col < n; ++col) { + for(int k = colptr[col]; k < colptr[col + 1]; ++k) { + const int row = rowind[k]; + const int dest = offsets[row]++; + csr.colind[dest] = col; + csr.values[dest] = values[k]; + } + } + + return csr; +} + +bool validate_host_csr_factor(const char* name, int n, int nnz, const HostCsrFactor& csr, bool silent_output) +{ + auto report = [&](const std::string& message) { + if(!silent_output) { + std::cout << "[EVLOSER] Invalid host CSR " << name << " factor: " << message << "\n"; + } + return false; + }; + + if(static_cast(csr.rowptr.size()) != n + 1) { + return report("row pointer size does not match dimension"); + } + + if(static_cast(csr.colind.size()) != nnz || static_cast(csr.values.size()) != nnz) { + return report("column/value array size does not match nnz"); + } + + if(csr.rowptr[0] != 0) { + return report("row pointer must start at zero"); + } + + for(int row = 0; row < n; ++row) { + if(csr.rowptr[row] > csr.rowptr[row + 1]) { + return report("row pointer is not monotone"); + } + } + + if(csr.rowptr[n] != nnz) { + return report("final row pointer does not match nnz"); + } + + for(int k = 0; k < nnz; ++k) { + if(csr.colind[k] < 0 || csr.colind[k] >= n) { + return report("column index out of range"); + } + } + + return true; +} + +} // namespace + RefactorizationSolver::RefactorizationSolver(int n) : n_(n) { @@ -609,214 +800,57 @@ int RefactorizationSolver::refactorizationSetupCusolverGLU() int RefactorizationSolver::refactorizationSetupCusolverRf() { + // For now this path requires a prior KLU factorization. if(!validate_klu_factorization("cuSOLVER RF setup")) { return -1; } - // for now this ONLY WORKS if preceeded by KLU. Might be worth decoupling - // later - const int nnzL = Numeric_->lnz; - const int nnzU = Numeric_->unz; + KluFactorData factors; + if(!extract_klu_factors(Numeric_, Symbolic_, Common_, n_, factors, silent_output_)) { + return -1; + } - checkCudaErrors(cudaMalloc(&d_P_, (n_) * sizeof(int))); - checkCudaErrors(cudaMalloc(&d_Q_, (n_) * sizeof(int))); - checkCudaErrors(cudaMalloc(&d_T_, (n_) * sizeof(double))); + HostCsrFactor L_csr = convert_csc_to_csr(n_, factors.nnzL, factors.Lp, factors.Li, factors.Lx); + HostCsrFactor U_csr = convert_csc_to_csr(n_, factors.nnzU, factors.Up, factors.Ui, factors.Ux); - checkCudaErrors(cudaMemcpy(d_P_, Numeric_->Pnum, sizeof(int) * (n_), cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpy(d_Q_, Symbolic_->Q, sizeof(int) * (n_), cudaMemcpyHostToDevice)); + if(!validate_host_csr_factor("L", n_, factors.nnzL, L_csr, silent_output_)) { + return -1; + } - int* Lp = new int[n_ + 1]; - int* Li = new int[nnzL]; - double* Lx = new double[nnzL]; - int* Up = new int[n_ + 1]; - int* Ui = new int[nnzU]; - double* Ux = new double[nnzU]; + if(!validate_host_csr_factor("U", n_, factors.nnzU, U_csr, silent_output_)) { + return -1; + } - int ok = klu_extract(Numeric_, - Symbolic_, - Lp, - Li, - Lx, - Up, - Ui, - Ux, - nullptr, - nullptr, - nullptr, - nullptr, - nullptr, - nullptr, - nullptr, - &Common_); + checkCudaErrors(cudaMalloc(&d_P_, n_ * sizeof(int))); + checkCudaErrors(cudaMalloc(&d_Q_, n_ * sizeof(int))); + checkCudaErrors(cudaMalloc(&d_T_, n_ * sizeof(double))); - /* CSC */ - int* d_Lp; - int* d_Li; - int* d_Up; - int* d_Ui; - double* d_Lx; - double* d_Ux; - /* CSR */ - int* d_Lp_csr; - int* d_Li_csr; - int* d_Up_csr; - int* d_Ui_csr; - double* d_Lx_csr; - double* d_Ux_csr; - - /* allocate CSC */ - checkCudaErrors(cudaMalloc(&d_Lp, (n_ + 1) * sizeof(int))); - checkCudaErrors(cudaMalloc(&d_Li, nnzL * sizeof(int))); - checkCudaErrors(cudaMalloc(&d_Lx, nnzL * sizeof(double))); - checkCudaErrors(cudaMalloc(&d_Up, (n_ + 1) * sizeof(int))); - checkCudaErrors(cudaMalloc(&d_Ui, nnzU * sizeof(int))); - checkCudaErrors(cudaMalloc(&d_Ux, nnzU * sizeof(double))); - - /* allocate CSR */ - checkCudaErrors(cudaMalloc(&d_Lp_csr, (n_ + 1) * sizeof(int))); - checkCudaErrors(cudaMalloc(&d_Li_csr, nnzL * sizeof(int))); - checkCudaErrors(cudaMalloc(&d_Lx_csr, nnzL * sizeof(double))); - checkCudaErrors(cudaMalloc(&d_Up_csr, (n_ + 1) * sizeof(int))); - checkCudaErrors(cudaMalloc(&d_Ui_csr, nnzU * sizeof(int))); - checkCudaErrors(cudaMalloc(&d_Ux_csr, nnzU * sizeof(double))); - - /* copy CSC to the GPU */ - checkCudaErrors(cudaMemcpy(d_Lp, Lp, sizeof(int) * (n_ + 1), cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpy(d_Li, Li, sizeof(int) * (nnzL), cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpy(d_Lx, Lx, sizeof(double) * (nnzL), cudaMemcpyHostToDevice)); - - checkCudaErrors(cudaMemcpy(d_Up, Up, sizeof(int) * (n_ + 1), cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpy(d_Ui, Ui, sizeof(int) * (nnzU), cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpy(d_Ux, Ux, sizeof(double) * (nnzU), cudaMemcpyHostToDevice)); - - /* we dont need these any more */ - delete[] Lp; - delete[] Li; - delete[] Lx; - delete[] Up; - delete[] Ui; - delete[] Ux; + checkCudaErrors(cudaMemcpy(d_P_, Numeric_->Pnum, n_ * sizeof(int), cudaMemcpyHostToDevice)); + checkCudaErrors(cudaMemcpy(d_Q_, Symbolic_->Q, n_ * sizeof(int), cudaMemcpyHostToDevice)); - /* now CSC to CSR using the new cuda 11 awkward way */ - size_t bufferSizeL; - size_t bufferSizeU; - - cusparseStatus_t csp = cusparseCsr2cscEx2_bufferSize(handle_, - n_, - n_, - nnzL, - d_Lx, - d_Lp, - d_Li, - d_Lx_csr, - d_Lp_csr, - d_Li_csr, - CUDA_R_64F, - CUSPARSE_ACTION_NUMERIC, - CUSPARSE_INDEX_BASE_ZERO, - CUSPARSE_CSR2CSC_ALG1, - &bufferSizeL); - - csp = cusparseCsr2cscEx2_bufferSize(handle_, - n_, - n_, - nnzU, - d_Ux, - d_Up, - d_Ui, - d_Ux_csr, - d_Up_csr, - d_Ui_csr, - CUDA_R_64F, - CUSPARSE_ACTION_NUMERIC, - CUSPARSE_INDEX_BASE_ZERO, - CUSPARSE_CSR2CSC_ALG1, - &bufferSizeU); - /* allocate buffers */ - - double* d_workL; - double* d_workU; - checkCudaErrors(cudaMalloc((void**)&d_workL, bufferSizeL)); - checkCudaErrors(cudaMalloc((void**)&d_workU, bufferSizeU)); - - /* actual CSC to CSR */ - - csp = cusparseCsr2cscEx2(handle_, - n_, - n_, - nnzL, - d_Lx, - d_Lp, - d_Li, - d_Lx_csr, - d_Lp_csr, - d_Li_csr, - CUDA_R_64F, - CUSPARSE_ACTION_NUMERIC, - CUSPARSE_INDEX_BASE_ZERO, - CUSPARSE_CSR2CSC_ALG1, - d_workL); - - csp = cusparseCsr2cscEx2(handle_, - n_, - n_, - nnzU, - d_Ux, - d_Up, - d_Ui, - d_Ux_csr, - d_Up_csr, - d_Ui_csr, - CUDA_R_64F, - CUSPARSE_ACTION_NUMERIC, - CUSPARSE_INDEX_BASE_ZERO, - CUSPARSE_CSR2CSC_ALG1, - d_workU); - - (void)csp; // mute unused variable warnings - - /* CSC no longer needed, nor the work arrays! */ - - cudaFree(d_Lp); - cudaFree(d_Li); - cudaFree(d_Lx); - - cudaFree(d_Up); - cudaFree(d_Ui); - cudaFree(d_Ux); - - cudaFree(d_workU); - cudaFree(d_workL); - - /* actual setup */ - - sp_status_ = cusolverRfSetupDevice(n_, - nnz_, - mat_A_csr_->device_irows(), // dia_, - mat_A_csr_->device_jcols(), // dja_, - mat_A_csr_->device_vals(), // da_, - nnzL, - d_Lp_csr, - d_Li_csr, - d_Lx_csr, - nnzU, - d_Up_csr, - d_Ui_csr, - d_Ux_csr, - d_P_, - d_Q_, - handle_rf_); - cudaDeviceSynchronize(); - sp_status_ = cusolverRfAnalyze(handle_rf_); + sp_status_ = cusolverRfSetupHost(n_, + nnz_, + mat_A_csr_->host_irows(), + mat_A_csr_->host_jcols(), + mat_A_csr_->host_vals(), + factors.nnzL, + L_csr.rowptr.data(), + L_csr.colind.data(), + L_csr.values.data(), + factors.nnzU, + U_csr.rowptr.data(), + U_csr.colind.data(), + U_csr.values.data(), + Numeric_->Pnum, + Symbolic_->Q, + handle_rf_); + assert(CUSOLVER_STATUS_SUCCESS == sp_status_); - // clean up - cudaFree(d_Lp_csr); - cudaFree(d_Li_csr); - cudaFree(d_Lx_csr); + sp_status_ = cusolverRfAnalyze(handle_rf_); + assert(CUSOLVER_STATUS_SUCCESS == sp_status_); - cudaFree(d_Up_csr); - cudaFree(d_Ui_csr); - cudaFree(d_Ux_csr); + sp_status_ = cusolverRfRefactor(handle_rf_); + assert(CUSOLVER_STATUS_SUCCESS == sp_status_); return 0; } From ac3720312575e026da0b4f8f6d7da11d44368caa Mon Sep 17 00:00:00 2001 From: Tamar DeWilde Date: Wed, 17 Jun 2026 02:01:02 -0400 Subject: [PATCH 12/28] Separate EVLOSER RF setup from matrix conversion --- src/LinAlg/EVLOSER/RefactorizationSolver.cpp | 104 ++++++++++++++----- src/LinAlg/EVLOSER/RefactorizationSolver.hpp | 5 + 2 files changed, 85 insertions(+), 24 deletions(-) diff --git a/src/LinAlg/EVLOSER/RefactorizationSolver.cpp b/src/LinAlg/EVLOSER/RefactorizationSolver.cpp index cd0507a..7eda744 100644 --- a/src/LinAlg/EVLOSER/RefactorizationSolver.cpp +++ b/src/LinAlg/EVLOSER/RefactorizationSolver.cpp @@ -401,6 +401,50 @@ bool RefactorizationSolver::validate_klu_factorization(const char* caller) const return true; } +bool RefactorizationSolver::checkCusolverRfStatus(cusolverStatus_t status, const char* caller) const +{ + if(status == CUSOLVER_STATUS_SUCCESS) { + return true; + } + + if(!silent_output_) { + std::cout << "[EVLOSER] " << caller << " failed with cuSOLVER status " << status << "\n"; + } + + return false; +} + +int RefactorizationSolver::resetCusolverRfValues(const char* caller) +{ + sp_status_ = cusolverRfResetValues(n_, + nnz_, + mat_A_csr_->device_irows(), + mat_A_csr_->device_jcols(), + mat_A_csr_->device_vals(), + d_P_, + d_Q_, + handle_rf_); + + if(!checkCusolverRfStatus(sp_status_, caller)) { + return -1; + } + + checkCudaErrors(cudaDeviceSynchronize()); + return 0; +} + +int RefactorizationSolver::analyzeCusolverRf(const char* caller) +{ + sp_status_ = cusolverRfAnalyze(handle_rf_); + return checkCusolverRfStatus(sp_status_, caller) ? 0 : -1; +} + +int RefactorizationSolver::refactorizeCusolverRf(const char* caller) +{ + sp_status_ = cusolverRfRefactor(handle_rf_); + return checkCusolverRfStatus(sp_status_, caller) ? 0 : -1; +} + int RefactorizationSolver::setup_factorization() { if(!validate_system_matrix("KLU analysis")) { @@ -448,8 +492,10 @@ void RefactorizationSolver::setup_refactorization() initializeCusolverGLU(); refactorizationSetupCusolverGLU(); } else if(refact_ == "rf") { - initializeCusolverRf(); - refactorizationSetupCusolverRf(); + if(initializeCusolverRf() != 0 || refactorizationSetupCusolverRf() != 0) { + assert(false && "cuSOLVER RF setup failed."); + return; + } if(use_ir_ == "yes") { configure_iterative_refinement(handle_, handle_cublas_, handle_rf_, n_, d_T_, d_P_, d_Q_, devx_, devr_); } @@ -477,16 +523,12 @@ int RefactorizationSolver::refactorize() sp_status_ = cusolverSpDgluFactor(handle_cusolver_, info_M_, d_work_); } else { if(refact_ == "rf") { - sp_status_ = cusolverRfResetValues(n_, - nnz_, - mat_A_csr_->device_irows(), - mat_A_csr_->device_jcols(), - mat_A_csr_->device_vals(), - d_P_, - d_Q_, - handle_rf_); - cudaDeviceSynchronize(); - sp_status_ = cusolverRfRefactor(handle_rf_); + if(resetCusolverRfValues("cuSOLVER RF reset values") != 0) { + return -1; + } + if(refactorizeCusolverRf("cuSOLVER RF refactorization") != 0) { + return -1; + } } } return 0; @@ -687,18 +729,32 @@ int RefactorizationSolver::initializeCusolverGLU() int RefactorizationSolver::initializeCusolverRf() { - cusolverRfCreate(&handle_rf_); + if(!checkCusolverRfStatus(cusolverRfCreate(&handle_rf_), "cusolverRfCreate")) { + return -1; + } - checkCudaErrors(cusolverRfSetAlgs(handle_rf_, CUSOLVERRF_FACTORIZATION_ALG2, CUSOLVERRF_TRIANGULAR_SOLVE_ALG2)); + sp_status_ = cusolverRfSetAlgs(handle_rf_, CUSOLVERRF_FACTORIZATION_ALG2, CUSOLVERRF_TRIANGULAR_SOLVE_ALG2); + if(!checkCusolverRfStatus(sp_status_, "cusolverRfSetAlgs")) { + return -1; + } - checkCudaErrors(cusolverRfSetMatrixFormat(handle_rf_, CUSOLVERRF_MATRIX_FORMAT_CSR, CUSOLVERRF_UNIT_DIAGONAL_STORED_L)); + sp_status_ = cusolverRfSetMatrixFormat(handle_rf_, CUSOLVERRF_MATRIX_FORMAT_CSR, CUSOLVERRF_UNIT_DIAGONAL_STORED_L); + if(!checkCusolverRfStatus(sp_status_, "cusolverRfSetMatrixFormat")) { + return -1; + } - cusolverRfSetResetValuesFastMode(handle_rf_, CUSOLVERRF_RESET_VALUES_FAST_MODE_ON); + sp_status_ = cusolverRfSetResetValuesFastMode(handle_rf_, CUSOLVERRF_RESET_VALUES_FAST_MODE_ON); + if(!checkCusolverRfStatus(sp_status_, "cusolverRfSetResetValuesFastMode")) { + return -1; + } const double boost = 1e-12; const double zero = 1e-14; - cusolverRfSetNumericProperties(handle_rf_, zero, boost); + sp_status_ = cusolverRfSetNumericProperties(handle_rf_, zero, boost); + if(!checkCusolverRfStatus(sp_status_, "cusolverRfSetNumericProperties")) { + return -1; + } cusolver_rf_enabled_ = true; return 0; @@ -844,15 +900,15 @@ int RefactorizationSolver::refactorizationSetupCusolverRf() Numeric_->Pnum, Symbolic_->Q, handle_rf_); - assert(CUSOLVER_STATUS_SUCCESS == sp_status_); - - sp_status_ = cusolverRfAnalyze(handle_rf_); - assert(CUSOLVER_STATUS_SUCCESS == sp_status_); + if(!checkCusolverRfStatus(sp_status_, "cusolverRfSetupHost")) { + return -1; + } - sp_status_ = cusolverRfRefactor(handle_rf_); - assert(CUSOLVER_STATUS_SUCCESS == sp_status_); + if(analyzeCusolverRf("cuSOLVER RF analysis") != 0) { + return -1; + } - return 0; + return refactorizeCusolverRf("cuSOLVER RF initial refactorization"); } // Error checking utility for CUDA diff --git a/src/LinAlg/EVLOSER/RefactorizationSolver.hpp b/src/LinAlg/EVLOSER/RefactorizationSolver.hpp index 95d3d06..de559dc 100644 --- a/src/LinAlg/EVLOSER/RefactorizationSolver.hpp +++ b/src/LinAlg/EVLOSER/RefactorizationSolver.hpp @@ -232,6 +232,11 @@ class RefactorizationSolver int refactorizationSetupCusolverGLU(); int refactorizationSetupCusolverRf(); + bool checkCusolverRfStatus(cusolverStatus_t status, const char* caller) const; + int resetCusolverRfValues(const char* caller); + int analyzeCusolverRf(const char* caller); + int refactorizeCusolverRf(const char* caller); + /** * @brief Check for CUDA errors. * From 03c35a6b7c45914bf973cb903d60fb4fc2db32c2 Mon Sep 17 00:00:00 2001 From: Tamar DeWilde Date: Wed, 17 Jun 2026 02:01:02 -0400 Subject: [PATCH 13/28] Make EVLOSER iterative refinement optional --- src/LinAlg/EVLOSER/RefactorizationSolver.cpp | 32 +++++++++++++++++--- src/LinAlg/EVLOSER/RefactorizationSolver.hpp | 2 ++ src/LinAlg/hiopLinSolverSparseEVLOSER.cpp | 3 ++ 3 files changed, 33 insertions(+), 4 deletions(-) diff --git a/src/LinAlg/EVLOSER/RefactorizationSolver.cpp b/src/LinAlg/EVLOSER/RefactorizationSolver.cpp index 7eda744..6541b64 100644 --- a/src/LinAlg/EVLOSER/RefactorizationSolver.cpp +++ b/src/LinAlg/EVLOSER/RefactorizationSolver.cpp @@ -322,13 +322,33 @@ RefactorizationSolver::~RefactorizationSolver() void RefactorizationSolver::enable_iterative_refinement() { - ir_ = new IterativeRefinement(); - if(ir_ != nullptr) iterative_refinement_enabled_ = true; + if(ir_ == nullptr) { + ir_ = new IterativeRefinement(); + } + + iterative_refinement_enabled_ = (ir_ != nullptr); +} + +void RefactorizationSolver::disable_iterative_refinement() +{ + delete ir_; + ir_ = nullptr; + iterative_refinement_enabled_ = false; + use_ir_ = "no"; +} + +bool RefactorizationSolver::iterative_refinement_active() const +{ + return iterative_refinement_enabled_ && ir_ != nullptr && use_ir_ == "yes"; } // TODO: Refactor to only pass mat_A_csr_ to setup_system_matrix; n and nnz can be read from mat_A_csr_ void RefactorizationSolver::setup_iterative_refinement_matrix(int n, int nnz) { + if(!iterative_refinement_active()) { + return; + } + ir_->setup_system_matrix(n, nnz, mat_A_csr_->device_irows(), mat_A_csr_->device_jcols(), mat_A_csr_->device_vals()); } @@ -343,6 +363,10 @@ void RefactorizationSolver::configure_iterative_refinement(cusparseHandle_t cusp double* devx, double* devr) { + if(!iterative_refinement_active()) { + return; + } + ir_->setup(cusparse_handle, cublas_handle, cusolverrf_handle, n, d_T, d_P, d_Q, devx, devr); } @@ -496,7 +520,7 @@ void RefactorizationSolver::setup_refactorization() assert(false && "cuSOLVER RF setup failed."); return; } - if(use_ir_ == "yes") { + if(iterative_refinement_active()) { configure_iterative_refinement(handle_, handle_cublas_, handle_rf_, n_, d_T_, d_P_, d_Q_, devx_, devr_); } } else { // for future - @@ -617,7 +641,7 @@ bool RefactorizationSolver::triangular_solve(double* dx, double tol, std::string return false; } - if(use_ir_ == "yes") { + if(iterative_refinement_active()) { // Set tolerance based on barrier parameter mu ir_->set_tol(tol); diff --git a/src/LinAlg/EVLOSER/RefactorizationSolver.hpp b/src/LinAlg/EVLOSER/RefactorizationSolver.hpp index de559dc..6953e55 100644 --- a/src/LinAlg/EVLOSER/RefactorizationSolver.hpp +++ b/src/LinAlg/EVLOSER/RefactorizationSolver.hpp @@ -79,6 +79,8 @@ class RefactorizationSolver ~RefactorizationSolver(); void enable_iterative_refinement(); + void disable_iterative_refinement(); + bool iterative_refinement_active() const; void setup_iterative_refinement_matrix(int n, int nnz); void configure_iterative_refinement(cusparseHandle_t cusparse_handle, cublasHandle_t cublas_handle, diff --git a/src/LinAlg/hiopLinSolverSparseEVLOSER.cpp b/src/LinAlg/hiopLinSolverSparseEVLOSER.cpp index df82d9f..c483ba2 100644 --- a/src/LinAlg/hiopLinSolverSparseEVLOSER.cpp +++ b/src/LinAlg/hiopLinSolverSparseEVLOSER.cpp @@ -181,6 +181,8 @@ hiopLinSolverSymSparseEVLOSER::hiopLinSolverSymSparseEVLOSER(const int& n, const use_ir = "yes"; solver_->enable_iterative_refinement(); solver_->ir()->maxit() = maxit_test; + } else { + solver_->disable_iterative_refinement(); } if(use_ir == "yes") { if((refact == "rf")) { @@ -245,6 +247,7 @@ hiopLinSolverSymSparseEVLOSER::hiopLinSolverSymSparseEVLOSER(const int& n, const } else { nlp_->log->printf(hovWarning, "Currently, inner iterative refinement works ONLY with cuSolverRf ... \n"); use_ir = "no"; + solver_->disable_iterative_refinement(); } } solver_->use_ir() = use_ir; From 25c0e63545cd16a3e3d7c63b207801fbd839bad6 Mon Sep 17 00:00:00 2001 From: Tamar DeWilde Date: Wed, 17 Jun 2026 02:01:02 -0400 Subject: [PATCH 14/28] Clean EVLOSER backend naming --- src/LinAlg/EVLOSER/CMakeLists.txt | 6 --- src/LinAlg/EVLOSER/IterativeRefinement.cpp | 4 +- src/LinAlg/EVLOSER/IterativeRefinement.hpp | 4 +- src/LinAlg/EVLOSER/MatrixCsr.cpp | 4 +- src/LinAlg/EVLOSER/MatrixCsr.hpp | 40 ++++++++++++++++++- src/LinAlg/EVLOSER/RefactorizationSolver.cpp | 4 +- src/LinAlg/EVLOSER/RefactorizationSolver.hpp | 21 +++++++++- ...ver_defs.hpp => evloser_cusolver_defs.hpp} | 2 +- src/LinAlg/hiopLinSolverSparseEVLOSER.cpp | 12 +++--- src/LinAlg/hiopLinSolverSparseEVLOSER.hpp | 2 +- 10 files changed, 73 insertions(+), 26 deletions(-) rename src/LinAlg/EVLOSER/{resolve_cusolver_defs.hpp => evloser_cusolver_defs.hpp} (99%) diff --git a/src/LinAlg/EVLOSER/CMakeLists.txt b/src/LinAlg/EVLOSER/CMakeLists.txt index aecb530..b7fbf9c 100644 --- a/src/LinAlg/EVLOSER/CMakeLists.txt +++ b/src/LinAlg/EVLOSER/CMakeLists.txt @@ -6,12 +6,6 @@ set(EVLOSER_SRC IterativeRefinement.cpp KrylovSolverKernels.cu ) -set(EVLOSER_HEADERS - RefactorizationSolver.hpp - MatrixCsr.hpp - IterativeRefinement.hpp -) - set_source_files_properties(${EVLOSER_SRC} PROPERTIES LANGUAGE CUDA) add_library(EVLOSER STATIC ${EVLOSER_SRC}) diff --git a/src/LinAlg/EVLOSER/IterativeRefinement.cpp b/src/LinAlg/EVLOSER/IterativeRefinement.cpp index d219e59..5e4caf3 100644 --- a/src/LinAlg/EVLOSER/IterativeRefinement.cpp +++ b/src/LinAlg/EVLOSER/IterativeRefinement.cpp @@ -65,7 +65,7 @@ #include #include -#define checkCudaErrors(val) resolveCheckCudaError((val), __FILE__, __LINE__) +#define checkCudaErrors(val) evloserCheckCudaError((val), __FILE__, __LINE__) namespace EVLOSER { @@ -684,7 +684,7 @@ void IterativeRefinement::GramSchmidt(int i) // Error checking utility for CUDA // KS: might later become part of src/Utils, putting it here for now template -void IterativeRefinement::resolveCheckCudaError(T result, const char* const file, int const line) +void IterativeRefinement::evloserCheckCudaError(T result, const char* const file, int const line) { #ifdef DEBUG if(result) { diff --git a/src/LinAlg/EVLOSER/IterativeRefinement.hpp b/src/LinAlg/EVLOSER/IterativeRefinement.hpp index b7c1767..f0d7bac 100644 --- a/src/LinAlg/EVLOSER/IterativeRefinement.hpp +++ b/src/LinAlg/EVLOSER/IterativeRefinement.hpp @@ -9,7 +9,7 @@ #pragma once #include "klu.h" -#include "resolve_cusolver_defs.hpp" +#include "evloser_cusolver_defs.hpp" #include namespace EVLOSER @@ -170,7 +170,7 @@ class IterativeRefinement * @param line - line at which the error occured */ template - void resolveCheckCudaError(T result, const char* const file, int const line); + void evloserCheckCudaError(T result, const char* const file, int const line); }; } // namespace EVLOSER diff --git a/src/LinAlg/EVLOSER/MatrixCsr.cpp b/src/LinAlg/EVLOSER/MatrixCsr.cpp index ab0f650..ffb157c 100644 --- a/src/LinAlg/EVLOSER/MatrixCsr.cpp +++ b/src/LinAlg/EVLOSER/MatrixCsr.cpp @@ -63,7 +63,7 @@ #include #include -#define checkCudaErrors(val) resolveCheckCudaError((val), __FILE__, __LINE__) +#define checkCudaErrors(val) evloserCheckCudaError((val), __FILE__, __LINE__) namespace EVLOSER { @@ -235,7 +235,7 @@ bool MatrixCsr::validate_host_structure(const char* caller, bool silent_output) // Error checking utility for CUDA // KS: might later become part of src/Utils, putting it here for now template -void MatrixCsr::resolveCheckCudaError(T result, const char* const file, int const line) +void MatrixCsr::evloserCheckCudaError(T result, const char* const file, int const line) { if(result) { std::cout << "CUDA error at " << file << ":" << line << " error# " << result << "\n"; diff --git a/src/LinAlg/EVLOSER/MatrixCsr.hpp b/src/LinAlg/EVLOSER/MatrixCsr.hpp index bd7b6f6..5cdbe88 100644 --- a/src/LinAlg/EVLOSER/MatrixCsr.hpp +++ b/src/LinAlg/EVLOSER/MatrixCsr.hpp @@ -8,41 +8,79 @@ class MatrixCsr public: MatrixCsr(); ~MatrixCsr(); + + /// Allocate device and host row-pointer storage for an n-by-n CSR matrix. void allocate_size(int n); + + /// Allocate device and host column-index/value storage for the current CSR matrix. void allocate_nnz(int nnz); + + /// Release all owned device and host CSR storage. void clear_data(); + /// Return the matrix dimension. int n() const { return n_; } + + /// Return the number of stored nonzeros. int nnz() const { return nnz_; } + + /// Return true when the required device CSR arrays have been allocated. bool has_device_storage() const; + + /// Return true when the required host CSR mirror arrays have been allocated. bool has_host_mirror() const; + + /** + * @brief Validate the host-side CSR structure before factorization/refactorization. + * + * Checks row-pointer monotonicity, final nnz consistency, and column-index bounds. + * + * @param caller Name of the caller used in diagnostic messages. + * @param silent_output Suppress diagnostic output when true. + * @return true if the host CSR structure is valid. + */ bool validate_host_structure(const char* caller, bool silent_output) const; + /// Return device row-pointer storage. int* device_irows() { return irows_; } + /// Return const device row-pointer storage. const int* device_irows() const { return irows_; } + /// Return device column-index storage. int* device_jcols() { return jcols_; } + /// Return const device column-index storage. const int* device_jcols() const { return jcols_; } + /// Return device value storage. double* device_vals() { return vals_; } + /// Return const device value storage. const double* device_vals() const { return vals_; } + /// Return host row-pointer mirror storage. int* host_irows() { return irows_host_; } + /// Return const host row-pointer mirror storage. const int* host_irows() const { return irows_host_; } + /// Return host column-index mirror storage. int* host_jcols() { return jcols_host_; } + /// Return const host column-index mirror storage. const int* host_jcols() const { return jcols_host_; } + /// Return host value mirror storage. double* host_vals() { return vals_host_; } + /// Return const host value mirror storage. const double* host_vals() const { return vals_host_; } + /// Copy host-side CSR arrays into device storage. void update_from_host_mirror(); + + /// Copy device CSR arrays into the host mirror. void copy_to_host_mirror(); private: @@ -66,7 +104,7 @@ class MatrixCsr * @param line - line at which the error occured */ template - void resolveCheckCudaError(T result, const char* const file, int const line); + void evloserCheckCudaError(T result, const char* const file, int const line); }; } // namespace EVLOSER diff --git a/src/LinAlg/EVLOSER/RefactorizationSolver.cpp b/src/LinAlg/EVLOSER/RefactorizationSolver.cpp index 6541b64..8972062 100644 --- a/src/LinAlg/EVLOSER/RefactorizationSolver.cpp +++ b/src/LinAlg/EVLOSER/RefactorizationSolver.cpp @@ -64,7 +64,7 @@ #include #include -#define checkCudaErrors(val) resolveCheckCudaError((val), __FILE__, __LINE__) +#define checkCudaErrors(val) evloserCheckCudaError((val), __FILE__, __LINE__) namespace EVLOSER { @@ -938,7 +938,7 @@ int RefactorizationSolver::refactorizationSetupCusolverRf() // Error checking utility for CUDA // KS: might later become part of src/Utils, putting it here for now template -void RefactorizationSolver::resolveCheckCudaError(T result, const char* const file, int const line) +void RefactorizationSolver::evloserCheckCudaError(T result, const char* const file, int const line) { if(result) { fprintf(stdout, "CUDA error at %s:%d, error# %d\n", file, line, result); diff --git a/src/LinAlg/EVLOSER/RefactorizationSolver.hpp b/src/LinAlg/EVLOSER/RefactorizationSolver.hpp index 6953e55..334cbca 100644 --- a/src/LinAlg/EVLOSER/RefactorizationSolver.hpp +++ b/src/LinAlg/EVLOSER/RefactorizationSolver.hpp @@ -57,7 +57,7 @@ #pragma once #include "klu.h" -#include "resolve_cusolver_defs.hpp" +#include "evloser_cusolver_defs.hpp" #include namespace EVLOSER @@ -78,9 +78,16 @@ class RefactorizationSolver RefactorizationSolver(int n); ~RefactorizationSolver(); + /// Enable allocation and use of iterative refinement. void enable_iterative_refinement(); + + /// Disable iterative refinement and release its owned state. void disable_iterative_refinement(); + + /// Return true when iterative refinement is enabled, allocated, and requested. bool iterative_refinement_active() const; + + /// Attach the current CSR matrix to the iterative refinement object. void setup_iterative_refinement_matrix(int n, int nnz); void configure_iterative_refinement(cusparseHandle_t cusparse_handle, cublasHandle_t cublas_handle, @@ -224,7 +231,10 @@ class RefactorizationSolver */ int createM(const int n, const int nnzL, const int* Lp, const int* Li, const int nnzU, const int* Up, const int* Ui); + /// Validate the current CSR system matrix before solver setup or refactorization. bool validate_system_matrix(const char* caller) const; + + /// Validate that KLU symbolic and numeric factors are available and dimensionally consistent. bool validate_klu_factorization(const char* caller) const; int initializeKLU(); @@ -234,9 +244,16 @@ class RefactorizationSolver int refactorizationSetupCusolverGLU(); int refactorizationSetupCusolverRf(); + /// Check and report a cuSOLVER RF status value. bool checkCusolverRfStatus(cusolverStatus_t status, const char* caller) const; + + /// Reset cuSOLVER RF values using the current device CSR matrix. int resetCusolverRfValues(const char* caller); + + /// Run cuSOLVER RF analysis on the configured RF handle. int analyzeCusolverRf(const char* caller); + + /// Run cuSOLVER RF numeric refactorization on the configured RF handle. int refactorizeCusolverRf(const char* caller); /** @@ -248,7 +265,7 @@ class RefactorizationSolver * @param line - line at which the error occured */ template - void resolveCheckCudaError(T result, const char* const file, int const line); + void evloserCheckCudaError(T result, const char* const file, int const line); }; } // namespace EVLOSER diff --git a/src/LinAlg/EVLOSER/resolve_cusolver_defs.hpp b/src/LinAlg/EVLOSER/evloser_cusolver_defs.hpp similarity index 99% rename from src/LinAlg/EVLOSER/resolve_cusolver_defs.hpp rename to src/LinAlg/EVLOSER/evloser_cusolver_defs.hpp index ce3b1fa..7d640cd 100644 --- a/src/LinAlg/EVLOSER/resolve_cusolver_defs.hpp +++ b/src/LinAlg/EVLOSER/evloser_cusolver_defs.hpp @@ -43,7 +43,7 @@ // product endorsement purposes. /** - * @file hiop_cusolver_defs.hpp + * @file evloser_cusolver_defs.hpp * * @author Kasia Swirydowicz , PNNL * diff --git a/src/LinAlg/hiopLinSolverSparseEVLOSER.cpp b/src/LinAlg/hiopLinSolverSparseEVLOSER.cpp index c483ba2..a9214de 100644 --- a/src/LinAlg/hiopLinSolverSparseEVLOSER.cpp +++ b/src/LinAlg/hiopLinSolverSparseEVLOSER.cpp @@ -54,13 +54,11 @@ */ #include "hiopLinSolverSparseEVLOSER.hpp" -#include #include "EVLOSER/RefactorizationSolver.hpp" #include "EVLOSER/MatrixCsr.hpp" #include "EVLOSER/IterativeRefinement.hpp" #include "hiop_blasdefs.hpp" -#include "KrylovSolverKernels.h" #include "cusparse_v2.h" #include @@ -77,7 +75,7 @@ * */ template -__global__ void mapArraysKernelEVLOSER(T* dst, const T* src, const I* mapidx, I n) +__global__ void evloser_map_arrays_kernel(T* dst, const T* src, const I* mapidx, I n) { I tid = blockDim.x * blockIdx.x + threadIdx.x; @@ -96,7 +94,7 @@ __global__ void mapArraysKernelEVLOSER(T* dst, const T* src, const I* mapidx, I * */ template -__global__ void addToArrayKernelEVLOSER(T* dst, const T* src, const I* mapidx, I n, I nnz) +__global__ void evloser_add_to_array_kernel(T* dst, const T* src, const I* mapidx, I n, I nnz) { I tid = blockDim.x * blockIdx.x + threadIdx.x; @@ -119,7 +117,7 @@ hiopLinSolverSymSparseEVLOSER::hiopLinSolverSymSparseEVLOSER(const int& n, const factorizationSetupSucc_{0}, is_first_call_{true} { - // Create embedded ReSolve refactorization solver for the EVLOSER wrapper + // Create embedded EVLOSER refactorization solver solver_ = new EVLOSER::RefactorizationSolver(n); // If memory space is device, allocate host mirror for HiOp's KKT matrix in triplet format @@ -384,10 +382,10 @@ void hiopLinSolverSymSparseEVLOSER::update_matrix_values() const int blocksize = 512; int gridsize = (nnz_ + blocksize - 1) / blocksize; - mapArraysKernelEVLOSER<<>>(csr_vals, coo_vals, index_convert_CSR2Triplet_device_, nnz_); + evloser_map_arrays_kernel<<>>(csr_vals, coo_vals, index_convert_CSR2Triplet_device_, nnz_); gridsize = (n_ + blocksize - 1) / blocksize; - addToArrayKernelEVLOSER + evloser_add_to_array_kernel <<>>(csr_vals, coo_vals, index_convert_extra_Diag2CSR_device_, n_, coo_nnz); // If factorization was not successful, we need a copy of values on the host diff --git a/src/LinAlg/hiopLinSolverSparseEVLOSER.hpp b/src/LinAlg/hiopLinSolverSparseEVLOSER.hpp index 29b2363..57787aa 100644 --- a/src/LinAlg/hiopLinSolverSparseEVLOSER.hpp +++ b/src/LinAlg/hiopLinSolverSparseEVLOSER.hpp @@ -62,7 +62,7 @@ #include /** Implements the sparse linear solver class using the EVLOSER interface - * to the embedded ReSolve backend. + * to the embedded EVLOSER backend. * * @ingroup LinearSolvers */ From d9f57cf4cb1453b2f610c530678b5bca2538418f Mon Sep 17 00:00:00 2001 From: Tamar DeWilde Date: Wed, 17 Jun 2026 02:01:02 -0400 Subject: [PATCH 15/28] Add HIP EVLOSER sparse RAJA driver option --- src/Drivers/Sparse/CMakeLists.txt | 4 ++ src/Drivers/Sparse/NlpSparseRajaEx2Driver.cpp | 40 ++++++++++++++++--- 2 files changed, 39 insertions(+), 5 deletions(-) diff --git a/src/Drivers/Sparse/CMakeLists.txt b/src/Drivers/Sparse/CMakeLists.txt index 0c1e2c2..707c86a 100644 --- a/src/Drivers/Sparse/CMakeLists.txt +++ b/src/Drivers/Sparse/CMakeLists.txt @@ -95,6 +95,10 @@ if(HIOP_USE_RAJA AND HIOP_USE_GPU AND HIOP_USE_CUDA AND HIOP_USE_RESOLVE) add_test(NAME NlpSparseRaja2_3 COMMAND ${RUNCMD} "$" "500" "-inertiafree" "-selfcheck" "-evloser_cuda_rf") endif() +if(HIOP_USE_RAJA AND HIOP_USE_GPU AND HIOP_USE_HIP AND HIOP_USE_RESOLVE) + add_test(NAME NlpSparseRaja2_4 COMMAND ${RUNCMD} "$" "500" "-inertiafree" "-selfcheck" "-evloser_hip_rf") +endif() + add_test(NAME NlpSparse3_1 COMMAND ${RUNCMD} "$" "500" "-selfcheck") if(HIOP_BUILD_SHARED AND NOT HIOP_USE_GPU ) add_test(NAME NlpSparseCinterface COMMAND ${RUNCMD} "$") diff --git a/src/Drivers/Sparse/NlpSparseRajaEx2Driver.cpp b/src/Drivers/Sparse/NlpSparseRajaEx2Driver.cpp index 579837d..885f306 100644 --- a/src/Drivers/Sparse/NlpSparseRajaEx2Driver.cpp +++ b/src/Drivers/Sparse/NlpSparseRajaEx2Driver.cpp @@ -21,6 +21,7 @@ static bool parse_arguments(int argc, bool& use_resolve_cuda_glu, bool& use_resolve_cuda_rf, bool& use_evloser_cuda_rf, + bool& use_evloser_hip_rf, bool& use_ginkgo, bool& use_ginkgo_cuda, bool& use_ginkgo_hip) @@ -31,6 +32,7 @@ static bool parse_arguments(int argc, use_resolve_cuda_glu = false; use_resolve_cuda_rf = false; use_evloser_cuda_rf = false; + use_evloser_hip_rf = false; use_ginkgo = false; use_ginkgo_cuda = false; use_ginkgo_hip = false; @@ -51,6 +53,8 @@ static bool parse_arguments(int argc, use_resolve_cuda_rf = true; } else if(std::string(argv[4]) == "-evloser_cuda_rf") { use_evloser_cuda_rf = true; + } else if(std::string(argv[4]) == "-evloser_hip_rf") { + use_hip_cuda_rf = true; } else if(std::string(argv[4]) == "-ginkgo") { use_ginkgo = true; } else if(std::string(argv[4]) == "-ginkgo_cuda") { @@ -78,6 +82,8 @@ static bool parse_arguments(int argc, use_resolve_cuda_rf = true; } else if(std::string(argv[3]) == "-evloser_cuda_rf") { use_evloser_cuda_rf = true; + } else if(std::string(argv[3]) == "-evloser_hip_rf") { + use_hip_cuda_rf = true; } else if(std::string(argv[3]) == "-ginkgo") { use_ginkgo = true; } else if(std::string(argv[3]) == "-ginkgo_cuda") { @@ -105,6 +111,8 @@ static bool parse_arguments(int argc, use_resolve_cuda_rf = true; } else if(std::string(argv[2]) == "-evloser_cuda_rf") { use_evloser_cuda_rf = true; + } else if(std::string(argv[2]) == "-evloser_hip_rf") { + use_hip_cuda_rf = true; } else if(std::string(argv[2]) == "-ginkgo") { use_ginkgo = true; } else if(std::string(argv[2]) == "-ginkgo_cuda") { @@ -132,6 +140,8 @@ static bool parse_arguments(int argc, use_resolve_cuda_rf = true; } else if(std::string(argv[1]) == "-evloser_cuda_rf") { use_evloser_cuda_rf = true; + } else if(std::string(argv[1]) == "-evloser_hip_rf") { + use_hip_cuda_rf = true; } else if(std::string(argv[1]) == "-ginkgo") { use_ginkgo = true; } else if(std::string(argv[1]) == "-ginkgo_cuda") { @@ -170,26 +180,41 @@ static bool parse_arguments(int argc, } #endif +// EVLOSER HIP RF requires HIP support. +#ifndef HIOP_USE_HIP + if(use_evloser_hip_rf) { + printf("HiOp built without HIP support. "); + printf("Using default instead of EVLOSER ...\n"); + use_evloser_hip_rf = false; + } +#endif + // If ReSolve was selected, but inertia free approach was not, add inertia-free - if((use_resolve_cuda_glu || use_resolve_cuda_rf || use_evloser_cuda_rf) && !(inertia_free)) { + if((use_resolve_cuda_glu || use_resolve_cuda_rf || use_evloser_cuda_rf || use_evloser_hip_rf) && !(inertia_free)) { inertia_free = true; printf("LU solver from ReSolve library requires inertia free approach. "); printf("Enabling now ...\n"); } - if(use_resolve_cuda_glu && (use_resolve_cuda_rf || use_evloser_cuda_rf)) { + if(use_resolve_cuda_glu && (use_resolve_cuda_rf || use_evloser_cuda_rf || use_evloser_hip_rf)) { use_resolve_cuda_rf = false; use_evloser_cuda_rf = false; + use_evloser_hip_rf = false; printf("You can select either GLU or Rf refactorization, not both. "); printf("Using default GLU refactorization ...\n"); } - if(use_resolve_cuda_rf && use_evloser_cuda_rf) { + if(use_resolve_cuda_rf && (use_evloser_cuda_rf || use_evloser_hip_rf)) { use_evloser_cuda_rf = false; + use_evloser_hip_rf = false; printf("You can select either ReSolve or EVLOSER, not both. "); printf("Using ReSolve ...\n"); } + if(use_evloser_cuda_rf && use_evloser_hip_rf) { + use_evloser_hip_rf = false; + } + // If Ginkgo is not available, de-select it. #ifndef HIOP_USE_GINKGO if(use_ginkgo) { @@ -228,6 +253,9 @@ static void usage(const char* exeName) printf( " '-evloser_cuda_rf' : use EVLOSER linear solver with KLU factorization and cusolverRf refactorization " "[optional]\n"); + printf( + " '-evloser_hip_rf' : use EVLOSER linear solver with KLU factorization and hipsolverRf refactorization " + "[optional]\n"); printf(" '-ginkgo': use GINKGO linear solver [optional]\n"); } @@ -261,6 +289,7 @@ int main(int argc, char** argv) bool use_resolve_cuda_glu = false; bool use_resolve_cuda_rf = false; bool use_evloser_cuda_rf = false; + bool use_evloser_hip_rf = false; bool use_ginkgo = false; bool use_ginkgo_cuda = false; bool use_ginkgo_hip = false; @@ -272,6 +301,7 @@ int main(int argc, char** argv) use_resolve_cuda_glu, use_resolve_cuda_rf, use_evloser_cuda_rf, + use_evloser_hip_rf, use_ginkgo, use_ginkgo_cuda, use_ginkgo_hip)) { @@ -297,12 +327,12 @@ int main(int argc, char** argv) // only support cusolverLU right now, 2023.02.28 // lsq initialization of the duals fails for this example since the Jacobian is rank deficient // use zero initialization - if(use_evloser_cuda_rf) { + if(use_evloser_cuda_rf || use_evloser_hip_rf) { nlp.options->SetStringValue("linear_solver_sparse", "evloser"); } else { nlp.options->SetStringValue("linear_solver_sparse", "resolve"); } - if(use_resolve_cuda_rf || use_evloser_cuda_rf) { + if(use_resolve_cuda_rf || use_evloser_cuda_rf || use_evloser_hip_rf) { nlp.options->SetStringValue("resolve_refactorization", "rf"); nlp.options->SetIntegerValue("ir_inner_maxit", 20); nlp.options->SetIntegerValue("ir_outer_maxit", 0); From 79b50d3cead56be6fc8025ad89bf70221db423aa Mon Sep 17 00:00:00 2001 From: Tamar DeWilde Date: Wed, 17 Jun 2026 02:01:02 -0400 Subject: [PATCH 16/28] Allow EVLOSER option in HIP builds --- src/Utils/hiopOptions.cpp | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/src/Utils/hiopOptions.cpp b/src/Utils/hiopOptions.cpp index 77cbdc7..8d5f7fe 100644 --- a/src/Utils/hiopOptions.cpp +++ b/src/Utils/hiopOptions.cpp @@ -1425,8 +1425,21 @@ void hiopOptionsNLP::ensure_consistence() } } +// EVLOSER requires either CUDA or HIP support. +#if !defined(HIOP_USE_CUDA) && !defined(HIOP_USE_HIP) + if(sol_sp == "evloser") { + if(is_user_defined("linear_solver_sparse")) { + log_printf(hovWarning, + "The option 'linear_solver_sparse=%s' is not valid without CUDA or HIP support enabled." + " Will use 'linear_solver_sparse=auto'.\n", + GetString("linear_solver_sparse").c_str()); + } + set_val("linear_solver_sparse", "auto"); + } +#endif // !defined(HIOP_USE_CUDA) && !defined(HIOP_USE_HIP) + #ifndef HIOP_USE_CUDA - if(sol_sp == "resolve" || sol_sp == "evloser" || sol_sp == "cusolver-chol") { + if(sol_sp == "resolve" || sol_sp == "cusolver-chol") { if(is_user_defined("linear_solver_sparse")) { log_printf(hovWarning, "The option 'linear_solver_sparse=%s' is not valid without CUDA support enabled." From 2fe20bde4aef9fe805a63d0f2348514a62c7d783 Mon Sep 17 00:00:00 2001 From: Tamar DeWilde Date: Wed, 17 Jun 2026 02:01:03 -0400 Subject: [PATCH 17/28] Use GPU backend wrappers in EVLOSER matrix storage --- src/LinAlg/EVLOSER/MatrixCsr.cpp | 38 +++++++-------- src/LinAlg/EVLOSER/MatrixCsr.hpp | 4 +- src/LinAlg/EVLOSER/evloser_cusolver_defs.hpp | 34 +++++++++++++ src/LinAlg/EVLOSER/evloser_gpu_defs.hpp | 25 ++++++++++ src/LinAlg/EVLOSER/evloser_hipsolver_defs.hpp | 48 +++++++++++++++++++ 5 files changed, 128 insertions(+), 21 deletions(-) create mode 100644 src/LinAlg/EVLOSER/evloser_gpu_defs.hpp create mode 100644 src/LinAlg/EVLOSER/evloser_hipsolver_defs.hpp diff --git a/src/LinAlg/EVLOSER/MatrixCsr.cpp b/src/LinAlg/EVLOSER/MatrixCsr.cpp index ffb157c..6eb801f 100644 --- a/src/LinAlg/EVLOSER/MatrixCsr.cpp +++ b/src/LinAlg/EVLOSER/MatrixCsr.cpp @@ -56,14 +56,14 @@ #include "hiop_blasdefs.hpp" #include "MatrixCsr.hpp" -#include "cusparse_v2.h" +#include "evloser_gpu_defs.hpp" #include #include #include #include #include -#define checkCudaErrors(val) evloserCheckCudaError((val), __FILE__, __LINE__) +#define checkGpuErrors(val) evloserCheckGpuError((val), __FILE__, __LINE__) namespace EVLOSER { @@ -96,15 +96,15 @@ void MatrixCsr::allocate_size(int n) } n_ = n; - checkCudaErrors(cudaMalloc(&irows_, (n_ + 1) * sizeof(int))); + checkGpuErrors(evloserGpuMalloc(reinterpret_cast(&irows_), (n_ + 1) * sizeof(int))); irows_host_ = new int[n_ + 1]{0}; } void MatrixCsr::allocate_nnz(int nnz) { if(jcols_ != nullptr || vals_ != nullptr || jcols_host_ != nullptr || vals_host_ != nullptr) { - checkCudaErrors(cudaFree(jcols_)); - checkCudaErrors(cudaFree(vals_)); + checkGpuErrors(evloserGpuFree(jcols_)); + checkGpuErrors(evloserGpuFree(vals_)); delete[] jcols_host_; delete[] vals_host_; @@ -120,17 +120,17 @@ void MatrixCsr::allocate_nnz(int nnz) return; } - checkCudaErrors(cudaMalloc(&jcols_, nnz_ * sizeof(int))); - checkCudaErrors(cudaMalloc(&vals_, nnz_ * sizeof(double))); + checkGpuErrors(evloserGpuMalloc(reinterpret_cast(&jcols_), nnz_ * sizeof(int))); + checkGpuErrors(evloserGpuMalloc(reinterpret_cast(&vals_), nnz_ * sizeof(double))); jcols_host_ = new int[nnz_]{0}; vals_host_ = new double[nnz_]{0}; } void MatrixCsr::clear_data() { - checkCudaErrors(cudaFree(irows_)); - checkCudaErrors(cudaFree(jcols_)); - checkCudaErrors(cudaFree(vals_)); + checkGpuErrors(evloserGpuFree(irows_)); + checkGpuErrors(evloserGpuFree(jcols_)); + checkGpuErrors(evloserGpuFree(vals_)); irows_ = nullptr; jcols_ = nullptr; @@ -153,11 +153,11 @@ void MatrixCsr::update_from_host_mirror() assert(has_device_storage()); assert(has_host_mirror()); - checkCudaErrors(cudaMemcpy(irows_, irows_host_, sizeof(int) * (n_ + 1), cudaMemcpyHostToDevice)); + checkGpuErrors(evloserGpuMemcpy(irows_, irows_host_, sizeof(int) * (n_ + 1), evloserMemcpyHostToDevice)); if(nnz_ > 0) { - checkCudaErrors(cudaMemcpy(jcols_, jcols_host_, sizeof(int) * nnz_, cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpy(vals_, vals_host_, sizeof(double) * nnz_, cudaMemcpyHostToDevice)); + checkGpuErrors(evloserGpuMemcpy(jcols_, jcols_host_, sizeof(int) * nnz_, evloserMemcpyHostToDevice)); + checkGpuErrors(evloserGpuMemcpy(vals_, vals_host_, sizeof(double) * nnz_, evloserMemcpyHostToDevice)); } } @@ -166,11 +166,11 @@ void MatrixCsr::copy_to_host_mirror() assert(has_device_storage()); assert(has_host_mirror()); - checkCudaErrors(cudaMemcpy(irows_host_, irows_, sizeof(int) * (n_ + 1), cudaMemcpyDeviceToHost)); + checkGpuErrors(evloserGpuMemcpy(irows_host_, irows_, sizeof(int) * (n_ + 1), evloserMemcpyDeviceToHost)); if(nnz_ > 0) { - checkCudaErrors(cudaMemcpy(jcols_host_, jcols_, sizeof(int) * nnz_, cudaMemcpyDeviceToHost)); - checkCudaErrors(cudaMemcpy(vals_host_, vals_, sizeof(double) * nnz_, cudaMemcpyDeviceToHost)); + checkGpuErrors(evloserGpuMemcpy(jcols_host_, jcols_, sizeof(int) * nnz_, evloserMemcpyDeviceToHost)); + checkGpuErrors(evloserGpuMemcpy(vals_host_, vals_, sizeof(double) * nnz_, evloserMemcpyDeviceToHost)); } } @@ -232,13 +232,13 @@ bool MatrixCsr::validate_host_structure(const char* caller, bool silent_output) return true; } -// Error checking utility for CUDA +// Error checking utility for GPU backend // KS: might later become part of src/Utils, putting it here for now template -void MatrixCsr::evloserCheckCudaError(T result, const char* const file, int const line) +void MatrixCsr::evloserCheckGpuError(T result, const char* const file, int const line) { if(result) { - std::cout << "CUDA error at " << file << ":" << line << " error# " << result << "\n"; + std::cout << "GPU error at " << file << ":" << line << " error# " << result << "\n"; assert(false); } } diff --git a/src/LinAlg/EVLOSER/MatrixCsr.hpp b/src/LinAlg/EVLOSER/MatrixCsr.hpp index 5cdbe88..2465245 100644 --- a/src/LinAlg/EVLOSER/MatrixCsr.hpp +++ b/src/LinAlg/EVLOSER/MatrixCsr.hpp @@ -96,7 +96,7 @@ class MatrixCsr double* vals_host_{nullptr}; /** - * @brief Check for CUDA errors. + * @brief Check for GPU backend errors. * * @tparam T - type of the result * @param result - result value @@ -104,7 +104,7 @@ class MatrixCsr * @param line - line at which the error occured */ template - void evloserCheckCudaError(T result, const char* const file, int const line); + void evloserCheckGpuError(T result, const char* const file, int const line); }; } // namespace EVLOSER diff --git a/src/LinAlg/EVLOSER/evloser_cusolver_defs.hpp b/src/LinAlg/EVLOSER/evloser_cusolver_defs.hpp index 7d640cd..e48782e 100644 --- a/src/LinAlg/EVLOSER/evloser_cusolver_defs.hpp +++ b/src/LinAlg/EVLOSER/evloser_cusolver_defs.hpp @@ -60,6 +60,40 @@ #include #include #include "cusolverSp_LOWLEVEL_PREVIEW.h" +#include + +using evloserGpuError_t = cudaError_t; +using evloserGpuMemcpyKind_t = cudaMemcpyKind; + +static const evloserGpuError_t evloserGpuSuccess = cudaSuccess; +static const evloserGpuMemcpyKind_t evloserMemcpyHostToDevice = cudaMemcpyHostToDevice; +static const evloserGpuMemcpyKind_t evloserMemcpyDeviceToHost = cudaMemcpyDeviceToHost; +static const evloserGpuMemcpyKind_t evloserMemcpyDeviceToDevice = cudaMemcpyDeviceToDevice; + +inline evloserGpuError_t evloserGpuMalloc(void** ptr, size_t size) +{ + return cudaMalloc(ptr, size); +} + +inline evloserGpuError_t evloserGpuFree(void* ptr) +{ + return cudaFree(ptr); +} + +inline evloserGpuError_t evloserGpuMemcpy(void* dst, const void* src, size_t count, evloserGpuMemcpyKind_t kind) +{ + return cudaMemcpy(dst, src, count, kind); +} + +inline evloserGpuError_t evloserGpuDeviceSynchronize() +{ + return cudaDeviceSynchronize(); +} + +inline const char* evloserGpuGetErrorString(evloserGpuError_t status) +{ + return cudaGetErrorString(status); +} #include "cusolverRf.h" diff --git a/src/LinAlg/EVLOSER/evloser_gpu_defs.hpp b/src/LinAlg/EVLOSER/evloser_gpu_defs.hpp new file mode 100644 index 0000000..764935f --- /dev/null +++ b/src/LinAlg/EVLOSER/evloser_gpu_defs.hpp @@ -0,0 +1,25 @@ +/** + * @file evloser_gpu_defs.hpp + * + * Selects CUDA or HIP GPU backend definitions for EVLOSER. + * + */ + +#ifndef EVLOSER_GPU_DEFS_H +#define EVLOSER_GPU_DEFS_H + +#if defined(HIOP_USE_CUDA) + +#include "evloser_cusolver_defs.hpp" + +#elif defined(HIOP_USE_HIP) + +#include "evloser_hipsolver_defs.hpp" + +#else + +#error "EVLOSER GPU backend requires either HIOP_USE_CUDA or HIOP_USE_HIP." + +#endif + +#endif // EVLOSER_GPU_DEFS_H diff --git a/src/LinAlg/EVLOSER/evloser_hipsolver_defs.hpp b/src/LinAlg/EVLOSER/evloser_hipsolver_defs.hpp new file mode 100644 index 0000000..a9e8cf6 --- /dev/null +++ b/src/LinAlg/EVLOSER/evloser_hipsolver_defs.hpp @@ -0,0 +1,48 @@ +/** + * @file evloser_hipsolver_defs.hpp + * + * Defines HIP GPU backend wrappers used by EVLOSER. + * + */ + +#ifndef EVLOSER_HIPSOLVER_DEFS_H +#define EVLOSER_HIPSOLVER_DEFS_H + +#include + +#include + +using evloserGpuError_t = hipError_t; +using evloserGpuMemcpyKind_t = hipMemcpyKind; + +static const evloserGpuError_t evloserGpuSuccess = hipSuccess; +static const evloserGpuMemcpyKind_t evloserMemcpyHostToDevice = hipMemcpyHostToDevice; +static const evloserGpuMemcpyKind_t evloserMemcpyDeviceToHost = hipMemcpyDeviceToHost; +static const evloserGpuMemcpyKind_t evloserMemcpyDeviceToDevice = hipMemcpyDeviceToDevice; + +inline evloserGpuError_t evloserGpuMalloc(void** ptr, size_t size) +{ + return hipMalloc(ptr, size); +} + +inline evloserGpuError_t evloserGpuFree(void* ptr) +{ + return hipFree(ptr); +} + +inline evloserGpuError_t evloserGpuMemcpy(void* dst, const void* src, size_t count, evloserGpuMemcpyKind_t kind) +{ + return hipMemcpy(dst, src, count, kind); +} + +inline evloserGpuError_t evloserGpuDeviceSynchronize() +{ + return hipDeviceSynchronize(); +} + +inline const char* evloserGpuGetErrorString(evloserGpuError_t status) +{ + return hipGetErrorString(status); +} + +#endif // EVLOSER_HIPSOLVER_DEFS_H From c8d981cee2cedda65170734ad95f812ca965432a Mon Sep 17 00:00:00 2001 From: Tamar DeWilde Date: Wed, 17 Jun 2026 02:01:03 -0400 Subject: [PATCH 18/28] Add HIP EVLOSER RF backend support --- CMakeLists.txt | 6 +- src/Drivers/Sparse/CMakeLists.txt | 20 +- src/Drivers/Sparse/NlpSparseEx1Driver.cpp | 2 +- src/Drivers/Sparse/NlpSparseEx2Driver.cpp | 2 +- src/Drivers/Sparse/NlpSparseRajaEx2Driver.cpp | 16 +- src/LinAlg/CMakeLists.txt | 23 +- src/LinAlg/EVLOSER/CMakeLists.txt | 21 +- src/LinAlg/EVLOSER/IterativeRefinement.cpp | 3 + src/LinAlg/EVLOSER/IterativeRefinement.hpp | 72 +++++- src/LinAlg/EVLOSER/RefactorizationSolver.cpp | 162 ++++++++----- src/LinAlg/EVLOSER/RefactorizationSolver.hpp | 48 ++-- src/LinAlg/EVLOSER/evloser_cusolver_defs.hpp | 119 ++++++++++ src/LinAlg/EVLOSER/evloser_gpu_defs.hpp | 2 +- src/LinAlg/EVLOSER/evloser_hipsolver_defs.hpp | 219 ++++++++++++++++++ src/LinAlg/hiopLinSolverSparseEVLOSER.cpp | 50 ++-- src/Optimization/hiopDualsUpdater.cpp | 4 + src/Optimization/hiopKKTLinSysSparse.cpp | 8 + 17 files changed, 642 insertions(+), 135 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 84e4f64..80b4997 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -97,7 +97,7 @@ cmake_dependent_option( HIOP_USE_MAGMA "Use Magma linear algebra" ON "HIOP_USE_GPU" OFF ) cmake_dependent_option( - HIOP_USE_RESOLVE "Build with cuSolver LU support" ON "HIOP_USE_CUDA" OFF + HIOP_USE_RESOLVE "Build with ReSolve/EVLOSER sparse solver support" ON "HIOP_USE_GPU" OFF ) add_library(hiop_tpl INTERFACE) @@ -286,6 +286,8 @@ endif(HIOP_USE_GPU) if(HIOP_USE_RAJA) # Look for CMake configuration file in RAJA installation + find_package(camp CONFIG REQUIRED) + find_package(RAJA CONFIG PATHS ${RAJA_DIR} ${RAJA_DIR}/share/raja/cmake REQUIRED) @@ -293,7 +295,7 @@ if(HIOP_USE_RAJA) find_package(umpire CONFIG PATHS ${umpire_DIR} ${umpire_DIR}/share/umpire/cmake REQUIRED) - target_link_libraries(hiop_tpl INTERFACE umpire RAJA) + target_link_libraries(hiop_tpl INTERFACE umpire RAJA camp) message(STATUS "Found RAJA pkg-config: ${RAJA_CONFIG}") message(STATUS "Found umpire pkg-config: ${umpire_CONFIG}") endif() diff --git a/src/Drivers/Sparse/CMakeLists.txt b/src/Drivers/Sparse/CMakeLists.txt index 707c86a..436d705 100644 --- a/src/Drivers/Sparse/CMakeLists.txt +++ b/src/Drivers/Sparse/CMakeLists.txt @@ -17,15 +17,17 @@ target_link_libraries(NlpSparseEx3.exe HiOp::HiOp) add_executable(NlpSparseEx4.exe NlpSparseEx4.cpp NlpSparseEx4Driver.cpp) target_link_libraries(NlpSparseEx4.exe HiOp::HiOp) -if(HIOP_USE_RAJA) - if(HIOP_USE_GPU AND HIOP_USE_CUDA AND HIOP_USE_RESOLVE) - set_source_files_properties( - NlpSparseRajaEx2.cpp - NlpSparseRajaEx2Driver.cpp - PROPERTIES LANGUAGE CUDA - ) - - add_executable(NlpSparseRajaEx2.exe NlpSparseRajaEx2Driver.cpp NlpSparseRajaEx2.cpp) +if(HIOP_USE_RAJA AND HIOP_USE_GPU AND HIOP_USE_RESOLVE) + if(HIOP_USE_CUDA OR HIOP_USE_HIP) + if(HIOP_USE_CUDA) + set_source_files_properties( + NlpSparseRajaEx2.cpp + NlpSparseRajaEx2Driver.cpp + PROPERTIES LANGUAGE CUDA + ) + endif() + + add_executable(NlpSparseRajaEx2.exe NlpSparseRajaEx2Driver.cpp NlpSparseRajaEx2.cpp) target_link_libraries(NlpSparseRajaEx2.exe HiOp::HiOp) install(TARGETS NlpSparseRajaEx2.exe DESTINATION bin) list(APPEND hiopSparseEx_INTERFACE_HEADERS NlpSparseRajaEx2.hpp) diff --git a/src/Drivers/Sparse/NlpSparseEx1Driver.cpp b/src/Drivers/Sparse/NlpSparseEx1Driver.cpp index 242f9dd..ef9a6a5 100644 --- a/src/Drivers/Sparse/NlpSparseEx1Driver.cpp +++ b/src/Drivers/Sparse/NlpSparseEx1Driver.cpp @@ -27,7 +27,7 @@ static bool parse_arguments(int argc, use_evloser = false; use_ginkgo = false; use_ginkgo_cuda = false; - use_ginkgo_cuda = false; + use_ginkgo_hip = false; force_fr = false; n = 3; scal = 1.0; diff --git a/src/Drivers/Sparse/NlpSparseEx2Driver.cpp b/src/Drivers/Sparse/NlpSparseEx2Driver.cpp index 620a1c5..6a27733 100644 --- a/src/Drivers/Sparse/NlpSparseEx2Driver.cpp +++ b/src/Drivers/Sparse/NlpSparseEx2Driver.cpp @@ -29,7 +29,7 @@ static bool parse_arguments(int argc, use_evloser = false; use_ginkgo = false; use_ginkgo_cuda = false; - use_ginkgo_cuda = false; + use_ginkgo_hip = false; switch(argc) { case 1: // no arguments diff --git a/src/Drivers/Sparse/NlpSparseRajaEx2Driver.cpp b/src/Drivers/Sparse/NlpSparseRajaEx2Driver.cpp index 885f306..591a0f3 100644 --- a/src/Drivers/Sparse/NlpSparseRajaEx2Driver.cpp +++ b/src/Drivers/Sparse/NlpSparseRajaEx2Driver.cpp @@ -54,7 +54,7 @@ static bool parse_arguments(int argc, } else if(std::string(argv[4]) == "-evloser_cuda_rf") { use_evloser_cuda_rf = true; } else if(std::string(argv[4]) == "-evloser_hip_rf") { - use_hip_cuda_rf = true; + use_evloser_hip_rf = true; } else if(std::string(argv[4]) == "-ginkgo") { use_ginkgo = true; } else if(std::string(argv[4]) == "-ginkgo_cuda") { @@ -83,7 +83,7 @@ static bool parse_arguments(int argc, } else if(std::string(argv[3]) == "-evloser_cuda_rf") { use_evloser_cuda_rf = true; } else if(std::string(argv[3]) == "-evloser_hip_rf") { - use_hip_cuda_rf = true; + use_evloser_hip_rf = true; } else if(std::string(argv[3]) == "-ginkgo") { use_ginkgo = true; } else if(std::string(argv[3]) == "-ginkgo_cuda") { @@ -112,7 +112,7 @@ static bool parse_arguments(int argc, } else if(std::string(argv[2]) == "-evloser_cuda_rf") { use_evloser_cuda_rf = true; } else if(std::string(argv[2]) == "-evloser_hip_rf") { - use_hip_cuda_rf = true; + use_evloser_hip_rf = true; } else if(std::string(argv[2]) == "-ginkgo") { use_ginkgo = true; } else if(std::string(argv[2]) == "-ginkgo_cuda") { @@ -141,7 +141,7 @@ static bool parse_arguments(int argc, } else if(std::string(argv[1]) == "-evloser_cuda_rf") { use_evloser_cuda_rf = true; } else if(std::string(argv[1]) == "-evloser_hip_rf") { - use_hip_cuda_rf = true; + use_evloser_hip_rf = true; } else if(std::string(argv[1]) == "-ginkgo") { use_ginkgo = true; } else if(std::string(argv[1]) == "-ginkgo_cuda") { @@ -332,11 +332,17 @@ int main(int argc, char** argv) } else { nlp.options->SetStringValue("linear_solver_sparse", "resolve"); } - if(use_resolve_cuda_rf || use_evloser_cuda_rf || use_evloser_hip_rf) { + if(use_resolve_cuda_rf || use_evloser_cuda_rf) { nlp.options->SetStringValue("resolve_refactorization", "rf"); nlp.options->SetIntegerValue("ir_inner_maxit", 20); nlp.options->SetIntegerValue("ir_outer_maxit", 0); } + + if(use_evloser_hip_rf) { + nlp.options->SetStringValue("resolve_refactorization", "rf"); + nlp.options->SetIntegerValue("ir_inner_maxit", 0); + nlp.options->SetIntegerValue("ir_outer_maxit", 0); + } nlp.options->SetStringValue("duals_init", "zero"); nlp.options->SetStringValue("mem_space", "device"); nlp.options->SetStringValue("fact_acceptor", "inertia_free"); diff --git a/src/LinAlg/CMakeLists.txt b/src/LinAlg/CMakeLists.txt index 482d0f1..3aacfc0 100644 --- a/src/LinAlg/CMakeLists.txt +++ b/src/LinAlg/CMakeLists.txt @@ -154,12 +154,18 @@ if(HIOP_SPARSE) list(APPEND hiopLinAlg_SRC ${hiopLinAlg_PARDISO_SRC}) endif(HIOP_USE_PARDISO) if(HIOP_USE_RESOLVE) - add_subdirectory(ReSolve) + if(HIOP_USE_CUDA) + add_subdirectory(ReSolve) + list(APPEND hiopLinAlg_SRC ${hiopLinAlg_CUSOLVER_LU_SRC}) + set_source_files_properties(${hiopLinAlg_CUSOLVER_LU_SRC} PROPERTIES LANGUAGE CUDA) + endif(HIOP_USE_CUDA) + add_subdirectory(EVLOSER) - list(APPEND hiopLinAlg_SRC ${hiopLinAlg_CUSOLVER_LU_SRC}) list(APPEND hiopLinAlg_SRC ${hiopLinAlg_EVLOSER_SRC}) - set_source_files_properties(${hiopLinAlg_CUSOLVER_LU_SRC} PROPERTIES LANGUAGE CUDA) - set_source_files_properties(${hiopLinAlg_EVLOSER_SRC} PROPERTIES LANGUAGE CUDA) + + if(HIOP_USE_CUDA) + set_source_files_properties(${hiopLinAlg_EVLOSER_SRC} PROPERTIES LANGUAGE CUDA) + endif(HIOP_USE_CUDA) endif(HIOP_USE_RESOLVE) if(HIOP_USE_CUDA) list(APPEND hiopLinAlg_SRC ${hiopLinAlg_CUSOLVER_CHOL_SRC}) @@ -228,7 +234,12 @@ install( add_library(hiopLinAlg OBJECT ${hiopLinAlg_SRC}) if(HIOP_USE_RESOLVE) - target_link_libraries(hiop_tpl INTERFACE ReSolve EVLOSER) - install(TARGETS ReSolve EVLOSER EXPORT hiop-targets) + target_link_libraries(hiop_tpl INTERFACE EVLOSER) + install(TARGETS EVLOSER EXPORT hiop-targets) + + if(HIOP_USE_CUDA) + target_link_libraries(hiop_tpl INTERFACE ReSolve) + install(TARGETS ReSolve EXPORT hiop-targets) + endif(HIOP_USE_CUDA) endif() target_link_libraries(hiopLinAlg PRIVATE hiop_tpl) diff --git a/src/LinAlg/EVLOSER/CMakeLists.txt b/src/LinAlg/EVLOSER/CMakeLists.txt index b7fbf9c..8dee32c 100644 --- a/src/LinAlg/EVLOSER/CMakeLists.txt +++ b/src/LinAlg/EVLOSER/CMakeLists.txt @@ -3,14 +3,27 @@ set(EVLOSER_SRC RefactorizationSolver.cpp MatrixCsr.cpp - IterativeRefinement.cpp - KrylovSolverKernels.cu ) -set_source_files_properties(${EVLOSER_SRC} PROPERTIES LANGUAGE CUDA) + +if(HIOP_USE_CUDA) + list(APPEND EVLOSER_SRC + IterativeRefinement.cpp + KrylovSolverKernels.cu + ) + set_source_files_properties(${EVLOSER_SRC} PROPERTIES LANGUAGE CUDA) +endif() add_library(EVLOSER STATIC ${EVLOSER_SRC}) target_include_directories(EVLOSER INTERFACE $ $ ) -target_link_libraries(EVLOSER PRIVATE KLU hiop_cuda) +target_link_libraries(EVLOSER PRIVATE KLU) + +if(HIOP_USE_CUDA) + target_link_libraries(EVLOSER PRIVATE hiop_cuda) +endif() + +if(HIOP_USE_HIP) + target_link_libraries(EVLOSER PRIVATE hipsparse hipsolver hipblas) +endif() diff --git a/src/LinAlg/EVLOSER/IterativeRefinement.cpp b/src/LinAlg/EVLOSER/IterativeRefinement.cpp index 5e4caf3..109f0cd 100644 --- a/src/LinAlg/EVLOSER/IterativeRefinement.cpp +++ b/src/LinAlg/EVLOSER/IterativeRefinement.cpp @@ -55,6 +55,8 @@ #include "IterativeRefinement.hpp" +#if !defined(HIOP_USE_HIP) && !defined(HAVE_HIP) + #include "hiop_blasdefs.hpp" #include "KrylovSolverKernels.h" @@ -695,3 +697,4 @@ void IterativeRefinement::evloserCheckCudaError(T result, const char* const file } } // namespace EVLOSER +#endif // !defined(HIOP_USE_HIP) && !defined(HAVE_HIP) diff --git a/src/LinAlg/EVLOSER/IterativeRefinement.hpp b/src/LinAlg/EVLOSER/IterativeRefinement.hpp index f0d7bac..2bc9f7b 100644 --- a/src/LinAlg/EVLOSER/IterativeRefinement.hpp +++ b/src/LinAlg/EVLOSER/IterativeRefinement.hpp @@ -9,7 +9,7 @@ #pragma once #include "klu.h" -#include "evloser_cusolver_defs.hpp" +#include "evloser_gpu_defs.hpp" #include namespace EVLOSER @@ -19,6 +19,70 @@ constexpr double ZERO = 0.0; constexpr double EPSILON = 1.0e-18; constexpr double EPSMAC = 1.0e-16; +#if defined(HIOP_USE_HIP) || defined(HAVE_HIP) + +/** + * @brief No-op iterative refinement interface for HIP EVLOSER builds. + * + * EVLOSER iterative refinement currently depends on CUDA-only GLU/Krylov + * kernels. HIP builds keep this interface available so shared solver code can + * compile, but the HIP EVLOSER path disables iterative refinement. + */ +class IterativeRefinement +{ +public: + IterativeRefinement() = default; + IterativeRefinement(int restart, double tol, int maxit) + : restart_{restart}, + maxit_{maxit}, + tol_{tol} + {} + ~IterativeRefinement() = default; + + int setup(cusparseHandle_t, + cublasHandle_t, + evloserRfHandle_t, + int, + double*, + int*, + int*, + double*, + double*) + { + return -1; + } + + int getFinalNumberOfIterations() { return 0; } + double getFinalResidalNorm() { return 0.0; } + double getInitialResidalNorm() { return 0.0; } + double getBNorm() { return 0.0; } + + void fgmres(double*, double*) {} + + void set_tol(double tol) { tol_ = tol; } + + int setup_system_matrix(int, int, int*, int*, double*) { return -1; } + + int& maxit() { return maxit_; } + + double& tol() { return tol_; } + + std::string& orth_option() { return orth_option_; } + + int& restart() { return restart_; } + + int& conv_cond() { return conv_cond_; } + +private: + int restart_{0}; + int maxit_{0}; + double tol_{0.0}; + int conv_cond_{0}; + std::string orth_option_{"mgs"}; +}; + +#else + /** * @brief Iterative refinement class * @@ -31,7 +95,7 @@ class IterativeRefinement ~IterativeRefinement(); int setup(cusparseHandle_t cusparse_handle, cublasHandle_t cublas_handle, - cusolverRfHandle_t cusolverrf_handle, + evloserRfHandle_t cusolverrf_handle, int n, double* d_T, int* d_P, @@ -105,7 +169,7 @@ class IterativeRefinement // CUDA libraries handles - MUST BE SET AT INIT cusparseHandle_t cusparse_handle_{nullptr}; cublasHandle_t cublas_handle_{nullptr}; - cusolverRfHandle_t cusolverrf_handle_{nullptr}; + evloserRfHandle_t cusolverrf_handle_{nullptr}; cusolverSpHandle_t cusolver_handle_{nullptr}; // GPU data (?) @@ -173,4 +237,6 @@ class IterativeRefinement void evloserCheckCudaError(T result, const char* const file, int const line); }; +#endif // defined(HIOP_USE_HIP) || defined(HAVE_HIP) + } // namespace EVLOSER diff --git a/src/LinAlg/EVLOSER/RefactorizationSolver.cpp b/src/LinAlg/EVLOSER/RefactorizationSolver.cpp index 8972062..d3e44d4 100644 --- a/src/LinAlg/EVLOSER/RefactorizationSolver.cpp +++ b/src/LinAlg/EVLOSER/RefactorizationSolver.cpp @@ -58,13 +58,13 @@ #include "RefactorizationSolver.hpp" #include "klu.h" -#include "cusparse_v2.h" +#include #include #include #include #include -#define checkCudaErrors(val) evloserCheckCudaError((val), __FILE__, __LINE__) +#define checkGpuErrors(val) evloserCheckGpuError((val), __FILE__, __LINE__) namespace EVLOSER { @@ -279,17 +279,19 @@ RefactorizationSolver::RefactorizationSolver(int n) hostx_ = new double[n_]; // Allocate solution and rhs vectors - checkCudaErrors(cudaMalloc(&devx_, n_ * sizeof(double))); - checkCudaErrors(cudaMalloc(&devr_, n_ * sizeof(double))); + checkGpuErrors(evloserGpuMalloc((void**)&devx_, n_ * sizeof(double))); + checkGpuErrors(evloserGpuMalloc((void**)&devr_, n_ * sizeof(double))); } RefactorizationSolver::~RefactorizationSolver() { - if(iterative_refinement_enabled_) delete ir_; + delete ir_; delete mat_A_csr_; // Delete workspaces and handles - cudaFree(d_work_); + if(d_work_ != nullptr) { + (void)evloserGpuFree(d_work_); + } cusparseDestroy(handle_); cusolverSpDestroy(handle_cusolver_); cublasDestroy(handle_cublas_); @@ -299,8 +301,12 @@ RefactorizationSolver::~RefactorizationSolver() delete[] hostx_; // Delete residual and solution vectors - cudaFree(devr_); - cudaFree(devx_); + if(devr_ != nullptr) { + (void)evloserGpuFree(devr_); + } + if(devx_ != nullptr) { + (void)evloserGpuFree(devx_); + } // Delete matrix descriptor used in cuSolverGLU setup if(cusolver_glu_enabled_) { @@ -309,9 +315,15 @@ RefactorizationSolver::~RefactorizationSolver() } if(cusolver_rf_enabled_) { - cudaFree(d_P_); - cudaFree(d_Q_); - cudaFree(d_T_); + if(d_P_ != nullptr) { + (void)evloserGpuFree(d_P_); + } + if(d_Q_ != nullptr) { + (void)evloserGpuFree(d_Q_); + } + if(d_T_ != nullptr) { + (void)evloserGpuFree(d_T_); + } } klu_free_symbolic(&Symbolic_, &Common_); @@ -355,7 +367,7 @@ void RefactorizationSolver::setup_iterative_refinement_matrix(int n, int nnz) // TODO: Can this function be merged with setup_iterative_refinement_matrix ? void RefactorizationSolver::configure_iterative_refinement(cusparseHandle_t cusparse_handle, cublasHandle_t cublas_handle, - cusolverRfHandle_t cusolverrf_handle, + evloserRfHandle_t cusolverrf_handle, int n, double* d_T, int* d_P, @@ -425,22 +437,22 @@ bool RefactorizationSolver::validate_klu_factorization(const char* caller) const return true; } -bool RefactorizationSolver::checkCusolverRfStatus(cusolverStatus_t status, const char* caller) const +bool RefactorizationSolver::checkEvloserRfStatus(evloserRfStatus_t status, const char* caller) const { - if(status == CUSOLVER_STATUS_SUCCESS) { + if(status == evloserRfSuccess) { return true; } if(!silent_output_) { - std::cout << "[EVLOSER] " << caller << " failed with cuSOLVER status " << status << "\n"; + std::cout << "[EVLOSER] " << caller << " failed with GPU RF status " << status << "\n"; } return false; } -int RefactorizationSolver::resetCusolverRfValues(const char* caller) +int RefactorizationSolver::resetEvloserRfValues(const char* caller) { - sp_status_ = cusolverRfResetValues(n_, + sp_status_ = evloserRfResetValues(n_, nnz_, mat_A_csr_->device_irows(), mat_A_csr_->device_jcols(), @@ -449,24 +461,24 @@ int RefactorizationSolver::resetCusolverRfValues(const char* caller) d_Q_, handle_rf_); - if(!checkCusolverRfStatus(sp_status_, caller)) { + if(!checkEvloserRfStatus(sp_status_, caller)) { return -1; } - checkCudaErrors(cudaDeviceSynchronize()); + checkGpuErrors(evloserGpuDeviceSynchronize()); return 0; } -int RefactorizationSolver::analyzeCusolverRf(const char* caller) +int RefactorizationSolver::analyzeEvloserRf(const char* caller) { - sp_status_ = cusolverRfAnalyze(handle_rf_); - return checkCusolverRfStatus(sp_status_, caller) ? 0 : -1; + sp_status_ = evloserRfAnalyze(handle_rf_); + return checkEvloserRfStatus(sp_status_, caller) ? 0 : -1; } -int RefactorizationSolver::refactorizeCusolverRf(const char* caller) +int RefactorizationSolver::refactorizeEvloserRf(const char* caller) { - sp_status_ = cusolverRfRefactor(handle_rf_); - return checkCusolverRfStatus(sp_status_, caller) ? 0 : -1; + sp_status_ = evloserRfRefactor(handle_rf_); + return checkEvloserRfStatus(sp_status_, caller) ? 0 : -1; } int RefactorizationSolver::setup_factorization() @@ -513,11 +525,16 @@ void RefactorizationSolver::setup_refactorization() } if(refact_ == "glu") { - initializeCusolverGLU(); - refactorizationSetupCusolverGLU(); + if(initializeCusolverGLU() != 0) { + return; + } + + if(refactorizationSetupCusolverGLU() != 0) { + return; + } } else if(refact_ == "rf") { if(initializeCusolverRf() != 0 || refactorizationSetupCusolverRf() != 0) { - assert(false && "cuSOLVER RF setup failed."); + assert(false && "EVLOSER RF setup failed."); return; } if(iterative_refinement_active()) { @@ -547,10 +564,10 @@ int RefactorizationSolver::refactorize() sp_status_ = cusolverSpDgluFactor(handle_cusolver_, info_M_, d_work_); } else { if(refact_ == "rf") { - if(resetCusolverRfValues("cuSOLVER RF reset values") != 0) { + if(resetEvloserRfValues("GPU RF reset values") != 0) { return -1; } - if(refactorizeCusolverRf("cuSOLVER RF refactorization") != 0) { + if(refactorizeEvloserRf("GPU RF refactorization") != 0) { return -1; } } @@ -563,10 +580,10 @@ bool RefactorizationSolver::triangular_solve(double* dx, double tol, std::string if(refact_ == "glu") { double* devx = nullptr; if(memspace == "device") { - checkCudaErrors(cudaMemcpy(devr_, dx, sizeof(double) * n_, cudaMemcpyDeviceToDevice)); + checkGpuErrors(evloserGpuMemcpy(devr_, dx, sizeof(double) * n_, evloserMemcpyDeviceToDevice)); devx = dx; } else { - checkCudaErrors(cudaMemcpy(devr_, dx, sizeof(double) * n_, cudaMemcpyHostToDevice)); + checkGpuErrors(evloserGpuMemcpy(devr_, dx, sizeof(double) * n_, evloserMemcpyHostToDevice)); devx = devx_; } sp_status_ = cusolverSpDgluSolve(handle_cusolver_, @@ -590,7 +607,7 @@ bool RefactorizationSolver::triangular_solve(double* dx, double tol, std::string if(memspace == "device") { // do nothing } else { - checkCudaErrors(cudaMemcpy(dx, devx_, sizeof(double) * n_, cudaMemcpyDeviceToHost)); + checkGpuErrors(evloserGpuMemcpy(dx, devx_, sizeof(double) * n_, evloserMemcpyDeviceToHost)); } return true; } @@ -600,7 +617,7 @@ bool RefactorizationSolver::triangular_solve(double* dx, double tol, std::string if(is_first_solve_) { double* hostx = nullptr; if(memspace == "device") { - checkCudaErrors(cudaMemcpy(hostx_, dx, sizeof(double) * n_, cudaMemcpyDeviceToHost)); + checkGpuErrors(evloserGpuMemcpy(hostx_, dx, sizeof(double) * n_, evloserMemcpyDeviceToHost)); hostx = hostx_; } else { hostx = dx; @@ -610,7 +627,7 @@ bool RefactorizationSolver::triangular_solve(double* dx, double tol, std::string klu_free_symbolic(&Symbolic_, &Common_); is_first_solve_ = false; if(memspace == "device") { - checkCudaErrors(cudaMemcpy(dx, hostx, sizeof(double) * n_, cudaMemcpyHostToDevice)); + checkGpuErrors(evloserGpuMemcpy(dx, hostx, sizeof(double) * n_, evloserMemcpyHostToDevice)); } else { // do nothing } @@ -620,15 +637,15 @@ bool RefactorizationSolver::triangular_solve(double* dx, double tol, std::string double* devx = nullptr; if(memspace == "device") { devx = dx; - checkCudaErrors(cudaMemcpy(devr_, dx, sizeof(double) * n_, cudaMemcpyDeviceToDevice)); + checkGpuErrors(evloserGpuMemcpy(devr_, dx, sizeof(double) * n_, evloserMemcpyDeviceToDevice)); } else { - checkCudaErrors(cudaMemcpy(devx_, dx, sizeof(double) * n_, cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpy(devr_, devx_, sizeof(double) * n_, cudaMemcpyDeviceToDevice)); + checkGpuErrors(evloserGpuMemcpy(devx_, dx, sizeof(double) * n_, evloserMemcpyHostToDevice)); + checkGpuErrors(evloserGpuMemcpy(devr_, devx_, sizeof(double) * n_, evloserMemcpyDeviceToDevice)); devx = devx_; } // Each next solve is performed on GPU - sp_status_ = cusolverRfSolve(handle_rf_, + sp_status_ = evloserRfSolve(handle_rf_, d_P_, d_Q_, 1, @@ -658,7 +675,7 @@ bool RefactorizationSolver::triangular_solve(double* dx, double tol, std::string if(memspace == "device") { // do nothing } else { - checkCudaErrors(cudaMemcpy(dx, devx_, sizeof(double) * n_, cudaMemcpyDeviceToHost)); + checkGpuErrors(evloserGpuMemcpy(dx, devx_, sizeof(double) * n_, evloserMemcpyDeviceToHost)); } return true; } @@ -738,6 +755,11 @@ int RefactorizationSolver::initializeKLU() int RefactorizationSolver::initializeCusolverGLU() { +#if defined(HIOP_USE_HIP) || defined(HAVE_HIP) + std::cerr << "EVLOSER GLU refactorization is not supported on HIP. Use RF instead.\n"; + return -1; +#endif + cusparseCreateMatDescr(&descr_M_); cusparseSetMatType(descr_M_, CUSPARSE_MATRIX_TYPE_GENERAL); cusparseSetMatIndexBase(descr_M_, CUSPARSE_INDEX_BASE_ZERO); @@ -753,32 +775,39 @@ int RefactorizationSolver::initializeCusolverGLU() int RefactorizationSolver::initializeCusolverRf() { - if(!checkCusolverRfStatus(cusolverRfCreate(&handle_rf_), "cusolverRfCreate")) { + if(!checkEvloserRfStatus(evloserRfCreate(&handle_rf_), "evloserRfCreate")) { return -1; } - sp_status_ = cusolverRfSetAlgs(handle_rf_, CUSOLVERRF_FACTORIZATION_ALG2, CUSOLVERRF_TRIANGULAR_SOLVE_ALG2); - if(!checkCusolverRfStatus(sp_status_, "cusolverRfSetAlgs")) { +#if defined(HIOP_USE_HIP) || defined(HAVE_HIP) + /* + * hipSOLVER RF uses the default RF settings. Some CUDA RF tuning calls are + * not portable to HIP. + */ +#else + sp_status_ = evloserRfSetAlgs(handle_rf_, evloserRfFactorizationAlg2, evloserRfTriangularSolveAlg2); + if(!checkEvloserRfStatus(sp_status_, "evloserRfSetAlgs")) { return -1; } - sp_status_ = cusolverRfSetMatrixFormat(handle_rf_, CUSOLVERRF_MATRIX_FORMAT_CSR, CUSOLVERRF_UNIT_DIAGONAL_STORED_L); - if(!checkCusolverRfStatus(sp_status_, "cusolverRfSetMatrixFormat")) { + sp_status_ = evloserRfSetMatrixFormat(handle_rf_, evloserRfMatrixFormatCsr, evloserRfUnitDiagonalStoredL); + if(!checkEvloserRfStatus(sp_status_, "evloserRfSetMatrixFormat")) { return -1; } - sp_status_ = cusolverRfSetResetValuesFastMode(handle_rf_, CUSOLVERRF_RESET_VALUES_FAST_MODE_ON); - if(!checkCusolverRfStatus(sp_status_, "cusolverRfSetResetValuesFastMode")) { + sp_status_ = evloserRfSetResetValuesFastMode(handle_rf_, evloserRfResetValuesFastModeOn); + if(!checkEvloserRfStatus(sp_status_, "evloserRfSetResetValuesFastMode")) { return -1; } const double boost = 1e-12; const double zero = 1e-14; - sp_status_ = cusolverRfSetNumericProperties(handle_rf_, zero, boost); - if(!checkCusolverRfStatus(sp_status_, "cusolverRfSetNumericProperties")) { + sp_status_ = evloserRfSetNumericProperties(handle_rf_, zero, boost); + if(!checkEvloserRfStatus(sp_status_, "evloserRfSetNumericProperties")) { return -1; } +#endif cusolver_rf_enabled_ = true; return 0; @@ -788,6 +817,11 @@ int RefactorizationSolver::initializeCusolverRf() // poor while using refactorization. int RefactorizationSolver::refactorizationSetupCusolverGLU() { +#if defined(HIOP_USE_HIP) || defined(HAVE_HIP) + std::cerr << "EVLOSER GLU refactorization is not supported on HIP. Use RF instead.\n"; + return -1; +#endif + // for now this ONLY WORKS if proceeded by KLU. Might be worth decoupling // later @@ -856,7 +890,7 @@ int RefactorizationSolver::refactorizationSetupCusolverGLU() assert(CUSOLVER_STATUS_SUCCESS == sp_status_); buffer_size_ = size_M_; - checkCudaErrors(cudaMalloc((void**)&d_work_, buffer_size_)); + checkGpuErrors(evloserGpuMalloc((void**)&d_work_, buffer_size_)); sp_status_ = cusolverSpDgluAnalysis(handle_cusolver_, info_M_, d_work_); assert(CUSOLVER_STATUS_SUCCESS == sp_status_); @@ -901,14 +935,14 @@ int RefactorizationSolver::refactorizationSetupCusolverRf() return -1; } - checkCudaErrors(cudaMalloc(&d_P_, n_ * sizeof(int))); - checkCudaErrors(cudaMalloc(&d_Q_, n_ * sizeof(int))); - checkCudaErrors(cudaMalloc(&d_T_, n_ * sizeof(double))); + checkGpuErrors(evloserGpuMalloc(&d_P_, n_ * sizeof(int))); + checkGpuErrors(evloserGpuMalloc(&d_Q_, n_ * sizeof(int))); + checkGpuErrors(evloserGpuMalloc(&d_T_, n_ * sizeof(double))); - checkCudaErrors(cudaMemcpy(d_P_, Numeric_->Pnum, n_ * sizeof(int), cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpy(d_Q_, Symbolic_->Q, n_ * sizeof(int), cudaMemcpyHostToDevice)); + checkGpuErrors(evloserGpuMemcpy(d_P_, Numeric_->Pnum, n_ * sizeof(int), evloserMemcpyHostToDevice)); + checkGpuErrors(evloserGpuMemcpy(d_Q_, Symbolic_->Q, n_ * sizeof(int), evloserMemcpyHostToDevice)); - sp_status_ = cusolverRfSetupHost(n_, + sp_status_ = evloserRfSetupHost(n_, nnz_, mat_A_csr_->host_irows(), mat_A_csr_->host_jcols(), @@ -924,24 +958,26 @@ int RefactorizationSolver::refactorizationSetupCusolverRf() Numeric_->Pnum, Symbolic_->Q, handle_rf_); - if(!checkCusolverRfStatus(sp_status_, "cusolverRfSetupHost")) { + if(!checkEvloserRfStatus(sp_status_, "evloserRfSetupHost")) { return -1; } - if(analyzeCusolverRf("cuSOLVER RF analysis") != 0) { + if(analyzeEvloserRf("GPU RF analysis") != 0) { return -1; } - return refactorizeCusolverRf("cuSOLVER RF initial refactorization"); + return refactorizeEvloserRf("GPU RF initial refactorization"); } -// Error checking utility for CUDA +// Error checking utility for GPU backend calls // KS: might later become part of src/Utils, putting it here for now template -void RefactorizationSolver::evloserCheckCudaError(T result, const char* const file, int const line) +void RefactorizationSolver::evloserCheckGpuError(T result, const char* const file, int const line) { - if(result) { - fprintf(stdout, "CUDA error at %s:%d, error# %d\n", file, line, result); + if(result != evloserGpuSuccess) { + std::cout << "GPU backend error at " << file << ":" << line + << ", error# " << static_cast(result) + << ": " << evloserGpuGetErrorString(result) << "\n"; assert(false); } } diff --git a/src/LinAlg/EVLOSER/RefactorizationSolver.hpp b/src/LinAlg/EVLOSER/RefactorizationSolver.hpp index 334cbca..6b6c451 100644 --- a/src/LinAlg/EVLOSER/RefactorizationSolver.hpp +++ b/src/LinAlg/EVLOSER/RefactorizationSolver.hpp @@ -57,7 +57,7 @@ #pragma once #include "klu.h" -#include "evloser_cusolver_defs.hpp" +#include "evloser_gpu_defs.hpp" #include namespace EVLOSER @@ -67,7 +67,7 @@ class MatrixCsr; class IterativeRefinement; /** - * @brief Implements refactorization solvers using KLU and cuSOLVER libraries + * @brief Implements refactorization solvers using KLU and GPU sparse solver libraries * */ class RefactorizationSolver @@ -91,7 +91,7 @@ class RefactorizationSolver void setup_iterative_refinement_matrix(int n, int nnz); void configure_iterative_refinement(cusparseHandle_t cusparse_handle, cublasHandle_t cublas_handle, - cusolverRfHandle_t cusolverrf_handle, + evloserRfHandle_t cusolverrf_handle, int n, double* d_T, int* d_P, @@ -165,8 +165,8 @@ class RefactorizationSolver MatrixCsr* mat_A_csr_{nullptr}; ///< System matrix in nonsymmetric CSR format IterativeRefinement* ir_{nullptr}; ///< Iterative refinement class - bool cusolver_glu_enabled_{false}; ///< cusolverGLU on/off flag - bool cusolver_rf_enabled_{false}; ///< cusolverRf on/off flag + bool cusolver_glu_enabled_{false}; ///< GLU refactorization enabled flag + bool cusolver_rf_enabled_{false}; ///< Rf refactorization enabled flag bool iterative_refinement_enabled_{false}; ///< Iterative refinement on/off flag bool is_first_solve_{true}; ///< If it is first call to triangular solver @@ -177,7 +177,7 @@ class RefactorizationSolver std::string use_ir_; bool silent_output_{true}; - /** needed for cuSolver **/ + /** needed for GPU sparse solver **/ cusolverStatus_t sp_status_; cusparseHandle_t handle_ = 0; @@ -189,12 +189,12 @@ class RefactorizationSolver csrluInfoHost_t info_lu_ = nullptr; csrgluInfo_t info_M_ = nullptr; - cusolverRfHandle_t handle_rf_ = nullptr; - size_t buffer_size_; - size_t size_M_; - double* d_work_; + evloserRfHandle_t handle_rf_ = nullptr; + size_t buffer_size_{0}; + size_t size_M_{0}; + double* d_work_{nullptr}; int ite_refine_succ_ = 0; - double r_nrminf_; + double r_nrminf_{0.0}; // KLU stuff int klu_status_; @@ -212,7 +212,7 @@ class RefactorizationSolver double* devx_ = nullptr; double* devr_ = nullptr; - /* needed for cuSolverRf */ + /* needed for GPU Rf */ int* d_P_ = nullptr; int* d_Q_ = nullptr; // permutation matrices double* d_T_ = nullptr; @@ -244,28 +244,28 @@ class RefactorizationSolver int refactorizationSetupCusolverGLU(); int refactorizationSetupCusolverRf(); - /// Check and report a cuSOLVER RF status value. - bool checkCusolverRfStatus(cusolverStatus_t status, const char* caller) const; + /// Check and report a GPU RF status value. + bool checkEvloserRfStatus(evloserRfStatus_t status, const char* caller) const; - /// Reset cuSOLVER RF values using the current device CSR matrix. - int resetCusolverRfValues(const char* caller); + /// Reset GPU RF values using the current device CSR matrix. + int resetEvloserRfValues(const char* caller); - /// Run cuSOLVER RF analysis on the configured RF handle. - int analyzeCusolverRf(const char* caller); + /// Run GPU RF analysis on the configured RF handle. + int analyzeEvloserRf(const char* caller); - /// Run cuSOLVER RF numeric refactorization on the configured RF handle. - int refactorizeCusolverRf(const char* caller); + /// Run GPU RF numeric refactorization on the configured RF handle. + int refactorizeEvloserRf(const char* caller); /** - * @brief Check for CUDA errors. + * @brief Check for GPU backend errors. * * @tparam T - type of the result * @param result - result value - * @param file - file name where the error occured - * @param line - line at which the error occured + * @param file - file name where the error occurred + * @param line - line at which the error occurred */ template - void evloserCheckCudaError(T result, const char* const file, int const line); + void evloserCheckGpuError(T result, const char* const file, int const line); }; } // namespace EVLOSER diff --git a/src/LinAlg/EVLOSER/evloser_cusolver_defs.hpp b/src/LinAlg/EVLOSER/evloser_cusolver_defs.hpp index e48782e..ad565b9 100644 --- a/src/LinAlg/EVLOSER/evloser_cusolver_defs.hpp +++ b/src/LinAlg/EVLOSER/evloser_cusolver_defs.hpp @@ -97,6 +97,125 @@ inline const char* evloserGpuGetErrorString(evloserGpuError_t status) #include "cusolverRf.h" +using evloserRfStatus_t = cusolverStatus_t; +using evloserRfHandle_t = cusolverRfHandle_t; +using evloserRfFactorization_t = cusolverRfFactorization_t; +using evloserRfTriangularSolve_t = cusolverRfTriangularSolve_t; +using evloserRfMatrixFormat_t = cusolverRfMatrixFormat_t; +using evloserRfUnitDiagonal_t = cusolverRfUnitDiagonal_t; +using evloserRfResetValuesFastMode_t = cusolverRfResetValuesFastMode_t; + +static const evloserRfStatus_t evloserRfSuccess = CUSOLVER_STATUS_SUCCESS; +static const evloserRfFactorization_t evloserRfFactorizationAlg2 = CUSOLVERRF_FACTORIZATION_ALG2; +static const evloserRfTriangularSolve_t evloserRfTriangularSolveAlg2 = CUSOLVERRF_TRIANGULAR_SOLVE_ALG2; +static const evloserRfMatrixFormat_t evloserRfMatrixFormatCsr = CUSOLVERRF_MATRIX_FORMAT_CSR; +static const evloserRfUnitDiagonal_t evloserRfUnitDiagonalStoredL = CUSOLVERRF_UNIT_DIAGONAL_STORED_L; +static const evloserRfResetValuesFastMode_t evloserRfResetValuesFastModeOn = CUSOLVERRF_RESET_VALUES_FAST_MODE_ON; + +inline evloserRfStatus_t evloserRfCreate(evloserRfHandle_t* handle) +{ + return cusolverRfCreate(handle); +} + +inline evloserRfStatus_t evloserRfDestroy(evloserRfHandle_t handle) +{ + return cusolverRfDestroy(handle); +} + +inline evloserRfStatus_t evloserRfSetAlgs(evloserRfHandle_t handle, + evloserRfFactorization_t fact_alg, + evloserRfTriangularSolve_t solve_alg) +{ + return cusolverRfSetAlgs(handle, fact_alg, solve_alg); +} + +inline evloserRfStatus_t evloserRfSetMatrixFormat(evloserRfHandle_t handle, + evloserRfMatrixFormat_t format, + evloserRfUnitDiagonal_t diag) +{ + return cusolverRfSetMatrixFormat(handle, format, diag); +} + +inline evloserRfStatus_t evloserRfSetResetValuesFastMode(evloserRfHandle_t handle, + evloserRfResetValuesFastMode_t fast_mode) +{ + return cusolverRfSetResetValuesFastMode(handle, fast_mode); +} + +inline evloserRfStatus_t evloserRfSetNumericProperties(evloserRfHandle_t handle, double zero, double boost) +{ + return cusolverRfSetNumericProperties(handle, zero, boost); +} + +inline evloserRfStatus_t evloserRfSetupHost(int n, + int nnzA, + int* csrRowPtrA, + int* csrColIndA, + double* csrValA, + int nnzL, + int* csrRowPtrL, + int* csrColIndL, + double* csrValL, + int nnzU, + int* csrRowPtrU, + int* csrColIndU, + double* csrValU, + int* P, + int* Q, + evloserRfHandle_t handle) +{ + return cusolverRfSetupHost(n, + nnzA, + csrRowPtrA, + csrColIndA, + csrValA, + nnzL, + csrRowPtrL, + csrColIndL, + csrValL, + nnzU, + csrRowPtrU, + csrColIndU, + csrValU, + P, + Q, + handle); +} + +inline evloserRfStatus_t evloserRfResetValues(int n, + int nnzA, + int* csrRowPtrA, + int* csrColIndA, + double* csrValA, + int* P, + int* Q, + evloserRfHandle_t handle) +{ + return cusolverRfResetValues(n, nnzA, csrRowPtrA, csrColIndA, csrValA, P, Q, handle); +} + +inline evloserRfStatus_t evloserRfAnalyze(evloserRfHandle_t handle) +{ + return cusolverRfAnalyze(handle); +} + +inline evloserRfStatus_t evloserRfRefactor(evloserRfHandle_t handle) +{ + return cusolverRfRefactor(handle); +} + +inline evloserRfStatus_t evloserRfSolve(evloserRfHandle_t handle, + int* P, + int* Q, + int nrhs, + double* Temp, + int ldt, + double* XF, + int ldxf) +{ + return cusolverRfSolve(handle, P, Q, nrhs, Temp, ldt, XF, ldxf); +} + extern "C" { /* * prototype not in public header file diff --git a/src/LinAlg/EVLOSER/evloser_gpu_defs.hpp b/src/LinAlg/EVLOSER/evloser_gpu_defs.hpp index 764935f..295b20a 100644 --- a/src/LinAlg/EVLOSER/evloser_gpu_defs.hpp +++ b/src/LinAlg/EVLOSER/evloser_gpu_defs.hpp @@ -12,7 +12,7 @@ #include "evloser_cusolver_defs.hpp" -#elif defined(HIOP_USE_HIP) +#elif defined(HIOP_USE_HIP) || defined(HAVE_HIP) #include "evloser_hipsolver_defs.hpp" diff --git a/src/LinAlg/EVLOSER/evloser_hipsolver_defs.hpp b/src/LinAlg/EVLOSER/evloser_hipsolver_defs.hpp index a9e8cf6..adb095f 100644 --- a/src/LinAlg/EVLOSER/evloser_hipsolver_defs.hpp +++ b/src/LinAlg/EVLOSER/evloser_hipsolver_defs.hpp @@ -11,6 +11,9 @@ #include #include +#include +#include +#include using evloserGpuError_t = hipError_t; using evloserGpuMemcpyKind_t = hipMemcpyKind; @@ -25,6 +28,12 @@ inline evloserGpuError_t evloserGpuMalloc(void** ptr, size_t size) return hipMalloc(ptr, size); } +template +inline evloserGpuError_t evloserGpuMalloc(T** ptr, size_t size) +{ + return hipMalloc(reinterpret_cast(ptr), size); +} + inline evloserGpuError_t evloserGpuFree(void* ptr) { return hipFree(ptr); @@ -45,4 +54,214 @@ inline const char* evloserGpuGetErrorString(evloserGpuError_t status) return hipGetErrorString(status); } +/* + * Compatibility aliases for EVLOSER code that still uses CUDA-style sparse + * solver handle names. The EVLOSER source keeps those names so the HIP path + * stays close to the original ReSolve implementation. + */ +using cusolverStatus_t = hipsolverStatus_t; +using cusparseHandle_t = hipsparseHandle_t; +using cusolverSpHandle_t = hipsolverSpHandle_t; +using cublasHandle_t = hipblasHandle_t; +using cusparseMatDescr_t = hipsparseMatDescr_t; + +#define CUSOLVER_STATUS_SUCCESS HIPSOLVER_STATUS_SUCCESS + +#define cusparseCreate hipsparseCreate +#define cusparseDestroy hipsparseDestroy +#define cusparseCreateMatDescr hipsparseCreateMatDescr +#define cusparseDestroyMatDescr hipsparseDestroyMatDescr +#define cusparseSetMatType hipsparseSetMatType +#define cusparseSetMatIndexBase hipsparseSetMatIndexBase + +#define CUSPARSE_MATRIX_TYPE_GENERAL HIPSPARSE_MATRIX_TYPE_GENERAL +#define CUSPARSE_INDEX_BASE_ZERO HIPSPARSE_INDEX_BASE_ZERO + +#define cusolverSpCreate hipsolverSpCreate +#define cusolverSpDestroy hipsolverSpDestroy + +#define cublasCreate hipblasCreate +#define cublasDestroy hipblasDestroy + +using evloserRfStatus_t = hipsolverStatus_t; +using evloserRfHandle_t = hipsolverRfHandle_t; +using evloserRfFactorization_t = hipsolverRfFactorization_t; +using evloserRfTriangularSolve_t = hipsolverRfTriangularSolve_t; +using evloserRfMatrixFormat_t = hipsolverRfMatrixFormat_t; +using evloserRfUnitDiagonal_t = hipsolverRfUnitDiagonal_t; +using evloserRfResetValuesFastMode_t = hipsolverRfResetValuesFastMode_t; + +static const evloserRfStatus_t evloserRfSuccess = HIPSOLVER_STATUS_SUCCESS; +static const evloserRfFactorization_t evloserRfFactorizationAlg2 = HIPSOLVERRF_FACTORIZATION_ALG2; +static const evloserRfTriangularSolve_t evloserRfTriangularSolveAlg2 = HIPSOLVERRF_TRIANGULAR_SOLVE_ALG2; +static const evloserRfMatrixFormat_t evloserRfMatrixFormatCsr = HIPSOLVERRF_MATRIX_FORMAT_CSR; +static const evloserRfUnitDiagonal_t evloserRfUnitDiagonalStoredL = HIPSOLVERRF_UNIT_DIAGONAL_STORED_L; +static const evloserRfResetValuesFastMode_t evloserRfResetValuesFastModeOn = HIPSOLVERRF_RESET_VALUES_FAST_MODE_ON; + +inline evloserRfStatus_t evloserRfCreate(evloserRfHandle_t* handle) +{ + return hipsolverRfCreate(handle); +} + +inline evloserRfStatus_t evloserRfDestroy(evloserRfHandle_t handle) +{ + return hipsolverRfDestroy(handle); +} + +inline evloserRfStatus_t evloserRfSetAlgs(evloserRfHandle_t handle, + evloserRfFactorization_t fact_alg, + evloserRfTriangularSolve_t solve_alg) +{ + return hipsolverRfSetAlgs(handle, fact_alg, solve_alg); +} + +inline evloserRfStatus_t evloserRfSetMatrixFormat(evloserRfHandle_t handle, + evloserRfMatrixFormat_t format, + evloserRfUnitDiagonal_t diag) +{ + return hipsolverRfSetMatrixFormat(handle, format, diag); +} + +inline evloserRfStatus_t evloserRfSetResetValuesFastMode(evloserRfHandle_t handle, + evloserRfResetValuesFastMode_t fast_mode) +{ + return hipsolverRfSetResetValuesFastMode(handle, fast_mode); +} + +inline evloserRfStatus_t evloserRfSetNumericProperties(evloserRfHandle_t handle, double zero, double boost) +{ + return hipsolverRfSetNumericProperties(handle, zero, boost); +} + +inline evloserRfStatus_t evloserRfSetupHost(int n, + int nnzA, + int* csrRowPtrA, + int* csrColIndA, + double* csrValA, + int nnzL, + int* csrRowPtrL, + int* csrColIndL, + double* csrValL, + int nnzU, + int* csrRowPtrU, + int* csrColIndU, + double* csrValU, + int* P, + int* Q, + evloserRfHandle_t handle) +{ + return hipsolverRfSetupHost(n, + nnzA, + csrRowPtrA, + csrColIndA, + csrValA, + nnzL, + csrRowPtrL, + csrColIndL, + csrValL, + nnzU, + csrRowPtrU, + csrColIndU, + csrValU, + P, + Q, + handle); +} + +inline evloserRfStatus_t evloserRfResetValues(int n, + int nnzA, + int* csrRowPtrA, + int* csrColIndA, + double* csrValA, + int* P, + int* Q, + evloserRfHandle_t handle) +{ + return hipsolverRfResetValues(n, nnzA, csrRowPtrA, csrColIndA, csrValA, P, Q, handle); +} + +inline evloserRfStatus_t evloserRfAnalyze(evloserRfHandle_t handle) +{ + return hipsolverRfAnalyze(handle); +} + +inline evloserRfStatus_t evloserRfRefactor(evloserRfHandle_t handle) +{ + return hipsolverRfRefactor(handle); +} + +inline evloserRfStatus_t evloserRfSolve(evloserRfHandle_t handle, + int* P, + int* Q, + int nrhs, + double* Temp, + int ldt, + double* XF, + int ldxf) +{ + return hipsolverRfSolve(handle, P, Q, nrhs, Temp, ldt, XF, ldxf); +} + +/* + * cuSOLVER GLU is CUDA-only. These stubs allow HIP EVLOSER builds to compile + * code paths that are present in the shared implementation but not used by the + * HIP RF validation path. + */ +using csrluInfoHost_t = void*; +using csrgluInfo_t = void*; + +template +inline cusolverStatus_t cusolverSpCreateGluInfo(Args...) +{ + return static_cast(1); +} + +template +inline cusolverStatus_t cusolverSpDestroyGluInfo(Args...) +{ + return CUSOLVER_STATUS_SUCCESS; +} + +template +inline cusolverStatus_t cusolverSpDgluSetup(Args...) +{ + return static_cast(1); +} + +template +inline cusolverStatus_t cusolverSpDgluBufferSize(Args...) +{ + return static_cast(1); +} + +template +inline cusolverStatus_t cusolverSpDgluAnalysis(Args...) +{ + return static_cast(1); +} + +template +inline cusolverStatus_t cusolverSpDgluReset(Args...) +{ + return static_cast(1); +} + +template +inline cusolverStatus_t cusolverSpDgluFactor(Args...) +{ + return static_cast(1); +} + +template +inline cusolverStatus_t cusolverSpDgluSolve(Args...) +{ + return static_cast(1); +} + +template +inline cusolverStatus_t cusolverSpDnrminf(Args...) +{ + return static_cast(1); +} + #endif // EVLOSER_HIPSOLVER_DEFS_H diff --git a/src/LinAlg/hiopLinSolverSparseEVLOSER.cpp b/src/LinAlg/hiopLinSolverSparseEVLOSER.cpp index a9214de..5380b6d 100644 --- a/src/LinAlg/hiopLinSolverSparseEVLOSER.cpp +++ b/src/LinAlg/hiopLinSolverSparseEVLOSER.cpp @@ -57,12 +57,19 @@ #include "EVLOSER/RefactorizationSolver.hpp" #include "EVLOSER/MatrixCsr.hpp" #include "EVLOSER/IterativeRefinement.hpp" +#include "EVLOSER/evloser_gpu_defs.hpp" #include "hiop_blasdefs.hpp" +#ifdef HIOP_USE_CUDA #include "cusparse_v2.h" +#endif + +#include +#include #include #include +#include #define checkCudaErrors(val) hiopCheckCudaError((val), __FILE__, __LINE__) @@ -175,6 +182,11 @@ hiopLinSolverSymSparseEVLOSER::hiopLinSolverSymSparseEVLOSER(const int& n, const maxit_test = 50; } use_ir = "no"; +#if defined(HIOP_USE_HIP) || defined(HAVE_HIP) + // EVLOSER iterative refinement currently depends on CUDA-only kernels. + // Keep the HIP path on RF only until the IR path is ported. + solver_->disable_iterative_refinement(); +#else if(maxit_test > 0) { use_ir = "yes"; solver_->enable_iterative_refinement(); @@ -182,6 +194,7 @@ hiopLinSolverSymSparseEVLOSER::hiopLinSolverSymSparseEVLOSER(const int& n, const } else { solver_->disable_iterative_refinement(); } +#endif if(use_ir == "yes") { if((refact == "rf")) { solver_->ir()->restart() = nlp_->options->GetInteger("ir_inner_restart"); @@ -264,8 +277,8 @@ hiopLinSolverSymSparseEVLOSER::~hiopLinSolverSymSparseEVLOSER() // Delete CSR <--> triplet mappings delete[] index_convert_CSR2Triplet_host_; delete[] index_convert_extra_Diag2CSR_host_; - checkCudaErrors(cudaFree(index_convert_CSR2Triplet_device_)); - checkCudaErrors(cudaFree(index_convert_extra_Diag2CSR_device_)); + checkCudaErrors(evloserGpuFree(index_convert_CSR2Triplet_device_)); + checkCudaErrors(evloserGpuFree(index_convert_extra_Diag2CSR_device_)); } int hiopLinSolverSymSparseEVLOSER::matrixChanged() @@ -333,11 +346,16 @@ void hiopLinSolverSymSparseEVLOSER::firstCall() // If the matrix is on device, copy it to the host mirror std::string mem_space = nlp_->options->GetString("mem_space"); if(mem_space == "device") { - checkCudaErrors(cudaMemcpy(M_host_->M(), M_->M(), sizeof(double) * M_->numberOfNonzeros(), cudaMemcpyDeviceToHost)); - checkCudaErrors( - cudaMemcpy(M_host_->i_row(), M_->i_row(), sizeof(index_type) * M_->numberOfNonzeros(), cudaMemcpyDeviceToHost)); checkCudaErrors( - cudaMemcpy(M_host_->j_col(), M_->j_col(), sizeof(index_type) * M_->numberOfNonzeros(), cudaMemcpyDeviceToHost)); + evloserGpuMemcpy(M_host_->M(), M_->M(), sizeof(double) * M_->numberOfNonzeros(), evloserMemcpyDeviceToHost)); + checkCudaErrors(evloserGpuMemcpy(M_host_->i_row(), + M_->i_row(), + sizeof(index_type) * M_->numberOfNonzeros(), + evloserMemcpyDeviceToHost)); + checkCudaErrors(evloserGpuMemcpy(M_host_->j_col(), + M_->j_col(), + sizeof(index_type) * M_->numberOfNonzeros(), + evloserMemcpyDeviceToHost)); } // Transfer triplet to CSR form @@ -390,10 +408,10 @@ void hiopLinSolverSymSparseEVLOSER::update_matrix_values() // If factorization was not successful, we need a copy of values on the host if(factorizationSetupSucc_ == 0) - checkCudaErrors(cudaMemcpy(solver_->mat_A_csr()->host_vals(), + checkCudaErrors(evloserGpuMemcpy(solver_->mat_A_csr()->host_vals(), solver_->mat_A_csr()->device_vals(), sizeof(double) * nnz_, - cudaMemcpyDeviceToHost)); + evloserMemcpyDeviceToHost)); } else { // KKT matrix is on the host @@ -406,10 +424,10 @@ void hiopLinSolverSymSparseEVLOSER::update_matrix_values() if(index_convert_extra_Diag2CSR_host_[i] != -1) vals[index_convert_extra_Diag2CSR_host_[i]] += M_->M()[M_->numberOfNonzeros() - n_ + i]; } - checkCudaErrors(cudaMemcpy(solver_->mat_A_csr()->device_vals(), + checkCudaErrors(evloserGpuMemcpy(solver_->mat_A_csr()->device_vals(), solver_->mat_A_csr()->host_vals(), sizeof(double) * nnz_, - cudaMemcpyHostToDevice)); + evloserMemcpyHostToDevice)); } } @@ -476,8 +494,8 @@ void hiopLinSolverSymSparseEVLOSER::set_csr_indices_values() index_convert_CSR2Triplet_host_ = new int[nnz_]; index_convert_extra_Diag2CSR_host_ = new int[n_]; - checkCudaErrors(cudaMalloc(&index_convert_CSR2Triplet_device_, nnz_ * sizeof(int))); - checkCudaErrors(cudaMalloc(&index_convert_extra_Diag2CSR_device_, n_ * sizeof(int))); + checkCudaErrors(evloserGpuMalloc(reinterpret_cast(&index_convert_CSR2Triplet_device_), nnz_ * sizeof(int))); + checkCudaErrors(evloserGpuMalloc(reinterpret_cast(&index_convert_extra_Diag2CSR_device_), n_ * sizeof(int))); int* nnz_each_row_tmp = new int[n_]{0}; int total_nnz_tmp{0}, nnz_tmp{0}, rowID_tmp, colID_tmp; @@ -537,14 +555,14 @@ void hiopLinSolverSymSparseEVLOSER::set_csr_indices_values() std::sort(col_idx + row_ptr[i], col_idx + row_ptr[i + 1]); } } - checkCudaErrors(cudaMemcpy(index_convert_CSR2Triplet_device_, + checkCudaErrors(evloserGpuMemcpy(index_convert_CSR2Triplet_device_, index_convert_CSR2Triplet_host_, nnz_ * sizeof(int), - cudaMemcpyHostToDevice)); - checkCudaErrors(cudaMemcpy(index_convert_extra_Diag2CSR_device_, + evloserMemcpyHostToDevice)); + checkCudaErrors(evloserGpuMemcpy(index_convert_extra_Diag2CSR_device_, index_convert_extra_Diag2CSR_host_, n_ * sizeof(int), - cudaMemcpyHostToDevice)); + evloserMemcpyHostToDevice)); delete[] nnz_each_row_tmp; } diff --git a/src/Optimization/hiopDualsUpdater.cpp b/src/Optimization/hiopDualsUpdater.cpp index 45d62e4..ed0cfa8 100644 --- a/src/Optimization/hiopDualsUpdater.cpp +++ b/src/Optimization/hiopDualsUpdater.cpp @@ -72,7 +72,9 @@ #include "hiopLinSolverSparsePARDISO.hpp" #endif #ifdef HIOP_USE_RESOLVE +#if defined(HIOP_USE_RESOLVE) && defined(HIOP_USE_CUDA) #include "hiopLinSolverSparseReSolve.hpp" +#endif #include "hiopLinSolverSparseEVLOSER.hpp" #endif #ifdef HIOP_USE_GINKGO @@ -443,7 +445,9 @@ bool hiopDualsLsqUpdateLinsysAugSparse::instantiate_linear_solver(const char* li // This is our first choice on the device. if(linear_solver == "resolve" || linear_solver == "auto") { ss_log << "LSQ linear solver --- KKT_SPARSE_XDYcYd linsys: ReSolve "; +#if defined(HIOP_USE_RESOLVE) && defined(HIOP_USE_CUDA) lin_sys_ = new hiopLinSolverSymSparseReSolve(n, nnz, nlp_); +#endif } if(linear_solver == "evloser") { ss_log << "LSQ linear solver --- KKT_SPARSE_XDYcYd linsys: EVLOSER "; diff --git a/src/Optimization/hiopKKTLinSysSparse.cpp b/src/Optimization/hiopKKTLinSysSparse.cpp index ce8f843..57ecf53 100644 --- a/src/Optimization/hiopKKTLinSysSparse.cpp +++ b/src/Optimization/hiopKKTLinSysSparse.cpp @@ -58,7 +58,9 @@ #include "hiopLinSolverSparsePARDISO.hpp" #endif #ifdef HIOP_USE_RESOLVE +#if defined(HIOP_USE_RESOLVE) && defined(HIOP_USE_CUDA) #include "hiopLinSolverSparseReSolve.hpp" +#endif #include "hiopLinSolverSparseEVLOSER.hpp" #endif #ifdef HIOP_USE_GINKGO @@ -364,7 +366,9 @@ hiopLinSolverSymSparse* hiopKKTLinSysCompressedSparseXYcYd::determineAndCreateLi if((nullptr == linSys_ && linear_solver == "auto") || linear_solver == "resolve") { #if defined(HIOP_USE_RESOLVE) +#if defined(HIOP_USE_RESOLVE) && defined(HIOP_USE_CUDA) linSys_ = new hiopLinSolverSymSparseReSolve(n, nnz, nlp_); +#endif linsol_actual = "ReSolve"; auto* fact_acceptor_ic = dynamic_cast(fact_acceptor_); if(fact_acceptor_ic) { @@ -761,7 +765,9 @@ hiopLinSolverSymSparse* hiopKKTLinSysCompressedSparseXDYcYd::determineAndCreateL if(linear_solver == "resolve" || linear_solver == "auto") { #if defined(HIOP_USE_RESOLVE) actual_lin_solver = "ReSolve"; +#if defined(HIOP_USE_RESOLVE) && defined(HIOP_USE_CUDA) linSys_ = new hiopLinSolverSymSparseReSolve(n, nnz, nlp_); +#endif auto* fact_acceptor_ic = dynamic_cast(fact_acceptor_); if(fact_acceptor_ic) { nlp_->log->printf(hovError, @@ -847,7 +853,9 @@ hiopLinSolverSymSparse* hiopKKTLinSysCompressedSparseXDYcYd::determineAndCreateL if(linear_solver == "resolve" || linear_solver == "auto") { #if defined(HIOP_USE_RESOLVE) +#if defined(HIOP_USE_RESOLVE) && defined(HIOP_USE_CUDA) linSys_ = new hiopLinSolverSymSparseReSolve(n, nnz, nlp_); +#endif nlp_->log->printf(hovScalars, "KKT_SPARSE_XDYcYd linsys: alloc ReSolve size %d (%d cons) (gpu)\n", n, neq + nineq); auto* fact_acceptor_ic = dynamic_cast(fact_acceptor_); if(fact_acceptor_ic) { From c26f56b7122276cc5416e2e6912dbf67a47de364 Mon Sep 17 00:00:00 2001 From: Tamar DeWilde Date: Wed, 17 Jun 2026 14:43:48 -0400 Subject: [PATCH 19/28] Clean up EVLOSER sparse solver integration --- .github/workflows/spack_build.yml | 2 +- CMakeLists.txt | 2 ++ src/Drivers/Sparse/CMakeLists.txt | 5 ++++ src/Drivers/Sparse/NlpSparseEx1Driver.cpp | 2 ++ src/Drivers/Sparse/NlpSparseEx2Driver.cpp | 4 ++- src/Drivers/Sparse/NlpSparseRajaEx2Driver.cpp | 12 ++++++-- src/LinAlg/CMakeLists.txt | 5 ++++ src/LinAlg/EVLOSER/CMakeLists.txt | 9 ++++++ src/LinAlg/EVLOSER/KrylovSolverKernels.cu | 30 +++++++++---------- src/LinAlg/EVLOSER/KrylovSolverKernels.h | 17 +++++------ src/LinAlg/EVLOSER/RefactorizationSolver.cpp | 2 +- src/LinAlg/EVLOSER/evloser_cusolver_defs.hpp | 6 ++++ src/LinAlg/EVLOSER/evloser_gpu_defs.hpp | 2 +- src/LinAlg/hiopLinSolverSparseEVLOSER.cpp | 28 ++++++++--------- src/LinAlg/hiopLinSolverSparseEVLOSER.hpp | 2 +- src/LinAlg/hiopLinSolverSparseReSolve.cpp | 12 +++++--- src/Optimization/hiopDualsUpdater.cpp | 4 +++ src/Optimization/hiopKKTLinSysSparse.cpp | 7 +++++ src/Utils/hiopOptions.cpp | 6 +++- 19 files changed, 107 insertions(+), 50 deletions(-) diff --git a/.github/workflows/spack_build.yml b/.github/workflows/spack_build.yml index c2554ff..30efb8a 100644 --- a/.github/workflows/spack_build.yml +++ b/.github/workflows/spack_build.yml @@ -163,7 +163,7 @@ jobs: run: spack -e . mirror set --oci-username ${{ env.USERNAME }} --oci-password "${{ secrets.GITHUB_TOKEN }}" local-buildcache - name: Trust keys - run: spack -e . buildcache keys --install --trust + run: printf "y\n" | spack -e . buildcache keys --install --trust --force - name: Find external packages run: spack -e . external find --all --exclude python --exclude curl --exclude openssl diff --git a/CMakeLists.txt b/CMakeLists.txt index 80b4997..546b66a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -96,6 +96,7 @@ cmake_dependent_option( cmake_dependent_option( HIOP_USE_MAGMA "Use Magma linear algebra" ON "HIOP_USE_GPU" OFF ) +# EVLOSER uses the existing ReSolve option path, so this must cover GPU builds instead of CUDA only. cmake_dependent_option( HIOP_USE_RESOLVE "Build with ReSolve/EVLOSER sparse solver support" ON "HIOP_USE_GPU" OFF ) @@ -286,6 +287,7 @@ endif(HIOP_USE_GPU) if(HIOP_USE_RAJA) # Look for CMake configuration file in RAJA installation + # The RAJA driver path needs camp available with the RAJA/Umpire target set. find_package(camp CONFIG REQUIRED) find_package(RAJA CONFIG diff --git a/src/Drivers/Sparse/CMakeLists.txt b/src/Drivers/Sparse/CMakeLists.txt index 436d705..343d6e2 100644 --- a/src/Drivers/Sparse/CMakeLists.txt +++ b/src/Drivers/Sparse/CMakeLists.txt @@ -17,6 +17,7 @@ target_link_libraries(NlpSparseEx3.exe HiOp::HiOp) add_executable(NlpSparseEx4.exe NlpSparseEx4.cpp NlpSparseEx4Driver.cpp) target_link_libraries(NlpSparseEx4.exe HiOp::HiOp) +# This driver used to be CUDA/ReSolve-only; EVLOSER adds the HIP path here too. if(HIOP_USE_RAJA AND HIOP_USE_GPU AND HIOP_USE_RESOLVE) if(HIOP_USE_CUDA OR HIOP_USE_HIP) if(HIOP_USE_CUDA) @@ -58,6 +59,7 @@ add_test(NAME NlpSparse1_2 COMMAND ${RUNCMD} "$" " if(HIOP_USE_CUDA) add_test(NAME NlpSparse1_3 COMMAND ${RUNCMD} "$" "500" "-cusolver" "-selfcheck") endif(HIOP_USE_CUDA) +# EVLOSER uses the existing SparseEx1 driver and selects the EVLOSER solver option. if(HIOP_USE_CUDA AND HIOP_USE_RESOLVE) add_test(NAME NlpSparse1_EVLOSER COMMAND ${RUNCMD} "$" "500" "-evloser" "-selfcheck") endif() @@ -78,6 +80,7 @@ add_test(NAME NlpSparse2_2 COMMAND ${RUNCMD} "$" " if(HIOP_USE_CUDA) add_test(NAME NlpSparse2_3 COMMAND ${RUNCMD} "$" "500" "-cusolver" "-inertiafree" "-selfcheck") endif(HIOP_USE_CUDA) +# EVLOSER uses the existing SparseEx2 driver with the required inertia-free path. if(HIOP_USE_CUDA AND HIOP_USE_RESOLVE) add_test(NAME NlpSparse2_EVLOSER COMMAND ${RUNCMD} "$" "500" "-evloser" "-inertiafree" "-selfcheck") endif() @@ -94,9 +97,11 @@ endif(HIOP_USE_GINKGO) if(HIOP_USE_RAJA AND HIOP_USE_GPU AND HIOP_USE_CUDA AND HIOP_USE_RESOLVE) add_test(NAME NlpSparseRaja2_1 COMMAND ${RUNCMD} "$" "500" "-inertiafree" "-selfcheck" "-resolve_cuda_glu") add_test(NAME NlpSparseRaja2_2 COMMAND ${RUNCMD} "$" "500" "-inertiafree" "-selfcheck" "-resolve_cuda_rf") + # CUDA EVLOSER RF uses the same RAJA sparse driver as the ReSolve RF test. add_test(NAME NlpSparseRaja2_3 COMMAND ${RUNCMD} "$" "500" "-inertiafree" "-selfcheck" "-evloser_cuda_rf") endif() +# HIP EVLOSER RF needs a separate test guard because ReSolve remains CUDA-only. if(HIOP_USE_RAJA AND HIOP_USE_GPU AND HIOP_USE_HIP AND HIOP_USE_RESOLVE) add_test(NAME NlpSparseRaja2_4 COMMAND ${RUNCMD} "$" "500" "-inertiafree" "-selfcheck" "-evloser_hip_rf") endif() diff --git a/src/Drivers/Sparse/NlpSparseEx1Driver.cpp b/src/Drivers/Sparse/NlpSparseEx1Driver.cpp index ef9a6a5..b7eac02 100644 --- a/src/Drivers/Sparse/NlpSparseEx1Driver.cpp +++ b/src/Drivers/Sparse/NlpSparseEx1Driver.cpp @@ -124,6 +124,7 @@ static bool parse_arguments(int argc, scal = 1.0; } + // EVLOSER follows the same sparse-LU driver path as cuSOLVER here. if((use_cusolver || use_evloser) && use_pardiso) { printf("Selected both, cuSOLVER and Pardiso. "); printf("You can select only one linear solver.\n\n"); @@ -242,6 +243,7 @@ int main(int argc, char** argv) if(use_pardiso) { nlp.options->SetStringValue("linear_solver_sparse", "pardiso"); } + // EVLOSER keeps the ReSolve RF settings below but selects the EVLOSER solver name. if(use_cusolver || use_evloser) { nlp.options->SetStringValue("duals_init", "zero"); nlp.options->SetStringValue("linsol_mode", "speculative"); diff --git a/src/Drivers/Sparse/NlpSparseEx2Driver.cpp b/src/Drivers/Sparse/NlpSparseEx2Driver.cpp index 6a27733..e49805a 100644 --- a/src/Drivers/Sparse/NlpSparseEx2Driver.cpp +++ b/src/Drivers/Sparse/NlpSparseEx2Driver.cpp @@ -151,6 +151,7 @@ static bool parse_arguments(int argc, // Use cuSOLVER's LU factorization, if it was configured #ifdef HIOP_USE_RESOLVE + // EVLOSER uses the existing ReSolve-enabled sparse solver setup in this driver. if(use_cusolver || use_evloser) { use_resolve = true; } @@ -159,7 +160,7 @@ static bool parse_arguments(int argc, // If cuSOLVER was selected, but inertia free approach was not, add inertia-free if((use_cusolver || use_evloser) && !(inertia_free)) { inertia_free = true; - printf("LU solver from ReSolve library requires inertia free approach. "); + printf("Selected LU sparse solver requires inertia free approach. "); printf("Enabling now ...\n"); } @@ -260,6 +261,7 @@ int main(int argc, char** argv) if(use_resolve) { nlp.options->SetStringValue("duals_init", "zero"); nlp.options->SetStringValue("linsol_mode", "speculative"); + // EVLOSER keeps the ReSolve RF settings below but selects the EVLOSER solver name. if(use_evloser) { nlp.options->SetStringValue("linear_solver_sparse", "evloser"); } else { diff --git a/src/Drivers/Sparse/NlpSparseRajaEx2Driver.cpp b/src/Drivers/Sparse/NlpSparseRajaEx2Driver.cpp index 591a0f3..c6952f3 100644 --- a/src/Drivers/Sparse/NlpSparseRajaEx2Driver.cpp +++ b/src/Drivers/Sparse/NlpSparseRajaEx2Driver.cpp @@ -190,12 +190,14 @@ static bool parse_arguments(int argc, #endif // If ReSolve was selected, but inertia free approach was not, add inertia-free + // EVLOSER RF has the same inertia-free requirement as the ReSolve sparse-LU path. if((use_resolve_cuda_glu || use_resolve_cuda_rf || use_evloser_cuda_rf || use_evloser_hip_rf) && !(inertia_free)) { inertia_free = true; printf("LU solver from ReSolve library requires inertia free approach. "); printf("Enabling now ...\n"); } + // GLU and RF still share one refactorization option, so keep this conflict check explicit. if(use_resolve_cuda_glu && (use_resolve_cuda_rf || use_evloser_cuda_rf || use_evloser_hip_rf)) { use_resolve_cuda_rf = false; use_evloser_cuda_rf = false; @@ -204,6 +206,7 @@ static bool parse_arguments(int argc, printf("Using default GLU refactorization ...\n"); } + // ReSolve RF and EVLOSER RF select different backend classes, so only one should be active. if(use_resolve_cuda_rf && (use_evloser_cuda_rf || use_evloser_hip_rf)) { use_evloser_cuda_rf = false; use_evloser_hip_rf = false; @@ -211,8 +214,11 @@ static bool parse_arguments(int argc, printf("Using ReSolve ...\n"); } + // EVLOSER has separate CUDA and HIP RF flags because the HIP path disables IR below. if(use_evloser_cuda_rf && use_evloser_hip_rf) { use_evloser_hip_rf = false; + printf("You can select either CUDA RF or HIP RF with EVLOSER, not both. "); + printf("Using CUDA RF ...\n"); } // If Ginkgo is not available, de-select it. @@ -245,10 +251,10 @@ static void usage(const char* exeName) " '-selfcheck': compares the optimal objective with a previously saved value for the " "problem specified by 'problem_size'. [optional]\n"); printf( - " '-use_resolve_cuda_glu': use ReSolve linear solver with KLU factorization and cusolverGLU refactorization " + " '-resolve_cuda_glu': use ReSolve linear solver with KLU factorization and cusolverGLU refactorization " "[optional]\n"); printf( - " '-use_resolve_cuda_rf' : use ReSolve linear solver with KLU factorization and cusolverRf refactorization " + " '-resolve_cuda_rf' : use ReSolve linear solver with KLU factorization and cusolverRf refactorization " "[optional]\n"); printf( " '-evloser_cuda_rf' : use EVLOSER linear solver with KLU factorization and cusolverRf refactorization " @@ -327,6 +333,7 @@ int main(int argc, char** argv) // only support cusolverLU right now, 2023.02.28 // lsq initialization of the duals fails for this example since the Jacobian is rank deficient // use zero initialization + // EVLOSER uses the same refactorization option string; the solver name selects the backend. if(use_evloser_cuda_rf || use_evloser_hip_rf) { nlp.options->SetStringValue("linear_solver_sparse", "evloser"); } else { @@ -338,6 +345,7 @@ int main(int argc, char** argv) nlp.options->SetIntegerValue("ir_outer_maxit", 0); } + // HIP EVLOSER RF currently runs without iterative refinement. if(use_evloser_hip_rf) { nlp.options->SetStringValue("resolve_refactorization", "rf"); nlp.options->SetIntegerValue("ir_inner_maxit", 0); diff --git a/src/LinAlg/CMakeLists.txt b/src/LinAlg/CMakeLists.txt index 3aacfc0..78c0795 100644 --- a/src/LinAlg/CMakeLists.txt +++ b/src/LinAlg/CMakeLists.txt @@ -102,6 +102,7 @@ set(hiopLinAlg_CUSOLVER_LU_SRC hiopLinSolverSparseReSolve.cpp ) +# EVLOSER wrapper source is separate from the ReSolve wrapper. If EVLOSER replaces ReSolve, merge this source path intentionally. set(hiopLinAlg_EVLOSER_SRC hiopLinSolverSparseEVLOSER.cpp ) @@ -154,12 +155,14 @@ if(HIOP_SPARSE) list(APPEND hiopLinAlg_SRC ${hiopLinAlg_PARDISO_SRC}) endif(HIOP_USE_PARDISO) if(HIOP_USE_RESOLVE) + # ReSolve stays CUDA-only here; EVLOSER below covers the HIP-capable backend path. if(HIOP_USE_CUDA) add_subdirectory(ReSolve) list(APPEND hiopLinAlg_SRC ${hiopLinAlg_CUSOLVER_LU_SRC}) set_source_files_properties(${hiopLinAlg_CUSOLVER_LU_SRC} PROPERTIES LANGUAGE CUDA) endif(HIOP_USE_CUDA) + # EVLOSER uses its own backend directory while sharing the existing sparse solver option path. add_subdirectory(EVLOSER) list(APPEND hiopLinAlg_SRC ${hiopLinAlg_EVLOSER_SRC}) @@ -234,9 +237,11 @@ install( add_library(hiopLinAlg OBJECT ${hiopLinAlg_SRC}) if(HIOP_USE_RESOLVE) + # Link EVLOSER separately so HIP builds do not require the CUDA-only ReSolve target. target_link_libraries(hiop_tpl INTERFACE EVLOSER) install(TARGETS EVLOSER EXPORT hiop-targets) + # Keep ReSolve linked only for CUDA builds. if(HIOP_USE_CUDA) target_link_libraries(hiop_tpl INTERFACE ReSolve) install(TARGETS ReSolve EXPORT hiop-targets) diff --git a/src/LinAlg/EVLOSER/CMakeLists.txt b/src/LinAlg/EVLOSER/CMakeLists.txt index 8dee32c..0dca8ff 100644 --- a/src/LinAlg/EVLOSER/CMakeLists.txt +++ b/src/LinAlg/EVLOSER/CMakeLists.txt @@ -14,6 +14,15 @@ if(HIOP_USE_CUDA) endif() add_library(EVLOSER STATIC ${EVLOSER_SRC}) + +if(HIOP_USE_CUDA) + target_compile_definitions(EVLOSER PRIVATE HIOP_USE_CUDA) +endif() + +if(HIOP_USE_HIP) + target_compile_definitions(EVLOSER PRIVATE HIOP_USE_HIP) +endif() + target_include_directories(EVLOSER INTERFACE $ $ diff --git a/src/LinAlg/EVLOSER/KrylovSolverKernels.cu b/src/LinAlg/EVLOSER/KrylovSolverKernels.cu index 6ad5ee1..f6185c8 100644 --- a/src/LinAlg/EVLOSER/KrylovSolverKernels.cu +++ b/src/LinAlg/EVLOSER/KrylovSolverKernels.cu @@ -54,11 +54,11 @@ #define maxk 1024 #define Tv5 1024 //computes V^T[u1 u2] where v is n x k and u1 and u2 are nx1 -__global__ void evloser_MassIPTwoVec_kernel(const double* __restrict__ u1, - const double* __restrict__ u2, - const double* __restrict__ v, +__global__ void evloser_MassIPTwoVec_kernel(const double* __restrict__ u1, + const double* __restrict__ u2, + const double* __restrict__ v, double* result, - const int k, + const int k, const int N) { int t = threadIdx.x; @@ -168,10 +168,10 @@ __global__ void evloser_massAxpy3_kernel(int N, } } -__global__ void evloser_matrixInfNormPart1(const int n, - const int nnz, +__global__ void evloser_matrixInfNormPart1(const int n, + const int nnz, const int* a_ia, - const double* a_val, + const double* a_val, double* result) { // one thread per row, pass through rows @@ -191,11 +191,11 @@ __global__ void evloser_matrixInfNormPart1(const int n, } -void evloser_mass_inner_product_two_vectors(int n, - int i, - double* vec1, - double* vec2, - double* mvec, +void evloser_mass_inner_product_two_vectors(int n, + int i, + double* vec1, + double* vec2, + double* mvec, double* result) { evloser_MassIPTwoVec_kernel<<>>(vec1, vec2, mvec, result, i + 1, n); @@ -205,10 +205,10 @@ void evloser_mass_axpy(int n, int i, double* x, double* y, double* alpha) evloser_massAxpy3_kernel<<<(n + 384 - 1) / 384, 384>>>(n, i + 1, x, y, alpha); } -void evloser_matrix_row_sums(int n, - int nnz, +void evloser_matrix_row_sums(int n, + int nnz, int* a_ia, - double* a_val, + double* a_val, double* result) { evloser_matrixInfNormPart1<<<1000,1024>>>(n, nnz, a_ia, a_val, result); diff --git a/src/LinAlg/EVLOSER/KrylovSolverKernels.h b/src/LinAlg/EVLOSER/KrylovSolverKernels.h index 2f46000..e5eff1b 100644 --- a/src/LinAlg/EVLOSER/KrylovSolverKernels.h +++ b/src/LinAlg/EVLOSER/KrylovSolverKernels.h @@ -53,18 +53,17 @@ */ -void evloser_mass_inner_product_two_vectors(int n, - int i, - double* vec1, - double* vec2, - double* mvec, +void evloser_mass_inner_product_two_vectors(int n, + int i, + double* vec1, + double* vec2, + double* mvec, double* result); void evloser_mass_axpy(int n, int i, double* x, double* y, double* alpha); //needed for matrix inf nrm -void evloser_matrix_row_sums(int n, - int nnz, +void evloser_matrix_row_sums(int n, + int nnz, int* a_ia, - double* a_val, + double* a_val, double* result); - diff --git a/src/LinAlg/EVLOSER/RefactorizationSolver.cpp b/src/LinAlg/EVLOSER/RefactorizationSolver.cpp index d3e44d4..9104b84 100644 --- a/src/LinAlg/EVLOSER/RefactorizationSolver.cpp +++ b/src/LinAlg/EVLOSER/RefactorizationSolver.cpp @@ -175,7 +175,7 @@ bool extract_klu_factors(klu_numeric* numeric, if(ok == 0) { if(!silent_output) { - std::cout << "[EVLOSER] klu_extract failed while preparing cuSOLVER RF setup\n"; + std::cout << "[EVLOSER] klu_extract failed while preparing GPU RF setup\n"; } return false; } diff --git a/src/LinAlg/EVLOSER/evloser_cusolver_defs.hpp b/src/LinAlg/EVLOSER/evloser_cusolver_defs.hpp index ad565b9..dae7039 100644 --- a/src/LinAlg/EVLOSER/evloser_cusolver_defs.hpp +++ b/src/LinAlg/EVLOSER/evloser_cusolver_defs.hpp @@ -75,6 +75,12 @@ inline evloserGpuError_t evloserGpuMalloc(void** ptr, size_t size) return cudaMalloc(ptr, size); } +template +inline evloserGpuError_t evloserGpuMalloc(T** ptr, size_t size) +{ + return cudaMalloc(reinterpret_cast(ptr), size); +} + inline evloserGpuError_t evloserGpuFree(void* ptr) { return cudaFree(ptr); diff --git a/src/LinAlg/EVLOSER/evloser_gpu_defs.hpp b/src/LinAlg/EVLOSER/evloser_gpu_defs.hpp index 295b20a..3904856 100644 --- a/src/LinAlg/EVLOSER/evloser_gpu_defs.hpp +++ b/src/LinAlg/EVLOSER/evloser_gpu_defs.hpp @@ -8,7 +8,7 @@ #ifndef EVLOSER_GPU_DEFS_H #define EVLOSER_GPU_DEFS_H -#if defined(HIOP_USE_CUDA) +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) #include "evloser_cusolver_defs.hpp" diff --git a/src/LinAlg/hiopLinSolverSparseEVLOSER.cpp b/src/LinAlg/hiopLinSolverSparseEVLOSER.cpp index 5380b6d..b8c0d5a 100644 --- a/src/LinAlg/hiopLinSolverSparseEVLOSER.cpp +++ b/src/LinAlg/hiopLinSolverSparseEVLOSER.cpp @@ -61,7 +61,7 @@ #include "hiop_blasdefs.hpp" -#ifdef HIOP_USE_CUDA +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) #include "cusparse_v2.h" #endif @@ -71,7 +71,7 @@ #include #include -#define checkCudaErrors(val) hiopCheckCudaError((val), __FILE__, __LINE__) +#define checkGpuErrors(val) hiopCheckGpuError((val), __FILE__, __LINE__) /** * @brief Map elements of one array to the other @@ -277,8 +277,8 @@ hiopLinSolverSymSparseEVLOSER::~hiopLinSolverSymSparseEVLOSER() // Delete CSR <--> triplet mappings delete[] index_convert_CSR2Triplet_host_; delete[] index_convert_extra_Diag2CSR_host_; - checkCudaErrors(evloserGpuFree(index_convert_CSR2Triplet_device_)); - checkCudaErrors(evloserGpuFree(index_convert_extra_Diag2CSR_device_)); + checkGpuErrors(evloserGpuFree(index_convert_CSR2Triplet_device_)); + checkGpuErrors(evloserGpuFree(index_convert_extra_Diag2CSR_device_)); } int hiopLinSolverSymSparseEVLOSER::matrixChanged() @@ -346,13 +346,13 @@ void hiopLinSolverSymSparseEVLOSER::firstCall() // If the matrix is on device, copy it to the host mirror std::string mem_space = nlp_->options->GetString("mem_space"); if(mem_space == "device") { - checkCudaErrors( + checkGpuErrors( evloserGpuMemcpy(M_host_->M(), M_->M(), sizeof(double) * M_->numberOfNonzeros(), evloserMemcpyDeviceToHost)); - checkCudaErrors(evloserGpuMemcpy(M_host_->i_row(), + checkGpuErrors(evloserGpuMemcpy(M_host_->i_row(), M_->i_row(), sizeof(index_type) * M_->numberOfNonzeros(), evloserMemcpyDeviceToHost)); - checkCudaErrors(evloserGpuMemcpy(M_host_->j_col(), + checkGpuErrors(evloserGpuMemcpy(M_host_->j_col(), M_->j_col(), sizeof(index_type) * M_->numberOfNonzeros(), evloserMemcpyDeviceToHost)); @@ -408,7 +408,7 @@ void hiopLinSolverSymSparseEVLOSER::update_matrix_values() // If factorization was not successful, we need a copy of values on the host if(factorizationSetupSucc_ == 0) - checkCudaErrors(evloserGpuMemcpy(solver_->mat_A_csr()->host_vals(), + checkGpuErrors(evloserGpuMemcpy(solver_->mat_A_csr()->host_vals(), solver_->mat_A_csr()->device_vals(), sizeof(double) * nnz_, evloserMemcpyDeviceToHost)); @@ -424,7 +424,7 @@ void hiopLinSolverSymSparseEVLOSER::update_matrix_values() if(index_convert_extra_Diag2CSR_host_[i] != -1) vals[index_convert_extra_Diag2CSR_host_[i]] += M_->M()[M_->numberOfNonzeros() - n_ + i]; } - checkCudaErrors(evloserGpuMemcpy(solver_->mat_A_csr()->device_vals(), + checkGpuErrors(evloserGpuMemcpy(solver_->mat_A_csr()->device_vals(), solver_->mat_A_csr()->host_vals(), sizeof(double) * nnz_, evloserMemcpyHostToDevice)); @@ -494,8 +494,8 @@ void hiopLinSolverSymSparseEVLOSER::set_csr_indices_values() index_convert_CSR2Triplet_host_ = new int[nnz_]; index_convert_extra_Diag2CSR_host_ = new int[n_]; - checkCudaErrors(evloserGpuMalloc(reinterpret_cast(&index_convert_CSR2Triplet_device_), nnz_ * sizeof(int))); - checkCudaErrors(evloserGpuMalloc(reinterpret_cast(&index_convert_extra_Diag2CSR_device_), n_ * sizeof(int))); + checkGpuErrors(evloserGpuMalloc(reinterpret_cast(&index_convert_CSR2Triplet_device_), nnz_ * sizeof(int))); + checkGpuErrors(evloserGpuMalloc(reinterpret_cast(&index_convert_extra_Diag2CSR_device_), n_ * sizeof(int))); int* nnz_each_row_tmp = new int[n_]{0}; int total_nnz_tmp{0}, nnz_tmp{0}, rowID_tmp, colID_tmp; @@ -555,11 +555,11 @@ void hiopLinSolverSymSparseEVLOSER::set_csr_indices_values() std::sort(col_idx + row_ptr[i], col_idx + row_ptr[i + 1]); } } - checkCudaErrors(evloserGpuMemcpy(index_convert_CSR2Triplet_device_, + checkGpuErrors(evloserGpuMemcpy(index_convert_CSR2Triplet_device_, index_convert_CSR2Triplet_host_, nnz_ * sizeof(int), evloserMemcpyHostToDevice)); - checkCudaErrors(evloserGpuMemcpy(index_convert_extra_Diag2CSR_device_, + checkGpuErrors(evloserGpuMemcpy(index_convert_extra_Diag2CSR_device_, index_convert_extra_Diag2CSR_host_, n_ * sizeof(int), evloserMemcpyHostToDevice)); @@ -569,7 +569,7 @@ void hiopLinSolverSymSparseEVLOSER::set_csr_indices_values() // Error checking utility for CUDA // KS: might later become part of src/Utils, putting it here for now template -void hiopLinSolverSymSparseEVLOSER::hiopCheckCudaError(T result, const char* const file, int const line) +void hiopLinSolverSymSparseEVLOSER::hiopCheckGpuError(T result, const char* const file, int const line) { if(result) { nlp_->log->printf(hovError, "CUDA error at %s:%d, error# %d\n", file, line, result); diff --git a/src/LinAlg/hiopLinSolverSparseEVLOSER.hpp b/src/LinAlg/hiopLinSolverSparseEVLOSER.hpp index 57787aa..fa1bbc5 100644 --- a/src/LinAlg/hiopLinSolverSparseEVLOSER.hpp +++ b/src/LinAlg/hiopLinSolverSparseEVLOSER.hpp @@ -153,7 +153,7 @@ class hiopLinSolverSymSparseEVLOSER : public hiopLinSolverSymSparse void set_csr_indices_values(); template - void hiopCheckCudaError(T result, const char* const file, int const line); + void hiopCheckGpuError(T result, const char* const file, int const line); }; } // namespace hiop diff --git a/src/LinAlg/hiopLinSolverSparseReSolve.cpp b/src/LinAlg/hiopLinSolverSparseReSolve.cpp index 312c9a0..a41d41f 100644 --- a/src/LinAlg/hiopLinSolverSparseReSolve.cpp +++ b/src/LinAlg/hiopLinSolverSparseReSolve.cpp @@ -54,12 +54,16 @@ */ #include "hiopLinSolverSparseReSolve.hpp" -#include -#include -#include +// Use the ReSolve path here because EVLOSER has headers with the same names. +// If EVLOSER replaces ReSolve, update this backend path instead of doing only +// a find-and-replace. +#include "ReSolve/IterativeRefinement.hpp" +#include "ReSolve/RefactorizationSolver.hpp" +#include "ReSolve/MatrixCsr.hpp" #include "hiop_blasdefs.hpp" -#include "KrylovSolverKernels.h" +// Use the ReSolve path here because EVLOSER has a kernel header with this name too. +#include "ReSolve/KrylovSolverKernels.h" #include "cusparse_v2.h" #include diff --git a/src/Optimization/hiopDualsUpdater.cpp b/src/Optimization/hiopDualsUpdater.cpp index ed0cfa8..5d52e0b 100644 --- a/src/Optimization/hiopDualsUpdater.cpp +++ b/src/Optimization/hiopDualsUpdater.cpp @@ -72,6 +72,7 @@ #include "hiopLinSolverSparsePARDISO.hpp" #endif #ifdef HIOP_USE_RESOLVE +// ReSolve is still CUDA-only; EVLOSER below covers the HIP-capable sparse solver path. #if defined(HIOP_USE_RESOLVE) && defined(HIOP_USE_CUDA) #include "hiopLinSolverSparseReSolve.hpp" #endif @@ -431,6 +432,7 @@ bool hiopDualsLsqUpdateLinsysAugSparse::instantiate_linear_solver(const char* li #ifdef HIOP_USE_RESOLVE if(compute_mode == "gpu") { + // EVLOSER is valid here because it uses the same dual-init sparse solver path as ReSolve. assert((linear_solver == "resolve" || linear_solver == "evloser" || linear_solver == "auto") && "the value for duals_init_linear_solver_sparse is invalid and should have been corrected during " "options processing"); @@ -445,10 +447,12 @@ bool hiopDualsLsqUpdateLinsysAugSparse::instantiate_linear_solver(const char* li // This is our first choice on the device. if(linear_solver == "resolve" || linear_solver == "auto") { ss_log << "LSQ linear solver --- KKT_SPARSE_XDYcYd linsys: ReSolve "; + // Only build the ReSolve solver object when CUDA is enabled. #if defined(HIOP_USE_RESOLVE) && defined(HIOP_USE_CUDA) lin_sys_ = new hiopLinSolverSymSparseReSolve(n, nnz, nlp_); #endif } + // EVLOSER has its own solver object but uses this same dual-init allocation point. if(linear_solver == "evloser") { ss_log << "LSQ linear solver --- KKT_SPARSE_XDYcYd linsys: EVLOSER "; lin_sys_ = new hiopLinSolverSymSparseEVLOSER(n, nnz, nlp_); diff --git a/src/Optimization/hiopKKTLinSysSparse.cpp b/src/Optimization/hiopKKTLinSysSparse.cpp index 57ecf53..4a4a550 100644 --- a/src/Optimization/hiopKKTLinSysSparse.cpp +++ b/src/Optimization/hiopKKTLinSysSparse.cpp @@ -58,6 +58,7 @@ #include "hiopLinSolverSparsePARDISO.hpp" #endif #ifdef HIOP_USE_RESOLVE +// ReSolve is still CUDA-only; EVLOSER below covers the HIP-capable sparse solver path. #if defined(HIOP_USE_RESOLVE) && defined(HIOP_USE_CUDA) #include "hiopLinSolverSparseReSolve.hpp" #endif @@ -366,6 +367,7 @@ hiopLinSolverSymSparse* hiopKKTLinSysCompressedSparseXYcYd::determineAndCreateLi if((nullptr == linSys_ && linear_solver == "auto") || linear_solver == "resolve") { #if defined(HIOP_USE_RESOLVE) + // Only build the ReSolve solver object when CUDA is enabled. #if defined(HIOP_USE_RESOLVE) && defined(HIOP_USE_CUDA) linSys_ = new hiopLinSolverSymSparseReSolve(n, nnz, nlp_); #endif @@ -381,6 +383,7 @@ hiopLinSolverSymSparse* hiopKKTLinSysCompressedSparseXYcYd::determineAndCreateLi #endif } + // EVLOSER has its own solver object but uses this same sparse KKT selection point. if(nullptr == linSys_ && linear_solver == "evloser") { #if defined(HIOP_USE_RESOLVE) linSys_ = new hiopLinSolverSymSparseEVLOSER(n, nnz, nlp_); @@ -765,6 +768,7 @@ hiopLinSolverSymSparse* hiopKKTLinSysCompressedSparseXDYcYd::determineAndCreateL if(linear_solver == "resolve" || linear_solver == "auto") { #if defined(HIOP_USE_RESOLVE) actual_lin_solver = "ReSolve"; + // Only build the ReSolve solver object when CUDA is enabled. #if defined(HIOP_USE_RESOLVE) && defined(HIOP_USE_CUDA) linSys_ = new hiopLinSolverSymSparseReSolve(n, nnz, nlp_); #endif @@ -779,6 +783,7 @@ hiopLinSolverSymSparse* hiopKKTLinSysCompressedSparseXDYcYd::determineAndCreateL #endif } // end resolve + // EVLOSER has its own solver object but uses this same sparse KKT selection point. if(nullptr == linSys_ && linear_solver == "evloser") { #if defined(HIOP_USE_RESOLVE) actual_lin_solver = "EVLOSER"; @@ -853,6 +858,7 @@ hiopLinSolverSymSparse* hiopKKTLinSysCompressedSparseXDYcYd::determineAndCreateL if(linear_solver == "resolve" || linear_solver == "auto") { #if defined(HIOP_USE_RESOLVE) + // Only build the ReSolve solver object when CUDA is enabled. #if defined(HIOP_USE_RESOLVE) && defined(HIOP_USE_CUDA) linSys_ = new hiopLinSolverSymSparseReSolve(n, nnz, nlp_); #endif @@ -868,6 +874,7 @@ hiopLinSolverSymSparse* hiopKKTLinSysCompressedSparseXDYcYd::determineAndCreateL #endif } // end resolve + // EVLOSER has its own solver object but uses this same sparse KKT selection point. if(nullptr == linSys_ && linear_solver == "evloser") { #if defined(HIOP_USE_RESOLVE) linSys_ = new hiopLinSolverSymSparseEVLOSER(n, nnz, nlp_); diff --git a/src/Utils/hiopOptions.cpp b/src/Utils/hiopOptions.cpp index 8d5f7fe..5432d90 100644 --- a/src/Utils/hiopOptions.cpp +++ b/src/Utils/hiopOptions.cpp @@ -922,6 +922,7 @@ void hiopOptionsNLP::register_options() // - 'gpu' compute mode: work in progress { + // EVLOSER is a separate sparse solver option while the old ReSolve option stays available. vector range{"auto", "ma57", "pardiso", "strumpack", "resolve", "evloser", "ginkgo", "cusolver-chol"}; register_str_option("linear_solver_sparse", @@ -936,6 +937,7 @@ void hiopOptionsNLP::register_options() // - when GPU mode is on, STRUMPACK is chosen by 'auto' if available // - choosing option ma57 or pardiso with GPU being on, it results in no device being used in the linear solve! { + // EVLOSER is also valid for dual initialization through the same sparse solver path. vector range{"auto", "ma57", "pardiso", "resolve", "evloser", "strumpack", "ginkgo"}; register_str_option("duals_init_linear_solver_sparse", @@ -1402,6 +1404,7 @@ void hiopOptionsNLP::ensure_consistence() auto kkt_linsys = GetString("KKTLinsys"); auto sol_sp = GetString("linear_solver_sparse"); if(kkt_linsys == "full") { + // Full sparse KKT accepts EVLOSER through the same sparse solver selection path as ReSolve. if(sol_sp != "resolve" && sol_sp != "evloser" && sol_sp != "pardiso" && sol_sp != "strumpack" && sol_sp != "auto") { if(is_user_defined("linear_solver_sparse")) { log_printf(hovWarning, @@ -1425,7 +1428,7 @@ void hiopOptionsNLP::ensure_consistence() } } -// EVLOSER requires either CUDA or HIP support. +// EVLOSER can use CUDA or HIP, unlike the CUDA-only ReSolve path below. #if !defined(HIOP_USE_CUDA) && !defined(HIOP_USE_HIP) if(sol_sp == "evloser") { if(is_user_defined("linear_solver_sparse")) { @@ -1572,6 +1575,7 @@ void hiopOptionsNLP::ensure_consistence() } set_val("fact_acceptor", "inertia_free"); } + // EVLOSER follows the same inertia-free fact_acceptor rule as the other sparse direct solvers. } else if(GetString("linear_solver_sparse") == "strumpack" || GetString("linear_solver_sparse") == "resolve" || GetString("linear_solver_sparse") == "evloser") { if(GetString("fact_acceptor") == "inertia_correction") { From 2fa4e832e8cb9df74b5f2a21c51163a73f9c09ff Mon Sep 17 00:00:00 2001 From: Tamar DeWilde Date: Tue, 23 Jun 2026 23:39:22 -0700 Subject: [PATCH 20/28] Enable CPU-only EVLOSER builds --- CMakeLists.txt | 24 +++- src/Drivers/Sparse/CMakeLists.txt | 13 ++- src/Interface/hiop_defs.hpp.in | 1 + src/LinAlg/CMakeLists.txt | 23 ++-- src/LinAlg/EVLOSER/CMakeLists.txt | 1 - src/LinAlg/EVLOSER/MatrixCsr.cpp | 65 ++++++++++- src/LinAlg/EVLOSER/MatrixCsr.hpp | 15 +++ src/LinAlg/EVLOSER/RefactorizationSolver.cpp | 85 ++++++++++++-- src/LinAlg/EVLOSER/RefactorizationSolver.hpp | 29 ++++- src/LinAlg/EVLOSER/evloser_cpu_defs.hpp | 26 +++++ src/LinAlg/EVLOSER/evloser_gpu_defs.hpp | 4 +- src/LinAlg/hiopLinSolverSparseEVLOSER.cpp | 114 ++++++++++++++----- src/LinAlg/hiopLinSolverSparseEVLOSER.hpp | 9 +- src/Optimization/hiopDualsUpdater.cpp | 25 ++-- src/Optimization/hiopKKTLinSysSparse.cpp | 10 +- 15 files changed, 354 insertions(+), 90 deletions(-) create mode 100644 src/LinAlg/EVLOSER/evloser_cpu_defs.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 546b66a..830b644 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -96,9 +96,15 @@ cmake_dependent_option( cmake_dependent_option( HIOP_USE_MAGMA "Use Magma linear algebra" ON "HIOP_USE_GPU" OFF ) -# EVLOSER uses the existing ReSolve option path, so this must cover GPU builds instead of CUDA only. + cmake_dependent_option( - HIOP_USE_RESOLVE "Build with ReSolve/EVLOSER sparse solver support" ON "HIOP_USE_GPU" OFF + HIOP_USE_RESOLVE "Build with ReSolve sparse solver support" ON "HIOP_USE_GPU" OFF +) + +option( + HIOP_USE_EVLOSER + "Enable the experimental EVLOSER sparse solver" + OFF ) add_library(hiop_tpl INTERFACE) @@ -361,12 +367,17 @@ if(HIOP_SPARSE) endif() endif(HIOP_USE_STRUMPACK) - if (HIOP_USE_RESOLVE) + if (HIOP_USE_RESOLVE OR HIOP_USE_EVLOSER) set(HIOP_KLU_DIR CACHE PATH "Path to KLU directory") include(FindKLU) if(NOT KLU_LIBRARY) message(STATUS "Cannot find KLU, disabling cuSOLVER LU module ...") - set(HIOP_USE_RESOLVE OFF CACHE BOOL "Build without cuSOLVER LU module." FORCE) + if(HIOP_USE_RESOLVE) + set(HIOP_USE_RESOLVE OFF CACHE BOOL "Build without cuSOLVER LU module." FORCE) + endif() + if(HIOP_USE_EVLOSER) + set(HIOP_USE_EVLOSER OFF CACHE BOOL "Build without EVLOSER." FORCE) + endif() else() target_link_libraries(hiop_tpl INTERFACE KLU) endif() @@ -398,14 +409,15 @@ if(HIOP_SPARSE) endif() endif(HIOP_USE_GINKGO) - if(NOT HIOP_USE_COINHSL AND NOT HIOP_USE_STRUMPACK AND NOT HIOP_USE_PARDISO AND NOT HIOP_USE_GINKGO) + if(NOT HIOP_USE_COINHSL AND NOT HIOP_USE_STRUMPACK AND NOT HIOP_USE_PARDISO AND NOT HIOP_USE_GINKGO AND NOT HIOP_USE_EVLOSER) set(HIOP_SPARSE OFF CACHE BOOL "Build without sparse linear algebra" FORCE) message(STATUS "Cannot find COINHSL, STRUMPACK, PARDISO nor GINKGO for sparse linear algebra, and the option HIOP_SPARSE will be disabled") -endif(NOT HIOP_USE_COINHSL AND NOT HIOP_USE_STRUMPACK AND NOT HIOP_USE_PARDISO AND NOT HIOP_USE_GINKGO) +endif() else(HIOP_SPARSE) set(HIOP_USE_COINHSL OFF CACHE BOOL "Build without COINHSL" FORCE) set(HIOP_USE_STRUMPACK OFF CACHE BOOL "Build without STRUMPACK" FORCE) set(HIOP_USE_RESOLVE OFF CACHE BOOL "Build without cuSOLVER LU module" FORCE) + set(HIOP_USE_EVLOSER OFF CACHE BOOL "Build without EVLOSER" FORCE) set(HIOP_USE_PARDISO OFF CACHE BOOL "Build without PARDISO" FORCE) set(HIOP_USE_GINKGO OFF CACHE BOOL "Build without GINKGO" FORCE) endif(HIOP_SPARSE) diff --git a/src/Drivers/Sparse/CMakeLists.txt b/src/Drivers/Sparse/CMakeLists.txt index 343d6e2..2918063 100644 --- a/src/Drivers/Sparse/CMakeLists.txt +++ b/src/Drivers/Sparse/CMakeLists.txt @@ -18,7 +18,7 @@ add_executable(NlpSparseEx4.exe NlpSparseEx4.cpp NlpSparseEx4Driver.cpp) target_link_libraries(NlpSparseEx4.exe HiOp::HiOp) # This driver used to be CUDA/ReSolve-only; EVLOSER adds the HIP path here too. -if(HIOP_USE_RAJA AND HIOP_USE_GPU AND HIOP_USE_RESOLVE) +if(HIOP_USE_RAJA AND HIOP_USE_GPU AND (HIOP_USE_RESOLVE OR HIOP_USE_EVLOSER)) if(HIOP_USE_CUDA OR HIOP_USE_HIP) if(HIOP_USE_CUDA) set_source_files_properties( @@ -60,7 +60,7 @@ if(HIOP_USE_CUDA) add_test(NAME NlpSparse1_3 COMMAND ${RUNCMD} "$" "500" "-cusolver" "-selfcheck") endif(HIOP_USE_CUDA) # EVLOSER uses the existing SparseEx1 driver and selects the EVLOSER solver option. -if(HIOP_USE_CUDA AND HIOP_USE_RESOLVE) +if(HIOP_USE_CUDA AND HIOP_USE_EVLOSER) add_test(NAME NlpSparse1_EVLOSER COMMAND ${RUNCMD} "$" "500" "-evloser" "-selfcheck") endif() if(HIOP_USE_PARDISO) @@ -81,7 +81,7 @@ if(HIOP_USE_CUDA) add_test(NAME NlpSparse2_3 COMMAND ${RUNCMD} "$" "500" "-cusolver" "-inertiafree" "-selfcheck") endif(HIOP_USE_CUDA) # EVLOSER uses the existing SparseEx2 driver with the required inertia-free path. -if(HIOP_USE_CUDA AND HIOP_USE_RESOLVE) +if(HIOP_USE_CUDA AND HIOP_USE_EVLOSER) add_test(NAME NlpSparse2_EVLOSER COMMAND ${RUNCMD} "$" "500" "-evloser" "-inertiafree" "-selfcheck") endif() if(HIOP_USE_GINKGO) @@ -97,12 +97,15 @@ endif(HIOP_USE_GINKGO) if(HIOP_USE_RAJA AND HIOP_USE_GPU AND HIOP_USE_CUDA AND HIOP_USE_RESOLVE) add_test(NAME NlpSparseRaja2_1 COMMAND ${RUNCMD} "$" "500" "-inertiafree" "-selfcheck" "-resolve_cuda_glu") add_test(NAME NlpSparseRaja2_2 COMMAND ${RUNCMD} "$" "500" "-inertiafree" "-selfcheck" "-resolve_cuda_rf") - # CUDA EVLOSER RF uses the same RAJA sparse driver as the ReSolve RF test. +endif() + +# CUDA EVLOSER RF uses the same RAJA sparse driver as the ReSolve RF test. +if(HIOP_USE_RAJA AND HIOP_USE_GPU AND HIOP_USE_CUDA AND HIOP_USE_EVLOSER) add_test(NAME NlpSparseRaja2_3 COMMAND ${RUNCMD} "$" "500" "-inertiafree" "-selfcheck" "-evloser_cuda_rf") endif() # HIP EVLOSER RF needs a separate test guard because ReSolve remains CUDA-only. -if(HIOP_USE_RAJA AND HIOP_USE_GPU AND HIOP_USE_HIP AND HIOP_USE_RESOLVE) +if(HIOP_USE_RAJA AND HIOP_USE_GPU AND HIOP_USE_HIP AND HIOP_USE_EVLOSER) add_test(NAME NlpSparseRaja2_4 COMMAND ${RUNCMD} "$" "500" "-inertiafree" "-selfcheck" "-evloser_hip_rf") endif() diff --git a/src/Interface/hiop_defs.hpp.in b/src/Interface/hiop_defs.hpp.in index 68570c8..ec47a2c 100644 --- a/src/Interface/hiop_defs.hpp.in +++ b/src/Interface/hiop_defs.hpp.in @@ -11,6 +11,7 @@ #cmakedefine HIOP_USE_STRUMPACK #cmakedefine HIOP_USE_PARDISO #cmakedefine HIOP_USE_RESOLVE +#cmakedefine HIOP_USE_EVLOSER #cmakedefine HIOP_USE_GINKGO #cmakedefine HIOP_USE_AXOM #define HIOP_VERSION "@PROJECT_VERSION@" diff --git a/src/LinAlg/CMakeLists.txt b/src/LinAlg/CMakeLists.txt index 78c0795..6d5b164 100644 --- a/src/LinAlg/CMakeLists.txt +++ b/src/LinAlg/CMakeLists.txt @@ -155,21 +155,21 @@ if(HIOP_SPARSE) list(APPEND hiopLinAlg_SRC ${hiopLinAlg_PARDISO_SRC}) endif(HIOP_USE_PARDISO) if(HIOP_USE_RESOLVE) - # ReSolve stays CUDA-only here; EVLOSER below covers the HIP-capable backend path. if(HIOP_USE_CUDA) add_subdirectory(ReSolve) list(APPEND hiopLinAlg_SRC ${hiopLinAlg_CUSOLVER_LU_SRC}) set_source_files_properties(${hiopLinAlg_CUSOLVER_LU_SRC} PROPERTIES LANGUAGE CUDA) - endif(HIOP_USE_CUDA) + endif() + endif() - # EVLOSER uses its own backend directory while sharing the existing sparse solver option path. + if(HIOP_USE_EVLOSER) add_subdirectory(EVLOSER) list(APPEND hiopLinAlg_SRC ${hiopLinAlg_EVLOSER_SRC}) - if(HIOP_USE_CUDA) set_source_files_properties(${hiopLinAlg_EVLOSER_SRC} PROPERTIES LANGUAGE CUDA) - endif(HIOP_USE_CUDA) - endif(HIOP_USE_RESOLVE) + endif() + endif() + if(HIOP_USE_CUDA) list(APPEND hiopLinAlg_SRC ${hiopLinAlg_CUSOLVER_CHOL_SRC}) set_source_files_properties(${hiopLinAlg_CUSOLVER_CHOL_SRC} PROPERTIES LANGUAGE CUDA) @@ -236,15 +236,14 @@ install( ) add_library(hiopLinAlg OBJECT ${hiopLinAlg_SRC}) -if(HIOP_USE_RESOLVE) +if(HIOP_USE_EVLOSER) # Link EVLOSER separately so HIP builds do not require the CUDA-only ReSolve target. target_link_libraries(hiop_tpl INTERFACE EVLOSER) install(TARGETS EVLOSER EXPORT hiop-targets) +endif() - # Keep ReSolve linked only for CUDA builds. - if(HIOP_USE_CUDA) - target_link_libraries(hiop_tpl INTERFACE ReSolve) - install(TARGETS ReSolve EXPORT hiop-targets) - endif(HIOP_USE_CUDA) +if(HIOP_USE_RESOLVE AND HIOP_USE_CUDA) + target_link_libraries(hiop_tpl INTERFACE ReSolve) + install(TARGETS ReSolve EXPORT hiop-targets) endif() target_link_libraries(hiopLinAlg PRIVATE hiop_tpl) diff --git a/src/LinAlg/EVLOSER/CMakeLists.txt b/src/LinAlg/EVLOSER/CMakeLists.txt index 0dca8ff..c26a756 100644 --- a/src/LinAlg/EVLOSER/CMakeLists.txt +++ b/src/LinAlg/EVLOSER/CMakeLists.txt @@ -10,7 +10,6 @@ if(HIOP_USE_CUDA) IterativeRefinement.cpp KrylovSolverKernels.cu ) - set_source_files_properties(${EVLOSER_SRC} PROPERTIES LANGUAGE CUDA) endif() add_library(EVLOSER STATIC ${EVLOSER_SRC}) diff --git a/src/LinAlg/EVLOSER/MatrixCsr.cpp b/src/LinAlg/EVLOSER/MatrixCsr.cpp index 6eb801f..ba67342 100644 --- a/src/LinAlg/EVLOSER/MatrixCsr.cpp +++ b/src/LinAlg/EVLOSER/MatrixCsr.cpp @@ -63,7 +63,10 @@ #include #include +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) #define checkGpuErrors(val) evloserCheckGpuError((val), __FILE__, __LINE__) +#endif namespace EVLOSER { @@ -75,12 +78,15 @@ MatrixCsr::~MatrixCsr() clear_data(); } +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) bool MatrixCsr::has_device_storage() const { const bool size_allocated = (n_ == 0) || (irows_ != nullptr); const bool nnz_allocated = (nnz_ == 0) || (jcols_ != nullptr && vals_ != nullptr); return size_allocated && nnz_allocated; } +#endif bool MatrixCsr::has_host_mirror() const { @@ -91,43 +97,85 @@ bool MatrixCsr::has_host_mirror() const void MatrixCsr::allocate_size(int n) { - if(irows_ != nullptr || irows_host_ != nullptr) { + bool storage_allocated = irows_host_ != nullptr; + +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) + + storage_allocated = storage_allocated || irows_ != nullptr; + +#endif + + if(storage_allocated) { clear_data(); } n_ = n; + +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) + checkGpuErrors(evloserGpuMalloc(reinterpret_cast(&irows_), (n_ + 1) * sizeof(int))); + +#endif + irows_host_ = new int[n_ + 1]{0}; } void MatrixCsr::allocate_nnz(int nnz) { - if(jcols_ != nullptr || vals_ != nullptr || jcols_host_ != nullptr || vals_host_ != nullptr) { + bool storage_allocated = jcols_host_ != nullptr || vals_host_ != nullptr; + +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) + + storage_allocated = storage_allocated || jcols_ != nullptr || vals_ != nullptr; + +#endif + + if(storage_allocated) { +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) + checkGpuErrors(evloserGpuFree(jcols_)); checkGpuErrors(evloserGpuFree(vals_)); - delete[] jcols_host_; - delete[] vals_host_; jcols_ = nullptr; vals_ = nullptr; + +#endif + + delete[] jcols_host_; + delete[] vals_host_; + jcols_host_ = nullptr; vals_host_ = nullptr; nnz_ = 0; } nnz_ = nnz; + if(nnz_ == 0) { return; } +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) + checkGpuErrors(evloserGpuMalloc(reinterpret_cast(&jcols_), nnz_ * sizeof(int))); checkGpuErrors(evloserGpuMalloc(reinterpret_cast(&vals_), nnz_ * sizeof(double))); + +#endif + jcols_host_ = new int[nnz_]{0}; vals_host_ = new double[nnz_]{0}; } void MatrixCsr::clear_data() { +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) + checkGpuErrors(evloserGpuFree(irows_)); checkGpuErrors(evloserGpuFree(jcols_)); checkGpuErrors(evloserGpuFree(vals_)); @@ -136,6 +184,8 @@ void MatrixCsr::clear_data() jcols_ = nullptr; vals_ = nullptr; +#endif + delete[] irows_host_; delete[] jcols_host_; delete[] vals_host_; @@ -148,6 +198,8 @@ void MatrixCsr::clear_data() nnz_ = 0; } +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) void MatrixCsr::update_from_host_mirror() { assert(has_device_storage()); @@ -173,6 +225,7 @@ void MatrixCsr::copy_to_host_mirror() checkGpuErrors(evloserGpuMemcpy(vals_host_, vals_, sizeof(double) * nnz_, evloserMemcpyDeviceToHost)); } } +#endif bool MatrixCsr::validate_host_structure(const char* caller, bool silent_output) const { @@ -232,6 +285,8 @@ bool MatrixCsr::validate_host_structure(const char* caller, bool silent_output) return true; } +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) // Error checking utility for GPU backend // KS: might later become part of src/Utils, putting it here for now template @@ -242,5 +297,5 @@ void MatrixCsr::evloserCheckGpuError(T result, const char* const file, int const assert(false); } } - +#endif } // namespace EVLOSER diff --git a/src/LinAlg/EVLOSER/MatrixCsr.hpp b/src/LinAlg/EVLOSER/MatrixCsr.hpp index 2465245..882894b 100644 --- a/src/LinAlg/EVLOSER/MatrixCsr.hpp +++ b/src/LinAlg/EVLOSER/MatrixCsr.hpp @@ -24,8 +24,11 @@ class MatrixCsr /// Return the number of stored nonzeros. int nnz() const { return nnz_; } +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) /// Return true when the required device CSR arrays have been allocated. bool has_device_storage() const; +#endif /// Return true when the required host CSR mirror arrays have been allocated. bool has_host_mirror() const; @@ -41,6 +44,8 @@ class MatrixCsr */ bool validate_host_structure(const char* caller, bool silent_output) const; +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) /// Return device row-pointer storage. int* device_irows() { return irows_; } @@ -58,6 +63,7 @@ class MatrixCsr /// Return const device value storage. const double* device_vals() const { return vals_; } +#endif /// Return host row-pointer mirror storage. int* host_irows() { return irows_host_; } @@ -77,24 +83,32 @@ class MatrixCsr /// Return const host value mirror storage. const double* host_vals() const { return vals_host_; } +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) /// Copy host-side CSR arrays into device storage. void update_from_host_mirror(); /// Copy device CSR arrays into the host mirror. void copy_to_host_mirror(); +#endif private: int n_{0}; int nnz_{0}; +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) int* irows_{nullptr}; int* jcols_{nullptr}; double* vals_{nullptr}; +#endif int* irows_host_{nullptr}; int* jcols_host_{nullptr}; double* vals_host_{nullptr}; +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) /** * @brief Check for GPU backend errors. * @@ -105,6 +119,7 @@ class MatrixCsr */ template void evloserCheckGpuError(T result, const char* const file, int const line); +#endif }; } // namespace EVLOSER diff --git a/src/LinAlg/EVLOSER/RefactorizationSolver.cpp b/src/LinAlg/EVLOSER/RefactorizationSolver.cpp index 9104b84..784285b 100644 --- a/src/LinAlg/EVLOSER/RefactorizationSolver.cpp +++ b/src/LinAlg/EVLOSER/RefactorizationSolver.cpp @@ -54,9 +54,13 @@ */ #include "MatrixCsr.hpp" -#include "IterativeRefinement.hpp" #include "RefactorizationSolver.hpp" +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) +#include "IterativeRefinement.hpp" +#endif + #include "klu.h" #include #include @@ -64,11 +68,15 @@ #include #include +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) #define checkGpuErrors(val) evloserCheckGpuError((val), __FILE__, __LINE__) +#endif namespace EVLOSER { - +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) namespace { @@ -259,12 +267,13 @@ bool validate_host_csr_factor(const char* name, int n, int nnz, const HostCsrFac } } // namespace - +#endif RefactorizationSolver::RefactorizationSolver(int n) : n_(n) { mat_A_csr_ = new MatrixCsr(); - +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) // handles cusparseCreate(&handle_); cusolverSpCreate(&handle_cusolver_); @@ -274,20 +283,28 @@ RefactorizationSolver::RefactorizationSolver(int n) cusparseCreateMatDescr(&descr_A_); cusparseSetMatType(descr_A_, CUSPARSE_MATRIX_TYPE_GENERAL); cusparseSetMatIndexBase(descr_A_, CUSPARSE_INDEX_BASE_ZERO); +#endif // Allocate host mirror for the solution vector hostx_ = new double[n_]; - +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) // Allocate solution and rhs vectors checkGpuErrors(evloserGpuMalloc((void**)&devx_, n_ * sizeof(double))); checkGpuErrors(evloserGpuMalloc((void**)&devr_, n_ * sizeof(double))); +#endif } RefactorizationSolver::~RefactorizationSolver() { +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) delete ir_; +#endif delete mat_A_csr_; +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) // Delete workspaces and handles if(d_work_ != nullptr) { (void)evloserGpuFree(d_work_); @@ -296,10 +313,13 @@ RefactorizationSolver::~RefactorizationSolver() cusolverSpDestroy(handle_cusolver_); cublasDestroy(handle_cublas_); cusparseDestroyMatDescr(descr_A_); +#endif // Delete host mirror for the solution vector delete[] hostx_; +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) // Delete residual and solution vectors if(devr_ != nullptr) { (void)evloserGpuFree(devr_); @@ -325,13 +345,15 @@ RefactorizationSolver::~RefactorizationSolver() (void)evloserGpuFree(d_T_); } } - +#endif klu_free_symbolic(&Symbolic_, &Common_); klu_free_numeric(&Numeric_, &Common_); delete[] mia_; delete[] mja_; } +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) void RefactorizationSolver::enable_iterative_refinement() { if(ir_ == nullptr) { @@ -381,6 +403,7 @@ void RefactorizationSolver::configure_iterative_refinement(cusparseHandle_t cusp ir_->setup(cusparse_handle, cublas_handle, cusolverrf_handle, n, d_T, d_P, d_Q, devx, devr); } +#endif bool RefactorizationSolver::validate_system_matrix(const char* caller) const { @@ -437,6 +460,8 @@ bool RefactorizationSolver::validate_klu_factorization(const char* caller) const return true; } +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) bool RefactorizationSolver::checkEvloserRfStatus(evloserRfStatus_t status, const char* caller) const { if(status == evloserRfSuccess) { @@ -480,6 +505,7 @@ int RefactorizationSolver::refactorizeEvloserRf(const char* caller) sp_status_ = evloserRfRefactor(handle_rf_); return checkEvloserRfStatus(sp_status_, caller) ? 0 : -1; } +#endif int RefactorizationSolver::setup_factorization() { @@ -524,6 +550,8 @@ void RefactorizationSolver::setup_refactorization() return; } +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) if(refact_ == "glu") { if(initializeCusolverGLU() != 0) { return; @@ -543,6 +571,13 @@ void RefactorizationSolver::setup_refactorization() } else { // for future - assert(0 && "Only glu and rf refactorizations available.\n"); } +#else + + if(!silent_output_) { + std::cout << "[EVLOSER] GPU refactorization is unavailable in this build.\n"; + } + +#endif } int RefactorizationSolver::refactorize() @@ -551,6 +586,8 @@ int RefactorizationSolver::refactorize() return -1; } +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) if(refact_ == "glu") { sp_status_ = cusolverSpDgluReset(handle_cusolver_, n_, @@ -573,10 +610,21 @@ int RefactorizationSolver::refactorize() } } return 0; +#else + + if(!silent_output_) { + std::cout << "[EVLOSER] GPU refactorization is unavailable in this build.\n"; + } + + return -1; + +#endif } bool RefactorizationSolver::triangular_solve(double* dx, double tol, std::string memspace) { +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) if(refact_ == "glu") { double* devx = nullptr; if(memspace == "device") { @@ -622,7 +670,7 @@ bool RefactorizationSolver::triangular_solve(double* dx, double tol, std::string } else { hostx = dx; } - int ok = klu_solve(Symbolic_, Numeric_, n_, 1, hostx, &Common_); // replace dx with hostx + (void)klu_solve(Symbolic_, Numeric_, n_, 1, hostx, &Common_); // replace dx with hostx klu_free_numeric(&Numeric_, &Common_); klu_free_symbolic(&Symbolic_, &Common_); is_first_solve_ = false; @@ -684,8 +732,24 @@ bool RefactorizationSolver::triangular_solve(double* dx, double tol, std::string std::cout << "Unknown refactorization " << refact_ << ", exiting\n"; } return false; +#else + + (void)dx; + (void)tol; + (void)memspace; + + if(!silent_output_) { + std::cout << "[EVLOSER] GPU triangular solve is unavailable in this build.\n"; + } + + return false; + +#endif } + +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) // helper private function needed for format conversion int RefactorizationSolver::createM(const int n, const int /* nnzL */, @@ -737,6 +801,7 @@ int RefactorizationSolver::createM(const int n, } return 0; } +#endif int RefactorizationSolver::initializeKLU() { @@ -753,6 +818,8 @@ int RefactorizationSolver::initializeKLU() return 0; } +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) int RefactorizationSolver::initializeCusolverGLU() { #if defined(HIOP_USE_HIP) || defined(HAVE_HIP) @@ -846,7 +913,7 @@ int RefactorizationSolver::refactorizationSetupCusolverGLU() double* Ux = new double[nnzU]; - int ok = klu_extract(Numeric_, + (void)klu_extract(Numeric_, Symbolic_, Lp, Li, @@ -981,5 +1048,5 @@ void RefactorizationSolver::evloserCheckGpuError(T result, const char* const fil assert(false); } } - +#endif } // namespace EVLOSER diff --git a/src/LinAlg/EVLOSER/RefactorizationSolver.hpp b/src/LinAlg/EVLOSER/RefactorizationSolver.hpp index 6b6c451..9b9a4f8 100644 --- a/src/LinAlg/EVLOSER/RefactorizationSolver.hpp +++ b/src/LinAlg/EVLOSER/RefactorizationSolver.hpp @@ -67,7 +67,7 @@ class MatrixCsr; class IterativeRefinement; /** - * @brief Implements refactorization solvers using KLU and GPU sparse solver libraries + * @brief Implements refactorization solvers using KLU and optional GPU sparse solver libraries * */ class RefactorizationSolver @@ -78,6 +78,9 @@ class RefactorizationSolver RefactorizationSolver(int n); ~RefactorizationSolver(); +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) + /// Enable allocation and use of iterative refinement. void enable_iterative_refinement(); @@ -98,6 +101,7 @@ class RefactorizationSolver int* d_Q, double* devx, double* devr); +#endif /** * @brief Set the number of nonzeros in system matrix. @@ -106,12 +110,14 @@ class RefactorizationSolver */ void set_nnz(int nnz) { nnz_ = nnz; } +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) IterativeRefinement* ir() { return ir_; } + double* devr() { return devr_; } +#endif MatrixCsr* mat_A_csr() { return mat_A_csr_; } - double* devr() { return devr_; } - int& ordering() { return ordering_; } std::string& fact() { return fact_; } @@ -165,9 +171,12 @@ class RefactorizationSolver MatrixCsr* mat_A_csr_{nullptr}; ///< System matrix in nonsymmetric CSR format IterativeRefinement* ir_{nullptr}; ///< Iterative refinement class +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) bool cusolver_glu_enabled_{false}; ///< GLU refactorization enabled flag bool cusolver_rf_enabled_{false}; ///< Rf refactorization enabled flag bool iterative_refinement_enabled_{false}; ///< Iterative refinement on/off flag +#endif bool is_first_solve_{true}; ///< If it is first call to triangular solver // Options @@ -177,6 +186,8 @@ class RefactorizationSolver std::string use_ir_; bool silent_output_{true}; +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) /** needed for GPU sparse solver **/ cusolverStatus_t sp_status_; @@ -195,6 +206,7 @@ class RefactorizationSolver double* d_work_{nullptr}; int ite_refine_succ_ = 0; double r_nrminf_{0.0}; +#endif // KLU stuff int klu_status_; @@ -208,6 +220,8 @@ class RefactorizationSolver /* CPU data */ double* hostx_ = nullptr; +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) /* for GPU data */ double* devx_ = nullptr; double* devr_ = nullptr; @@ -216,7 +230,10 @@ class RefactorizationSolver int* d_P_ = nullptr; int* d_Q_ = nullptr; // permutation matrices double* d_T_ = nullptr; +#endif +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) /** * @brief Function that computes M = (L-I) + U * @@ -230,6 +247,7 @@ class RefactorizationSolver * @return int */ int createM(const int n, const int nnzL, const int* Lp, const int* Li, const int nnzU, const int* Up, const int* Ui); +#endif /// Validate the current CSR system matrix before solver setup or refactorization. bool validate_system_matrix(const char* caller) const; @@ -238,6 +256,10 @@ class RefactorizationSolver bool validate_klu_factorization(const char* caller) const; int initializeKLU(); + +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) + int initializeCusolverGLU(); int initializeCusolverRf(); @@ -266,6 +288,7 @@ class RefactorizationSolver */ template void evloserCheckGpuError(T result, const char* const file, int const line); +#endif }; } // namespace EVLOSER diff --git a/src/LinAlg/EVLOSER/evloser_cpu_defs.hpp b/src/LinAlg/EVLOSER/evloser_cpu_defs.hpp new file mode 100644 index 0000000..a6f8be8 --- /dev/null +++ b/src/LinAlg/EVLOSER/evloser_cpu_defs.hpp @@ -0,0 +1,26 @@ +#ifndef EVLOSER_CPU_DEFS_HPP +#define EVLOSER_CPU_DEFS_HPP + +#if !defined(HIOP_USE_CUDA) && !defined(HAVE_CUDA) && \ + !defined(HIOP_USE_HIP) && !defined(HAVE_HIP) + +using evloserGpuError_t = int; +using evloserGpuMemcpyKind_t = int; + +using cusolverStatus_t = int; +using cusparseHandle_t = void*; +using cusolverSpHandle_t = void*; +using cublasHandle_t = void*; +using cusparseMatDescr_t = void*; +using csrluInfoHost_t = void*; +using csrgluInfo_t = void*; + +using evloserRfStatus_t = int; +using evloserRfHandle_t = void*; + +static constexpr evloserGpuError_t evloserGpuSuccess = 0; +static constexpr evloserRfStatus_t evloserRfSuccess = 0; + +#endif + +#endif // EVLOSER_CPU_DEFS_HPP \ No newline at end of file diff --git a/src/LinAlg/EVLOSER/evloser_gpu_defs.hpp b/src/LinAlg/EVLOSER/evloser_gpu_defs.hpp index 3904856..15eaa64 100644 --- a/src/LinAlg/EVLOSER/evloser_gpu_defs.hpp +++ b/src/LinAlg/EVLOSER/evloser_gpu_defs.hpp @@ -1,7 +1,7 @@ /** * @file evloser_gpu_defs.hpp * - * Selects CUDA or HIP GPU backend definitions for EVLOSER. + * Selects CUDA, HIP or CPU backend definitions for EVLOSER. * */ @@ -18,7 +18,7 @@ #else -#error "EVLOSER GPU backend requires either HIOP_USE_CUDA or HIOP_USE_HIP." +#include "evloser_cpu_defs.hpp" #endif diff --git a/src/LinAlg/hiopLinSolverSparseEVLOSER.cpp b/src/LinAlg/hiopLinSolverSparseEVLOSER.cpp index b8c0d5a..8048eb8 100644 --- a/src/LinAlg/hiopLinSolverSparseEVLOSER.cpp +++ b/src/LinAlg/hiopLinSolverSparseEVLOSER.cpp @@ -56,8 +56,11 @@ #include "hiopLinSolverSparseEVLOSER.hpp" #include "EVLOSER/RefactorizationSolver.hpp" #include "EVLOSER/MatrixCsr.hpp" -#include "EVLOSER/IterativeRefinement.hpp" #include "EVLOSER/evloser_gpu_defs.hpp" +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) +#include "EVLOSER/IterativeRefinement.hpp" +#endif #include "hiop_blasdefs.hpp" @@ -71,8 +74,12 @@ #include #include +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) #define checkGpuErrors(val) hiopCheckGpuError((val), __FILE__, __LINE__) +#endif +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) /** * @brief Map elements of one array to the other * @@ -110,17 +117,23 @@ __global__ void evloser_add_to_array_kernel(T* dst, const T* src, const I* mapid } } +#endif + namespace hiop { hiopLinSolverSymSparseEVLOSER::hiopLinSolverSymSparseEVLOSER(const int& n, const int& nnz, hiopNlpFormulation* nlp) : hiopLinSolverSymSparse(n, nnz, nlp), + solver_{nullptr}, + m_{n}, + n_{n}, + nnz_{0}, index_convert_CSR2Triplet_host_{nullptr}, index_convert_extra_Diag2CSR_host_{nullptr}, +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) index_convert_CSR2Triplet_device_{nullptr}, index_convert_extra_Diag2CSR_device_{nullptr}, - m_{n}, - n_{n}, - nnz_{0}, +#endif factorizationSetupSucc_{0}, is_first_call_{true} { @@ -172,7 +185,10 @@ hiopLinSolverSymSparseEVLOSER::hiopLinSolverSymSparseEVLOSER(const int& n, const nlp_->log->printf(hovSummary, "Refactorization: %s\n", solver_->refact().c_str()); // by default, dont use iterative refinement - std::string use_ir; + std::string use_ir{"no"}; + +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) int maxit_test = nlp_->options->GetInteger("ir_inner_maxit"); if((maxit_test < 0) || (maxit_test > 1000)) { @@ -181,7 +197,6 @@ hiopLinSolverSymSparseEVLOSER::hiopLinSolverSymSparseEVLOSER(const int& n, const maxit_test); maxit_test = 50; } - use_ir = "no"; #if defined(HIOP_USE_HIP) || defined(HAVE_HIP) // EVLOSER iterative refinement currently depends on CUDA-only kernels. // Keep the HIP path on RF only until the IR path is ported. @@ -261,6 +276,8 @@ hiopLinSolverSymSparseEVLOSER::hiopLinSolverSymSparseEVLOSER(const int& n, const solver_->disable_iterative_refinement(); } } +#endif + solver_->use_ir() = use_ir; nlp_->log->printf(hovSummary, "Use IR: %s\n", solver_->use_ir().c_str()); } // constructor @@ -277,8 +294,14 @@ hiopLinSolverSymSparseEVLOSER::~hiopLinSolverSymSparseEVLOSER() // Delete CSR <--> triplet mappings delete[] index_convert_CSR2Triplet_host_; delete[] index_convert_extra_Diag2CSR_host_; + +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) + checkGpuErrors(evloserGpuFree(index_convert_CSR2Triplet_device_)); checkGpuErrors(evloserGpuFree(index_convert_extra_Diag2CSR_device_)); + +#endif } int hiopLinSolverSymSparseEVLOSER::matrixChanged() @@ -345,6 +368,8 @@ void hiopLinSolverSymSparseEVLOSER::firstCall() // If the matrix is on device, copy it to the host mirror std::string mem_space = nlp_->options->GetString("mem_space"); +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) if(mem_space == "device") { checkGpuErrors( evloserGpuMemcpy(M_host_->M(), M_->M(), sizeof(double) * M_->numberOfNonzeros(), evloserMemcpyDeviceToHost)); @@ -357,6 +382,12 @@ void hiopLinSolverSymSparseEVLOSER::firstCall() sizeof(index_type) * M_->numberOfNonzeros(), evloserMemcpyDeviceToHost)); } +#else + if(mem_space == "device") { + nlp_->log->printf(hovError, "Device memory is unavailable in this EVLOSER build.\n"); + return; + } +#endif // Transfer triplet to CSR form @@ -371,12 +402,15 @@ void hiopLinSolverSymSparseEVLOSER::firstCall() // Set column indices and matrix values. set_csr_indices_values(); +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) // Copy matrix to device solver_->mat_A_csr()->update_from_host_mirror(); if(solver_->use_ir() == "yes") { solver_->setup_iterative_refinement_matrix(n_, nnz_); } +#endif /* * initialize matrix factorization */ @@ -393,6 +427,8 @@ void hiopLinSolverSymSparseEVLOSER::firstCall() void hiopLinSolverSymSparseEVLOSER::update_matrix_values() { std::string mem_space = nlp_->options->GetString("mem_space"); + +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) if(mem_space == "device") { double* csr_vals = solver_->mat_A_csr()->device_vals(); double* coo_vals = M_->M(); @@ -412,23 +448,41 @@ void hiopLinSolverSymSparseEVLOSER::update_matrix_values() solver_->mat_A_csr()->device_vals(), sizeof(double) * nnz_, evloserMemcpyDeviceToHost)); + return; + } +#endif - } else { - // KKT matrix is on the host - double* vals = solver_->mat_A_csr()->host_vals(); - // update matrix - for(int k = 0; k < nnz_; k++) { - vals[k] = M_->M()[index_convert_CSR2Triplet_host_[k]]; - } - for(int i = 0; i < n_; i++) { - if(index_convert_extra_Diag2CSR_host_[i] != -1) - vals[index_convert_extra_Diag2CSR_host_[i]] += M_->M()[M_->numberOfNonzeros() - n_ + i]; - } - checkGpuErrors(evloserGpuMemcpy(solver_->mat_A_csr()->device_vals(), - solver_->mat_A_csr()->host_vals(), - sizeof(double) * nnz_, - evloserMemcpyHostToDevice)); + hiopMatrixSparse* matrix_source = M_; + +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) + if(mem_space == "device") { + checkGpuErrors( + evloserGpuMemcpy(M_host_->M(), M_->M(), sizeof(double) * M_->numberOfNonzeros(), evloserMemcpyDeviceToHost)); + matrix_source = M_host_; } +#else + if(mem_space == "device") { + nlp_->log->printf(hovError, "Device memory is unavailable in this EVLOSER build.\n"); + return; + } +#endif + + // KKT matrix is on the host + double* vals = solver_->mat_A_csr()->host_vals(); + // update matrix + for(int k = 0; k < nnz_; k++) { + vals[k] = matrix_source->M()[index_convert_CSR2Triplet_host_[k]]; + } + for(int i = 0; i < n_; i++) { + if(index_convert_extra_Diag2CSR_host_[i] != -1) + vals[index_convert_extra_Diag2CSR_host_[i]] += matrix_source->M()[matrix_source->numberOfNonzeros() - n_ + i]; + } + +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) + solver_->mat_A_csr()->update_from_host_mirror(); +#endif } /// @pre Data is either on the host or the host mirror is synced with the device @@ -494,11 +548,14 @@ void hiopLinSolverSymSparseEVLOSER::set_csr_indices_values() index_convert_CSR2Triplet_host_ = new int[nnz_]; index_convert_extra_Diag2CSR_host_ = new int[n_]; +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) checkGpuErrors(evloserGpuMalloc(reinterpret_cast(&index_convert_CSR2Triplet_device_), nnz_ * sizeof(int))); checkGpuErrors(evloserGpuMalloc(reinterpret_cast(&index_convert_extra_Diag2CSR_device_), n_ * sizeof(int))); +#endif int* nnz_each_row_tmp = new int[n_]{0}; - int total_nnz_tmp{0}, nnz_tmp{0}, rowID_tmp, colID_tmp; + int nnz_tmp{0}, rowID_tmp, colID_tmp; for(int k = 0; k < n_; k++) { index_convert_extra_Diag2CSR_host_[k] = -1; @@ -517,7 +574,6 @@ void hiopLinSolverSymSparseEVLOSER::set_csr_indices_values() index_convert_extra_Diag2CSR_host_[rowID_tmp] = nnz_tmp; nnz_each_row_tmp[rowID_tmp]++; - total_nnz_tmp++; } else { nnz_tmp = nnz_each_row_tmp[rowID_tmp] + row_ptr[rowID_tmp]; col_idx[nnz_tmp] = colID_tmp; @@ -531,7 +587,6 @@ void hiopLinSolverSymSparseEVLOSER::set_csr_indices_values() nnz_each_row_tmp[rowID_tmp]++; nnz_each_row_tmp[colID_tmp]++; - total_nnz_tmp += 2; } } // correct the missing dia_gonal term @@ -542,7 +597,6 @@ void hiopLinSolverSymSparseEVLOSER::set_csr_indices_values() col_idx[nnz_tmp] = i; vals[nnz_tmp] = M_host->M()[M_host->numberOfNonzeros() - n_ + i]; index_convert_CSR2Triplet_host_[nnz_tmp] = M_host->numberOfNonzeros() - n_ + i; - total_nnz_tmp += 1; std::vector ind_temp(row_ptr[i + 1] - row_ptr[i]); std::iota(ind_temp.begin(), ind_temp.end(), 0); @@ -555,6 +609,8 @@ void hiopLinSolverSymSparseEVLOSER::set_csr_indices_values() std::sort(col_idx + row_ptr[i], col_idx + row_ptr[i + 1]); } } +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) checkGpuErrors(evloserGpuMemcpy(index_convert_CSR2Triplet_device_, index_convert_CSR2Triplet_host_, nnz_ * sizeof(int), @@ -563,18 +619,22 @@ void hiopLinSolverSymSparseEVLOSER::set_csr_indices_values() index_convert_extra_Diag2CSR_host_, n_ * sizeof(int), evloserMemcpyHostToDevice)); +#endif delete[] nnz_each_row_tmp; } -// Error checking utility for CUDA +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) +// Error checking utility for GPU backends // KS: might later become part of src/Utils, putting it here for now template void hiopLinSolverSymSparseEVLOSER::hiopCheckGpuError(T result, const char* const file, int const line) { if(result) { - nlp_->log->printf(hovError, "CUDA error at %s:%d, error# %d\n", file, line, result); + nlp_->log->printf(hovError, "GPU error at %s:%d, error# %d\n", file, line, result); assert(false); } } +#endif } // namespace hiop diff --git a/src/LinAlg/hiopLinSolverSparseEVLOSER.hpp b/src/LinAlg/hiopLinSolverSparseEVLOSER.hpp index fa1bbc5..d53ba60 100644 --- a/src/LinAlg/hiopLinSolverSparseEVLOSER.hpp +++ b/src/LinAlg/hiopLinSolverSparseEVLOSER.hpp @@ -119,9 +119,12 @@ class hiopLinSolverSymSparseEVLOSER : public hiopLinSolverSymSparse int* index_convert_CSR2Triplet_host_; int* index_convert_extra_Diag2CSR_host_; +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) // Mapping on the device int* index_convert_CSR2Triplet_device_; int* index_convert_extra_Diag2CSR_device_; +#endif // Algorithm control flags int factorizationSetupSucc_; @@ -129,9 +132,6 @@ class hiopLinSolverSymSparseEVLOSER : public hiopLinSolverSymSparse hiopMatrixSparse* M_host_{nullptr}; ///< Host mirror for the KKT matrix - /* private function: creates a cuSolver data structure from KLU data - * structures. */ - /** called the very first time a matrix is factored. Perform KLU * factorization, allocate all aux variables * @@ -152,8 +152,11 @@ class hiopLinSolverSymSparseEVLOSER : public hiopLinSolverSymSparse /** Function to compute column indices and matrix values arrays */ void set_csr_indices_values(); +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ + defined(HIOP_USE_HIP) || defined(HAVE_HIP) template void hiopCheckGpuError(T result, const char* const file, int const line); +#endif }; } // namespace hiop diff --git a/src/Optimization/hiopDualsUpdater.cpp b/src/Optimization/hiopDualsUpdater.cpp index 5d52e0b..7dc1748 100644 --- a/src/Optimization/hiopDualsUpdater.cpp +++ b/src/Optimization/hiopDualsUpdater.cpp @@ -71,11 +71,11 @@ #ifdef HIOP_USE_PARDISO #include "hiopLinSolverSparsePARDISO.hpp" #endif -#ifdef HIOP_USE_RESOLVE -// ReSolve is still CUDA-only; EVLOSER below covers the HIP-capable sparse solver path. #if defined(HIOP_USE_RESOLVE) && defined(HIOP_USE_CUDA) #include "hiopLinSolverSparseReSolve.hpp" #endif + +#ifdef HIOP_USE_EVLOSER #include "hiopLinSolverSparseEVLOSER.hpp" #endif #ifdef HIOP_USE_GINKGO @@ -430,34 +430,35 @@ bool hiopDualsLsqUpdateLinsysAugSparse::instantiate_linear_solver(const char* li // Under gpu compute_mode, which is work in progress, the initialization should be done only using // GPU sparse linear solvers. -#ifdef HIOP_USE_RESOLVE +#if defined(HIOP_USE_RESOLVE) || defined(HIOP_USE_EVLOSER) if(compute_mode == "gpu") { - // EVLOSER is valid here because it uses the same dual-init sparse solver path as ReSolve. assert((linear_solver == "resolve" || linear_solver == "evloser" || linear_solver == "auto") && "the value for duals_init_linear_solver_sparse is invalid and should have been corrected during " "options processing"); } if(fact_acceptor == "inertia_correction") { nlp_->log->printf(hovError, - "LSQ linear solver with ReSolve does not support inertia correction. " + "LSQ linear solver with ReSolve or EVLOSER does not support inertia correction. " "Please set option 'fact_acceptor' to 'inertia_free'.\n"); assert(false); return false; } - // This is our first choice on the device. + +#if defined(HIOP_USE_RESOLVE) && defined(HIOP_USE_CUDA) + // ReSolve remains the first choice when it is available. if(linear_solver == "resolve" || linear_solver == "auto") { ss_log << "LSQ linear solver --- KKT_SPARSE_XDYcYd linsys: ReSolve "; - // Only build the ReSolve solver object when CUDA is enabled. -#if defined(HIOP_USE_RESOLVE) && defined(HIOP_USE_CUDA) lin_sys_ = new hiopLinSolverSymSparseReSolve(n, nnz, nlp_); -#endif } - // EVLOSER has its own solver object but uses this same dual-init allocation point. - if(linear_solver == "evloser") { +#endif + +#ifdef HIOP_USE_EVLOSER + if(nullptr == lin_sys_ && (linear_solver == "evloser" || linear_solver == "auto")) { ss_log << "LSQ linear solver --- KKT_SPARSE_XDYcYd linsys: EVLOSER "; lin_sys_ = new hiopLinSolverSymSparseEVLOSER(n, nnz, nlp_); } -#else // of #ifdef HIOP_USE_RESOLVE +#endif +#else // no ReSolve or EVLOSER support // under compute mode gpu, at this point we don't have a sparse linear solver if(compute_mode == "gpu") { if(linear_solver == "auto") { diff --git a/src/Optimization/hiopKKTLinSysSparse.cpp b/src/Optimization/hiopKKTLinSysSparse.cpp index 4a4a550..c44760e 100644 --- a/src/Optimization/hiopKKTLinSysSparse.cpp +++ b/src/Optimization/hiopKKTLinSysSparse.cpp @@ -57,11 +57,11 @@ #ifdef HIOP_USE_PARDISO #include "hiopLinSolverSparsePARDISO.hpp" #endif -#ifdef HIOP_USE_RESOLVE -// ReSolve is still CUDA-only; EVLOSER below covers the HIP-capable sparse solver path. #if defined(HIOP_USE_RESOLVE) && defined(HIOP_USE_CUDA) #include "hiopLinSolverSparseReSolve.hpp" #endif + +#ifdef HIOP_USE_EVLOSER #include "hiopLinSolverSparseEVLOSER.hpp" #endif #ifdef HIOP_USE_GINKGO @@ -385,7 +385,7 @@ hiopLinSolverSymSparse* hiopKKTLinSysCompressedSparseXYcYd::determineAndCreateLi // EVLOSER has its own solver object but uses this same sparse KKT selection point. if(nullptr == linSys_ && linear_solver == "evloser") { -#if defined(HIOP_USE_RESOLVE) +#if defined(HIOP_USE_EVLOSER) linSys_ = new hiopLinSolverSymSparseEVLOSER(n, nnz, nlp_); linsol_actual = "EVLOSER"; auto* fact_acceptor_ic = dynamic_cast(fact_acceptor_); @@ -785,7 +785,7 @@ hiopLinSolverSymSparse* hiopKKTLinSysCompressedSparseXDYcYd::determineAndCreateL // EVLOSER has its own solver object but uses this same sparse KKT selection point. if(nullptr == linSys_ && linear_solver == "evloser") { -#if defined(HIOP_USE_RESOLVE) +#if defined(HIOP_USE_EVLOSER) actual_lin_solver = "EVLOSER"; linSys_ = new hiopLinSolverSymSparseEVLOSER(n, nnz, nlp_); auto* fact_acceptor_ic = dynamic_cast(fact_acceptor_); @@ -876,7 +876,7 @@ hiopLinSolverSymSparse* hiopKKTLinSysCompressedSparseXDYcYd::determineAndCreateL // EVLOSER has its own solver object but uses this same sparse KKT selection point. if(nullptr == linSys_ && linear_solver == "evloser") { -#if defined(HIOP_USE_RESOLVE) +#if defined(HIOP_USE_EVLOSER) linSys_ = new hiopLinSolverSymSparseEVLOSER(n, nnz, nlp_); nlp_->log->printf(hovScalars, "KKT_SPARSE_XDYcYd linsys: alloc EVLOSER size %d (%d cons) (gpu)\n", n, neq + nineq); auto* fact_acceptor_ic = dynamic_cast(fact_acceptor_); From 59cb53282998984e479b138bd7e3f4d1baf02b76 Mon Sep 17 00:00:00 2001 From: Tamar DeWilde Date: Wed, 24 Jun 2026 12:08:17 -0700 Subject: [PATCH 21/28] Add EVLOSER execution mode selection --- src/LinAlg/EVLOSER/MatrixCsr.cpp | 62 +-- src/LinAlg/EVLOSER/MatrixCsr.hpp | 5 +- src/LinAlg/EVLOSER/RefactorizationSolver.cpp | 206 +++++----- src/LinAlg/EVLOSER/RefactorizationSolver.hpp | 7 +- src/LinAlg/EVLOSER/evloser_execution_mode.hpp | 16 + src/LinAlg/hiopLinSolverSparseEVLOSER.cpp | 368 +++++++++--------- src/LinAlg/hiopLinSolverSparseEVLOSER.hpp | 2 + 7 files changed, 372 insertions(+), 294 deletions(-) create mode 100644 src/LinAlg/EVLOSER/evloser_execution_mode.hpp diff --git a/src/LinAlg/EVLOSER/MatrixCsr.cpp b/src/LinAlg/EVLOSER/MatrixCsr.cpp index ba67342..7b55ce7 100644 --- a/src/LinAlg/EVLOSER/MatrixCsr.cpp +++ b/src/LinAlg/EVLOSER/MatrixCsr.cpp @@ -71,7 +71,7 @@ namespace EVLOSER { -MatrixCsr::MatrixCsr() {} +MatrixCsr::MatrixCsr(ExecutionMode execution_mode) : execution_mode_(execution_mode) {} MatrixCsr::~MatrixCsr() { @@ -82,6 +82,10 @@ MatrixCsr::~MatrixCsr() defined(HIOP_USE_HIP) || defined(HAVE_HIP) bool MatrixCsr::has_device_storage() const { + if(execution_mode_ == ExecutionMode::CPU) { + return false; + } + const bool size_allocated = (n_ == 0) || (irows_ != nullptr); const bool nnz_allocated = (nnz_ == 0) || (jcols_ != nullptr && vals_ != nullptr); return size_allocated && nnz_allocated; @@ -101,9 +105,9 @@ void MatrixCsr::allocate_size(int n) #if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ defined(HIOP_USE_HIP) || defined(HAVE_HIP) - - storage_allocated = storage_allocated || irows_ != nullptr; - + if(execution_mode_ == ExecutionMode::CUDA || execution_mode_ == ExecutionMode::HIP) { + storage_allocated = storage_allocated || irows_ != nullptr; + } #endif if(storage_allocated) { @@ -114,9 +118,9 @@ void MatrixCsr::allocate_size(int n) #if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ defined(HIOP_USE_HIP) || defined(HAVE_HIP) - - checkGpuErrors(evloserGpuMalloc(reinterpret_cast(&irows_), (n_ + 1) * sizeof(int))); - + if(execution_mode_ == ExecutionMode::CUDA || execution_mode_ == ExecutionMode::HIP) { + checkGpuErrors(evloserGpuMalloc(reinterpret_cast(&irows_), (n_ + 1) * sizeof(int))); + } #endif irows_host_ = new int[n_ + 1]{0}; @@ -128,21 +132,21 @@ void MatrixCsr::allocate_nnz(int nnz) #if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ defined(HIOP_USE_HIP) || defined(HAVE_HIP) - - storage_allocated = storage_allocated || jcols_ != nullptr || vals_ != nullptr; - + if(execution_mode_ == ExecutionMode::CUDA || execution_mode_ == ExecutionMode::HIP) { + storage_allocated = storage_allocated || jcols_ != nullptr || vals_ != nullptr; + } #endif if(storage_allocated) { #if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ defined(HIOP_USE_HIP) || defined(HAVE_HIP) - + if(execution_mode_ == ExecutionMode::CUDA || execution_mode_ == ExecutionMode::HIP) { checkGpuErrors(evloserGpuFree(jcols_)); checkGpuErrors(evloserGpuFree(vals_)); jcols_ = nullptr; vals_ = nullptr; - + } #endif delete[] jcols_host_; @@ -161,10 +165,10 @@ void MatrixCsr::allocate_nnz(int nnz) #if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ defined(HIOP_USE_HIP) || defined(HAVE_HIP) - - checkGpuErrors(evloserGpuMalloc(reinterpret_cast(&jcols_), nnz_ * sizeof(int))); - checkGpuErrors(evloserGpuMalloc(reinterpret_cast(&vals_), nnz_ * sizeof(double))); - + if(execution_mode_ == ExecutionMode::CUDA || execution_mode_ == ExecutionMode::HIP) { + checkGpuErrors(evloserGpuMalloc(reinterpret_cast(&jcols_), nnz_ * sizeof(int))); + checkGpuErrors(evloserGpuMalloc(reinterpret_cast(&vals_), nnz_ * sizeof(double))); + } #endif jcols_host_ = new int[nnz_]{0}; @@ -175,15 +179,15 @@ void MatrixCsr::clear_data() { #if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ defined(HIOP_USE_HIP) || defined(HAVE_HIP) + if(execution_mode_ == ExecutionMode::CUDA || execution_mode_ == ExecutionMode::HIP) { + checkGpuErrors(evloserGpuFree(irows_)); + checkGpuErrors(evloserGpuFree(jcols_)); + checkGpuErrors(evloserGpuFree(vals_)); - checkGpuErrors(evloserGpuFree(irows_)); - checkGpuErrors(evloserGpuFree(jcols_)); - checkGpuErrors(evloserGpuFree(vals_)); - - irows_ = nullptr; - jcols_ = nullptr; - vals_ = nullptr; - + irows_ = nullptr; + jcols_ = nullptr; + vals_ = nullptr; + } #endif delete[] irows_host_; @@ -202,6 +206,11 @@ void MatrixCsr::clear_data() defined(HIOP_USE_HIP) || defined(HAVE_HIP) void MatrixCsr::update_from_host_mirror() { + if(execution_mode_ == ExecutionMode::CPU) { + assert(false && "Cannot update device storage in CPU execution mode."); + return; + } + assert(has_device_storage()); assert(has_host_mirror()); @@ -215,6 +224,11 @@ void MatrixCsr::update_from_host_mirror() void MatrixCsr::copy_to_host_mirror() { + if(execution_mode_ == ExecutionMode::CPU) { + assert(false && "Cannot copy from device storage in CPU execution mode."); + return; + } + assert(has_device_storage()); assert(has_host_mirror()); diff --git a/src/LinAlg/EVLOSER/MatrixCsr.hpp b/src/LinAlg/EVLOSER/MatrixCsr.hpp index 882894b..7465354 100644 --- a/src/LinAlg/EVLOSER/MatrixCsr.hpp +++ b/src/LinAlg/EVLOSER/MatrixCsr.hpp @@ -1,12 +1,14 @@ #pragma once +#include "evloser_execution_mode.hpp" + namespace EVLOSER { class MatrixCsr { public: - MatrixCsr(); + explicit MatrixCsr(ExecutionMode execution_mode); ~MatrixCsr(); /// Allocate device and host row-pointer storage for an n-by-n CSR matrix. @@ -93,6 +95,7 @@ class MatrixCsr #endif private: + const ExecutionMode execution_mode_; ///< Selected CPU, CUDA, or HIP execution path int n_{0}; int nnz_{0}; diff --git a/src/LinAlg/EVLOSER/RefactorizationSolver.cpp b/src/LinAlg/EVLOSER/RefactorizationSolver.cpp index 784285b..ab4c9fd 100644 --- a/src/LinAlg/EVLOSER/RefactorizationSolver.cpp +++ b/src/LinAlg/EVLOSER/RefactorizationSolver.cpp @@ -268,30 +268,35 @@ bool validate_host_csr_factor(const char* name, int n, int nnz, const HostCsrFac } // namespace #endif -RefactorizationSolver::RefactorizationSolver(int n) - : n_(n) +RefactorizationSolver::RefactorizationSolver(int n, ExecutionMode execution_mode) + : n_(n), + execution_mode_(execution_mode) { - mat_A_csr_ = new MatrixCsr(); + mat_A_csr_ = new MatrixCsr(execution_mode_); #if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ defined(HIOP_USE_HIP) || defined(HAVE_HIP) - // handles - cusparseCreate(&handle_); - cusolverSpCreate(&handle_cusolver_); - cublasCreate(&handle_cublas_); - - // descriptors - cusparseCreateMatDescr(&descr_A_); - cusparseSetMatType(descr_A_, CUSPARSE_MATRIX_TYPE_GENERAL); - cusparseSetMatIndexBase(descr_A_, CUSPARSE_INDEX_BASE_ZERO); + if(execution_mode_ == ExecutionMode::CUDA || execution_mode_ == ExecutionMode::HIP) { + // handles + cusparseCreate(&handle_); + cusolverSpCreate(&handle_cusolver_); + cublasCreate(&handle_cublas_); + + // descriptors + cusparseCreateMatDescr(&descr_A_); + cusparseSetMatType(descr_A_, CUSPARSE_MATRIX_TYPE_GENERAL); + cusparseSetMatIndexBase(descr_A_, CUSPARSE_INDEX_BASE_ZERO); + } #endif // Allocate host mirror for the solution vector hostx_ = new double[n_]; #if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ defined(HIOP_USE_HIP) || defined(HAVE_HIP) - // Allocate solution and rhs vectors - checkGpuErrors(evloserGpuMalloc((void**)&devx_, n_ * sizeof(double))); - checkGpuErrors(evloserGpuMalloc((void**)&devr_, n_ * sizeof(double))); + if(execution_mode_ == ExecutionMode::CUDA || execution_mode_ == ExecutionMode::HIP) { + // Allocate solution and rhs vectors + checkGpuErrors(evloserGpuMalloc((void**)&devx_, n_ * sizeof(double))); + checkGpuErrors(evloserGpuMalloc((void**)&devr_, n_ * sizeof(double))); + } #endif } @@ -305,14 +310,16 @@ RefactorizationSolver::~RefactorizationSolver() #if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ defined(HIOP_USE_HIP) || defined(HAVE_HIP) - // Delete workspaces and handles - if(d_work_ != nullptr) { - (void)evloserGpuFree(d_work_); - } - cusparseDestroy(handle_); - cusolverSpDestroy(handle_cusolver_); - cublasDestroy(handle_cublas_); - cusparseDestroyMatDescr(descr_A_); + if(execution_mode_ == ExecutionMode::CUDA || execution_mode_ == ExecutionMode::HIP) { + // Delete workspaces and handles + if(d_work_ != nullptr) { + (void)evloserGpuFree(d_work_); + } + cusparseDestroy(handle_); + cusolverSpDestroy(handle_cusolver_); + cublasDestroy(handle_cublas_); + cusparseDestroyMatDescr(descr_A_); + } #endif // Delete host mirror for the solution vector @@ -320,29 +327,31 @@ RefactorizationSolver::~RefactorizationSolver() #if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ defined(HIOP_USE_HIP) || defined(HAVE_HIP) - // Delete residual and solution vectors - if(devr_ != nullptr) { - (void)evloserGpuFree(devr_); - } - if(devx_ != nullptr) { - (void)evloserGpuFree(devx_); - } - - // Delete matrix descriptor used in cuSolverGLU setup - if(cusolver_glu_enabled_) { - cusparseDestroyMatDescr(descr_M_); - cusolverSpDestroyGluInfo(info_M_); - } - - if(cusolver_rf_enabled_) { - if(d_P_ != nullptr) { - (void)evloserGpuFree(d_P_); + if(execution_mode_ == ExecutionMode::CUDA || execution_mode_ == ExecutionMode::HIP) { + // Delete residual and solution vectors + if(devr_ != nullptr) { + (void)evloserGpuFree(devr_); } - if(d_Q_ != nullptr) { - (void)evloserGpuFree(d_Q_); + if(devx_ != nullptr) { + (void)evloserGpuFree(devx_); } - if(d_T_ != nullptr) { - (void)evloserGpuFree(d_T_); + + // Delete matrix descriptor used in cuSolverGLU setup + if(cusolver_glu_enabled_) { + cusparseDestroyMatDescr(descr_M_); + cusolverSpDestroyGluInfo(info_M_); + } + + if(cusolver_rf_enabled_) { + if(d_P_ != nullptr) { + (void)evloserGpuFree(d_P_); + } + if(d_Q_ != nullptr) { + (void)evloserGpuFree(d_Q_); + } + if(d_T_ != nullptr) { + (void)evloserGpuFree(d_T_); + } } } #endif @@ -356,6 +365,13 @@ RefactorizationSolver::~RefactorizationSolver() defined(HIOP_USE_HIP) || defined(HAVE_HIP) void RefactorizationSolver::enable_iterative_refinement() { + if(execution_mode_ == ExecutionMode::CPU) { + if(!silent_output_) { + std::cout << "[EVLOSER] Iterative refinement is unavailable in CPU execution mode.\n"; + } + return; + } + if(ir_ == nullptr) { ir_ = new IterativeRefinement(); } @@ -373,7 +389,7 @@ void RefactorizationSolver::disable_iterative_refinement() bool RefactorizationSolver::iterative_refinement_active() const { - return iterative_refinement_enabled_ && ir_ != nullptr && use_ir_ == "yes"; + return execution_mode_ != ExecutionMode::CPU && iterative_refinement_enabled_ && ir_ != nullptr && use_ir_ == "yes"; } // TODO: Refactor to only pass mat_A_csr_ to setup_system_matrix; n and nnz can be read from mat_A_csr_ @@ -550,9 +566,23 @@ void RefactorizationSolver::setup_refactorization() return; } + if(execution_mode_ == ExecutionMode::CPU) { + if(!silent_output_) { + std::cout << "[EVLOSER] CPU refactorization setup is not available yet.\n"; + } + return; + } + #if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ defined(HIOP_USE_HIP) || defined(HAVE_HIP) if(refact_ == "glu") { + if(execution_mode_ != ExecutionMode::CUDA) { + if(!silent_output_) { + std::cout << "[EVLOSER] GLU refactorization requires CUDA execution mode.\n"; + } + return; + } + if(initializeCusolverGLU() != 0) { return; } @@ -586,9 +616,23 @@ int RefactorizationSolver::refactorize() return -1; } + if(execution_mode_ == ExecutionMode::CPU) { + if(!silent_output_) { + std::cout << "[EVLOSER] CPU refactorization is not available yet.\n"; + } + return -1; + } + #if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ defined(HIOP_USE_HIP) || defined(HAVE_HIP) if(refact_ == "glu") { + if(execution_mode_ != ExecutionMode::CUDA) { + if(!silent_output_) { + std::cout << "[EVLOSER] GLU refactorization requires CUDA execution mode.\n"; + } + return -1; + } + sp_status_ = cusolverSpDgluReset(handle_cusolver_, n_, /* A is original matrix */ @@ -621,19 +665,26 @@ int RefactorizationSolver::refactorize() #endif } -bool RefactorizationSolver::triangular_solve(double* dx, double tol, std::string memspace) +bool RefactorizationSolver::triangular_solve(double* dx, double tol) { + if(execution_mode_ == ExecutionMode::CPU) { + if(!silent_output_) { + std::cout << "[EVLOSER] CPU triangular solve is not available yet.\n"; + } + return false; + } + #if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ defined(HIOP_USE_HIP) || defined(HAVE_HIP) if(refact_ == "glu") { - double* devx = nullptr; - if(memspace == "device") { - checkGpuErrors(evloserGpuMemcpy(devr_, dx, sizeof(double) * n_, evloserMemcpyDeviceToDevice)); - devx = dx; - } else { - checkGpuErrors(evloserGpuMemcpy(devr_, dx, sizeof(double) * n_, evloserMemcpyHostToDevice)); - devx = devx_; + if(execution_mode_ != ExecutionMode::CUDA) { + if(!silent_output_) { + std::cout << "[EVLOSER] GLU solve requires CUDA execution mode.\n"; + } + return false; } + + checkGpuErrors(evloserGpuMemcpy(devr_, dx, sizeof(double) * n_, evloserMemcpyDeviceToDevice)); sp_status_ = cusolverSpDgluSolve(handle_cusolver_, n_, /* A is original matrix */ @@ -643,54 +694,33 @@ bool RefactorizationSolver::triangular_solve(double* dx, double tol, std::string mat_A_csr_->device_irows(), mat_A_csr_->device_jcols(), devr_, /* right hand side */ - devx, /* left hand side, local pointer */ + dx, /* left hand side */ &ite_refine_succ_, &r_nrminf_, info_M_, d_work_); - if(sp_status_ != 0 && !silent_output_) { - std::cout << "GLU solve failed with status: " << sp_status_ << "\n"; + if(sp_status_ != 0) { + if(!silent_output_) { + std::cout << "GLU solve failed with status: " << sp_status_ << "\n"; + } return false; } - if(memspace == "device") { - // do nothing - } else { - checkGpuErrors(evloserGpuMemcpy(dx, devx_, sizeof(double) * n_, evloserMemcpyDeviceToHost)); - } return true; } if(refact_ == "rf") { // First solve is performed on CPU if(is_first_solve_) { - double* hostx = nullptr; - if(memspace == "device") { - checkGpuErrors(evloserGpuMemcpy(hostx_, dx, sizeof(double) * n_, evloserMemcpyDeviceToHost)); - hostx = hostx_; - } else { - hostx = dx; - } - (void)klu_solve(Symbolic_, Numeric_, n_, 1, hostx, &Common_); // replace dx with hostx + checkGpuErrors(evloserGpuMemcpy(hostx_, dx, sizeof(double) * n_, evloserMemcpyDeviceToHost)); + (void)klu_solve(Symbolic_, Numeric_, n_, 1, hostx_, &Common_); klu_free_numeric(&Numeric_, &Common_); klu_free_symbolic(&Symbolic_, &Common_); is_first_solve_ = false; - if(memspace == "device") { - checkGpuErrors(evloserGpuMemcpy(dx, hostx, sizeof(double) * n_, evloserMemcpyHostToDevice)); - } else { - // do nothing - } + checkGpuErrors(evloserGpuMemcpy(dx, hostx_, sizeof(double) * n_, evloserMemcpyHostToDevice)); return true; } - double* devx = nullptr; - if(memspace == "device") { - devx = dx; - checkGpuErrors(evloserGpuMemcpy(devr_, dx, sizeof(double) * n_, evloserMemcpyDeviceToDevice)); - } else { - checkGpuErrors(evloserGpuMemcpy(devx_, dx, sizeof(double) * n_, evloserMemcpyHostToDevice)); - checkGpuErrors(evloserGpuMemcpy(devr_, devx_, sizeof(double) * n_, evloserMemcpyDeviceToDevice)); - devx = devx_; - } + checkGpuErrors(evloserGpuMemcpy(devr_, dx, sizeof(double) * n_, evloserMemcpyDeviceToDevice)); // Each next solve is performed on GPU sp_status_ = evloserRfSolve(handle_rf_, @@ -699,7 +729,7 @@ bool RefactorizationSolver::triangular_solve(double* dx, double tol, std::string 1, d_T_, n_, - devx, // replace devx_ with local pointer devx + dx, n_); if(sp_status_ != 0) { if(!silent_output_) std::cout << "Rf solve failed with status: " << sp_status_ << "\n"; @@ -710,7 +740,7 @@ bool RefactorizationSolver::triangular_solve(double* dx, double tol, std::string // Set tolerance based on barrier parameter mu ir_->set_tol(tol); - ir_->fgmres(devx, devr_); // replace devx_ with local pointer devx + ir_->fgmres(dx, devr_); if(!silent_output_ && (ir_->getFinalResidalNorm() > tol * ir_->getBNorm())) { std::cout << "[Warning] Iterative refinement did not converge!\n"; std::cout << "\t Iterative refinement tolerance " << tol << "\n"; @@ -720,11 +750,6 @@ bool RefactorizationSolver::triangular_solve(double* dx, double tol, std::string << "\t number of iterations: " << ir_->getFinalNumberOfIterations() << "\n"; } } - if(memspace == "device") { - // do nothing - } else { - checkGpuErrors(evloserGpuMemcpy(dx, devx_, sizeof(double) * n_, evloserMemcpyDeviceToHost)); - } return true; } @@ -736,7 +761,6 @@ bool RefactorizationSolver::triangular_solve(double* dx, double tol, std::string (void)dx; (void)tol; - (void)memspace; if(!silent_output_) { std::cout << "[EVLOSER] GPU triangular solve is unavailable in this build.\n"; diff --git a/src/LinAlg/EVLOSER/RefactorizationSolver.hpp b/src/LinAlg/EVLOSER/RefactorizationSolver.hpp index 9b9a4f8..d3300dd 100644 --- a/src/LinAlg/EVLOSER/RefactorizationSolver.hpp +++ b/src/LinAlg/EVLOSER/RefactorizationSolver.hpp @@ -57,6 +57,7 @@ #pragma once #include "klu.h" +#include "evloser_execution_mode.hpp" #include "evloser_gpu_defs.hpp" #include @@ -75,7 +76,7 @@ class RefactorizationSolver public: // constructor // RefactorizationSolver(); - RefactorizationSolver(int n); + RefactorizationSolver(int n, ExecutionMode execution_mode); ~RefactorizationSolver(); #if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ @@ -162,12 +163,14 @@ class RefactorizationSolver * @param tol * @return bool */ - bool triangular_solve(double* dx, double tol, std::string memspace); + bool triangular_solve(double* dx, double tol); private: int n_{0}; ///< Size of the linear system int nnz_{0}; ///< Number of nonzeros in the system's matrix + const ExecutionMode execution_mode_; ///< Selected CPU, CUDA, or HIP execution path + MatrixCsr* mat_A_csr_{nullptr}; ///< System matrix in nonsymmetric CSR format IterativeRefinement* ir_{nullptr}; ///< Iterative refinement class diff --git a/src/LinAlg/EVLOSER/evloser_execution_mode.hpp b/src/LinAlg/EVLOSER/evloser_execution_mode.hpp new file mode 100644 index 0000000..fddf86c --- /dev/null +++ b/src/LinAlg/EVLOSER/evloser_execution_mode.hpp @@ -0,0 +1,16 @@ +#ifndef EVLOSER_EXECUTION_MODE_HPP +#define EVLOSER_EXECUTION_MODE_HPP + +namespace EVLOSER +{ + +enum class ExecutionMode +{ + CPU, + CUDA, + HIP +}; + +} // namespace EVLOSER + +#endif diff --git a/src/LinAlg/hiopLinSolverSparseEVLOSER.cpp b/src/LinAlg/hiopLinSolverSparseEVLOSER.cpp index 8048eb8..8994133 100644 --- a/src/LinAlg/hiopLinSolverSparseEVLOSER.cpp +++ b/src/LinAlg/hiopLinSolverSparseEVLOSER.cpp @@ -69,6 +69,7 @@ #endif #include +#include #include #include #include @@ -97,7 +98,9 @@ __global__ void evloser_map_arrays_kernel(T* dst, const T* src, const I* mapidx, dst[tid] = src[mapidx[tid]]; } } +#endif +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) /** * @brief Map elements of one array to the other * @@ -116,13 +119,42 @@ __global__ void evloser_add_to_array_kernel(T* dst, const T* src, const I* mapid if(mapidx[tid] != -1) dst[mapidx[tid]] += src[nnz - n + tid]; } } - #endif namespace hiop { +namespace +{ + +EVLOSER::ExecutionMode select_evloser_execution_mode(hiopNlpFormulation* nlp) +{ + const std::string mem_space = nlp->options->GetString("mem_space"); + + if(mem_space == "host" || mem_space == "default") { + return EVLOSER::ExecutionMode::CPU; + } + + if(mem_space == "device") { +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) + return EVLOSER::ExecutionMode::CUDA; +#elif defined(HIOP_USE_HIP) || defined(HAVE_HIP) + return EVLOSER::ExecutionMode::HIP; +#else + nlp->log->printf(hovError, + "EVLOSER device execution was requested, but HiOp was not built with CUDA or HIP support.\n"); + std::abort(); +#endif + } + + nlp->log->printf(hovError, "Memory space %s is incompatible with EVLOSER.\n", mem_space.c_str()); + std::abort(); +} + +} // namespace + hiopLinSolverSymSparseEVLOSER::hiopLinSolverSymSparseEVLOSER(const int& n, const int& nnz, hiopNlpFormulation* nlp) : hiopLinSolverSymSparse(n, nnz, nlp), + execution_mode_{select_evloser_execution_mode(nlp)}, solver_{nullptr}, m_{n}, n_{n}, @@ -138,10 +170,10 @@ hiopLinSolverSymSparseEVLOSER::hiopLinSolverSymSparseEVLOSER(const int& n, const is_first_call_{true} { // Create embedded EVLOSER refactorization solver - solver_ = new EVLOSER::RefactorizationSolver(n); + solver_ = new EVLOSER::RefactorizationSolver(n, execution_mode_); - // If memory space is device, allocate host mirror for HiOp's KKT matrix in triplet format - if(nlp_->options->GetString("mem_space") == "device") { + // Device execution requires a host mirror for HiOp's KKT matrix. + if(execution_mode_ != EVLOSER::ExecutionMode::CPU) { M_host_ = LinearAlgebraFactory::create_matrix_sparse("default", n, n, nnz); } @@ -186,95 +218,97 @@ hiopLinSolverSymSparseEVLOSER::hiopLinSolverSymSparseEVLOSER(const int& n, const // by default, dont use iterative refinement std::string use_ir{"no"}; - #if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ defined(HIOP_USE_HIP) || defined(HAVE_HIP) - int maxit_test = nlp_->options->GetInteger("ir_inner_maxit"); + if(execution_mode_ != EVLOSER::ExecutionMode::CPU) { + int maxit_test = nlp_->options->GetInteger("ir_inner_maxit"); - if((maxit_test < 0) || (maxit_test > 1000)) { - nlp_->log->printf(hovWarning, - "Wrong maxit value: %d. Use int maxit value between 0 and 1000. Setting default (50) ...\n", - maxit_test); - maxit_test = 50; - } + + if((maxit_test < 0) || (maxit_test > 1000)) { + nlp_->log->printf(hovWarning, + "Wrong maxit value: %d. Use int maxit value between 0 and 1000. Setting default (50) ...\n", + maxit_test); + maxit_test = 50; + } #if defined(HIOP_USE_HIP) || defined(HAVE_HIP) - // EVLOSER iterative refinement currently depends on CUDA-only kernels. - // Keep the HIP path on RF only until the IR path is ported. - solver_->disable_iterative_refinement(); -#else - if(maxit_test > 0) { - use_ir = "yes"; - solver_->enable_iterative_refinement(); - solver_->ir()->maxit() = maxit_test; - } else { + // EVLOSER iterative refinement currently depends on CUDA-only kernels. + // Keep the HIP path on RF only until the IR path is ported. solver_->disable_iterative_refinement(); - } -#endif - if(use_ir == "yes") { - if((refact == "rf")) { - solver_->ir()->restart() = nlp_->options->GetInteger("ir_inner_restart"); - - if((solver_->ir()->restart() < 0) || (solver_->ir()->restart() > 100)) { - nlp_->log->printf(hovWarning, - "Wrong restart value: %d. Use int restart value between 1 and 100. Setting default (20) ...\n", - solver_->ir()->restart()); - solver_->ir()->restart() = 20; - } - - solver_->ir()->tol() = nlp_->options->GetNumeric("ir_inner_tol"); - if((solver_->ir()->tol() < 0) || (solver_->ir()->tol() > 1)) { - nlp_->log->printf(hovWarning, - "Wrong tol value: %e. Use double tol value between 0 and 1. Setting default (1e-12) ...\n", - solver_->ir()->tol()); - solver_->ir()->tol() = 1e-12; - } - solver_->ir()->orth_option() = nlp_->options->GetString("ir_inner_gs_scheme"); - - /* 0) "Standard" GMRES and FGMRES (Saad and Schultz, 1986, Saad, 1992) use Modified Gram-Schmidt ("mgs") to keep the - * Krylov vectors orthogonal. Modified Gram-Schmidt requires k synchronization (due to inner products) in iteration k - * and this becomes a scaling bottleneck for GPU-accelerated implementation and it becomes even more pronouced for - * MPI+GPU-acceleration. Modified Gram-Schidt can be replaced by a different scheme. - * - * 1) One can use Classical Gram-Schmidt ("cgs") which is numerically unstable or reorthogonalized Classical - * Gram-Schmidt ("cgs2"), which is numerically stable and requires 3 synchrnozations and each iteration. - * Reorthogonalized Classical Gram-Schmidt makes two passes of Classical Gram-Schmidt. And two passes are enough to get - * vectors orthogonal to machine precision (Bjorck 1967). - * - * 2) An alternative is a low-sych version (Swirydowicz and Thomas, 2020), which reformulates Modified Gram-Schmidt to - * be a (very small) triangular solve. It requires extra storage for the matrix used in triangular solve (kxk at - * iteration k), but only two sycnhronizations are needed per iteration. The inner producats are performed in bulk, - * which quarantees better GPU utilization. The second synchronization comes from normalizing the vector and can be - * eliminated if the norm is postponed to the next iteration, but also makes code more complicated. This is why we use - * two-synch method ("mgs_two_synch") - * - * 3) A recently submitted paper by Stephen Thomas (Thomas 202*) takes the triangular solve idea further and uses a - * different approximation for the inverse of a triangular matrix. It requires two (very small) triangular solves and - * two sychroniztions (if the norm is NOT delayed). It also guarantees that the vectors are orthogonal to the machine - * epsilon, as in cgs2. Since Stephen's paper is named "post modern GMRES", we call this Gram-Schmidt scheme "mgs_pm". - */ - if(solver_->ir()->orth_option() != "mgs" && solver_->ir()->orth_option() != "cgs2" && - solver_->ir()->orth_option() != "mgs_two_synch" && solver_->ir()->orth_option() != "mgs_pm") { - nlp_->log->printf( - hovWarning, - "mgs option : %s is wrong. Use 'mgs', 'cgs2', 'mgs_two_synch' or 'mgs_pm'. Switching to default (mgs) ...\n", - use_ir.c_str()); - solver_->ir()->orth_option() = "mgs"; - } - - solver_->ir()->conv_cond() = nlp_->options->GetInteger("ir_inner_conv_cond"); - - if((solver_->ir()->conv_cond() < 0) || (solver_->ir()->conv_cond() > 2)) { - nlp_->log->printf(hovWarning, - "Wrong IR convergence condition: %d. Use int value: 0, 1 or 2. Setting default (0) ...\n", - solver_->ir()->conv_cond()); - solver_->ir()->conv_cond() = 0; - } - + #else + if(maxit_test > 0) { + use_ir = "yes"; + solver_->enable_iterative_refinement(); + solver_->ir()->maxit() = maxit_test; } else { - nlp_->log->printf(hovWarning, "Currently, inner iterative refinement works ONLY with cuSolverRf ... \n"); - use_ir = "no"; solver_->disable_iterative_refinement(); } + #endif + if(use_ir == "yes") { + if((refact == "rf")) { + solver_->ir()->restart() = nlp_->options->GetInteger("ir_inner_restart"); + + if((solver_->ir()->restart() < 0) || (solver_->ir()->restart() > 100)) { + nlp_->log->printf(hovWarning, + "Wrong restart value: %d. Use int restart value between 1 and 100. Setting default (20) ...\n", + solver_->ir()->restart()); + solver_->ir()->restart() = 20; + } + + solver_->ir()->tol() = nlp_->options->GetNumeric("ir_inner_tol"); + if((solver_->ir()->tol() < 0) || (solver_->ir()->tol() > 1)) { + nlp_->log->printf(hovWarning, + "Wrong tol value: %e. Use double tol value between 0 and 1. Setting default (1e-12) ...\n", + solver_->ir()->tol()); + solver_->ir()->tol() = 1e-12; + } + solver_->ir()->orth_option() = nlp_->options->GetString("ir_inner_gs_scheme"); + + /* 0) "Standard" GMRES and FGMRES (Saad and Schultz, 1986, Saad, 1992) use Modified Gram-Schmidt ("mgs") to keep the + * Krylov vectors orthogonal. Modified Gram-Schmidt requires k synchronization (due to inner products) in iteration k + * and this becomes a scaling bottleneck for GPU-accelerated implementation and it becomes even more pronouced for + * MPI+GPU-acceleration. Modified Gram-Schidt can be replaced by a different scheme. + * + * 1) One can use Classical Gram-Schmidt ("cgs") which is numerically unstable or reorthogonalized Classical + * Gram-Schmidt ("cgs2"), which is numerically stable and requires 3 synchrnozations and each iteration. + * Reorthogonalized Classical Gram-Schmidt makes two passes of Classical Gram-Schmidt. And two passes are enough to get + * vectors orthogonal to machine precision (Bjorck 1967). + * + * 2) An alternative is a low-sych version (Swirydowicz and Thomas, 2020), which reformulates Modified Gram-Schmidt to + * be a (very small) triangular solve. It requires extra storage for the matrix used in triangular solve (kxk at + * iteration k), but only two sycnhronizations are needed per iteration. The inner producats are performed in bulk, + * which quarantees better GPU utilization. The second synchronization comes from normalizing the vector and can be + * eliminated if the norm is postponed to the next iteration, but also makes code more complicated. This is why we use + * two-synch method ("mgs_two_synch") + * + * 3) A recently submitted paper by Stephen Thomas (Thomas 202*) takes the triangular solve idea further and uses a + * different approximation for the inverse of a triangular matrix. It requires two (very small) triangular solves and + * two sychroniztions (if the norm is NOT delayed). It also guarantees that the vectors are orthogonal to the machine + * epsilon, as in cgs2. Since Stephen's paper is named "post modern GMRES", we call this Gram-Schmidt scheme "mgs_pm". + */ + if(solver_->ir()->orth_option() != "mgs" && solver_->ir()->orth_option() != "cgs2" && + solver_->ir()->orth_option() != "mgs_two_synch" && solver_->ir()->orth_option() != "mgs_pm") { + nlp_->log->printf( + hovWarning, + "mgs option : %s is wrong. Use 'mgs', 'cgs2', 'mgs_two_synch' or 'mgs_pm'. Switching to default (mgs) ...\n", + use_ir.c_str()); + solver_->ir()->orth_option() = "mgs"; + } + + solver_->ir()->conv_cond() = nlp_->options->GetInteger("ir_inner_conv_cond"); + + if((solver_->ir()->conv_cond() < 0) || (solver_->ir()->conv_cond() > 2)) { + nlp_->log->printf(hovWarning, + "Wrong IR convergence condition: %d. Use int value: 0, 1 or 2. Setting default (0) ...\n", + solver_->ir()->conv_cond()); + solver_->ir()->conv_cond() = 0; + } + + } else { + nlp_->log->printf(hovWarning, "Currently, inner iterative refinement works ONLY with cuSolverRf ... \n"); + use_ir = "no"; + solver_->disable_iterative_refinement(); + } + } } #endif @@ -285,22 +319,20 @@ hiopLinSolverSymSparseEVLOSER::hiopLinSolverSymSparseEVLOSER(const int& n, const hiopLinSolverSymSparseEVLOSER::~hiopLinSolverSymSparseEVLOSER() { delete solver_; - - // If memory space is device, delete allocated host mirrors - if(nlp_->options->GetString("mem_space") == "device") { - delete M_host_; - } + delete M_host_; + M_host_ = nullptr; // Delete CSR <--> triplet mappings delete[] index_convert_CSR2Triplet_host_; delete[] index_convert_extra_Diag2CSR_host_; - #if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ defined(HIOP_USE_HIP) || defined(HAVE_HIP) - - checkGpuErrors(evloserGpuFree(index_convert_CSR2Triplet_device_)); - checkGpuErrors(evloserGpuFree(index_convert_extra_Diag2CSR_device_)); - + if(index_convert_CSR2Triplet_device_ != nullptr) { + checkGpuErrors(evloserGpuFree(index_convert_CSR2Triplet_device_)); + } + if(index_convert_extra_Diag2CSR_device_ != nullptr) { + checkGpuErrors(evloserGpuFree(index_convert_extra_Diag2CSR_device_)); + } #endif } @@ -347,18 +379,15 @@ bool hiopLinSolverSymSparseEVLOSER::solve(hiopVector& x) // Set IR tolerance double ir_tol = nlp_->options->GetNumeric("ir_inner_tol"); - - std::string mem_space = nlp_->options->GetString("mem_space"); double* dx = x.local_data(); - - bool retval = solver_->triangular_solve(dx, ir_tol, mem_space); + bool retval = solver_->triangular_solve(dx, ir_tol); if(!retval) { nlp_->log->printf(hovError, // catastrophic failure "EVLOSER triangular solve failed\n"); } nlp_->runStats.linsolv.tmTriuSolves.stop(); - return true; + return retval; } void hiopLinSolverSymSparseEVLOSER::firstCall() @@ -366,11 +395,10 @@ void hiopLinSolverSymSparseEVLOSER::firstCall() assert(n_ == M_->n() && M_->n() == M_->m()); assert(n_ > 0); - // If the matrix is on device, copy it to the host mirror - std::string mem_space = nlp_->options->GetString("mem_space"); + // Device execution requires a host copy for the initial KLU factorization. #if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ defined(HIOP_USE_HIP) || defined(HAVE_HIP) - if(mem_space == "device") { + if(execution_mode_ != EVLOSER::ExecutionMode::CPU) { checkGpuErrors( evloserGpuMemcpy(M_host_->M(), M_->M(), sizeof(double) * M_->numberOfNonzeros(), evloserMemcpyDeviceToHost)); checkGpuErrors(evloserGpuMemcpy(M_host_->i_row(), @@ -382,13 +410,7 @@ void hiopLinSolverSymSparseEVLOSER::firstCall() sizeof(index_type) * M_->numberOfNonzeros(), evloserMemcpyDeviceToHost)); } -#else - if(mem_space == "device") { - nlp_->log->printf(hovError, "Device memory is unavailable in this EVLOSER build.\n"); - return; - } #endif - // Transfer triplet to CSR form // Allocate row pointers and compute number of nonzeros. @@ -404,11 +426,13 @@ void hiopLinSolverSymSparseEVLOSER::firstCall() #if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ defined(HIOP_USE_HIP) || defined(HAVE_HIP) - // Copy matrix to device - solver_->mat_A_csr()->update_from_host_mirror(); + if(execution_mode_ != EVLOSER::ExecutionMode::CPU) { + // Copy matrix to device + solver_->mat_A_csr()->update_from_host_mirror(); - if(solver_->use_ir() == "yes") { - solver_->setup_iterative_refinement_matrix(n_, nnz_); + if(solver_->use_ir() == "yes") { + solver_->setup_iterative_refinement_matrix(n_, nnz_); + } } #endif /* @@ -426,62 +450,63 @@ void hiopLinSolverSymSparseEVLOSER::firstCall() /// M_->numberOfNonzeros() is number of zeros in symmetric triplet matrix void hiopLinSolverSymSparseEVLOSER::update_matrix_values() { - std::string mem_space = nlp_->options->GetString("mem_space"); - #if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) - if(mem_space == "device") { + if(execution_mode_ == EVLOSER::ExecutionMode::CUDA) { double* csr_vals = solver_->mat_A_csr()->device_vals(); double* coo_vals = M_->M(); int coo_nnz = M_->numberOfNonzeros(); const int blocksize = 512; int gridsize = (nnz_ + blocksize - 1) / blocksize; - evloser_map_arrays_kernel<<>>(csr_vals, coo_vals, index_convert_CSR2Triplet_device_, nnz_); + evloser_map_arrays_kernel + <<>>(csr_vals, coo_vals, index_convert_CSR2Triplet_device_, nnz_); gridsize = (n_ + blocksize - 1) / blocksize; evloser_add_to_array_kernel <<>>(csr_vals, coo_vals, index_convert_extra_Diag2CSR_device_, n_, coo_nnz); // If factorization was not successful, we need a copy of values on the host - if(factorizationSetupSucc_ == 0) + if(factorizationSetupSucc_ == 0) { checkGpuErrors(evloserGpuMemcpy(solver_->mat_A_csr()->host_vals(), solver_->mat_A_csr()->device_vals(), sizeof(double) * nnz_, evloserMemcpyDeviceToHost)); + + } + return; } #endif - hiopMatrixSparse* matrix_source = M_; - + hiopMatrixSparse* matrix = M_; #if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ defined(HIOP_USE_HIP) || defined(HAVE_HIP) - if(mem_space == "device") { - checkGpuErrors( - evloserGpuMemcpy(M_host_->M(), M_->M(), sizeof(double) * M_->numberOfNonzeros(), evloserMemcpyDeviceToHost)); - matrix_source = M_host_; - } -#else - if(mem_space == "device") { - nlp_->log->printf(hovError, "Device memory is unavailable in this EVLOSER build.\n"); - return; + if(execution_mode_ != EVLOSER::ExecutionMode::CPU) { + checkGpuErrors(evloserGpuMemcpy(M_host_->M(), + M_->M(), + sizeof(double) * M_->numberOfNonzeros(), + evloserMemcpyDeviceToHost)); + matrix = M_host_; } #endif - - // KKT matrix is on the host double* vals = solver_->mat_A_csr()->host_vals(); - // update matrix - for(int k = 0; k < nnz_; k++) { - vals[k] = matrix_source->M()[index_convert_CSR2Triplet_host_[k]]; + + for(int k = 0; k < nnz_; ++k) { + vals[k] = matrix->M()[index_convert_CSR2Triplet_host_[k]]; } - for(int i = 0; i < n_; i++) { - if(index_convert_extra_Diag2CSR_host_[i] != -1) - vals[index_convert_extra_Diag2CSR_host_[i]] += matrix_source->M()[matrix_source->numberOfNonzeros() - n_ + i]; + + for(int i = 0; i < n_; ++i) { + if(index_convert_extra_Diag2CSR_host_[i] != -1) { + vals[index_convert_extra_Diag2CSR_host_[i]] += + matrix->M()[matrix->numberOfNonzeros() - n_ + i]; + } } #if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ defined(HIOP_USE_HIP) || defined(HAVE_HIP) - solver_->mat_A_csr()->update_from_host_mirror(); + if(execution_mode_ != EVLOSER::ExecutionMode::CPU) { + solver_->mat_A_csr()->update_from_host_mirror(); + } #endif } @@ -493,16 +518,10 @@ void hiopLinSolverSymSparseEVLOSER::compute_nnz() // int* row_ptr = solver_->mat_A_csr()->host_irows(); - // If the data is on device, fetch it from the host mirror - hiopMatrixSparse* M_host = nullptr; - std::string mem_space = nlp_->options->GetString("mem_space"); - if(mem_space == "host" || mem_space == "default") { - M_host = M_; - } else if(mem_space == "device") { - M_host = M_host_; - } else { - nlp_->log->printf(hovError, "Memory space %s incompatible with EVLOSER.\n", mem_space.c_str()); - } + hiopMatrixSparse* M_host = + execution_mode_ == EVLOSER::ExecutionMode::CPU ? M_ : M_host_; + + assert(M_host != nullptr); // off-diagonal part row_ptr[0] = 0; @@ -528,16 +547,10 @@ void hiopLinSolverSymSparseEVLOSER::compute_nnz() /// @pre Data is either on the host or the host mirror is synced with the device void hiopLinSolverSymSparseEVLOSER::set_csr_indices_values() { - // If the data is on device, fetch it from the host mirror - hiopMatrixSparse* M_host = nullptr; - std::string mem_space = nlp_->options->GetString("mem_space"); - if(mem_space == "host" || mem_space == "default") { - M_host = M_; - } else if(mem_space == "device") { - M_host = M_host_; - } else { - nlp_->log->printf(hovError, "Memory space %s incompatible with EVLOSER.\n", mem_space.c_str()); - } + hiopMatrixSparse* M_host = + execution_mode_ == EVLOSER::ExecutionMode::CPU ? M_ : M_host_; + + assert(M_host != nullptr); // // set correct col index and value @@ -548,14 +561,14 @@ void hiopLinSolverSymSparseEVLOSER::set_csr_indices_values() index_convert_CSR2Triplet_host_ = new int[nnz_]; index_convert_extra_Diag2CSR_host_ = new int[n_]; -#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ - defined(HIOP_USE_HIP) || defined(HAVE_HIP) - checkGpuErrors(evloserGpuMalloc(reinterpret_cast(&index_convert_CSR2Triplet_device_), nnz_ * sizeof(int))); - checkGpuErrors(evloserGpuMalloc(reinterpret_cast(&index_convert_extra_Diag2CSR_device_), n_ * sizeof(int))); +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) + if(execution_mode_ == EVLOSER::ExecutionMode::CUDA) { + checkGpuErrors(evloserGpuMalloc(reinterpret_cast(&index_convert_CSR2Triplet_device_), nnz_ * sizeof(int))); + checkGpuErrors(evloserGpuMalloc(reinterpret_cast(&index_convert_extra_Diag2CSR_device_), n_ * sizeof(int))); + } #endif - int* nnz_each_row_tmp = new int[n_]{0}; - int nnz_tmp{0}, rowID_tmp, colID_tmp; + int total_nnz_tmp{0}, nnz_tmp{0}, rowID_tmp, colID_tmp; for(int k = 0; k < n_; k++) { index_convert_extra_Diag2CSR_host_[k] = -1; @@ -574,6 +587,7 @@ void hiopLinSolverSymSparseEVLOSER::set_csr_indices_values() index_convert_extra_Diag2CSR_host_[rowID_tmp] = nnz_tmp; nnz_each_row_tmp[rowID_tmp]++; + total_nnz_tmp++; } else { nnz_tmp = nnz_each_row_tmp[rowID_tmp] + row_ptr[rowID_tmp]; col_idx[nnz_tmp] = colID_tmp; @@ -587,6 +601,7 @@ void hiopLinSolverSymSparseEVLOSER::set_csr_indices_values() nnz_each_row_tmp[rowID_tmp]++; nnz_each_row_tmp[colID_tmp]++; + total_nnz_tmp += 2; } } // correct the missing dia_gonal term @@ -597,6 +612,7 @@ void hiopLinSolverSymSparseEVLOSER::set_csr_indices_values() col_idx[nnz_tmp] = i; vals[nnz_tmp] = M_host->M()[M_host->numberOfNonzeros() - n_ + i]; index_convert_CSR2Triplet_host_[nnz_tmp] = M_host->numberOfNonzeros() - n_ + i; + total_nnz_tmp += 1; std::vector ind_temp(row_ptr[i + 1] - row_ptr[i]); std::iota(ind_temp.begin(), ind_temp.end(), 0); @@ -609,23 +625,24 @@ void hiopLinSolverSymSparseEVLOSER::set_csr_indices_values() std::sort(col_idx + row_ptr[i], col_idx + row_ptr[i + 1]); } } -#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ - defined(HIOP_USE_HIP) || defined(HAVE_HIP) - checkGpuErrors(evloserGpuMemcpy(index_convert_CSR2Triplet_device_, - index_convert_CSR2Triplet_host_, - nnz_ * sizeof(int), - evloserMemcpyHostToDevice)); - checkGpuErrors(evloserGpuMemcpy(index_convert_extra_Diag2CSR_device_, - index_convert_extra_Diag2CSR_host_, - n_ * sizeof(int), - evloserMemcpyHostToDevice)); +#if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) + if(execution_mode_ == EVLOSER::ExecutionMode::CUDA) { + checkGpuErrors(evloserGpuMemcpy(index_convert_CSR2Triplet_device_, + index_convert_CSR2Triplet_host_, + nnz_ * sizeof(int), + evloserMemcpyHostToDevice)); + checkGpuErrors(evloserGpuMemcpy(index_convert_extra_Diag2CSR_device_, + index_convert_extra_Diag2CSR_host_, + n_ * sizeof(int), + evloserMemcpyHostToDevice)); + } #endif delete[] nnz_each_row_tmp; } #if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ defined(HIOP_USE_HIP) || defined(HAVE_HIP) -// Error checking utility for GPU backends +// Error checking utility for CUDA // KS: might later become part of src/Utils, putting it here for now template void hiopLinSolverSymSparseEVLOSER::hiopCheckGpuError(T result, const char* const file, int const line) @@ -636,5 +653,4 @@ void hiopLinSolverSymSparseEVLOSER::hiopCheckGpuError(T result, const char* cons } } #endif - } // namespace hiop diff --git a/src/LinAlg/hiopLinSolverSparseEVLOSER.hpp b/src/LinAlg/hiopLinSolverSparseEVLOSER.hpp index d53ba60..c31300a 100644 --- a/src/LinAlg/hiopLinSolverSparseEVLOSER.hpp +++ b/src/LinAlg/hiopLinSolverSparseEVLOSER.hpp @@ -59,6 +59,7 @@ #include "hiopLinSolver.hpp" #include "hiopMatrixSparseTriplet.hpp" +#include "evloser_execution_mode.hpp" #include /** Implements the sparse linear solver class using the EVLOSER interface @@ -109,6 +110,7 @@ class hiopLinSolverSymSparseEVLOSER : public hiopLinSolverSymSparse } protected: + const EVLOSER::ExecutionMode execution_mode_; ///< Selected EVLOSER execution path EVLOSER::RefactorizationSolver* solver_; int m_; ///< number of rows of the whole matrix From b554dc773023ef05a86dc68ff5ec723d5c8e8211 Mon Sep 17 00:00:00 2001 From: Tamar DeWilde Date: Wed, 24 Jun 2026 13:35:26 -0700 Subject: [PATCH 22/28] Add KLU CPU execution to EVLOSER --- CMakeLists.txt | 7 + src/LinAlg/EVLOSER/MatrixCsr.cpp | 35 +++ src/LinAlg/EVLOSER/MatrixCsr.hpp | 9 + src/LinAlg/EVLOSER/RefactorizationSolver.cpp | 226 +++++++++++++--- src/LinAlg/EVLOSER/RefactorizationSolver.hpp | 16 +- tests/CMakeLists.txt | 15 ++ tests/LinAlg/testEVLOSERKLUCPU.cpp | 259 +++++++++++++++++++ 7 files changed, 520 insertions(+), 47 deletions(-) create mode 100644 tests/LinAlg/testEVLOSERKLUCPU.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 830b644..e71b373 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -544,6 +544,13 @@ if (HIOP_WITH_MAKETEST) add_test(NAME SparseMatrixTest COMMAND ${RUNCMD} "$") add_test(NAME SymmetricSparseMatrixTest COMMAND ${RUNCMD} "$") + if(HIOP_USE_EVLOSER AND NOT HIOP_USE_GPU) + add_test( + NAME EVLOSERKLUCPU + COMMAND ${RUNCMD} "$" + ) + endif() + # Test drivers in the form of user applications add_subdirectory(src/Drivers) endif(HIOP_WITH_MAKETEST) diff --git a/src/LinAlg/EVLOSER/MatrixCsr.cpp b/src/LinAlg/EVLOSER/MatrixCsr.cpp index 7b55ce7..77c45f7 100644 --- a/src/LinAlg/EVLOSER/MatrixCsr.cpp +++ b/src/LinAlg/EVLOSER/MatrixCsr.cpp @@ -62,6 +62,7 @@ #include #include #include +#include #if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ defined(HIOP_USE_HIP) || defined(HAVE_HIP) @@ -299,6 +300,40 @@ bool MatrixCsr::validate_host_structure(const char* caller, bool silent_output) return true; } +bool MatrixCsr::validate_host_values(const char* caller, bool silent_output) const +{ + const char* caller_name = caller == nullptr ? "unknown caller" : caller; + + auto report = [&](const std::string& message) { + if(!silent_output) { + std::cout << "[EVLOSER] Invalid CSR matrix values in " << caller_name << ": " << message << "\n"; + } + return false; + }; + + if(nnz_ < 0) { + return report("number of nonzeros is negative"); + } + + if(nnz_ == 0) { + return true; + } + + if(vals_host_ == nullptr) { + return report("host value array is null"); + } + + for(int k = 0; k < nnz_; ++k) { + if(!std::isfinite(vals_host_[k])) { + std::ostringstream message; + message << "matrix value at entry " << k << " is not finite"; + return report(message.str()); + } + } + + return true; +} + #if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ defined(HIOP_USE_HIP) || defined(HAVE_HIP) // Error checking utility for GPU backend diff --git a/src/LinAlg/EVLOSER/MatrixCsr.hpp b/src/LinAlg/EVLOSER/MatrixCsr.hpp index 7465354..12e3c6d 100644 --- a/src/LinAlg/EVLOSER/MatrixCsr.hpp +++ b/src/LinAlg/EVLOSER/MatrixCsr.hpp @@ -46,6 +46,15 @@ class MatrixCsr */ bool validate_host_structure(const char* caller, bool silent_output) const; + /** + * @brief Validate that all host-side CSR values are finite. + * + * @param caller Name of the caller used in diagnostic messages. + * @param silent_output Suppress diagnostic output when true. + * @return true if every stored host value is finite. + */ + bool validate_host_values(const char* caller, bool silent_output) const; + #if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ defined(HIOP_USE_HIP) || defined(HAVE_HIP) /// Return device row-pointer storage. diff --git a/src/LinAlg/EVLOSER/RefactorizationSolver.cpp b/src/LinAlg/EVLOSER/RefactorizationSolver.cpp index ab4c9fd..d3ec77b 100644 --- a/src/LinAlg/EVLOSER/RefactorizationSolver.cpp +++ b/src/LinAlg/EVLOSER/RefactorizationSolver.cpp @@ -63,6 +63,7 @@ #include "klu.h" #include +#include #include #include #include @@ -355,8 +356,13 @@ RefactorizationSolver::~RefactorizationSolver() } } #endif - klu_free_symbolic(&Symbolic_, &Common_); - klu_free_numeric(&Numeric_, &Common_); + if(Numeric_ != nullptr) { + klu_free_numeric(&Numeric_, &Common_); + } + + if(Symbolic_ != nullptr) { + klu_free_symbolic(&Symbolic_, &Common_); + } delete[] mia_; delete[] mja_; } @@ -430,7 +436,8 @@ bool RefactorizationSolver::validate_system_matrix(const char* caller) const return false; } - return mat_A_csr_->validate_host_structure(caller, silent_output_); + return mat_A_csr_->validate_host_structure(caller, silent_output_) && + mat_A_csr_->validate_host_values(caller, silent_output_); } bool RefactorizationSolver::validate_klu_factorization(const char* caller) const @@ -476,6 +483,30 @@ bool RefactorizationSolver::validate_klu_factorization(const char* caller) const return true; } +bool RefactorizationSolver::validate_solution(const double* solution, const char* caller) const +{ + const char* caller_name = caller == nullptr ? "unknown caller" : caller; + + if(solution == nullptr) { + if(!silent_output_) { + std::cout << "[EVLOSER] Invalid vector in " << caller_name << ": pointer is null\n"; + } + return false; + } + + for(int i = 0; i < n_; ++i) { + if(!std::isfinite(solution[i])) { + if(!silent_output_) { + std::cout << "[EVLOSER] Invalid vector in " << caller_name + << ": entry " << i << " is not finite\n"; + } + return false; + } + } + + return true; +} + #if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ defined(HIOP_USE_HIP) || defined(HAVE_HIP) bool RefactorizationSolver::checkEvloserRfStatus(evloserRfStatus_t status, const char* caller) const @@ -525,39 +556,78 @@ int RefactorizationSolver::refactorizeEvloserRf(const char* caller) int RefactorizationSolver::setup_factorization() { - if(!validate_system_matrix("KLU analysis")) { + if(fact_ != "klu") { + assert(false && "Only KLU is available for the first factorization."); return -1; } - int* row_ptr = mat_A_csr_->host_irows(); - int* col_idx = mat_A_csr_->host_jcols(); + if(!validate_system_matrix("KLU analysis")) { + return -1; + } - if(fact_ == "klu") { - /* initialize KLU setup parameters, dont factorize yet */ - initializeKLU(); + // A new matrix structure invalidates both existing KLU states. + if(Numeric_ != nullptr) { + klu_free_numeric(&Numeric_, &Common_); + } - /*perform KLU but only the symbolic analysis (important) */ + if(Symbolic_ != nullptr) { klu_free_symbolic(&Symbolic_, &Common_); - klu_free_numeric(&Numeric_, &Common_); - Symbolic_ = klu_analyze(n_, row_ptr, col_idx, &Common_); + } - if(Symbolic_ == nullptr) { - return -1; + if(initializeKLU() != 0) { + return -1; + } + + Symbolic_ = klu_analyze(n_, + mat_A_csr_->host_irows(), + mat_A_csr_->host_jcols(), + &Common_); + + if(Symbolic_ == nullptr || Common_.status != KLU_OK) { + if(!silent_output_) { + std::cout << "[EVLOSER] KLU symbolic analysis failed with status " + << Common_.status << "\n"; } - } else { // for future - assert(0 && "Only KLU is available for the first factorization.\n"); + return -1; } + return 0; } int RefactorizationSolver::factorize() { + if(!validate_system_matrix("KLU factorization")) { + return -1; + } + + if(Symbolic_ == nullptr || Symbolic_->n != n_) { + if(!silent_output_) { + std::cout << "[EVLOSER] KLU factorization requires valid symbolic analysis.\n"; + } + return -1; + } + + // A fresh factorization replaces only the numeric state. + if(Numeric_ != nullptr) { + klu_free_numeric(&Numeric_, &Common_); + } + Numeric_ = klu_factor(mat_A_csr_->host_irows(), mat_A_csr_->host_jcols(), mat_A_csr_->host_vals(), Symbolic_, &Common_); - return (Numeric_ == nullptr) ? -1 : 0; + + if(Numeric_ == nullptr || Common_.status != KLU_OK) { + if(!silent_output_) { + std::cout << "[EVLOSER] KLU numeric factorization failed with status " + << Common_.status << "\n"; + } + return -1; + } + + is_first_solve_ = true; + return validate_klu_factorization("KLU factorization") ? 0 : -1; } void RefactorizationSolver::setup_refactorization() @@ -567,9 +637,7 @@ void RefactorizationSolver::setup_refactorization() } if(execution_mode_ == ExecutionMode::CPU) { - if(!silent_output_) { - std::cout << "[EVLOSER] CPU refactorization setup is not available yet.\n"; - } + // KLU numeric state is already available from factorize(). return; } @@ -601,11 +669,6 @@ void RefactorizationSolver::setup_refactorization() } else { // for future - assert(0 && "Only glu and rf refactorizations available.\n"); } -#else - - if(!silent_output_) { - std::cout << "[EVLOSER] GPU refactorization is unavailable in this build.\n"; - } #endif } @@ -617,10 +680,26 @@ int RefactorizationSolver::refactorize() } if(execution_mode_ == ExecutionMode::CPU) { - if(!silent_output_) { - std::cout << "[EVLOSER] CPU refactorization is not available yet.\n"; + if(!validate_klu_factorization("KLU refactorization")) { + return -1; } - return -1; + + const int ok = klu_refactor(mat_A_csr_->host_irows(), + mat_A_csr_->host_jcols(), + mat_A_csr_->host_vals(), + Symbolic_, + Numeric_, + &Common_); + + if(ok == 0 || Common_.status != KLU_OK) { + if(!silent_output_) { + std::cout << "[EVLOSER] KLU refactorization failed with status " + << Common_.status << "\n"; + } + return -1; + } + + return 0; } #if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ @@ -654,26 +733,53 @@ int RefactorizationSolver::refactorize() } } return 0; -#else +#endif if(!silent_output_) { - std::cout << "[EVLOSER] GPU refactorization is unavailable in this build.\n"; + std::cout << "[EVLOSER] Selected refactorization backend is unavailable in this build.\n"; } return -1; - -#endif } bool RefactorizationSolver::triangular_solve(double* dx, double tol) { - if(execution_mode_ == ExecutionMode::CPU) { + if(dx == nullptr) { if(!silent_output_) { - std::cout << "[EVLOSER] CPU triangular solve is not available yet.\n"; + std::cout << "[EVLOSER] Solve received a null right-hand side.\n"; } return false; } + if(execution_mode_ == ExecutionMode::CPU) { + (void)tol; + + if(!validate_klu_factorization("KLU solve")) { + return false; + } + + if(!validate_solution(dx, "KLU right-hand side")) { + return false; + } + + const int ok = klu_solve(Symbolic_, + Numeric_, + n_, + 1, + dx, + &Common_); + + if(ok == 0 || Common_.status != KLU_OK) { + if(!silent_output_) { + std::cout << "[EVLOSER] KLU solve failed with status " + << Common_.status << "\n"; + } + return false; + } + + return validate_solution(dx, "KLU solve"); + } + #if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ defined(HIOP_USE_HIP) || defined(HAVE_HIP) if(refact_ == "glu") { @@ -711,12 +817,36 @@ bool RefactorizationSolver::triangular_solve(double* dx, double tol) if(refact_ == "rf") { // First solve is performed on CPU if(is_first_solve_) { - checkGpuErrors(evloserGpuMemcpy(hostx_, dx, sizeof(double) * n_, evloserMemcpyDeviceToHost)); - (void)klu_solve(Symbolic_, Numeric_, n_, 1, hostx_, &Common_); - klu_free_numeric(&Numeric_, &Common_); - klu_free_symbolic(&Symbolic_, &Common_); + checkGpuErrors(evloserGpuMemcpy(hostx_, + dx, + sizeof(double) * n_, + evloserMemcpyDeviceToHost)); + + const int ok = klu_solve(Symbolic_, + Numeric_, + n_, + 1, + hostx_, + &Common_); + + if(ok == 0 || Common_.status != KLU_OK) { + if(!silent_output_) { + std::cout << "[EVLOSER] Initial KLU solve failed with status " + << Common_.status << "\n"; + } + return false; + } + + if(!validate_solution(hostx_, "initial KLU solve")) { + return false; + } + + checkGpuErrors(evloserGpuMemcpy(dx, + hostx_, + sizeof(double) * n_, + evloserMemcpyHostToDevice)); + is_first_solve_ = false; - checkGpuErrors(evloserGpuMemcpy(dx, hostx_, sizeof(double) * n_, evloserMemcpyHostToDevice)); return true; } @@ -829,16 +959,28 @@ int RefactorizationSolver::createM(const int n, int RefactorizationSolver::initializeKLU() { - klu_defaults(&Common_); + if(klu_defaults(&Common_) == 0) { + if(!silent_output_) { + std::cout << "[EVLOSER] klu_defaults failed.\n"; + } + return -1; + } - // TODO: consider making this a part of setup options so that user can - // set up these values. For now, we keep them hard-wired. + // TODO: consider making these user-configurable. Common_.btf = 0; Common_.ordering = ordering_; // COLAMD=1; AMD=0 Common_.tol = 0.1; Common_.scale = -1; Common_.halt_if_singular = 1; + if(Common_.status != KLU_OK) { + if(!silent_output_) { + std::cout << "[EVLOSER] Invalid KLU initialization status " + << Common_.status << "\n"; + } + return -1; + } + return 0; } diff --git a/src/LinAlg/EVLOSER/RefactorizationSolver.hpp b/src/LinAlg/EVLOSER/RefactorizationSolver.hpp index d3300dd..a8ec2de 100644 --- a/src/LinAlg/EVLOSER/RefactorizationSolver.hpp +++ b/src/LinAlg/EVLOSER/RefactorizationSolver.hpp @@ -157,11 +157,14 @@ class RefactorizationSolver int refactorize(); /** - * @brief Invokes triangular solver given matrix factors + * @brief Solve the factored linear system. * - * @param dx - * @param tol - * @return bool + * In CPU execution mode, dx must point to host memory. In CUDA or HIP + * execution mode, dx must point to device memory. + * + * @param dx rhs on entry and solution on return. + * @param tol ir tolerance for GPU execution. + * @return bool true when the solve succeeds and the solution is finite. */ bool triangular_solve(double* dx, double tol); @@ -213,7 +216,7 @@ class RefactorizationSolver // KLU stuff int klu_status_; - klu_common Common_; + klu_common Common_{}; klu_symbolic* Symbolic_ = nullptr; klu_numeric* Numeric_ = nullptr; /*pieces of M */ @@ -258,6 +261,9 @@ class RefactorizationSolver /// Validate that KLU symbolic and numeric factors are available and dimensionally consistent. bool validate_klu_factorization(const char* caller) const; + /// Validate that the solution pointer is non-null and all solution values are finite. + bool validate_solution(const double* solution, const char* caller) const; + int initializeKLU(); #if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index bb4cc01..e7beb2d 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -71,3 +71,18 @@ target_link_libraries(test_pcg PRIVATE HiOp::HiOp) add_executable(test_bicgstab ${testBiCGStab_SRC}) target_link_libraries(test_bicgstab PRIVATE HiOp::HiOp) + +if(HIOP_USE_EVLOSER AND NOT HIOP_USE_GPU) + add_executable( + testEVLOSERKLUCPU + LinAlg/testEVLOSERKLUCPU.cpp + ) + + target_link_libraries( + testEVLOSERKLUCPU + PRIVATE + EVLOSER + KLU + ) + +endif() diff --git a/tests/LinAlg/testEVLOSERKLUCPU.cpp b/tests/LinAlg/testEVLOSERKLUCPU.cpp new file mode 100644 index 0000000..5dd5fe7 --- /dev/null +++ b/tests/LinAlg/testEVLOSERKLUCPU.cpp @@ -0,0 +1,259 @@ +#include "MatrixCsr.hpp" +#include "RefactorizationSolver.hpp" + +#include +#include +#include +#include +#include +#include + +namespace +{ + +constexpr double absolute_tolerance = 1e-10; +constexpr double relative_tolerance = 1e-8; + +bool nearly_equal(double actual, double expected) +{ + return std::abs(actual - expected) <= + absolute_tolerance + + relative_tolerance * std::max(std::abs(actual), std::abs(expected)); +} + +bool vectors_equal(const std::vector& actual, + const std::vector& expected) +{ + if(actual.size() != expected.size()) { + return false; + } + + for(std::size_t i = 0; i < actual.size(); ++i) { + if(!nearly_equal(actual[i], expected[i])) { + return false; + } + } + + return true; +} + +bool load_matrix(EVLOSER::RefactorizationSolver& solver, + int n, + const std::vector& rowptr, + const std::vector& colind, + const std::vector& values) +{ + if(static_cast(rowptr.size()) != n + 1 || + colind.size() != values.size()) { + return false; + } + + EVLOSER::MatrixCsr* matrix = solver.mat_A_csr(); + matrix->allocate_size(n); + matrix->allocate_nnz(static_cast(values.size())); + + std::copy(rowptr.begin(), rowptr.end(), matrix->host_irows()); + std::copy(colind.begin(), colind.end(), matrix->host_jcols()); + std::copy(values.begin(), values.end(), matrix->host_vals()); + + solver.set_nnz(static_cast(values.size())); + solver.ordering() = 1; + solver.fact() = "klu"; + solver.refact() = "klu"; + solver.use_ir() = "no"; + solver.set_silent_output(false); + + return true; +} + +bool factorize(EVLOSER::RefactorizationSolver& solver) +{ + return solver.setup_factorization() == 0 && + solver.factorize() == 0; +} + +bool test_analyze_factor_solve() +{ + EVLOSER::RefactorizationSolver solver(3, EVLOSER::ExecutionMode::CPU); + + const std::vector rowptr{0, 2, 5, 7}; + const std::vector colind{0, 1, 0, 1, 2, 1, 2}; + const std::vector values{4.0, 1.0, 1.0, 3.0, 1.0, 1.0, 2.0}; + + if(!load_matrix(solver, 3, rowptr, colind, values) || + !factorize(solver)) { + return false; + } + + std::vector rhs{6.0, 10.0, 8.0}; + + return solver.triangular_solve(rhs.data(), 0.0) && + vectors_equal(rhs, {1.0, 2.0, 3.0}); +} + +bool test_repeated_solve() +{ + EVLOSER::RefactorizationSolver solver(3, EVLOSER::ExecutionMode::CPU); + + const std::vector rowptr{0, 2, 5, 7}; + const std::vector colind{0, 1, 0, 1, 2, 1, 2}; + const std::vector values{4.0, 1.0, 1.0, 3.0, 1.0, 1.0, 2.0}; + + if(!load_matrix(solver, 3, rowptr, colind, values) || + !factorize(solver)) { + return false; + } + + std::vector first_rhs{6.0, 10.0, 8.0}; + if(!solver.triangular_solve(first_rhs.data(), 0.0) || + !vectors_equal(first_rhs, {1.0, 2.0, 3.0})) { + return false; + } + + std::vector second_rhs{-3.5, 2.5, 4.5}; + + return solver.triangular_solve(second_rhs.data(), 0.0) && + vectors_equal(second_rhs, {-1.0, 0.5, 2.0}); +} + +bool test_value_only_refactor() +{ + EVLOSER::RefactorizationSolver solver(3, EVLOSER::ExecutionMode::CPU); + + const std::vector rowptr{0, 2, 5, 7}; + const std::vector colind{0, 1, 0, 1, 2, 1, 2}; + const std::vector initial_values{4.0, 1.0, 1.0, 3.0, 1.0, 1.0, 2.0}; + const std::vector updated_values{5.0, 1.0, 1.0, 4.0, 1.0, 1.0, 3.0}; + + if(!load_matrix(solver, 3, rowptr, colind, initial_values) || + !factorize(solver)) { + return false; + } + + std::copy(updated_values.begin(), + updated_values.end(), + solver.mat_A_csr()->host_vals()); + + solver.setup_refactorization(); + + if(solver.refactorize() != 0) { + return false; + } + + std::vector rhs{7.0, 12.0, 11.0}; + + return solver.triangular_solve(rhs.data(), 0.0) && + vectors_equal(rhs, {1.0, 2.0, 3.0}); +} + +bool test_singular_factorization() +{ + EVLOSER::RefactorizationSolver solver(3, EVLOSER::ExecutionMode::CPU); + + const std::vector rowptr{0, 1, 2, 3}; + const std::vector colind{0, 1, 2}; + const std::vector values{1.0, 0.0, 1.0}; + + return load_matrix(solver, 3, rowptr, colind, values) && + solver.setup_factorization() == 0 && + solver.factorize() != 0; +} + +bool test_invalid_csr_structure() +{ + EVLOSER::RefactorizationSolver solver(3, EVLOSER::ExecutionMode::CPU); + + const std::vector rowptr{0, 2, 1, 3}; + const std::vector colind{0, 1, 2}; + const std::vector values{1.0, 1.0, 1.0}; + + return load_matrix(solver, 3, rowptr, colind, values) && + solver.setup_factorization() != 0; +} + +bool test_null_rhs() +{ + EVLOSER::RefactorizationSolver solver(3, EVLOSER::ExecutionMode::CPU); + + const std::vector rowptr{0, 1, 2, 3}; + const std::vector colind{0, 1, 2}; + const std::vector values{2.0, 3.0, 4.0}; + + return load_matrix(solver, 3, rowptr, colind, values) && + factorize(solver) && + !solver.triangular_solve(nullptr, 0.0); +} + +bool test_nonfinite_matrix_values() +{ + EVLOSER::RefactorizationSolver solver(3, EVLOSER::ExecutionMode::CPU); + + const std::vector rowptr{0, 1, 2, 3}; + const std::vector colind{0, 1, 2}; + const std::vector values{ + 1.0, + std::numeric_limits::quiet_NaN(), + 1.0}; + + return load_matrix(solver, 3, rowptr, colind, values) && + solver.setup_factorization() != 0; +} + +bool test_nonfinite_solution() +{ + EVLOSER::RefactorizationSolver solver(1, EVLOSER::ExecutionMode::CPU); + + const std::vector rowptr{0, 1}; + const std::vector colind{0}; + const std::vector values{1e-200}; + + if(!load_matrix(solver, 1, rowptr, colind, values) || + !factorize(solver)) { + return false; + } + + std::vector rhs{1e200}; + + return !solver.triangular_solve(rhs.data(), 0.0); +} + +} // namespace + +int main() +{ + struct TestCase + { + const char* name; + bool (*run)(); + }; + + const TestCase tests[] = { + {"analyze/factor/solve", test_analyze_factor_solve}, + {"repeated solve", test_repeated_solve}, + {"value-only refactor", test_value_only_refactor}, + {"singular factorization", test_singular_factorization}, + {"invalid CSR structure", test_invalid_csr_structure}, + {"null RHS", test_null_rhs}, + {"non-finite matrix values", test_nonfinite_matrix_values}, + {"non-finite solution", test_nonfinite_solution}, + }; + + int failures = 0; + + for(const TestCase& test : tests) { + const bool passed = test.run(); + std::cout << (passed ? "[PASS] " : "[FAIL] ") << test.name << "\n"; + + if(!passed) { + ++failures; + } + } + + if(failures != 0) { + std::cout << failures << " EVLOSER KLU CPU test(s) failed.\n"; + return 1; + } + + std::cout << "All EVLOSER KLU CPU tests passed.\n"; + return 0; +} From 18a721315698d4d7c7dd28896ff12dda658e4529 Mon Sep 17 00:00:00 2001 From: Tamar DeWilde Date: Wed, 24 Jun 2026 14:38:08 -0700 Subject: [PATCH 23/28] Add KLU refactorization recovery --- src/LinAlg/EVLOSER/RefactorizationSolver.cpp | 512 ++++++++++++++++--- src/LinAlg/EVLOSER/RefactorizationSolver.hpp | 62 +++ tests/LinAlg/testEVLOSERKLUCPU.cpp | 168 ++++++ 3 files changed, 674 insertions(+), 68 deletions(-) diff --git a/src/LinAlg/EVLOSER/RefactorizationSolver.cpp b/src/LinAlg/EVLOSER/RefactorizationSolver.cpp index d3ec77b..756fd2d 100644 --- a/src/LinAlg/EVLOSER/RefactorizationSolver.cpp +++ b/src/LinAlg/EVLOSER/RefactorizationSolver.cpp @@ -62,12 +62,15 @@ #endif #include "klu.h" +#include #include #include +#include +#include #include #include #include -#include + #if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ defined(HIOP_USE_HIP) || defined(HAVE_HIP) @@ -507,6 +510,384 @@ bool RefactorizationSolver::validate_solution(const double* solution, const char return true; } +double RefactorizationSolver::compute_klu_residual(const double* rhs, + const double* solution) const +{ + if(rhs == nullptr || solution == nullptr || mat_A_csr_ == nullptr) { + return std::numeric_limits::infinity(); + } + + const int* rowptr = mat_A_csr_->host_irows(); + const int* colind = mat_A_csr_->host_jcols(); + const double* values = mat_A_csr_->host_vals(); + + long double residual_inf = 0.0L; + long double matrix_inf = 0.0L; + long double solution_inf = 0.0L; + long double rhs_inf = 0.0L; + + for(int i = 0; i < n_; ++i) { + solution_inf = + std::max(solution_inf, std::abs(static_cast(solution[i]))); + rhs_inf = + std::max(rhs_inf, std::abs(static_cast(rhs[i]))); + } + + for(int row = 0; row < n_; ++row) { + long double row_sum = 0.0L; + long double matrix_vector_product = 0.0L; + + for(int k = rowptr[row]; k < rowptr[row + 1]; ++k) { + const long double value = static_cast(values[k]); + + row_sum += std::abs(value); + matrix_vector_product += + value * static_cast(solution[colind[k]]); + } + + matrix_inf = std::max(matrix_inf, row_sum); + residual_inf = + std::max(residual_inf, + std::abs(matrix_vector_product - + static_cast(rhs[row]))); + } + + const long double scale = + std::max(1.0L, matrix_inf * solution_inf + rhs_inf); + + const long double normalized_residual = residual_inf / scale; + + if(!std::isfinite(normalized_residual)) { + return std::numeric_limits::infinity(); + } + + return static_cast(normalized_residual); +} + +klu_numeric* RefactorizationSolver::factor_klu_numeric(const char* caller) +{ + if(!validate_system_matrix(caller)) { + return nullptr; + } + + if(Symbolic_ == nullptr || Symbolic_->n != n_) { + if(!silent_output_) { + std::cout << "[EVLOSER] " << caller + << " requires valid KLU symbolic analysis.\n"; + } + return nullptr; + } + + klu_common trial_common = Common_; + trial_common.status = KLU_OK; + + klu_numeric* numeric = + klu_factor(mat_A_csr_->host_irows(), + mat_A_csr_->host_jcols(), + mat_A_csr_->host_vals(), + Symbolic_, + &trial_common); + + const int status = trial_common.status; + + if(numeric == nullptr || status != KLU_OK) { + if(!silent_output_) { + std::cout << "[EVLOSER] " << caller + << " failed with KLU status " << status << "\n"; + } + + if(numeric != nullptr) { + klu_free_numeric(&numeric, &trial_common); + } + + return nullptr; + } + return numeric; +} + +bool RefactorizationSolver::solve_klu_candidate( + klu_numeric* numeric, + const double* rhs, + std::vector& solution, + double& residual, + const char* caller) +{ + residual = std::numeric_limits::infinity(); + + if(numeric == nullptr || rhs == nullptr) { + return false; + } + + if(!validate_solution(rhs, caller)) { + return false; + } + + solution.assign(rhs, rhs + n_); + + klu_common trial_common = Common_; + trial_common.status = KLU_OK; + + const int ok = + klu_solve(Symbolic_, + numeric, + n_, + 1, + solution.data(), + &Common_); + + const int status = Common_.status; + + if(ok == 0 || status != KLU_OK) { + if(!silent_output_) { + std::cout << "[EVLOSER] " << caller + << " failed with KLU status " << status << "\n"; + } + return false; + } + + if(!validate_solution(solution.data(), caller)) { + return false; + } + + residual = compute_klu_residual(rhs, solution.data()); + + if(!std::isfinite(residual)) { + if(!silent_output_) { + std::cout << "[EVLOSER] " << caller + << " produced a non-finite residual.\n"; + } + return false; + } + + return true; +} + +bool RefactorizationSolver::solve_cpu_with_recovery(double* dx) +{ + last_klu_recovery_action_ = KluRecoveryAction::None; + + if(dx == nullptr) { + if(!silent_output_) { + std::cout << "[EVLOSER] KLU solve received a null right-hand side.\n"; + } + last_klu_recovery_action_ = KluRecoveryAction::Failed; + return false; + } + + if(!validate_system_matrix("KLU solve") || + !validate_solution(dx, "KLU right-hand side")) { + last_klu_recovery_action_ = KluRecoveryAction::Failed; + return false; + } + + if(Symbolic_ == nullptr || Symbolic_->n != n_) { + if(!silent_output_) { + std::cout << "[EVLOSER] KLU solve requires valid symbolic analysis.\n"; + } + + last_klu_recovery_action_ = KluRecoveryAction::Failed; + return false; + } + + /* + * A failed klu_refactor() may leave Common_.status nonzero. When recovery + * is pending, allow the fresh-factorization path to run instead of + * rejecting the solve based on that previous status. + */ + if(!klu_refactor_pending_validation_ && + !validate_klu_factorization("KLU solve")) { + last_klu_recovery_action_ = KluRecoveryAction::Failed; + return false; + } + + const std::vector rhs(dx, dx + n_); + + std::vector refactor_solution; + double residual_refactor = + std::numeric_limits::infinity(); + + bool refactor_solution_usable = false; + + /* + * With no pending value-only refactorization, Numeric_ already represents + * a fresh factorization. Solve normally and apply only the loose safety + * limit. + */ + if(!klu_refactor_pending_validation_) { + if(!solve_klu_candidate(Numeric_, + rhs.data(), + refactor_solution, + residual_refactor, + "KLU solve")) { + last_klu_recovery_action_ = KluRecoveryAction::Failed; + return false; + } + + if(residual_refactor > klu_residual_safety_limit_) { + if(!silent_output_) { + std::cout << "[EVLOSER] KLU residual " + << residual_refactor + << " exceeds safety limit " + << klu_residual_safety_limit_ << "\n"; + } + + last_klu_recovery_action_ = KluRecoveryAction::Failed; + return false; + } + + std::copy(refactor_solution.begin(), + refactor_solution.end(), + dx); + + return true; + } + + if(klu_refactor_succeeded_) { + refactor_solution_usable = + solve_klu_candidate(Numeric_, + rhs.data(), + refactor_solution, + residual_refactor, + "KLU refactorized solve"); + } + + const bool fresh_factorization_required = + !refactor_solution_usable || + residual_refactor > klu_suspicious_residual_threshold_; + + if(!fresh_factorization_required) { + if(residual_refactor > klu_residual_safety_limit_) { + if(!silent_output_) { + std::cout << "[EVLOSER] KLU refactorized residual " + << residual_refactor + << " exceeds safety limit " + << klu_residual_safety_limit_ << "\n"; + } + + klu_refactor_pending_validation_ = false; + klu_refactor_succeeded_ = false; + last_klu_recovery_action_ = KluRecoveryAction::Failed; + return false; + } + + std::copy(refactor_solution.begin(), + refactor_solution.end(), + dx); + + klu_refactor_pending_validation_ = false; + klu_refactor_succeeded_ = false; + last_klu_recovery_action_ = + KluRecoveryAction::RefactorAccepted; + + return true; + } + + if(!silent_output_ && refactor_solution_usable) { + std::cout << "[EVLOSER] KLU refactorized residual " + << residual_refactor + << " exceeds suspicious-result threshold " + << klu_suspicious_residual_threshold_ + << "; trying fresh factorization.\n"; + } + + klu_numeric* full_numeric = + factor_klu_numeric("KLU recovery factorization"); + + std::vector full_solution; + double residual_full = + std::numeric_limits::infinity(); + + const bool full_solution_usable = + full_numeric != nullptr && + solve_klu_candidate(full_numeric, + rhs.data(), + full_solution, + residual_full, + "KLU recovery solve"); + + const bool refactor_candidate_safe = + refactor_solution_usable && + residual_refactor <= klu_residual_safety_limit_; + + const bool full_candidate_safe = + full_solution_usable && + residual_full <= klu_residual_safety_limit_; + + if(!refactor_candidate_safe && !full_candidate_safe) { + if(full_numeric != nullptr) { + klu_free_numeric(&full_numeric, &Common_); + } + + if(!silent_output_) { + std::cout << "[EVLOSER] KLU recovery produced no candidate " + "within the residual safety limit.\n"; + } + + klu_refactor_pending_validation_ = false; + klu_refactor_succeeded_ = false; + last_klu_recovery_action_ = KluRecoveryAction::Failed; + + return false; + } + + const bool full_factor_materially_better = + full_candidate_safe && + refactor_candidate_safe && + residual_full < + klu_improvement_ratio_ * residual_refactor && + residual_refactor - residual_full > + klu_minimum_improvement_; + + const bool keep_full_factors = + full_candidate_safe && + (!refactor_candidate_safe || + full_factor_materially_better); + + if(keep_full_factors) { + if(Numeric_ != nullptr) { + klu_free_numeric(&Numeric_, &Common_); + } + + Numeric_ = full_numeric; + full_numeric = nullptr; + + std::copy(full_solution.begin(), + full_solution.end(), + dx); + + last_klu_recovery_action_ = + KluRecoveryAction::FullFactorAccepted; + } else { + /* + * Retain the refactored factors unless the fresh factorization is + * materially better. For the current solve, return whichever finite + * candidate has the smaller residual. + */ + if(full_candidate_safe && + residual_full < residual_refactor) { + std::copy(full_solution.begin(), + full_solution.end(), + dx); + } else { + std::copy(refactor_solution.begin(), + refactor_solution.end(), + dx); + } + + if(full_numeric != nullptr) { + klu_free_numeric(&full_numeric, &Common_); + } + + last_klu_recovery_action_ = + KluRecoveryAction::RefactorRetained; + } + + klu_refactor_pending_validation_ = false; + klu_refactor_succeeded_ = false; + + return true; +} + #if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ defined(HIOP_USE_HIP) || defined(HAVE_HIP) bool RefactorizationSolver::checkEvloserRfStatus(evloserRfStatus_t status, const char* caller) const @@ -565,6 +946,10 @@ int RefactorizationSolver::setup_factorization() return -1; } + klu_refactor_pending_validation_ = false; + klu_refactor_succeeded_ = false; + last_klu_recovery_action_ = KluRecoveryAction::None; + // A new matrix structure invalidates both existing KLU states. if(Numeric_ != nullptr) { klu_free_numeric(&Numeric_, &Common_); @@ -596,38 +981,27 @@ int RefactorizationSolver::setup_factorization() int RefactorizationSolver::factorize() { - if(!validate_system_matrix("KLU factorization")) { - return -1; - } + klu_numeric* fresh_numeric = + factor_klu_numeric("KLU factorization"); - if(Symbolic_ == nullptr || Symbolic_->n != n_) { - if(!silent_output_) { - std::cout << "[EVLOSER] KLU factorization requires valid symbolic analysis.\n"; - } + if(fresh_numeric == nullptr) { return -1; } - // A fresh factorization replaces only the numeric state. if(Numeric_ != nullptr) { klu_free_numeric(&Numeric_, &Common_); } - Numeric_ = klu_factor(mat_A_csr_->host_irows(), - mat_A_csr_->host_jcols(), - mat_A_csr_->host_vals(), - Symbolic_, - &Common_); - - if(Numeric_ == nullptr || Common_.status != KLU_OK) { - if(!silent_output_) { - std::cout << "[EVLOSER] KLU numeric factorization failed with status " - << Common_.status << "\n"; - } - return -1; - } + Numeric_ = fresh_numeric; + klu_refactor_pending_validation_ = false; + klu_refactor_succeeded_ = false; + last_klu_recovery_action_ = KluRecoveryAction::None; is_first_solve_ = true; - return validate_klu_factorization("KLU factorization") ? 0 : -1; + + return validate_klu_factorization("KLU factorization") + ? 0 + : -1; } void RefactorizationSolver::setup_refactorization() @@ -676,29 +1050,48 @@ void RefactorizationSolver::setup_refactorization() int RefactorizationSolver::refactorize() { if(!validate_system_matrix("refactorization")) { + if(execution_mode_ == ExecutionMode::CPU) { + klu_refactor_pending_validation_ = false; + klu_refactor_succeeded_ = false; + last_klu_recovery_action_ = KluRecoveryAction::Failed; + } return -1; } if(execution_mode_ == ExecutionMode::CPU) { if(!validate_klu_factorization("KLU refactorization")) { + klu_refactor_pending_validation_ = false; + klu_refactor_succeeded_ = false; + last_klu_recovery_action_ = KluRecoveryAction::Failed; return -1; } - const int ok = klu_refactor(mat_A_csr_->host_irows(), - mat_A_csr_->host_jcols(), - mat_A_csr_->host_vals(), - Symbolic_, - Numeric_, - &Common_); - - if(ok == 0 || Common_.status != KLU_OK) { - if(!silent_output_) { - std::cout << "[EVLOSER] KLU refactorization failed with status " - << Common_.status << "\n"; - } - return -1; + klu_refactor_pending_validation_ = true; + klu_refactor_succeeded_ = false; + last_klu_recovery_action_ = KluRecoveryAction::None; + + const int ok = + klu_refactor(mat_A_csr_->host_irows(), + mat_A_csr_->host_jcols(), + mat_A_csr_->host_vals(), + Symbolic_, + Numeric_, + &Common_); + + klu_refactor_succeeded_ = + ok != 0 && Common_.status == KLU_OK; + + if(!klu_refactor_succeeded_ && !silent_output_) { + std::cout + << "[EVLOSER] KLU refactorization failed with status " + << Common_.status + << "; fresh factorization will be attempted during solve.\n"; } + /* + * A failed KLU refactorization is recoverable. Continue to the solve, + * where a fresh numeric factorization will be attempted. + */ return 0; } @@ -707,7 +1100,8 @@ int RefactorizationSolver::refactorize() if(refact_ == "glu") { if(execution_mode_ != ExecutionMode::CUDA) { if(!silent_output_) { - std::cout << "[EVLOSER] GLU refactorization requires CUDA execution mode.\n"; + std::cout + << "[EVLOSER] GLU refactorization requires CUDA execution mode.\n"; } return -1; } @@ -721,22 +1115,28 @@ int RefactorizationSolver::refactorize() mat_A_csr_->device_irows(), mat_A_csr_->device_jcols(), info_M_); - sp_status_ = cusolverSpDgluFactor(handle_cusolver_, info_M_, d_work_); + + sp_status_ = + cusolverSpDgluFactor(handle_cusolver_, info_M_, d_work_); } else { if(refact_ == "rf") { if(resetEvloserRfValues("GPU RF reset values") != 0) { return -1; } + if(refactorizeEvloserRf("GPU RF refactorization") != 0) { return -1; } } } + return 0; #endif if(!silent_output_) { - std::cout << "[EVLOSER] Selected refactorization backend is unavailable in this build.\n"; + std::cout + << "[EVLOSER] Selected refactorization backend is unavailable " + "in this build.\n"; } return -1; @@ -744,6 +1144,11 @@ int RefactorizationSolver::refactorize() bool RefactorizationSolver::triangular_solve(double* dx, double tol) { + if(execution_mode_ == ExecutionMode::CPU) { + (void)tol; + return solve_cpu_with_recovery(dx); + } + if(dx == nullptr) { if(!silent_output_) { std::cout << "[EVLOSER] Solve received a null right-hand side.\n"; @@ -751,35 +1156,6 @@ bool RefactorizationSolver::triangular_solve(double* dx, double tol) return false; } - if(execution_mode_ == ExecutionMode::CPU) { - (void)tol; - - if(!validate_klu_factorization("KLU solve")) { - return false; - } - - if(!validate_solution(dx, "KLU right-hand side")) { - return false; - } - - const int ok = klu_solve(Symbolic_, - Numeric_, - n_, - 1, - dx, - &Common_); - - if(ok == 0 || Common_.status != KLU_OK) { - if(!silent_output_) { - std::cout << "[EVLOSER] KLU solve failed with status " - << Common_.status << "\n"; - } - return false; - } - - return validate_solution(dx, "KLU solve"); - } - #if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ defined(HIOP_USE_HIP) || defined(HAVE_HIP) if(refact_ == "glu") { diff --git a/src/LinAlg/EVLOSER/RefactorizationSolver.hpp b/src/LinAlg/EVLOSER/RefactorizationSolver.hpp index a8ec2de..0e3f029 100644 --- a/src/LinAlg/EVLOSER/RefactorizationSolver.hpp +++ b/src/LinAlg/EVLOSER/RefactorizationSolver.hpp @@ -60,6 +60,7 @@ #include "evloser_execution_mode.hpp" #include "evloser_gpu_defs.hpp" #include +#include namespace EVLOSER { @@ -74,6 +75,14 @@ class IterativeRefinement; class RefactorizationSolver { public: + enum class KluRecoveryAction + { + None, + RefactorAccepted, + FullFactorAccepted, + RefactorRetained, + Failed + }; // constructor // RefactorizationSolver(); RefactorizationSolver(int n, ExecutionMode execution_mode); @@ -129,6 +138,31 @@ class RefactorizationSolver void set_silent_output(bool silent_output) { silent_output_ = silent_output; } + double& klu_suspicious_residual_threshold() + { + return klu_suspicious_residual_threshold_; + } + + double& klu_residual_safety_limit() + { + return klu_residual_safety_limit_; + } + + double& klu_improvement_ratio() + { + return klu_improvement_ratio_; + } + + double& klu_minimum_improvement() + { + return klu_minimum_improvement_; + } + + KluRecoveryAction last_klu_recovery_action() const + { + return last_klu_recovery_action_; + } + /** * @brief Set up factorization of the first linear system. * @@ -219,6 +253,17 @@ class RefactorizationSolver klu_common Common_{}; klu_symbolic* Symbolic_ = nullptr; klu_numeric* Numeric_ = nullptr; + + bool klu_refactor_pending_validation_{false}; + bool klu_refactor_succeeded_{false}; + + double klu_suspicious_residual_threshold_{1e-4}; + double klu_residual_safety_limit_{1e-1}; + double klu_improvement_ratio_{0.1}; + double klu_minimum_improvement_{1e-10}; + + KluRecoveryAction last_klu_recovery_action_{ + KluRecoveryAction::None}; /*pieces of M */ int* mia_ = nullptr; int* mja_ = nullptr; @@ -264,6 +309,23 @@ class RefactorizationSolver /// Validate that the solution pointer is non-null and all solution values are finite. bool validate_solution(const double* solution, const char* caller) const; + /// Compute the normalized infinity-norm residual for the current CSR matrix. + double compute_klu_residual(const double* rhs, + const double* solution) const; + + /// Create fresh KLU numeric factors without replacing the currently retained factors. + klu_numeric* factor_klu_numeric(const char* caller); + + /// Solve one candidate system and compute its normalized residual. + bool solve_klu_candidate(klu_numeric* numeric, + const double* rhs, + std::vector& solution, + double& residual, + const char* caller); + + /// Complete a CPU solve, including refactorization recovery when required. + bool solve_cpu_with_recovery(double* dx); + int initializeKLU(); #if defined(HIOP_USE_CUDA) || defined(HAVE_CUDA) || \ diff --git a/tests/LinAlg/testEVLOSERKLUCPU.cpp b/tests/LinAlg/testEVLOSERKLUCPU.cpp index 5dd5fe7..fa38fdb 100644 --- a/tests/LinAlg/testEVLOSERKLUCPU.cpp +++ b/tests/LinAlg/testEVLOSERKLUCPU.cpp @@ -217,6 +217,167 @@ bool test_nonfinite_solution() return !solver.triangular_solve(rhs.data(), 0.0); } + +bool test_refactor_accepted() +{ + EVLOSER::RefactorizationSolver solver(3, EVLOSER::ExecutionMode::CPU); + + const std::vector rowptr{0, 2, 5, 7}; + const std::vector colind{0, 1, 0, 1, 2, 1, 2}; + const std::vector initial_values{ + 4.0, 1.0, 1.0, 3.0, 1.0, 1.0, 2.0}; + const std::vector updated_values{ + 5.0, 1.0, 1.0, 4.0, 1.0, 1.0, 3.0}; + + if(!load_matrix(solver, 3, rowptr, colind, initial_values) || + !factorize(solver)) { + return false; + } + + std::copy(updated_values.begin(), + updated_values.end(), + solver.mat_A_csr()->host_vals()); + + solver.setup_refactorization(); + + if(solver.refactorize() != 0) { + return false; + } + + std::vector rhs{7.0, 12.0, 11.0}; + + return solver.triangular_solve(rhs.data(), 0.0) && + vectors_equal(rhs, {1.0, 2.0, 3.0}) && + solver.last_klu_recovery_action() == + EVLOSER::RefactorizationSolver:: + KluRecoveryAction::RefactorAccepted; +} + +bool test_refactor_retained_after_comparison() +{ + EVLOSER::RefactorizationSolver solver(3, EVLOSER::ExecutionMode::CPU); + + const std::vector rowptr{0, 2, 5, 7}; + const std::vector colind{0, 1, 0, 1, 2, 1, 2}; + const std::vector initial_values{ + 4.0, 1.0, 1.0, 3.0, 1.0, 1.0, 2.0}; + const std::vector updated_values{ + 5.0, 1.0, 1.0, 4.0, 1.0, 1.0, 3.0}; + + if(!load_matrix(solver, 3, rowptr, colind, initial_values) || + !factorize(solver)) { + return false; + } + + std::copy(updated_values.begin(), + updated_values.end(), + solver.mat_A_csr()->host_vals()); + + solver.setup_refactorization(); + + /* + * Force comparison with a fresh numeric factorization. Since both + * candidates should have comparable residuals, retain the refactored + * factors. + */ + solver.klu_suspicious_residual_threshold() = -1.0; + + if(solver.refactorize() != 0) { + return false; + } + + std::vector rhs{7.0, 12.0, 11.0}; + + return solver.triangular_solve(rhs.data(), 0.0) && + vectors_equal(rhs, {1.0, 2.0, 3.0}) && + solver.last_klu_recovery_action() == + EVLOSER::RefactorizationSolver:: + KluRecoveryAction::RefactorRetained; +} + +bool test_failed_refactor_recovered_by_full_factorization() +{ + EVLOSER::RefactorizationSolver solver(2, EVLOSER::ExecutionMode::CPU); + + const std::vector rowptr{0, 2, 4}; + const std::vector colind{0, 1, 0, 1}; + + /* + * The updated matrix retains the sparsity pattern but invalidates the + * previous numerical pivot. A fresh factorization can choose a new pivot. + */ + const std::vector initial_values{ + 10.0, 1.0, + 1.0, 1.0}; + + const std::vector updated_values{ + 0.0, 1.0, + 1.0, 10.0}; + + if(!load_matrix(solver, 2, rowptr, colind, initial_values) || + !factorize(solver)) { + return false; + } + + std::copy(updated_values.begin(), + updated_values.end(), + solver.mat_A_csr()->host_vals()); + + solver.setup_refactorization(); + + /* + * Refactorization failure is recoverable, so refactorize() permits the + * subsequent solve to attempt a fresh numeric factorization. + */ + if(solver.refactorize() != 0) { + return false; + } + + std::vector rhs{-1.0, -8.0}; + + return solver.triangular_solve(rhs.data(), 0.0) && + vectors_equal(rhs, {2.0, -1.0}) && + solver.last_klu_recovery_action() == + EVLOSER::RefactorizationSolver:: + KluRecoveryAction::FullFactorAccepted; +} + +bool test_unrecoverable_refactor_failure() +{ + EVLOSER::RefactorizationSolver solver(2, EVLOSER::ExecutionMode::CPU); + + const std::vector rowptr{0, 2, 4}; + const std::vector colind{0, 1, 0, 1}; + const std::vector initial_values{ + 10.0, 1.0, + 1.0, 1.0}; + const std::vector singular_values{ + 1.0, 1.0, + 1.0, 1.0}; + + if(!load_matrix(solver, 2, rowptr, colind, initial_values) || + !factorize(solver)) { + return false; + } + + std::copy(singular_values.begin(), + singular_values.end(), + solver.mat_A_csr()->host_vals()); + + solver.setup_refactorization(); + + if(solver.refactorize() != 0) { + return false; + } + + std::vector rhs{2.0, 2.0}; + + return !solver.triangular_solve(rhs.data(), 0.0) && + solver.last_klu_recovery_action() == + EVLOSER::RefactorizationSolver:: + KluRecoveryAction::Failed; +} + } // namespace int main() @@ -236,6 +397,13 @@ int main() {"null RHS", test_null_rhs}, {"non-finite matrix values", test_nonfinite_matrix_values}, {"non-finite solution", test_nonfinite_solution}, + {"refactor accepted", test_refactor_accepted}, + {"refactor retained after fresh comparison", + test_refactor_retained_after_comparison}, + {"failed refactor recovered by full factorization", + test_failed_refactor_recovered_by_full_factorization}, + {"unrecoverable refactor failure", + test_unrecoverable_refactor_failure}, }; int failures = 0; From 0b993b59188093aa115bbd04f0326f60b03608d2 Mon Sep 17 00:00:00 2001 From: Tamar DeWilde Date: Wed, 24 Jun 2026 15:43:28 -0700 Subject: [PATCH 24/28] Route HiOp CPU solves through EVLOSER --- src/Optimization/hiopDualsUpdater.cpp | 14 ++++++++++ src/Optimization/hiopKKTLinSysSparse.cpp | 33 ++++++++++++++++++++++++ src/Utils/hiopOptions.cpp | 24 ++++++++++++----- 3 files changed, 64 insertions(+), 7 deletions(-) diff --git a/src/Optimization/hiopDualsUpdater.cpp b/src/Optimization/hiopDualsUpdater.cpp index 7dc1748..a66ec5a 100644 --- a/src/Optimization/hiopDualsUpdater.cpp +++ b/src/Optimization/hiopDualsUpdater.cpp @@ -379,6 +379,20 @@ bool hiopDualsLsqUpdateLinsysAugSparse::instantiate_linear_solver(const char* li // compute mode CPU ///////////////////////////////////////////////////////////////////////////////////////// assert(nullptr == lin_sys_); +#ifdef HIOP_USE_EVLOSER + if(linear_solver == "evloser") { + if(fact_acceptor == "inertia_correction") { + nlp_->log->printf(hovError, + "LSQ linear solver with EVLOSER does not support inertia correction. " + "Please set option 'fact_acceptor' to 'inertia_free'.\n"); + assert(false); + return false; + } + + ss_log << "LSQ linear solver --- KKT_SPARSE_XDYcYd linsys: EVLOSER on CPU "; + lin_sys_ = new hiopLinSolverSymSparseEVLOSER(n, nnz, nlp_); + } +#endif // HIOP_USE_EVLOSER if(linear_solver == "ma57" || linear_solver == "auto") { #ifdef HIOP_USE_COINHSL ss_log << "LSQ linear solver --- KKT_SPARSE_XDYcYd linsys: MA57 "; diff --git a/src/Optimization/hiopKKTLinSysSparse.cpp b/src/Optimization/hiopKKTLinSysSparse.cpp index c44760e..c7cbc80 100644 --- a/src/Optimization/hiopKKTLinSysSparse.cpp +++ b/src/Optimization/hiopKKTLinSysSparse.cpp @@ -314,6 +314,22 @@ hiopLinSolverSymSparse* hiopKKTLinSysCompressedSparseXYcYd::determineAndCreateLi //////////////////////////////////////////////////////////////////////////////////////////////// assert(nullptr == linSys_); +#ifdef HIOP_USE_EVLOSER + if(linear_solver == "evloser") { + linsol_actual = "EVLOSER"; + linSys_ = new hiopLinSolverSymSparseEVLOSER(n, nnz, nlp_); + + auto* fact_acceptor_ic = dynamic_cast(fact_acceptor_); + if(fact_acceptor_ic) { + nlp_->log->printf(hovError, + "KKT_SPARSE_XYcYd linsys with EVLOSER does not support inertia correction. " + "Please set option 'fact_acceptor' to 'inertia_free'.\n"); + assert(false); + return nullptr; + } + } +#endif // HIOP_USE_EVLOSER + if(linear_solver == "ma57" || linear_solver == "auto") { #ifdef HIOP_USE_COINHSL linsol_actual = "MA57"; @@ -710,6 +726,23 @@ hiopLinSolverSymSparse* hiopKKTLinSysCompressedSparseXDYcYd::determineAndCreateL // CPU compute mode ///////////////////////////////////////////////////////////////////////////////////////////// if(linear_solver == "ma57" || linear_solver == "auto") { + +#ifdef HIOP_USE_EVLOSER + if(linear_solver == "evloser") { + actual_lin_solver = "EVLOSER"; + linSys_ = new hiopLinSolverSymSparseEVLOSER(n, nnz, nlp_); + + auto* fact_acceptor_ic = dynamic_cast(fact_acceptor_); + if(fact_acceptor_ic) { + nlp_->log->printf(hovError, + "KKT_SPARSE_XDYcYd linsys with EVLOSER does not support inertia correction. " + "Please set option 'fact_acceptor' to 'inertia_free'.\n"); + assert(false); + return nullptr; + } + } +#endif // HIOP_USE_EVLOSER + #ifdef HIOP_USE_COINHSL linSys_ = new hiopLinSolverSymSparseMA57(n, nnz, nlp_); actual_lin_solver = "MA57"; diff --git a/src/Utils/hiopOptions.cpp b/src/Utils/hiopOptions.cpp index 5432d90..05c2481 100644 --- a/src/Utils/hiopOptions.cpp +++ b/src/Utils/hiopOptions.cpp @@ -928,7 +928,7 @@ void hiopOptionsNLP::register_options() register_str_option("linear_solver_sparse", "auto", range, - "Selects among MA57, PARDISO, STRUMPACK, cuSOLVER's Cholesky or LU, and GINKGO for the " + "Selects among MA57, PARDISO, STRUMPACK, ReSolve, EVLOSER, cuSOLVER's Cholesky or LU, and GINKGO for the " "sparse linear solves."); } @@ -943,7 +943,7 @@ void hiopOptionsNLP::register_options() register_str_option("duals_init_linear_solver_sparse", "auto", range, - "Selects among MA57, PARDISO, cuSOLVER, STRUMPACK, and GINKGO for the sparse linear solves."); + "Selects among MA57, PARDISO, ReSolve, EVLOSER, cuSOLVER, STRUMPACK, and GINKGO for the sparse linear solves."); } // choose hardware backend for the Ginkgo solver to run on. @@ -1404,7 +1404,6 @@ void hiopOptionsNLP::ensure_consistence() auto kkt_linsys = GetString("KKTLinsys"); auto sol_sp = GetString("linear_solver_sparse"); if(kkt_linsys == "full") { - // Full sparse KKT accepts EVLOSER through the same sparse solver selection path as ReSolve. if(sol_sp != "resolve" && sol_sp != "evloser" && sol_sp != "pardiso" && sol_sp != "strumpack" && sol_sp != "auto") { if(is_user_defined("linear_solver_sparse")) { log_printf(hovWarning, @@ -1428,18 +1427,29 @@ void hiopOptionsNLP::ensure_consistence() } } -// EVLOSER can use CUDA or HIP, unlike the CUDA-only ReSolve path below. -#if !defined(HIOP_USE_CUDA) && !defined(HIOP_USE_HIP) + +#ifndef HIOP_USE_EVLOSER if(sol_sp == "evloser") { if(is_user_defined("linear_solver_sparse")) { log_printf(hovWarning, - "The option 'linear_solver_sparse=%s' is not valid without CUDA or HIP support enabled." + "The option 'linear_solver_sparse=%s' is not valid because HiOp was built without EVLOSER support." " Will use 'linear_solver_sparse=auto'.\n", GetString("linear_solver_sparse").c_str()); } set_val("linear_solver_sparse", "auto"); } -#endif // !defined(HIOP_USE_CUDA) && !defined(HIOP_USE_HIP) + + if(GetString("duals_init_linear_solver_sparse") == "evloser") { + if(is_user_defined("duals_init_linear_solver_sparse")) { + log_printf( + hovWarning, + "The option 'duals_init_linear_solver_sparse=%s' is not valid because HiOp was built without EVLOSER support." + " Will use 'duals_init_linear_solver_sparse=auto'.\n", + GetString("duals_init_linear_solver_sparse").c_str()); + } + set_val("duals_init_linear_solver_sparse", "auto"); + } +#endif // HIOP_USE_EVLOSER #ifndef HIOP_USE_CUDA if(sol_sp == "resolve" || sol_sp == "cusolver-chol") { From 6d64eb05a9b60441d037699c3d6635d5bd0d1b17 Mon Sep 17 00:00:00 2001 From: Tamar DeWilde Date: Wed, 24 Jun 2026 17:10:00 -0700 Subject: [PATCH 25/28] Add EVLOSER CPU integration tests --- src/Drivers/Sparse/CMakeLists.txt | 27 ++++++++++++++++++----- src/Drivers/Sparse/NlpSparseEx1Driver.cpp | 14 +++++++++++- src/Drivers/Sparse/NlpSparseEx2Driver.cpp | 22 +++++++++++++----- src/Optimization/hiopKKTLinSysSparse.cpp | 3 +-- 4 files changed, 51 insertions(+), 15 deletions(-) diff --git a/src/Drivers/Sparse/CMakeLists.txt b/src/Drivers/Sparse/CMakeLists.txt index 2918063..d9f9b46 100644 --- a/src/Drivers/Sparse/CMakeLists.txt +++ b/src/Drivers/Sparse/CMakeLists.txt @@ -59,9 +59,16 @@ add_test(NAME NlpSparse1_2 COMMAND ${RUNCMD} "$" " if(HIOP_USE_CUDA) add_test(NAME NlpSparse1_3 COMMAND ${RUNCMD} "$" "500" "-cusolver" "-selfcheck") endif(HIOP_USE_CUDA) -# EVLOSER uses the existing SparseEx1 driver and selects the EVLOSER solver option. -if(HIOP_USE_CUDA AND HIOP_USE_EVLOSER) - add_test(NAME NlpSparse1_EVLOSER COMMAND ${RUNCMD} "$" "500" "-evloser" "-selfcheck") +# CPU-only EVLOSER integration test. +if(HIOP_USE_EVLOSER AND NOT HIOP_USE_GPU) + add_test( + NAME NlpSparse1_EVLOSER_CPU + COMMAND ${RUNCMD} + "$" + "500" + "-evloser" + "-selfcheck" + ) endif() if(HIOP_USE_PARDISO) add_test(NAME NlpSparse1_4 COMMAND ${RUNCMD} "$" "500" "-pardiso" "-selfcheck") @@ -80,9 +87,17 @@ add_test(NAME NlpSparse2_2 COMMAND ${RUNCMD} "$" " if(HIOP_USE_CUDA) add_test(NAME NlpSparse2_3 COMMAND ${RUNCMD} "$" "500" "-cusolver" "-inertiafree" "-selfcheck") endif(HIOP_USE_CUDA) -# EVLOSER uses the existing SparseEx2 driver with the required inertia-free path. -if(HIOP_USE_CUDA AND HIOP_USE_EVLOSER) - add_test(NAME NlpSparse2_EVLOSER COMMAND ${RUNCMD} "$" "500" "-evloser" "-inertiafree" "-selfcheck") +# CPU-only EVLOSER integration test with the required inertia-free configuration. +if(HIOP_USE_EVLOSER AND NOT HIOP_USE_GPU) + add_test( + NAME NlpSparse2_EVLOSER_CPU + COMMAND ${RUNCMD} + "$" + "500" + "-evloser" + "-inertiafree" + "-selfcheck" + ) endif() if(HIOP_USE_GINKGO) add_test(NAME NlpSparse2_4 COMMAND ${RUNCMD} "$" "500" "-ginkgo" "-inertiafree" "-selfcheck") diff --git a/src/Drivers/Sparse/NlpSparseEx1Driver.cpp b/src/Drivers/Sparse/NlpSparseEx1Driver.cpp index b7eac02..97d8811 100644 --- a/src/Drivers/Sparse/NlpSparseEx1Driver.cpp +++ b/src/Drivers/Sparse/NlpSparseEx1Driver.cpp @@ -149,10 +149,18 @@ static bool parse_arguments(int argc, // If HiOp is built without CUDA de-select cuSOLVER. #ifndef HIOP_USE_RESOLVE - if(use_cusolver || use_evloser) { + if(use_cusolver) { printf("HiOp built without support for ReSolve. "); printf("Using default linear solver ...\n"); use_cusolver = false; + } +#endif + +// If EVLOSER is not available, de-select it. +#ifndef HIOP_USE_EVLOSER + if(use_evloser) { + printf("HiOp built without support for EVLOSER. "); + printf("Using default linear solver ...\n"); use_evloser = false; } #endif @@ -252,6 +260,9 @@ int main(int argc, char** argv) } else { nlp.options->SetStringValue("linear_solver_sparse", "resolve"); } + +#if defined(HIOP_USE_CUDA) || defined(HIOP_USE_HIP) + // Device ReSolve and EVLOSER configurations use RF and hybrid execution. nlp.options->SetStringValue("resolve_refactorization", "rf"); nlp.options->SetIntegerValue("ir_inner_maxit", 100); nlp.options->SetNumericValue("ir_inner_tol", 1e-8); @@ -259,6 +270,7 @@ int main(int argc, char** argv) nlp.options->SetIntegerValue("ir_inner_conv_cond", 2); nlp.options->SetStringValue("ir_inner_gs_scheme", "cgs2"); nlp.options->SetStringValue("compute_mode", "hybrid"); +#endif // LU solver needs to use inertia free approach nlp.options->SetStringValue("fact_acceptor", "inertia_free"); nlp.options->SetIntegerValue("ir_outer_maxit", 0); diff --git a/src/Drivers/Sparse/NlpSparseEx2Driver.cpp b/src/Drivers/Sparse/NlpSparseEx2Driver.cpp index e49805a..e6d8cb1 100644 --- a/src/Drivers/Sparse/NlpSparseEx2Driver.cpp +++ b/src/Drivers/Sparse/NlpSparseEx2Driver.cpp @@ -141,18 +141,25 @@ static bool parse_arguments(int argc, // If CUDA is not available, de-select cuSOLVER #ifndef HIOP_USE_CUDA - if(use_cusolver || use_evloser) { + if(use_cusolver) { printf("HiOp built without CUDA support. "); printf("Using default instead of cuSOLVER/EVLOSER ...\n"); use_cusolver = false; + } +#endif + +// If EVLOSER is not available, de-select it. +#ifndef HIOP_USE_EVLOSER + if(use_evloser) { + printf("HiOp built without EVLOSER support. "); + printf("Using default linear solver ...\n"); use_evloser = false; } #endif -// Use cuSOLVER's LU factorization, if it was configured +// Use Resolve's sparse LU path when cuSOLVER was selected. #ifdef HIOP_USE_RESOLVE - // EVLOSER uses the existing ReSolve-enabled sparse solver setup in this driver. - if(use_cusolver || use_evloser) { + if(use_cusolver) { use_resolve = true; } #endif @@ -258,21 +265,24 @@ int main(int argc, char** argv) if(inertia_free) { nlp.options->SetStringValue("fact_acceptor", "inertia_free"); } - if(use_resolve) { + if(use_resolve || use_evloser) { nlp.options->SetStringValue("duals_init", "zero"); nlp.options->SetStringValue("linsol_mode", "speculative"); - // EVLOSER keeps the ReSolve RF settings below but selects the EVLOSER solver name. + if(use_evloser) { nlp.options->SetStringValue("linear_solver_sparse", "evloser"); } else { nlp.options->SetStringValue("linear_solver_sparse", "resolve"); } +#if defined(HIOP_USE_CUDA) || defined(HIOP_USE_HIP) + // Device ReSolve and EVLOSER configurations use RF and hybrid execution. nlp.options->SetStringValue("resolve_refactorization", "rf"); nlp.options->SetStringValue("compute_mode", "hybrid"); nlp.options->SetIntegerValue("ir_outer_maxit", 0); nlp.options->SetIntegerValue("ir_inner_conv_cond", 2); nlp.options->SetStringValue("ir_inner_gs_scheme", "cgs2"); nlp.options->SetNumericValue("ir_inner_tol", 1e-8); +#endif } if(use_ginkgo) { nlp.options->SetStringValue("linsol_mode", "speculative"); diff --git a/src/Optimization/hiopKKTLinSysSparse.cpp b/src/Optimization/hiopKKTLinSysSparse.cpp index c7cbc80..332887f 100644 --- a/src/Optimization/hiopKKTLinSysSparse.cpp +++ b/src/Optimization/hiopKKTLinSysSparse.cpp @@ -725,7 +725,6 @@ hiopLinSolverSymSparse* hiopKKTLinSysCompressedSparseXDYcYd::determineAndCreateL ///////////////////////////////////////////////////////////////////////////////////////////// // CPU compute mode ///////////////////////////////////////////////////////////////////////////////////////////// - if(linear_solver == "ma57" || linear_solver == "auto") { #ifdef HIOP_USE_EVLOSER if(linear_solver == "evloser") { @@ -742,7 +741,7 @@ hiopLinSolverSymSparse* hiopKKTLinSysCompressedSparseXDYcYd::determineAndCreateL } } #endif // HIOP_USE_EVLOSER - +if(linear_solver == "ma57" || linear_solver == "auto") { #ifdef HIOP_USE_COINHSL linSys_ = new hiopLinSolverSymSparseMA57(n, nnz, nlp_); actual_lin_solver = "MA57"; From 240786a53ca57789e90898ba0ddd8064184c487f Mon Sep 17 00:00:00 2001 From: Tamar DeWilde Date: Thu, 25 Jun 2026 14:26:49 -0700 Subject: [PATCH 26/28] Isolate EVLOSER from embedded ReSolve --- .github/workflows/spack_build.yml | 2 +- CMakeLists.txt | 7 ++----- src/LinAlg/EVLOSER/CMakeLists.txt | 2 +- src/LinAlg/hiopLinSolverSparseEVLOSER.hpp | 2 +- src/LinAlg/hiopLinSolverSparseReSolve.cpp | 12 ++++-------- 5 files changed, 9 insertions(+), 16 deletions(-) diff --git a/.github/workflows/spack_build.yml b/.github/workflows/spack_build.yml index 30efb8a..c2554ff 100644 --- a/.github/workflows/spack_build.yml +++ b/.github/workflows/spack_build.yml @@ -163,7 +163,7 @@ jobs: run: spack -e . mirror set --oci-username ${{ env.USERNAME }} --oci-password "${{ secrets.GITHUB_TOKEN }}" local-buildcache - name: Trust keys - run: printf "y\n" | spack -e . buildcache keys --install --trust --force + run: spack -e . buildcache keys --install --trust - name: Find external packages run: spack -e . external find --all --exclude python --exclude curl --exclude openssl diff --git a/CMakeLists.txt b/CMakeLists.txt index e71b373..3b54cd3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -98,7 +98,7 @@ cmake_dependent_option( ) cmake_dependent_option( - HIOP_USE_RESOLVE "Build with ReSolve sparse solver support" ON "HIOP_USE_GPU" OFF + HIOP_USE_RESOLVE "Build with ReSolve sparse solver support" ON "HIOP_USE_CUDA" OFF ) option( @@ -293,9 +293,6 @@ endif(HIOP_USE_GPU) if(HIOP_USE_RAJA) # Look for CMake configuration file in RAJA installation - # The RAJA driver path needs camp available with the RAJA/Umpire target set. - find_package(camp CONFIG REQUIRED) - find_package(RAJA CONFIG PATHS ${RAJA_DIR} ${RAJA_DIR}/share/raja/cmake REQUIRED) @@ -303,7 +300,7 @@ if(HIOP_USE_RAJA) find_package(umpire CONFIG PATHS ${umpire_DIR} ${umpire_DIR}/share/umpire/cmake REQUIRED) - target_link_libraries(hiop_tpl INTERFACE umpire RAJA camp) + target_link_libraries(hiop_tpl INTERFACE umpire RAJA) message(STATUS "Found RAJA pkg-config: ${RAJA_CONFIG}") message(STATUS "Found umpire pkg-config: ${umpire_CONFIG}") endif() diff --git a/src/LinAlg/EVLOSER/CMakeLists.txt b/src/LinAlg/EVLOSER/CMakeLists.txt index c26a756..305dc05 100644 --- a/src/LinAlg/EVLOSER/CMakeLists.txt +++ b/src/LinAlg/EVLOSER/CMakeLists.txt @@ -22,7 +22,7 @@ if(HIOP_USE_HIP) target_compile_definitions(EVLOSER PRIVATE HIOP_USE_HIP) endif() -target_include_directories(EVLOSER INTERFACE +target_include_directories(EVLOSER PRIVATE $ $ ) diff --git a/src/LinAlg/hiopLinSolverSparseEVLOSER.hpp b/src/LinAlg/hiopLinSolverSparseEVLOSER.hpp index c31300a..849acae 100644 --- a/src/LinAlg/hiopLinSolverSparseEVLOSER.hpp +++ b/src/LinAlg/hiopLinSolverSparseEVLOSER.hpp @@ -59,7 +59,7 @@ #include "hiopLinSolver.hpp" #include "hiopMatrixSparseTriplet.hpp" -#include "evloser_execution_mode.hpp" +#include "EVLOSER/evloser_execution_mode.hpp" #include /** Implements the sparse linear solver class using the EVLOSER interface diff --git a/src/LinAlg/hiopLinSolverSparseReSolve.cpp b/src/LinAlg/hiopLinSolverSparseReSolve.cpp index a41d41f..312c9a0 100644 --- a/src/LinAlg/hiopLinSolverSparseReSolve.cpp +++ b/src/LinAlg/hiopLinSolverSparseReSolve.cpp @@ -54,16 +54,12 @@ */ #include "hiopLinSolverSparseReSolve.hpp" -// Use the ReSolve path here because EVLOSER has headers with the same names. -// If EVLOSER replaces ReSolve, update this backend path instead of doing only -// a find-and-replace. -#include "ReSolve/IterativeRefinement.hpp" -#include "ReSolve/RefactorizationSolver.hpp" -#include "ReSolve/MatrixCsr.hpp" +#include +#include +#include #include "hiop_blasdefs.hpp" -// Use the ReSolve path here because EVLOSER has a kernel header with this name too. -#include "ReSolve/KrylovSolverKernels.h" +#include "KrylovSolverKernels.h" #include "cusparse_v2.h" #include From 3c54f726f605d73fec70401d135d4881b5bb2255 Mon Sep 17 00:00:00 2001 From: Tamar DeWilde Date: Thu, 25 Jun 2026 22:17:09 -0400 Subject: [PATCH 27/28] Remove BLAS header include from EVLOSER MatrixCsr --- src/LinAlg/EVLOSER/MatrixCsr.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/LinAlg/EVLOSER/MatrixCsr.cpp b/src/LinAlg/EVLOSER/MatrixCsr.cpp index 77c45f7..3b8f505 100644 --- a/src/LinAlg/EVLOSER/MatrixCsr.cpp +++ b/src/LinAlg/EVLOSER/MatrixCsr.cpp @@ -53,7 +53,6 @@ * */ -#include "hiop_blasdefs.hpp" #include "MatrixCsr.hpp" #include "evloser_gpu_defs.hpp" From fb2965243130fd32ca21dcffd9325cc820b2dea1 Mon Sep 17 00:00:00 2001 From: Tamar DeWilde Date: Fri, 26 Jun 2026 00:07:05 -0700 Subject: [PATCH 28/28] Fix EVLOSER CPU test include path --- tests/CMakeLists.txt | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index e7beb2d..0d84551 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -78,6 +78,13 @@ if(HIOP_USE_EVLOSER AND NOT HIOP_USE_GPU) LinAlg/testEVLOSERKLUCPU.cpp ) + target_include_directories( + testEVLOSERKLUCPU + PRIVATE + ${PROJECT_SOURCE_DIR}/src/LinAlg/EVLOSER + ) + + target_link_libraries( testEVLOSERKLUCPU PRIVATE