diff --git a/.github/license-check/c.txt b/.github/license-check/c.txt
index cf1406c942..cdc649ac06 100644
--- a/.github/license-check/c.txt
+++ b/.github/license-check/c.txt
@@ -1,2 +1 @@
-// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 // SPDX-License-Identifier: Apache-2.0
diff --git a/cmake/ExternalGitTags.cmake b/cmake/ExternalGitTags.cmake
index d8c9752ee3..9bc7e0459a 100644
--- a/cmake/ExternalGitTags.cmake
+++ b/cmake/ExternalGitTags.cmake
@@ -125,15 +125,15 @@ set(EXTERN_METIS_GIT_TAG
 
 # MFEM
 set(EXTERN_MFEM_URL
-  "https://github.com/yuyangdai/mfem.git" CACHE STRING
+  "https://github.com/mfem/mfem.git" CACHE STRING
   "URL for external MFEM build"
 )
 set(EXTERN_MFEM_GIT_BRANCH
-  "cudss-dev" CACHE STRING
+  "master" CACHE STRING
   "Git branch for external MFEM build"
 )
 set(EXTERN_MFEM_GIT_TAG
-  "e217864f168acb1d7bdb3fd7c5aa73ef81951237" CACHE STRING
+  "d9d6526cc1749980a2ba1da16e2c1ca1e07d82ec" CACHE STRING
   "Git tag for external MFEM build"
 )
 
diff --git a/cmake/ExternalMFEM.cmake b/cmake/ExternalMFEM.cmake
index 6da084142a..e64614a25d 100644
--- a/cmake/ExternalMFEM.cmake
+++ b/cmake/ExternalMFEM.cmake
@@ -123,7 +123,7 @@ if(PALACE_WITH_CUDA)
     "-DCMAKE_CUDA_FLAGS=${MFEM_CUDA_FLAGS}"
     "-DMFEM_USE_CUDSS=${PALACE_WITH_CUDSS}"
   )
-  if(PALACE_WITH_CUDSS)
+  if(PALACE_WITH_CUDSS AND NOT "${CUDSS_DIR}" STREQUAL "")
     list(APPEND MFEM_OPTIONS
       "-DCUDSS_DIR=${CUDSS_DIR}"
     )
@@ -406,6 +406,8 @@ set(MFEM_PATCH_FILES
   "${CMAKE_SOURCE_DIR}/extern/patch/mfem/patch_par_tet_mesh_fix_dev.diff"
   "${CMAKE_SOURCE_DIR}/extern/patch/mfem/patch_gmsh_parser_performance.diff"
   "${CMAKE_SOURCE_DIR}/extern/patch/mfem/mfem_pr5280.diff"
+  "${CMAKE_SOURCE_DIR}/extern/patch/mfem/mfem_pr5246.diff"
+  "${CMAKE_SOURCE_DIR}/extern/patch/mfem/mfem_pr5124_cudss.diff"
 )
 
 include(ExternalProject)
diff --git a/extern/patch/mfem/mfem_pr5124_cudss.diff b/extern/patch/mfem/mfem_pr5124_cudss.diff
new file mode 100644
index 0000000000..a162c75154
--- /dev/null
+++ b/extern/patch/mfem/mfem_pr5124_cudss.diff
@@ -0,0 +1,1103 @@
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index c0eaf258cc..dd204aab01 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -433,6 +433,15 @@ if (MFEM_USE_STRUMPACK)
+   endif()
+ endif()
+ 
++# cuDSS can only be enabled in CUDA
++if (MFEM_USE_CUDSS)
++  if (MFEM_USE_CUDA)
++    find_package(CUDSS REQUIRED)
++  else()
++    message(FATAL_ERROR " *** cuDSS requires that CUDA be enabled.")
++  endif()
++endif()
++
+ # GnuTLS
+ if (MFEM_USE_GNUTLS)
+   find_package(_GnuTLS REQUIRED)
+@@ -631,7 +640,7 @@ find_package(Threads REQUIRED)
+ set(MFEM_TPLS OPENMP HYPRE LAPACK BLAS SuperLUDist STRUMPACK METIS SuiteSparse
+     SUNDIALS PETSC SLEPC MUMPS AXOM FMS CONDUIT Ginkgo GNUTLS GSLIB HDF5
+     NETCDF MPFR PUMI HIOP POSIXCLOCKS MFEMBacktrace ZLIB OCCA CEED RAJA UMPIRE
+-    ADIOS2 MKL_CPARDISO MKL_PARDISO AMGX MAGMA CUSPARSE CUBLAS CALIPER CODIPACK
++    ADIOS2 MKL_CPARDISO MKL_PARDISO AMGX MAGMA CUSPARSE CUBLAS CUDSS CALIPER CODIPACK
+     BENCHMARK PARELAG TRIBOL MPI_CXX HIP HIPBLAS HIPSPARSE MOONOLITH BLITZ
+     ALGOIM ENZYME CUDA::cudart)
+ 
+diff --git a/config/cmake/MFEMConfig.cmake.in b/config/cmake/MFEMConfig.cmake.in
+index cbe963041d..afbb981763 100644
+--- a/config/cmake/MFEMConfig.cmake.in
++++ b/config/cmake/MFEMConfig.cmake.in
+@@ -35,6 +35,7 @@ set(MFEM_USE_SUITESPARSE @MFEM_USE_SUITESPARSE@)
+ set(MFEM_USE_SUPERLU @MFEM_USE_SUPERLU@)
+ set(MFEM_USE_MUMPS @MFEM_USE_MUMPS@)
+ set(MFEM_USE_STRUMPACK @MFEM_USE_STRUMPACK@)
++set(MFEM_USE_CUDSS @MFEM_USE_CUDSS@)
+ set(MFEM_USE_GINKGO @MFEM_USE_GINKGO@)
+ set(MFEM_USE_AMGX @MFEM_USE_AMGX@)
+ set(MFEM_USE_MAGMA @MFEM_USE_MAGMA@)
+@@ -109,6 +110,10 @@ if (MFEM_USE_RAJA)
+    find_dependency(RAJA)
+ endif()
+ 
++if (MFEM_USE_CUDSS)
++  find_dependency(cudss)
++endif (MFEM_USE_CUDSS)
++
+ if (NOT TARGET mfem)
+   include(${CMAKE_CURRENT_LIST_DIR}/MFEMTargets.cmake)
+ endif (NOT TARGET mfem)
+diff --git a/config/cmake/config.hpp.in b/config/cmake/config.hpp.in
+index 12556546a8..d570b44e2d 100644
+--- a/config/cmake/config.hpp.in
++++ b/config/cmake/config.hpp.in
+@@ -108,6 +108,15 @@
+ // Enable MFEM functionality based on the STRUMPACK library.
+ #cmakedefine MFEM_USE_STRUMPACK
+ 
++// Enable MFEM functionality based on the cuDSS library.
++#cmakedefine MFEM_USE_CUDSS
++
++// CUDSS communication layer library path
++#cmakedefine MFEM_CUDSS_COMM_LIB "@MFEM_CUDSS_COMM_LIB@"
++
++// CUDSS threading layer library path
++#cmakedefine MFEM_CUDSS_THREADING_LIB "@MFEM_CUDSS_THREADING_LIB@"
++
+ // Enable functionality based on the Ginkgo library.
+ #cmakedefine MFEM_USE_GINKGO
+ 
+diff --git a/config/cmake/modules/FindCUDSS.cmake b/config/cmake/modules/FindCUDSS.cmake
+new file mode 100644
+index 0000000000..689151305d
+--- /dev/null
++++ b/config/cmake/modules/FindCUDSS.cmake
+@@ -0,0 +1,68 @@
++if (NOT cudss_DIR AND CUDSS_DIR)
++  set(cudss_DIR ${CUDSS_DIR}/lib/cmake/cudss)
++endif()
++message(STATUS "Looking for CUDSS ...")
++message(STATUS "   in CUDSS_DIR = ${CUDSS_DIR}")
++message(STATUS "      cudss_DIR = ${cudss_DIR}")
++find_package(cudss)
++set(CUDSS_FOUND ${cudss_FOUND})
++set(CUDSS_LIBRARIES "cudss")
++if (CUDSS_FOUND)
++  message(STATUS
++    "Found CUDSS target: ${CUDSS_LIBRARIES} (version: ${cudss_VERSION})")
++else()
++  set(msg STATUS)
++  if (CUDSS_FIND_REQUIRED)
++    set(msg FATAL_ERROR)
++  endif()
++  message(${msg}
++    "CUDSS not found. Please set CUDSS_DIR to the install prefix.")
++endif()
++
++if(CUDSS_FOUND AND TARGET cudss)
++  get_target_property(CUDSS_LIBRARY_LOCATION cudss IMPORTED_LOCATION)
++  if(NOT CUDSS_LIBRARY_LOCATION)
++    get_target_property(CUDSS_LIBRARY_LOCATION cudss IMPORTED_LOCATION_RELEASE)
++  endif()
++  if(CUDSS_LIBRARY_LOCATION)
++    get_filename_component(CUDSS_LIBRARY_DIR "${CUDSS_LIBRARY_LOCATION}" DIRECTORY)
++  else()
++    message(WARNING "Could not determine the location of the cuDSS library.")
++  endif()
++else()
++    message(WARNING "cuDSS target not available; cannot determine library directory.")
++endif()
++
++# Set the full name of the cuDSS threading library if OpenMP is enabled.
++# The threading layer library (libcudss_mtlayer_gomp.so) is located under the
++# cuDSS library directory by default.
++if (MFEM_USE_OPENMP)
++  find_file(
++    CUDSS_THREADING_LIB
++    NAMES libcudss_mtlayer_gomp.so
++    PATHS ${CUDSS_LIBRARY_DIR}
++    NO_DEFAULT_PATH
++  )
++  if (NOT DEFINED MFEM_CUDSS_THREADING_LIB AND CUDSS_THREADING_LIB)
++    set(MFEM_CUDSS_THREADING_LIB "${CUDSS_THREADING_LIB}")
++  endif()
++  message(STATUS "CUDSS threading layer library: ${MFEM_CUDSS_THREADING_LIB}")
++endif()
++
++# Set the full name of the cuDSS communication library if MFEM use OpenMPI.
++# The communication layer library (libcudss_commlayer_mpi.so) is located under the
++# cuDSS library directory by default.
++# The communication layer library is used pre-built communication layers for OpenMPI
++# by default.
++if (MFEM_USE_MPI)
++  find_file(
++    CUDSS_COMM_LIB
++    NAMES libcudss_commlayer_openmpi.so
++    PATHS ${CUDSS_LIBRARY_DIR}
++    NO_DEFAULT_PATH
++  )
++  if (NOT DEFINED MFEM_CUDSS_COMM_LIB AND CUDSS_COMM_LIB)
++    set(MFEM_CUDSS_COMM_LIB "${CUDSS_COMM_LIB}")
++  endif()
++  message(STATUS "CUDSS communication layer library: ${MFEM_CUDSS_COMM_LIB}")
++endif()
+diff --git a/config/config.hpp b/config/config.hpp
+index b3b03666b2..27106dfbd2 100644
+--- a/config/config.hpp
++++ b/config/config.hpp
+@@ -157,4 +157,10 @@ constexpr real_t operator""_r(unsigned long long v)
+ #endif
+ #endif // MFEM_USE_MPI not defined
+ 
++#ifndef MFEM_USE_CUDA
++#ifdef MFEM_USE_CUDSS
++#error Building with cuDSS (MFEM_USE_CUDSS=YES) requires CUDA (MFEM_USE_CUDA=YES)
++#endif
++#endif // MFEM_USE_CUDSS not defined
++
+ #endif // MFEM_CONFIG_HPP
+diff --git a/config/config.hpp.in b/config/config.hpp.in
+index 964fc0eea8..4c4bf9b2d3 100644
+--- a/config/config.hpp.in
++++ b/config/config.hpp.in
+@@ -108,6 +108,15 @@
+ // Enable MFEM functionality based on the STRUMPACK library.
+ // #define MFEM_USE_STRUMPACK
+ 
++// Enable MFEM functionality based on the cuDSS library.
++// #define MFEM_USE_CUDSS
++
++// CUDSS communication layer library path
++// #define MFEM_CUDSS_COMM_LIB "@MFEM_CUDSS_COMM_LIB@"
++
++// CUDSS threading layer library path
++// #define MFEM_CUDSS_THREADING_LIB "@MFEM_CUDSS_THREADING_LIB@"
++
+ // Enable MFEM features based on the Ginkgo library.
+ // #define MFEM_USE_GINKGO
+ 
+diff --git a/config/config.mk.in b/config/config.mk.in
+index dae3d75606..6346853061 100644
+--- a/config/config.mk.in
++++ b/config/config.mk.in
+@@ -36,6 +36,9 @@ MFEM_USE_SUPERLU       = @MFEM_USE_SUPERLU@
+ MFEM_USE_SUPERLU5      = @MFEM_USE_SUPERLU5@
+ MFEM_USE_MUMPS         = @MFEM_USE_MUMPS@
+ MFEM_USE_STRUMPACK     = @MFEM_USE_STRUMPACK@
++MFEM_USE_CUDSS         = @MFEM_USE_CUDSS@
++MFEM_CUDSS_COMM_LIB    = @MFEM_CUDSS_COMM_LIB@
++MFEM_CUDSS_THREADING_LIB = @MFEM_CUDSS_THREADING_LIB@
+ MFEM_USE_GINKGO        = @MFEM_USE_GINKGO@
+ MFEM_USE_AMGX          = @MFEM_USE_AMGX@
+ MFEM_USE_MAGMA         = @MFEM_USE_MAGMA@
+diff --git a/config/defaults.cmake b/config/defaults.cmake
+index f31f81736b..4fc8220107 100644
+--- a/config/defaults.cmake
++++ b/config/defaults.cmake
+@@ -38,6 +38,7 @@ option(MFEM_USE_SUPERLU "Enable SuperLU_DIST usage" OFF)
+ option(MFEM_USE_SUPERLU5 "Use the old SuperLU_DIST 5.1 version" OFF)
+ option(MFEM_USE_MUMPS "Enable MUMPS usage" OFF)
+ option(MFEM_USE_STRUMPACK "Enable STRUMPACK usage" OFF)
++option(MFEM_USE_CUDSS "Enable cuDSS usage" OFF)
+ option(MFEM_USE_GINKGO "Enable Ginkgo usage" OFF)
+ option(MFEM_USE_AMGX "Enable AmgX usage" OFF)
+ option(MFEM_USE_MAGMA "Enable MAGMA usage" OFF)
+diff --git a/config/defaults.mk b/config/defaults.mk
+index 08a01c80fd..3fa1c2e20c 100644
+--- a/config/defaults.mk
++++ b/config/defaults.mk
+@@ -152,6 +152,7 @@ MFEM_USE_SUPERLU       = NO
+ MFEM_USE_SUPERLU5      = NO
+ MFEM_USE_MUMPS         = NO
+ MFEM_USE_STRUMPACK     = NO
++MFEM_USE_CUDSS         = NO
+ MFEM_USE_GINKGO        = NO
+ MFEM_USE_AMGX          = NO
+ MFEM_USE_MAGMA         = NO
+@@ -367,6 +368,19 @@ STRUMPACK_OPT = -I$(STRUMPACK_DIR)/include $(SCOTCH_OPT)
+ STRUMPACK_LIB = -L$(STRUMPACK_DIR)/lib -lstrumpack $(MPI_FORTRAN_LIB)\
+  $(SCOTCH_LIB) $(SCALAPACK_LIB)
+ 
++# CUDSS library configuration
++CUDSS_DIR         = @MFEM_DIR@/../cudss
++CUDSS_INCLUDE_DIR = $(CUDSS_DIR)/include
++CUDSS_LIBRARY_DIR = $(CUDSS_DIR)/lib
++CUDSS_OPT         = -I$(CUDSS_INCLUDE_DIR)
++CUDSS_LIB         = \
++ $(XLINKER)-rpath,$(CUDSS_LIBRARY_DIR) -L$(CUDSS_LIBRARY_DIR) -lcudss
++# The cuDSS communication and threading libraries.
++MFEM_CUDSS_COMM_LIB = $(abspath $(wildcard $(or $(CUDSS_COMM_LIB),\
++   $(subst @MFEM_DIR@,$(MFEM_DIR), $(CUDSS_LIBRARY_DIR)/libcudss_commlayer_openmpi.so))))
++MFEM_CUDSS_THREADING_LIB = $(abspath $(wildcard $(or $(CUDSS_THREADING_LIB),\
++   $(subst @MFEM_DIR@,$(MFEM_DIR),$(CUDSS_LIBRARY_DIR)/libcudss_mtlayer_gomp.so))))
++
+ # Ginkgo library configuration
+ GINKGO_DIR = @MFEM_DIR@/../ginkgo/install
+ GINKGO_SEARCH_DIR = $(subst @MFEM_DIR@,$(MFEM_DIR),$(GINKGO_DIR))
+diff --git a/examples/ex1.cpp b/examples/ex1.cpp
+index c47543b449..4192f4b4b4 100644
+--- a/examples/ex1.cpp
++++ b/examples/ex1.cpp
+@@ -224,17 +224,29 @@ int main(int argc, char *argv[])
+    // 11. Solve the linear system A X = B.
+    if (!pa)
+    {
++#ifdef MFEM_USE_CUDSS
++      if (Device::Allows(Backend::CUDA_MASK))
++      {
++         // Use cuDSS to solve the system.
++         CuDSSSolver cudss_solver;
++         cudss_solver.SetOperator(*A);
++         cudss_solver.Mult(B, X);
++      }
++      else
++#endif
++      {
+ #ifndef MFEM_USE_SUITESPARSE
+-      // Use a simple symmetric Gauss-Seidel preconditioner with PCG.
+-      GSSmoother M((SparseMatrix&)(*A));
+-      PCG(*A, M, B, X, 1, 200, 1e-12, 0.0);
++         // Use a simple symmetric Gauss-Seidel preconditioner with PCG.
++         GSSmoother M((SparseMatrix&)(*A));
++         PCG(*A, M, B, X, 1, 200, 1e-12, 0.0);
+ #else
+-      // If MFEM was compiled with SuiteSparse, use UMFPACK to solve the system.
+-      UMFPackSolver umf_solver;
+-      umf_solver.Control[UMFPACK_ORDERING] = UMFPACK_ORDERING_METIS;
+-      umf_solver.SetOperator(*A);
+-      umf_solver.Mult(B, X);
++         // If MFEM was compiled with SuiteSparse, use UMFPACK to solve the system.
++         UMFPackSolver umf_solver;
++         umf_solver.Control[UMFPACK_ORDERING] = UMFPACK_ORDERING_METIS;
++         umf_solver.SetOperator(*A);
++         umf_solver.Mult(B, X);
+ #endif
++      }
+    }
+    else
+    {
+@@ -273,7 +285,7 @@ int main(int argc, char *argv[])
+    if (visualization)
+    {
+       char vishost[] = "localhost";
+-      int  visport   = 19916;
++      int visport = 19916;
+       socketstream sol_sock(vishost, visport);
+       sol_sock.precision(8);
+       sol_sock << "solution\n" << mesh << x << flush;
+diff --git a/examples/ex1p.cpp b/examples/ex1p.cpp
+index d7a47699c8..b3360a93db 100644
+--- a/examples/ex1p.cpp
++++ b/examples/ex1p.cpp
+@@ -83,6 +83,9 @@ int main(int argc, char *argv[])
+    const char *device_config = "cpu";
+    bool visualization = true;
+    bool algebraic_ceed = false;
++#ifdef MFEM_USE_CUDSS
++   bool cudss_solver = false;
++#endif
+ 
+    OptionsParser args(argc, argv);
+    args.AddOption(&mesh_file, "-m", "--mesh",
+@@ -102,6 +105,10 @@ int main(int argc, char *argv[])
+    args.AddOption(&algebraic_ceed, "-a", "--algebraic",
+                   "-no-a", "--no-algebraic",
+                   "Use algebraic Ceed solver");
++#endif
++#ifdef MFEM_USE_CUDSS
++   args.AddOption(&cudss_solver, "-cudss", "--cudss-solver", "-no-cudss",
++                  "--no-cudss-solver", "Use the cuDSS Solver.");
+ #endif
+    args.AddOption(&visualization, "-vis", "--visualization", "-no-vis",
+                   "--no-visualization",
+@@ -248,33 +255,51 @@ int main(int argc, char *argv[])
+    // 13. Solve the linear system A X = B.
+    //     * With full assembly, use the BoomerAMG preconditioner from hypre.
+    //     * With partial assembly, use Jacobi smoothing, for now.
+-   Solver *prec = NULL;
+-   if (pa)
++#ifdef MFEM_USE_CUDSS
++   if (!pa && (Device::Allows(Backend::CUDA_MASK) && cudss_solver))
++   {
++      // Solve using a direct solver with cuDSS
++      CuDSSSolver cudss_solver(MPI_COMM_WORLD);
++      cudss_solver.SetMatrixSymType(
++         CuDSSSolver::SYMMETRIC_POSITIVE_DEFINITE);
++      cudss_solver.SetMatrixViewType(CuDSSSolver::UPPER);
++      cudss_solver.SetOperator(*A);
++      cudss_solver.Mult(B, X);
++   }
++   else
++#endif
+    {
+-      if (UsesTensorBasis(fespace))
++      Solver *prec = NULL;
++      if (pa)
+       {
+-         if (algebraic_ceed)
+-         {
+-            prec = new ceed::AlgebraicSolver(a, ess_tdof_list);
+-         }
+-         else
++         if (UsesTensorBasis(fespace))
+          {
+-            prec = new OperatorJacobiSmoother(a, ess_tdof_list);
++            if (algebraic_ceed)
++            {
++               prec = new ceed::AlgebraicSolver(a, ess_tdof_list);
++            }
++            else
++            {
++               prec = new OperatorJacobiSmoother(a, ess_tdof_list);
++            }
+          }
+       }
++      else
++      {
++         prec = new HypreBoomerAMG;
++      }
++      CGSolver cg(MPI_COMM_WORLD);
++      cg.SetRelTol(1e-12);
++      cg.SetMaxIter(2000);
++      cg.SetPrintLevel(1);
++      if (prec)
++      {
++         cg.SetPreconditioner(*prec);
++      }
++      cg.SetOperator(*A);
++      cg.Mult(B, X);
++      delete prec;
+    }
+-   else
+-   {
+-      prec = new HypreBoomerAMG;
+-   }
+-   CGSolver cg(MPI_COMM_WORLD);
+-   cg.SetRelTol(1e-12);
+-   cg.SetMaxIter(2000);
+-   cg.SetPrintLevel(1);
+-   if (prec) { cg.SetPreconditioner(*prec); }
+-   cg.SetOperator(*A);
+-   cg.Mult(B, X);
+-   delete prec;
+ 
+    // 14. Recover the parallel grid function corresponding to X. This is the
+    //     local finite element solution on each processor.
+diff --git a/linalg/CMakeLists.txt b/linalg/CMakeLists.txt
+index 1d7710f6d3..dacd97fbe6 100644
+--- a/linalg/CMakeLists.txt
++++ b/linalg/CMakeLists.txt
+@@ -148,6 +148,12 @@ if (MFEM_USE_MKL_PARDISO)
+   list(APPEND HDRS pardiso.hpp)
+ endif()
+ 
++# cudss solver
++if (MFEM_USE_CUDSS)
++  list(APPEND SRCS cudss.cpp)
++  list(APPEND HDRS cudss.hpp)
++endif()
++
+ convert_filenames_to_full_paths(SRCS)
+ convert_filenames_to_full_paths(HDRS)
+ 
+diff --git a/linalg/cudss.cpp b/linalg/cudss.cpp
+new file mode 100644
+index 0000000000..8cbf8d8387
+--- /dev/null
++++ b/linalg/cudss.cpp
+@@ -0,0 +1,409 @@
++// Copyright (c) 2010-2025, Lawrence Livermore National Security, LLC. Produced
++// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
++// LICENSE and NOTICE for details. LLNL-CODE-806117.
++//
++// This file is part of the MFEM library. For more information and source code
++// availability visit https://mfem.org.
++//
++// MFEM is free software; you can redistribute it and/or modify it under the
++// terms of the BSD-3 license. We welcome feedback and contributions, see file
++// CONTRIBUTING.md for details.
++
++#include "cudss.hpp"
++#include "../general/communication.hpp"
++#include <string>
++
++#ifdef MFEM_USE_CUDSS
++
++#ifdef MFEM_USE_SINGLE
++#define CUDA_REAL_T CUDA_R_32F
++#else
++#define CUDA_REAL_T CUDA_R_64F
++#endif
++
++// Define a cuDSS error check macro, MFEM_CUDSS_CHECK(x), where x returns/is of
++// type 'cudssStatus_t'. This macro evaluates 'x' and raises an error if the
++// result is not CUDSS_STATUS_SUCCESS.
++#define MFEM_CUDSS_CHECK(x)                                          \
++  do {                                                               \
++    cudssStatus_t mfem_err_internal_var_name = (x);                  \
++    if (mfem_err_internal_var_name != CUDSS_STATUS_SUCCESS) {        \
++      ::mfem::mfem_cudss_error(mfem_err_internal_var_name, #x,       \
++                               _MFEM_FUNC_NAME, __FILE__, __LINE__); \
++    }                                                                \
++  } while (0)
++
++namespace mfem
++{
++// Function used by the macro MFEM_CUDSS_CHECK.
++void mfem_cudss_error(cudssStatus_t status, const char *expr, const char *func,
++                      const char *file, int line)
++{
++   mfem::err << "\n\nCUDSS error: (" << expr << ") failed with error:\n --> "
++             << "CUDSS call ended unsuccessfully"
++             << " [code: " << static_cast<int>(status) << ']'
++             << "\n ... in function: " << func << "\n ... in file: " << file
++             << ':' << line << '\n';
++   mfem_error();
++}
++
++CuDSSSolver::CuDSSSolver() { InitCuDSS(); }
++
++#ifdef MFEM_USE_MPI
++CuDSSSolver::CuDSSSolver(MPI_Comm comm_) : mpi_comm(comm_)
++{
++   InitCuDSS();
++
++   // NOTE: Set the communication layer to NULL so that cuDSS picks it
++   // from the environment variable "CUDSS_COMM_LIB"
++   const char* comm_lib = GetEnv("CUDSS_COMM_LIB");
++#ifdef MFEM_CUDSS_COMM_LIB
++   if (comm_lib == nullptr)
++   {
++      comm_lib = MFEM_CUDSS_COMM_LIB;
++   }
++#endif
++   MFEM_CUDSS_CHECK(cudssSetCommLayer(handle, comm_lib));
++
++   MFEM_CUDSS_CHECK(cudssDataSet(handle, solverData, CUDSS_DATA_COMM,
++                                 &mpi_comm, sizeof(MPI_Comm *)));
++}
++#endif // MFEM_USE_MPI
++
++CuDSSSolver::~CuDSSSolver()
++{
++   // Destroy the system Matrix, RHS vector and solution vector
++   if (Ac)
++   {
++      MFEM_CUDSS_CHECK(cudssMatrixDestroy(*Ac));
++      MFEM_CUDSS_CHECK(cudssMatrixDestroy(xc));
++      MFEM_CUDSS_CHECK(cudssMatrixDestroy(yc));
++   }
++
++   // Destroy the cuDSS handle, solver config and solver data
++   MFEM_CUDSS_CHECK(cudssDataDestroy(handle, solverData));
++   MFEM_CUDSS_CHECK(cudssConfigDestroy(solverConfig));
++
++
++   MFEM_CUDSS_CHECK(cudssDestroy(handle));
++   handle = nullptr;
++
++
++   if (csr_offsets_d != NULL)
++   {
++      CuMemFree(csr_offsets_d);
++   }
++
++   if (csr_columns_d != NULL)
++   {
++      CuMemFree(csr_columns_d);
++   }
++
++   if (csr_values_d != NULL)
++   {
++      CuMemFree(csr_values_d);
++   }
++}
++
++void CuDSSSolver::InitCuDSS()
++{
++   // Create the cuDSS handle
++   MFEM_CUDSS_CHECK(cudssCreate(&handle));
++
++#ifdef MFEM_USE_OPENMP
++   // NOTE: Set the threading layer library name to NULL so that cuDSS picks
++   // it from the environment variable "CUDSS_THREADING_LIB"
++   const char* threading_lib = GetEnv("CUDSS_THREADING_LIB");
++#ifdef MFEM_CUDSS_THREADING_LIB
++   if (threading_lib == nullptr)
++   {
++      threading_lib = MFEM_CUDSS_THREADING_LIB;
++   }
++#endif
++   MFEM_CUDSS_CHECK(cudssSetThreadingLayer(handle, threading_lib));
++#endif  // MFEM_USE_OPENMP
++
++   // Create the solver configuration and data objects
++   MFEM_CUDSS_CHECK(cudssConfigCreate(&solverConfig));
++   MFEM_CUDSS_CHECK(cudssDataCreate(handle, &solverData));
++}
++
++void CuDSSSolver::SetMatrixSymType(MatType mtype_)
++{
++   switch (mtype_)
++   {
++      case MatType::SYMMETRIC_INDEFINITE:
++         mat_type = CUDSS_MTYPE_SYMMETRIC;
++         break;
++      case MatType::SYMMETRIC_POSITIVE_DEFINITE:
++         mat_type = CUDSS_MTYPE_SPD;
++         break;
++      default:
++         mat_type = CUDSS_MTYPE_GENERAL;
++         break;
++   }
++}
++
++void CuDSSSolver::SetMatrixViewType(MatViewType mvtype_)
++{
++   // If the MatType is NONSYMMETRIC, the matrix view type must be FULL.
++   if (mat_type == CUDSS_MTYPE_GENERAL)
++   {
++      mview = CUDSS_MVIEW_FULL;
++      return;
++   }
++
++   // If the matrix is symmetric, the following view type will be optional.
++   switch (mvtype_)
++   {
++      case MatViewType::LOWER:
++         mview = CUDSS_MVIEW_LOWER;
++         break;
++      case MatViewType::UPPER:
++         mview = CUDSS_MVIEW_UPPER;
++         break;
++      default:
++         mview = CUDSS_MVIEW_FULL;
++         break;
++   }
++}
++
++void CuDSSSolver::SetReorderingReuse(bool reuse)
++{
++   MFEM_VERIFY(Ac == nullptr,
++               "Set reordering reuse before setting the operator!");
++   reorder_reuse = reuse;
++}
++
++#ifdef MFEM_USE_MPI
++void CuDSSSolver::SetMatrix(const HypreParMatrix &op)
++{
++   bool cuDSSObjectInitialized = (Ac != nullptr);
++
++   hypre_ParCSRMatrix *parcsr_op = op;
++   op.HypreRead();
++   hypre_CSRMatrix *csr_op = hypre_MergeDiagAndOffd(parcsr_op);
++   op.HypreRead();
++#if MFEM_HYPRE_VERSION >= 21600
++   hypre_CSRMatrixBigJtoJ(csr_op);
++#endif
++
++   // Parameters of the Operator
++   n_loc = height;  // Equal to the csr_op->num_rows
++   n_global = internal::to_int(parcsr_op->global_num_rows);
++   row_start = parcsr_op->first_row_index;
++   row_end = row_start + n_loc - 1;
++   MFEM_VERIFY(!cuDSSObjectInitialized || !reorder_reuse ||
++               (reorder_reuse && (nnz == csr_op->num_nonzeros)),
++               "Inconsistent new matrix pattern!");
++   nnz = csr_op->num_nonzeros;
++
++   SetMatrixCuDSS(csr_op->i, csr_op->j, csr_op->data);
++   hypre_CSRMatrixDestroy(csr_op);
++}
++#endif // MFEM_USE_MPI
++
++void CuDSSSolver::SetMatrix(const SparseMatrix &op)
++{
++   bool cuDSSObjectInitialized = (Ac != nullptr);
++
++   // Parameters of the Operator
++   MFEM_VERIFY(!cuDSSObjectInitialized || !reorder_reuse ||
++               (reorder_reuse && (nnz == op.NumNonZeroElems())),
++               "Inconsistent new matrix pattern!");
++
++   SparseMatrix *A = const_cast<SparseMatrix *>(&op);
++
++   nnz = A->NumNonZeroElems();
++   n_global = height;  // Equal to the height in serial
++   n_loc = height;     // Equal to the height in serial
++
++   int *csr_offsets = const_cast<int *>(A->ReadI());
++   int *csr_columns = const_cast<int *>(A->ReadJ());
++   real_t *csr_values = const_cast<real_t *>(A->ReadData());
++
++   SetMatrixCuDSS(csr_offsets, csr_columns, csr_values);
++}
++
++void CuDSSSolver::SetMatrixCuDSS(int *csr_offsets, int *csr_columns,
++                                 real_t *csr_values)
++{
++   bool cuDSSObjectInitialized = (Ac != nullptr);
++   // Initial the cudssMatrix objects
++   if (!cuDSSObjectInitialized)
++   {
++      // Set the cudssMatrix object of csr operator
++      Ac = std::make_unique<cudssMatrix_t>();
++      // Create empty RHS and solution vectors
++      SetNumRHS(1);
++      // Allocate device memory for csr values
++      CuMemAlloc(&csr_values_d, nnz * sizeof(real_t));
++   }
++
++   CuMemcpyDtoD(csr_values_d, csr_values, nnz * sizeof(real_t));
++
++   // New cuDSS CSR matrix object and analysis or reuse the one from a previous
++   // matrix
++   if (!cuDSSObjectInitialized || !reorder_reuse)
++   {
++      if (reorder_reuse)  // !cuDSSObjectInitialized && reorder_reuse
++      {
++         // NOTE: For CuDSS solver to reuse the reordering (skipping analysis
++         // phase), it needs to access the I and J arrays of the **initial**
++         // matrix. Therefore, we need to copy and keep I and J in device memory.
++         CuMemAlloc(&csr_offsets_d, (n_loc + 1) * sizeof(int));
++         CuMemAlloc(&csr_columns_d, nnz * sizeof(int));
++
++         CuMemcpyDtoD(csr_offsets_d, csr_offsets, (n_loc + 1) * sizeof(int));
++         CuMemcpyDtoD(csr_columns_d, csr_columns, nnz * sizeof(int));
++
++         MFEM_CUDSS_CHECK(
++            cudssMatrixCreateCsr(
++               Ac.get(), n_global, n_global, nnz, csr_offsets_d, NULL,
++               csr_columns_d, csr_values_d, CUDA_R_32I, CUDA_REAL_T, mat_type, mview,
++               CUDSS_BASE_ZERO));
++      }
++      else    // !reorder_reuse
++      {
++         if (cuDSSObjectInitialized)
++         {
++            MFEM_CUDSS_CHECK(cudssMatrixDestroy(*Ac));
++         }
++         MFEM_CUDSS_CHECK(
++            cudssMatrixCreateCsr(
++               Ac.get(), n_global, n_global, nnz, csr_offsets, NULL, csr_columns,
++               csr_values_d, CUDA_R_32I, CUDA_REAL_T, mat_type, mview,
++               CUDSS_BASE_ZERO));
++      }
++#ifdef MFEM_USE_MPI
++      if (Mpi::IsInitialized())
++      {
++         MFEM_CUDSS_CHECK(cudssMatrixSetDistributionRow1d(*Ac, row_start, row_end));
++      }
++#endif
++      // Analysis
++      MFEM_CUDSS_CHECK(cudssExecute(handle, CUDSS_PHASE_ANALYSIS, solverConfig,
++                                    solverData, *Ac, yc, xc));
++   }
++   else    // cuDSSObjectInitialized && reorder_reuse
++   {
++      // NOTE: When reusing analysis result, we only update the Data array,
++      // without changing the I and J arrays.
++      MFEM_CUDSS_CHECK(cudssMatrixSetValues(*Ac, csr_values_d));
++   }
++
++   // Factorization
++   MFEM_CUDSS_CHECK(cudssExecute(handle, CUDSS_PHASE_FACTORIZATION, solverConfig,
++                                 solverData, *Ac, yc, xc));
++}
++
++void CuDSSSolver::SetOperator(const Operator &op)
++{
++   bool cuDSSObjectInitialized = (Ac != nullptr);
++   MFEM_VERIFY(
++      !cuDSSObjectInitialized || (height == op.Height() && width == op.Width()),
++      "Inconsistent new matrix size!");
++   height = op.Height();
++   width = op.Width();
++   if (const SparseMatrix *A = dynamic_cast<const SparseMatrix *>(&op))
++   {
++      SetMatrix(*A);
++   }
++#ifdef MFEM_USE_MPI
++   else if (const HypreParMatrix *A =
++               dynamic_cast<const HypreParMatrix *>(&op))
++   {
++      SetMatrix(*A);
++   }
++#endif // MFEM_USE_MPI
++   else
++   {
++      MFEM_ABORT("Unsupported Operator Type \n");
++   }
++}
++
++void CuDSSSolver::SetNumRHS(int nrhs_) const
++{
++   if (nrhs != nrhs_)
++   {
++      if (nrhs > 0)
++      {
++         // Destroy the previous RHS vector and solution vector
++         MFEM_CUDSS_CHECK(cudssMatrixDestroy(xc));
++         MFEM_CUDSS_CHECK(cudssMatrixDestroy(yc));
++      }
++      // Create empty RHS and solution vectors
++      MFEM_CUDSS_CHECK(cudssMatrixCreateDn(&xc, n_global, nrhs_, n_global, NULL,
++                                           CUDA_REAL_T, CUDSS_LAYOUT_COL_MAJOR));
++
++      MFEM_CUDSS_CHECK(cudssMatrixCreateDn(&yc, n_global, nrhs_, n_global, NULL,
++                                           CUDA_REAL_T, CUDSS_LAYOUT_COL_MAJOR));
++
++#ifdef MFEM_USE_MPI
++      MFEM_CUDSS_CHECK(cudssMatrixSetDistributionRow1d(xc, row_start, row_end));
++      MFEM_CUDSS_CHECK(cudssMatrixSetDistributionRow1d(yc, row_start, row_end));
++#endif // MFEM_USE_MPI
++   }
++   nrhs = nrhs_;
++}
++
++void CuDSSSolver::Mult(const Vector &x, Vector &y) const
++{
++   Array<const Vector *> X(1);
++   Array<Vector *> Y(1);
++   X[0] = &x;
++   Y[0] = &y;
++   ArrayMult(X, Y);
++}
++
++void CuDSSSolver::ArrayMult(const Array<const Vector *> &X,
++                            Array<Vector *> &Y) const
++{
++   SetNumRHS(X.Size());
++
++   Vector RHS, SOL;
++
++   if (nrhs == 1)
++   {
++      RHS.MakeRef(*(const_cast<Vector *>(X[0])), 0, X[0]->Size());
++      SOL.MakeRef(*Y[0], 0, Y[0]->Size());
++   }
++   else
++   {
++      // NOTE: RHS must have **global** num_rows and nrhs columns
++      RHS.SetSize(nrhs * n_global, *X[0]);
++      for (int i = 0; i < nrhs; i++)
++      {
++         Vector s(RHS, i * n_global, n_loc);
++         s = *X[i];
++      }
++
++      // NOTE: SOL must have **global** num_rows and nrhs columns
++      SOL.SetSize(nrhs * n_global, *Y[0]);
++   }
++
++   MFEM_CUDSS_CHECK(cudssMatrixSetValues(xc, const_cast<real_t *>(RHS.Read())));
++   MFEM_CUDSS_CHECK(cudssMatrixSetValues(yc, SOL.Write()));
++
++   // Solve
++   MFEM_CUDSS_CHECK(cudssExecute(handle, CUDSS_PHASE_SOLVE, solverConfig,
++                                 solverData, *Ac, yc, xc));
++
++   if (nrhs == 1)
++   {
++      SOL.SyncAliasMemory(*Y[0]);
++   }
++
++   if (nrhs > 1)
++   {
++      // Get solution for each right-hand side
++      for (int i = 0; i < nrhs; i++)
++      {
++         Vector s(SOL, i * n_global, n_loc);
++         *Y[i] = s;
++      }
++   }
++}
++
++} // namespace mfem
++#endif // MFEM_USE_CUDSS
+diff --git a/linalg/cudss.hpp b/linalg/cudss.hpp
+new file mode 100644
+index 0000000000..6f970bff15
+--- /dev/null
++++ b/linalg/cudss.hpp
+@@ -0,0 +1,224 @@
++// Copyright (c) 2010-2025, Lawrence Livermore National Security, LLC. Produced
++// at the Lawrence Livermore National Laboratory. All Rights reserved. See files
++// LICENSE and NOTICE for details. LLNL-CODE-806117.
++//
++// This file is part of the MFEM library. For more information and source code
++// availability visit https://mfem.org.
++//
++// MFEM is free software; you can redistribute it and/or modify it under the
++// terms of the BSD-3 license. We welcome feedback and contributions, see file
++// CONTRIBUTING.md for details.
++
++#ifndef MFEM_CUDSS
++#define MFEM_CUDSS
++
++#include "../config/config.hpp"
++
++#ifdef MFEM_USE_CUDSS
++
++#include "cudss.h"
++#include <memory>
++
++#ifdef MFEM_USE_MPI
++#include <mpi.h>
++#include "hypre.hpp"
++#else
++#include "operator.hpp"
++#include "sparsemat.hpp"
++#endif
++
++namespace mfem
++{
++/**
++ * @brief cuDSS: A high-performance CUDA Library for Direct Sparse Solvers
++ *
++ * Interface for the distributed cuDSS solver
++ */
++class CuDSSSolver : public Solver
++{
++public:
++   /// Specify the type of matrix we are applying the solver to
++   enum MatType
++   {
++      /// CUDSS_MTYPE_GENERAL: General matrix [default].
++      NONSYMMETRIC = 0,
++      /// CUDSS_MTYPE_SYMMETRIC: Real symmetric matrix.
++      SYMMETRIC_INDEFINITE = 1,
++      /// CUDSS_MTYPE_SPD: Symmetric positive-definite matrix.
++      SYMMETRIC_POSITIVE_DEFINITE = 2,
++   };
++
++   /// Specify the view type of matrix we are applying the solver to
++   enum MatViewType
++   {
++      /// CUDSS_MVIEW_FULL: Full matrix [default]
++      FULL = 0,
++      /// CUDSS_MVIEW_LOWER: Lower-triangular matrix (including the diagonal).
++      LOWER = 1,
++      /// CUDSS_MVIEW_UPPER: Upper-triangular matrix (including the diagonal).
++      UPPER = 2,
++   };
++
++   /**
++    * @brief Constructor.
++    */
++   CuDSSSolver();
++
++#ifdef MFEM_USE_MPI
++   /**
++    * @brief Constructor with MPI_Comm parameter.
++    */
++   CuDSSSolver(MPI_Comm comm);
++#endif
++
++   // Note: CuDSSSolver disables the move copy constructor and move assignment
++   // operator
++   CuDSSSolver(CuDSSSolver &&) = delete;
++   CuDSSSolver &operator=(CuDSSSolver &&) = delete;
++
++   /**
++    * @brief Set the matrix type
++    *
++    * Supported matrix types:
++    *          CuDSSSolver::NONSYMMETRIC,
++    *          CuDSSSolver::SYMMETRIC_INDEFINITE,
++    *      and CuDSSSolver::SYMMETRIC_POSITIVE_DEFINITE
++    *
++    * @param mtype_ Matrix type
++    *
++    * @note This method has to be called before SetOperator
++    */
++   void SetMatrixSymType(MatType mtype_);
++
++   /**
++    * @brief Set the matrix view type
++    *
++    * Supported matrix types:
++    *          CuDSSSolver::FULL,
++    *          CuDSSSolver::LOWER,
++    *      and CuDSSSolver::UPPER
++    *
++    * @param mvtype Matrix view type
++    *
++    * @note This method has to be called before SetOperator
++    */
++   void SetMatrixViewType(MatViewType mvtype);
++
++   /**
++    * @brief Set the flag controlling reuse of the symbolic factorization
++    * for multiple operators
++    *
++    * @param reuse Flag to reuse symbolic factorization
++    *
++    * @note This method has to be called before repeated calls to SetOperator
++    */
++   void SetReorderingReuse(bool reuse);
++
++   void SetOperator(const Operator &op) override;
++
++   /**
++    * @brief Solve $ y = Op^{-1} x $
++    *
++    * @param x RHS vector
++    * @param y Solution vector
++    */
++   void Mult(const Vector &x, Vector &y) const override;
++
++   /**
++    * @brief Solve $ Y_i = Op^{-1} X_i $
++    *
++    * @param X Array of RHS vectors
++    * @param Y Array of Solution vectors
++    */
++   void ArrayMult(const Array<const Vector *> &X,
++                  Array<Vector *> &Y) const override;
++
++   ~CuDSSSolver();
++
++private:
++#ifdef MFEM_USE_MPI
++   // MPI_Comm
++   MPI_Comm mpi_comm = MPI_COMM_NULL;
++
++   int row_start = 0;  // the first row index in CSR matrix operator
++   int row_end = 0;    // the end row index in CSR matrix operator
++#endif
++
++   // Parameter controlling whether or not to reuse the symbolic factorization
++   // for multiple calls to SetOperator
++   bool reorder_reuse = false;
++
++   // Parameter controlling the matrix type
++   cudssMatrixType_t mat_type = CUDSS_MTYPE_GENERAL;
++
++   int n_global = 0;      // global number of rows
++   int n_loc = 0;         // the number of the rows in CSR matrix operator
++
++   mutable int nrhs = 0;  // the number of the RHSs
++   int nnz = 0;        // the number of non zeros
++
++   // copy and keep the I and J arrays in device memory when skipping analysis
++   // phase
++   void *csr_offsets_d = NULL; // copy and keep I in device
++   void *csr_columns_d = NULL; // copy and keep J in device
++   void *csr_values_d = NULL;  // copy and keep csr data in device
++
++   // cuDSS object specifies available matrix types for sparse matrices
++   cudssMatrixViewType_t mview = CUDSS_MVIEW_FULL;
++
++   // cuDSS objects storage for sparse matrix Ac, RHS yc and solution xc
++   std::unique_ptr<cudssMatrix_t> Ac;
++   mutable cudssMatrix_t xc, yc;
++
++   // common for all cuDSS solver instances.
++   // cuDSS object holds the cuDSS library context
++   cudssHandle_t handle;
++
++   // cuDSS object stores configuration settings for the solver
++   mutable cudssConfig_t solverConfig;
++   // cuDSS object holds internal data
++   mutable cudssData_t solverData;
++
++   /// Method for configuring storage for distributed/centralized RHS and
++   /// solution
++   void SetNumRHS(int nrhs_) const;
++
++#ifdef MFEM_USE_MPI
++   /**
++    * @brief Set the HypreParMatrix object
++    *
++    * @param op HypreParMatrix object
++    *
++    * @note This method is called inside SetOperator
++    */
++   void SetMatrix(const HypreParMatrix &op);
++#endif
++
++   /**
++    * @brief Set the SparseMatrix object
++    *
++    * @param op SparseMatrix object
++    *
++    * @note This method is called inside SetOperator
++    */
++   void SetMatrix(const SparseMatrix &op);
++
++   /**
++    * @brief Set the matrix values for cuDSS
++    *
++    * @param csr_offsets Row offsets of the CSR matrix
++    * @param csr_columns Column indices of the CSR matrix
++    * @param csr_values Non-zero values of the CSR matrix
++    *
++    * @note This method is called inside SetMatrix.
++   */
++   void SetMatrixCuDSS(int* csr_offsets, int* csr_columns, real_t* csr_values);
++
++   /// Method for initializing the cuDSS library and creating the cuDSS handle
++   void InitCuDSS();
++};
++
++} // namespace mfem
++
++#endif // MFEM_USE_CUDSS
++#endif // MFEM_CUDSS
+diff --git a/linalg/linalg.hpp b/linalg/linalg.hpp
+index a3b129c849..7fe2e8804b 100644
+--- a/linalg/linalg.hpp
++++ b/linalg/linalg.hpp
+@@ -91,4 +91,8 @@
+ 
+ #endif // MFEM_USE_MPI
+ 
++#ifdef MFEM_USE_CUDSS
++#include "cudss.hpp"
++#endif
++
+ #endif
+diff --git a/makefile b/makefile
+index f33f648bf5..efc294b23f 100644
+--- a/makefile
++++ b/makefile
+@@ -302,7 +302,7 @@ endif
+ MFEM_REQ_LIB_DEPS = SUPERLU MUMPS METIS FMS CONDUIT SIDRE LAPACK SUNDIALS\
+  SUITESPARSE STRUMPACK GINKGO GNUTLS HDF5 NETCDF SLEPC PETSC MPFR PUMI HIOP\
+  GSLIB OCCA CEED RAJA UMPIRE MKL_CPARDISO MKL_PARDISO AMGX MAGMA CALIPER PARELAG\
+- TRIBOL BENCHMARK MOONOLITH ALGOIM
++ TRIBOL BENCHMARK MOONOLITH ALGOIM CUDSS
+ 
+ 
+ PETSC_ERROR_MSG = $(if $(PETSC_FOUND),,. PETSC config not found: $(PETSC_VARS))
+@@ -371,7 +371,8 @@ MFEM_DEFINES = MFEM_VERSION MFEM_VERSION_STRING MFEM_GIT_STRING MFEM_USE_MPI\
+  MFEM_USE_SIMD MFEM_USE_ADIOS2 MFEM_USE_MKL_CPARDISO MFEM_USE_MKL_PARDISO MFEM_USE_AMGX\
+  MFEM_USE_MAGMA MFEM_USE_MUMPS MFEM_USE_ADFORWARD MFEM_USE_CODIPACK MFEM_USE_CALIPER\
+  MFEM_USE_BENCHMARK MFEM_USE_PARELAG MFEM_USE_TRIBOL MFEM_USE_ALGOIM MFEM_USE_ENZYME\
+- MFEM_SOURCE_DIR MFEM_INSTALL_DIR MFEM_SHARED_BUILD MFEM_USE_DOUBLE MFEM_USE_SINGLE
++ MFEM_SOURCE_DIR MFEM_INSTALL_DIR MFEM_SHARED_BUILD MFEM_USE_DOUBLE MFEM_USE_SINGLE\
++ MFEM_USE_CUDSS MFEM_CUDSS_COMM_LIB MFEM_CUDSS_THREADING_LIB
+ 
+ # List of makefile variables that will be written to config.mk:
+ MFEM_CONFIG_VARS = MFEM_CXX MFEM_HOST_CXX MFEM_CPPFLAGS MFEM_CXXFLAGS\
+@@ -406,7 +407,7 @@ MFEM_INSTALL_DIR = $(abspath $(MFEM_PREFIX))
+ # If we have 'config' target, export variables used by config/makefile
+ ifneq (,$(filter config,$(MAKECMDGOALS)))
+    export $(MFEM_DEFINES) MFEM_DEFINES $(MFEM_CONFIG_VARS) MFEM_CONFIG_VARS
+-   export VERBOSE HYPRE_OPT PUMI_DIR MUMPS_OPT GSLIB_OPT
++   export VERBOSE HYPRE_OPT PUMI_DIR MUMPS_OPT GSLIB_OPT CUDSS_OPT
+ endif
+ 
+ # If we have 'install' target, export variables used by config/makefile
+@@ -737,6 +738,7 @@ status info:
+ 	$(info MFEM_USE_SUPERLU5      = $(MFEM_USE_SUPERLU5))
+ 	$(info MFEM_USE_MUMPS         = $(MFEM_USE_MUMPS))
+ 	$(info MFEM_USE_STRUMPACK     = $(MFEM_USE_STRUMPACK))
++	$(info MFEM_USE_CUDSS         = $(MFEM_USE_CUDSS))
+ 	$(info MFEM_USE_GINKGO        = $(MFEM_USE_GINKGO))
+ 	$(info MFEM_USE_AMGX          = $(MFEM_USE_AMGX))
+ 	$(info MFEM_USE_MAGMA         = $(MFEM_USE_MAGMA))
diff --git a/extern/patch/mfem/mfem_pr5246.diff b/extern/patch/mfem/mfem_pr5246.diff
new file mode 100644
index 0000000000..2f1eecdcf4
--- /dev/null
+++ b/extern/patch/mfem/mfem_pr5246.diff
@@ -0,0 +1,1389 @@
+diff --git a/fem/intrules.cpp b/fem/intrules.cpp
+index a88ba2d0a0..d5963f6788 100644
+--- a/fem/intrules.cpp
++++ b/fem/intrules.cpp
+@@ -14,6 +14,18 @@
+ // Acknowledgment: Some of the high-precision triangular and tetrahedral
+ // quadrature rules below were obtained from the Encyclopaedia of Cubature
+ // Formulas at http://nines.cs.kuleuven.be/research/ecf/ecf.html
++//
++// Positive-weight simplex quadrature rules from:
++//
++//   [1] F.D. Witherden, P.E. Vincent, "On the identification of symmetric
++//       quadrature rules for finite element methods", Computers & Mathematics
++//       with Applications, 69(10):1232-1241, 2015.
++//       Used for: triangles (d=0-20), tetrahedra (d=0-13).
++//
++//   [2] G. Chuluunbaatar, O. Chuluunbaatar, A.A. Gusev, S.I. Vinitsky,
++//       "PI-type fully symmetric quadrature rules on the 3-,...,6-simplexes",
++//       Computers & Mathematics with Applications, 124:89-97, 2022.
++//       Used for: tetrahedra (d=14-20).
+ 
+ #include "fem.hpp"
+ #include "../mesh/nurbs.hpp"
+@@ -982,7 +994,7 @@ IntegrationRules::IntegrationRules(int ref, int type)
+    SquareIntRules.SetSize(32, h_mt);
+    SquareIntRules = NULL;
+ 
+-   // TetrahedronIntegrationRule() assumes that this size is >= 10
++   // TetrahedronIntegrationRule() assumes that this size is >= 22
+    TetrahedronIntRules.SetSize(32, h_mt);
+    TetrahedronIntRules = NULL;
+ 
+@@ -1250,344 +1262,350 @@ IntegrationRule *IntegrationRules::SegmentIntegrationRule(int Order)
+    return ir;
+ }
+ 
+-// Integration rules for reference triangle {[0,0],[1,0],[0,1]}
++// Triangle rules from Witherden & Vincent [1].
++// Orbit data from PyFR (https://pyfr.org), licensed under CC-BY 4.0.
+ IntegrationRule *IntegrationRules::TriangleIntegrationRule(int Order)
+ {
+    IntegrationRule *ir = NULL;
+-   // Note: Set TriangleIntRules[*] to ir only *after* ir is fully constructed.
+-   // This is needed in multithreaded environment.
+ 
+-   // assuming that orders <= 25 are pre-allocated
+    switch (Order)
+    {
+-      case 0:  // 1 point - degree 1
++      case 0:
+       case 1:
+          ir = new IntegrationRule(1);
+          ir->AddTriMidPoint(0, 0.5);
+          ir->SetOrder(1);
+-         TriangleIntRules[0] = TriangleIntRules[1] = ir;
++         TriangleIntRules[0] =
++            TriangleIntRules[1] = ir;
+          return ir;
+ 
+-      case 2:  // 3 point - 2 degree
++      case 2:
+          ir = new IntegrationRule(3);
+          ir->AddTriPoints3(0, 1./6., 1./6.);
+          ir->SetOrder(2);
+          TriangleIntRules[2] = ir;
+-         // interior points
+-         return ir;
+-
+-      case 3:  // 4 point - 3 degree (has one negative weight)
+-         ir = new IntegrationRule(4);
+-         ir->AddTriMidPoint(0, -0.28125); // -9./32.
+-         ir->AddTriPoints3(1, 0.2, 25./96.);
+-         ir->SetOrder(3);
+-         TriangleIntRules[3] = ir;
+          return ir;
+ 
+-      case 4:  // 6 point - 4 degree
++      case 3:
++      case 4:
+          ir = new IntegrationRule(6);
+-         ir->AddTriPoints3(0, 0.091576213509770743460, 0.054975871827660933819);
+-         ir->AddTriPoints3(3, 0.44594849091596488632, 0.11169079483900573285);
++         ir->AddTriPoints3(0, 4.45948490915964890213e-01, 1.11690794839005735906e-01);
++         ir->AddTriPoints3(3, 9.15762135097707430376e-02, 5.49758718276609353870e-02);
+          ir->SetOrder(4);
+-         TriangleIntRules[4] = ir;
++         TriangleIntRules[3] =
++            TriangleIntRules[4] = ir;
+          return ir;
+ 
+-      case 5:  // 7 point - 5 degree
++      case 5:
+          ir = new IntegrationRule(7);
+          ir->AddTriMidPoint(0, 0.1125);
+-         ir->AddTriPoints3(1, 0.10128650732345633880, 0.062969590272413576298);
+-         ir->AddTriPoints3(4, 0.47014206410511508977, 0.066197076394253090369);
++         ir->AddTriPoints3(1, 1.01286507323456342888e-01, 6.29695902724135697648e-02);
++         ir->AddTriPoints3(4, 4.70142064105115109474e-01, 6.61970763942530959767e-02);
+          ir->SetOrder(5);
+          TriangleIntRules[5] = ir;
+          return ir;
+ 
+-      case 6:  // 12 point - 6 degree
++      case 6:
+          ir = new IntegrationRule(12);
+-         ir->AddTriPoints3(0, 0.063089014491502228340, 0.025422453185103408460);
+-         ir->AddTriPoints3(3, 0.24928674517091042129, 0.058393137863189683013);
+-         ir->AddTriPoints6(6, 0.053145049844816947353, 0.31035245103378440542,
+-                           0.041425537809186787597);
++         ir->AddTriPoints3(0, 6.30890144915022266225e-02, 2.54224531851034094010e-02);
++         ir->AddTriPoints3(3, 2.49286745170910428726e-01, 5.83931378631896841336e-02);
++         ir->AddTriPoints6(6, 6.36502499121398668258e-01, 3.10352451033784393353e-01,
++                           4.14255378091867854096e-02);
+          ir->SetOrder(6);
+          TriangleIntRules[6] = ir;
+          return ir;
+ 
+-      case 7:  // 12 point - degree 7
+-         ir = new IntegrationRule(12);
+-         ir->AddTriPoints3R(0, 0.062382265094402118174, 0.067517867073916085443,
+-                            0.026517028157436251429);
+-         ir->AddTriPoints3R(3, 0.055225456656926611737, 0.32150249385198182267,
+-                            0.043881408714446055037);
+-         // slightly better with explicit 3rd area coordinate
+-         ir->AddTriPoints3R(6, 0.034324302945097146470, 0.66094919618673565761,
+-                            0.30472650086816719592, 0.028775042784981585738);
+-         ir->AddTriPoints3R(9, 0.51584233435359177926, 0.27771616697639178257,
+-                            0.20644149867001643817, 0.067493187009802774463);
++      case 7:
++         ir = new IntegrationRule(15);
++         ir->AddTriPoints3(0, 3.37306485545878498300e-02, 8.27252505539606552976e-03);
++         ir->AddTriPoints3(3, 2.41577382595403566956e-01, 6.39720856150777922311e-02);
++         ir->AddTriPoints3(6, 4.74309692504718327655e-01, 3.85433230929930342734e-02);
++         ir->AddTriPoints6(9, 7.54280040550053154647e-01, 1.98683314797351684433e-01,
++                           2.79393664515998896292e-02);
+          ir->SetOrder(7);
+          TriangleIntRules[7] = ir;
+          return ir;
+ 
+-      case 8:  // 16 point - 8 degree
++      case 8:
+          ir = new IntegrationRule(16);
+-         ir->AddTriMidPoint(0, 0.0721578038388935841255455552445323);
+-         ir->AddTriPoints3(1, 0.170569307751760206622293501491464,
+-                           0.0516086852673591251408957751460645);
+-         ir->AddTriPoints3(4, 0.0505472283170309754584235505965989,
+-                           0.0162292488115990401554629641708902);
+-         ir->AddTriPoints3(7, 0.459292588292723156028815514494169,
+-                           0.0475458171336423123969480521942921);
+-         ir->AddTriPoints6(10, 0.008394777409957605337213834539296,
+-                           0.263112829634638113421785786284643,
+-                           0.0136151570872174971324223450369544);
++         ir->AddTriMidPoint(0, 7.21578038388935860681e-02);
++         ir->AddTriPoints3(1, 4.59292588292723236165e-01, 4.75458171336423096598e-02);
++         ir->AddTriPoints3(4, 1.70569307751760268488e-01, 5.16086852673591223173e-02);
++         ir->AddTriPoints3(7, 5.05472283170309566458e-02, 1.62292488115990396480e-02);
++         ir->AddTriPoints6(10, 7.28492392955404244326e-01, 2.63112829634638112353e-01,
++                           1.36151570872174963733e-02);
+          ir->SetOrder(8);
+          TriangleIntRules[8] = ir;
+          return ir;
+ 
+-      case 9:  // 19 point - 9 degree
++      case 9:
+          ir = new IntegrationRule(19);
+-         ir->AddTriMidPoint(0, 0.0485678981413994169096209912536443);
+-         ir->AddTriPoints3b(1, 0.020634961602524744433,
+-                            0.0156673501135695352684274156436046);
+-         ir->AddTriPoints3b(4, 0.12582081701412672546,
+-                            0.0389137705023871396583696781497019);
+-         ir->AddTriPoints3(7, 0.188203535619032730240961280467335,
+-                           0.0398238694636051265164458871320226);
+-         ir->AddTriPoints3(10, 0.0447295133944527098651065899662763,
+-                           0.0127888378293490156308393992794999);
+-         ir->AddTriPoints6(13, 0.0368384120547362836348175987833851,
+-                           0.2219629891607656956751025276931919,
+-                           0.0216417696886446886446886446886446);
++         ir->AddTriMidPoint(0, 4.85678981413994181882e-02);
++         ir->AddTriPoints3(1, 4.37089591492936690997e-01, 3.89137705023871391385e-02);
++         ir->AddTriPoints3(4, 1.88203535619032802373e-01, 3.98238694636051243636e-02);
++         ir->AddTriPoints3(7, 4.89682519198737620236e-01, 1.56673501135695357467e-02);
++         ir->AddTriPoints3(10, 4.47295133944527467662e-02, 1.27888378293490156262e-02);
++         ir->AddTriPoints6(13, 7.41198598784498008385e-01, 2.21962989160765733487e-01,
++                           2.16417696886446880855e-02);
+          ir->SetOrder(9);
+          TriangleIntRules[9] = ir;
+          return ir;
+ 
+-      case 10:  // 25 point - 10 degree
++      case 10:
+          ir = new IntegrationRule(25);
+-         ir->AddTriMidPoint(0, 0.0454089951913767900476432975500142);
+-         ir->AddTriPoints3b(1, 0.028844733232685245264984935583748,
+-                            0.0183629788782333523585030359456832);
+-         ir->AddTriPoints3(4, 0.109481575485037054795458631340522,
+-                           0.0226605297177639673913028223692986);
+-         ir->AddTriPoints6(7, 0.141707219414879954756683250476361,
+-                           0.307939838764120950165155022930631,
+-                           0.0363789584227100543021575883096803);
+-         ir->AddTriPoints6(13, 0.025003534762686386073988481007746,
+-                           0.246672560639902693917276465411176,
+-                           0.0141636212655287424183685307910495);
+-         ir->AddTriPoints6(19, 0.0095408154002994575801528096228873,
+-                           0.0668032510122002657735402127620247,
+-                           4.71083348186641172996373548344341E-03);
++         ir->AddTriMidPoint(0, 4.08716645731429864541e-02);
++         ir->AddTriPoints3(1, 3.20553732169435168231e-02, 6.67648440657478327992e-03);
++         ir->AddTriPoints3(4, 1.42161101056564431744e-01, 2.29789818023723654838e-02);
++         ir->AddTriPoints6(7, 5.30054118927343997925e-01, 3.21812995288835446139e-01,
++                           3.19524531982120219009e-02);
++         ir->AddTriPoints6(13, 6.01233328683459244957e-01, 3.69146781827810910315e-01,
++                           1.70923240814797143539e-02);
++         ir->AddTriPoints6(19, 8.07930600922879049719e-01, 1.63701733737182442141e-01,
++                           1.26488788536441923438e-02);
+          ir->SetOrder(10);
+          TriangleIntRules[10] = ir;
+          return ir;
+ 
+-      case 11: // 28 point -- 11 degree
++      case 11:
+          ir = new IntegrationRule(28);
+-         ir->AddTriPoints6(0, 0.0,
+-                           0.141129718717363295960826061941652,
+-                           3.68119189165027713212944752369032E-03);
+-         ir->AddTriMidPoint(6, 0.0439886505811161193990465846607278);
+-         ir->AddTriPoints3(7, 0.0259891409282873952600324854988407,
+-                           4.37215577686801152475821439991262E-03);
+-         ir->AddTriPoints3(10, 0.0942875026479224956305697762754049,
+-                           0.0190407859969674687575121697178070);
+-         ir->AddTriPoints3b(13, 0.010726449965572372516734795387128,
+-                            9.42772402806564602923839129555767E-03);
+-         ir->AddTriPoints3(16, 0.207343382614511333452934024112966,
+-                           0.0360798487723697630620149942932315);
+-         ir->AddTriPoints3b(19, 0.122184388599015809877869236727746,
+-                            0.0346645693527679499208828254519072);
+-         ir->AddTriPoints6(22, 0.0448416775891304433090523914688007,
+-                           0.2772206675282791551488214673424523,
+-                           0.0205281577146442833208261574536469);
++         ir->AddTriMidPoint(0, 4.28805898661121093207e-02);
++         ir->AddTriPoints3(1, 2.84854176143718995640e-02, 5.21593525644734826857e-03);
++         ir->AddTriPoints3(4, 2.10219956703178278978e-01, 3.52578420558582877886e-02);
++         ir->AddTriPoints3(7, 1.02635482712246428605e-01, 1.93153796185096607307e-02);
++         ir->AddTriPoints3(10, 4.95891900965890919384e-01, 8.30313652729268436570e-03);
++         ir->AddTriPoints3(13, 4.38465926764352253997e-01, 3.36580770397341480504e-02);
++         ir->AddTriPoints6(16, 8.43349783661853091843e-01, 1.49324788652082374174e-01,
++                           5.14514478647663895533e-03);
++         ir->AddTriPoints6(22, 6.64408374196864159877e-01, 2.89581125637705882880e-01,
++                           2.01662383202502772106e-02);
+          ir->SetOrder(11);
+          TriangleIntRules[11] = ir;
+          return ir;
+ 
+-      case 12: // 33 point - 12 degree
++      case 12:
+          ir = new IntegrationRule(33);
+-         ir->AddTriPoints3b(0, 2.35652204523900E-02, 1.28655332202275E-02);
+-         ir->AddTriPoints3b(3, 1.20551215411079E-01, 2.18462722690190E-02);
+-         ir->AddTriPoints3(6, 2.71210385012116E-01, 3.14291121089425E-02);
+-         ir->AddTriPoints3(9, 1.27576145541586E-01, 1.73980564653545E-02);
+-         ir->AddTriPoints3(12, 2.13173504532100E-02, 3.08313052577950E-03);
+-         ir->AddTriPoints6(15, 1.15343494534698E-01, 2.75713269685514E-01,
+-                           2.01857788831905E-02);
+-         ir->AddTriPoints6(21, 2.28383322222570E-02, 2.81325580989940E-01,
+-                           1.11783866011515E-02);
+-         ir->AddTriPoints6(27, 2.57340505483300E-02, 1.16251915907597E-01,
+-                           8.65811555432950E-03);
++         ir->AddTriPoints3(0, 4.88203750945541581352e-01, 1.21334190407260157640e-02);
++         ir->AddTriPoints3(3, 1.09257827659354322947e-01, 1.42430260344387719235e-02);
++         ir->AddTriPoints3(6, 2.71462507014926135440e-01, 3.12706065979513822550e-02);
++         ir->AddTriPoints3(9, 2.46463634363356387524e-02, 3.96582125498681943576e-03);
++         ir->AddTriPoints3(12, 4.40111648658593201944e-01, 2.49591674640304711508e-02);
++         ir->AddTriPoints6(15, 6.85310163906391878186e-01, 2.91655679738340944951e-01,
++                           1.08917925193037796322e-02);
++         ir->AddTriPoints6(21, 6.28249751683556123538e-01, 2.55454228638517299999e-01,
++                           2.16136818297071042760e-02);
++         ir->AddTriPoints6(27, 8.51337792510240110033e-01, 1.27279717233589384495e-01,
++                           7.54183878825571887144e-03);
+          ir->SetOrder(12);
+          TriangleIntRules[12] = ir;
+          return ir;
+ 
+-      case 13: // 37 point - 13 degree
++      case 13:
+          ir = new IntegrationRule(37);
+-         ir->AddTriPoints3b(0, 0.0,
+-                            2.67845189554543044455908674650066E-03);
+-         ir->AddTriMidPoint(3, 0.0293480398063595158995969648597808);
+-         ir->AddTriPoints3(4, 0.0246071886432302181878499494124643,
+-                           3.92538414805004016372590903990464E-03);
+-         ir->AddTriPoints3b(7, 0.159382493797610632566158925635800,
+-                            0.0253344765879434817105476355306468);
+-         ir->AddTriPoints3(10, 0.227900255506160619646298948153592,
+-                           0.0250401630452545330803738542916538);
+-         ir->AddTriPoints3(13, 0.116213058883517905247155321839271,
+-                           0.0158235572961491595176634480481793);
+-         ir->AddTriPoints3b(16, 0.046794039901841694097491569577008,
+-                            0.0157462815379843978450278590138683);
+-         ir->AddTriPoints6(19, 0.0227978945382486125477207592747430,
+-                           0.1254265183163409177176192369310890,
+-                           7.90126610763037567956187298486575E-03);
+-         ir->AddTriPoints6(25, 0.0162757709910885409437036075960413,
+-                           0.2909269114422506044621801030055257,
+-                           7.99081889046420266145965132482933E-03);
+-         ir->AddTriPoints6(31, 0.0897330604516053590796290561145196,
+-                           0.2723110556841851025078181617634414,
+-                           0.0182757511120486476280967518782978);
++         ir->AddTriMidPoint(0, 3.39800182934158201409e-02);
++         ir->AddTriPoints3(1, 4.89076946452539351728e-01, 1.19972009644473652512e-02);
++         ir->AddTriPoints3(4, 2.21372286291832920391e-01, 2.91392425595999905730e-02);
++         ir->AddTriPoints3(7, 4.26941414259800422482e-01, 2.78009837652266646180e-02);
++         ir->AddTriPoints3(10, 2.15096811088433259584e-02, 3.02616855176958583773e-03);
++         ir->AddTriPoints6(13, 7.48507115899952224503e-01, 1.63597401067850478640e-01,
++                           1.20895199057969096601e-02);
++         ir->AddTriPoints6(19, 8.64707770295442768038e-01, 1.10922042803463405392e-01,
++                           7.48270055258283377231e-03);
++         ir->AddTriPoints6(25, 6.23545995553675513889e-01, 3.08441760892117777804e-01,
++                           1.73206380704241866275e-02);
++         ir->AddTriPoints6(31, 7.22357793124188019007e-01, 2.72515817773429591675e-01,
++                           4.79534050177163155559e-03);
+          ir->SetOrder(13);
+          TriangleIntRules[13] = ir;
+          return ir;
+ 
+-      case 14: // 42 point - 14 degree
++      case 14:
+          ir = new IntegrationRule(42);
+-         ir->AddTriPoints3b(0, 2.20721792756430E-02, 1.09417906847145E-02);
+-         ir->AddTriPoints3b(3, 1.64710561319092E-01, 1.63941767720625E-02);
+-         ir->AddTriPoints3(6, 2.73477528308839E-01, 2.58870522536460E-02);
+-         ir->AddTriPoints3(9, 1.77205532412543E-01, 2.10812943684965E-02);
+-         ir->AddTriPoints3(12, 6.17998830908730E-02, 7.21684983488850E-03);
+-         ir->AddTriPoints3(15, 1.93909612487010E-02, 2.46170180120000E-03);
+-         ir->AddTriPoints6(18, 5.71247574036480E-02, 1.72266687821356E-01,
+-                           1.23328766062820E-02);
+-         ir->AddTriPoints6(24, 9.29162493569720E-02, 3.36861459796345E-01,
+-                           1.92857553935305E-02);
+-         ir->AddTriPoints6(30, 1.46469500556540E-02, 2.98372882136258E-01,
+-                           7.21815405676700E-03);
+-         ir->AddTriPoints6(36, 1.26833093287200E-03, 1.18974497696957E-01,
+-                           2.50511441925050E-03);
++         ir->AddTriPoints3(0, 1.77205532412543442788e-01, 2.10812943684965080349e-02);
++         ir->AddTriPoints3(3, 4.17644719340453940415e-01, 1.63941767720626740967e-02);
++         ir->AddTriPoints3(6, 6.17998830908725871325e-02, 7.21684983488833382143e-03);
++         ir->AddTriPoints3(9, 4.88963910362178677538e-01, 1.09417906847144447147e-02);
++         ir->AddTriPoints3(12, 2.73477528308838646609e-01, 2.58870522536457925433e-02);
++         ir->AddTriPoints3(15, 1.93909612487010996063e-02, 2.46170180120004094063e-03);
++         ir->AddTriPoints6(18, 6.86980167808087793802e-01, 2.98372882136257788765e-01,
++                           7.21815405676692022074e-03);
++         ir->AddTriPoints6(24, 7.70608554774996457049e-01, 1.72266687821355679588e-01,
++                           1.23328766062818367955e-02);
++         ir->AddTriPoints6(30, 5.70222290846683188548e-01, 3.36861459796344964168e-01,
++                           1.92857553935303419057e-02);
++         ir->AddTriPoints6(36, 8.79757171370171064950e-01, 1.18974497696956893478e-01,
++                           2.50511441925033596229e-03);
+          ir->SetOrder(14);
+          TriangleIntRules[14] = ir;
+          return ir;
+ 
+-      case 15: // 54 point - 15 degree
+-         ir = new IntegrationRule(54);
+-         ir->AddTriPoints3b(0, 0.0834384072617499333, 0.016330909424402645);
+-         ir->AddTriPoints3b(3, 0.192779070841738867, 0.01370640901568218);
+-         ir->AddTriPoints3(6, 0.293197167913025367, 0.01325501829935165);
+-         ir->AddTriPoints3(9, 0.146467786942772933, 0.014607981068243055);
+-         ir->AddTriPoints3(12, 0.0563628676656034333, 0.005292304033121995);
+-         ir->AddTriPoints3(15, 0.0165751268583703333, 0.0018073215320460175);
+-         ir->AddTriPoints6(18, 0.0099122033092248, 0.239534554154794445,
+-                           0.004263874050854718);
+-         ir->AddTriPoints6(24, 0.015803770630228, 0.404878807318339958,
+-                           0.006958088258345965);
+-         ir->AddTriPoints6(30, 0.00514360881697066667, 0.0950021131130448885,
+-                           0.0021459664703674175);
+-         ir->AddTriPoints6(36, 0.0489223257529888, 0.149753107322273969,
+-                           0.008117664640887445);
+-         ir->AddTriPoints6(42, 0.0687687486325192, 0.286919612441334979,
+-                           0.012803670460631195);
+-         ir->AddTriPoints6(48, 0.1684044181246992, 0.281835668099084562,
+-                           0.016544097765822835);
++      case 15:
++         ir = new IntegrationRule(49);
++         ir->AddTriMidPoint(0, 2.21676936910920364954e-02);
++         ir->AddTriPoints3(1, 4.05362214133975495844e-01, 2.13568907857302828224e-02);
++         ir->AddTriPoints3(4, 7.01735528999860580512e-02, 8.22236878131258133728e-03);
++         ir->AddTriPoints3(7, 4.74170681438019769871e-01, 8.69807400038170690226e-03);
++         ir->AddTriPoints3(10, 2.26378713420349653163e-01, 2.33916808643548149171e-02);
++         ir->AddTriPoints3(13, 4.94996956769126195130e-01, 4.78692309123004283711e-03);
++         ir->AddTriPoints3(16, 1.58117262509887002153e-02, 1.48038731895268772104e-03);
++         ir->AddTriPoints6(19, 6.66975644801868106093e-01, 3.14648242812450851247e-01,
++                           7.80128641528798211224e-03);
++         ir->AddTriPoints6(25, 9.19912157726236134891e-01, 7.09486052364554087291e-02,
++                           2.01492668600904969653e-03);
++         ir->AddTriPoints6(31, 7.15222356931450642392e-01, 1.90535589476393929509e-01,
++                           1.43602934626006709801e-02);
++         ir->AddTriPoints6(37, 8.13292641049419229304e-01, 1.68068645222414381202e-01,
++                           5.83631059078792285844e-03);
++         ir->AddTriPoints6(43, 5.65252664877114230357e-01, 3.38950611475277163720e-01,
++                           1.56577381424846430458e-02);
+          ir->SetOrder(15);
+          TriangleIntRules[15] = ir;
+          return ir;
+ 
+-      case 16:  // 61 point - 17 degree (used for 16 as well)
++      case 16:
++         ir = new IntegrationRule(55);
++         ir->AddTriMidPoint(0, 2.26322830369093952463e-02);
++         ir->AddTriPoints3(1, 2.45990070467141719313e-01, 2.05464615718494759966e-02);
++         ir->AddTriPoints3(4, 4.15584896885420551627e-01, 2.03559166562126796218e-02);
++         ir->AddTriPoints3(7, 8.53555665867003487968e-02, 7.39081734511220188322e-03);
++         ir->AddTriPoints3(10, 1.61918644191271221544e-01, 1.47092048494940497855e-02);
++         ir->AddTriPoints3(13, 5.00000000000000000000e-01, 2.20927315607528452004e-03);
++         ir->AddTriPoints3(16, 4.75280727545942083268e-01, 1.29871666491385793357e-02);
++         ir->AddTriPoints6(19, 7.54170061444767725334e-01, 1.91074763640529221576e-01,
++                           9.46913623220784969603e-03);
++         ir->AddTriPoints6(25, 9.68244368030958701965e-01, 2.32034277688137335893e-02,
++                           8.27233357417524097638e-04);
++         ir->AddTriPoints6(31, 6.49303698245446425652e-01, 3.31764523474147643434e-01,
++                           7.50430089214290316213e-03);
++         ir->AddTriPoints6(37, 9.00273703270429548340e-01, 8.06961669858730079596e-02,
++                           3.97379696669624901673e-03);
++         ir->AddTriPoints6(43, 5.89148840564247877616e-01, 3.08244969196354023921e-01,
++                           1.59918050396850343342e-02);
++         ir->AddTriPoints6(49, 8.06621867499395683865e-01, 1.87441782483782071189e-01,
++                           2.69559355842440570919e-03);
++         ir->SetOrder(16);
++         TriangleIntRules[16] = ir;
++         return ir;
++
+       case 17:
+-         ir = new IntegrationRule(61);
+-         ir->AddTriMidPoint(0,  1.67185996454015E-02);
+-         ir->AddTriPoints3b(1,  5.65891888645200E-03, 2.54670772025350E-03);
+-         ir->AddTriPoints3b(4,  3.56473547507510E-02, 7.33543226381900E-03);
+-         ir->AddTriPoints3b(7,  9.95200619584370E-02, 1.21754391768360E-02);
+-         ir->AddTriPoints3b(10, 1.99467521245206E-01, 1.55537754344845E-02);
+-         ir->AddTriPoints3 (13, 2.52141267970953E-01, 1.56285556093100E-02);
+-         ir->AddTriPoints3 (16, 1.62047004658461E-01, 1.24078271698325E-02);
+-         ir->AddTriPoints3 (19, 7.58758822607460E-02, 7.02803653527850E-03);
+-         ir->AddTriPoints3 (22, 1.56547269678220E-02, 1.59733808688950E-03);
+-         ir->AddTriPoints6 (25, 1.01869288269190E-02, 3.34319867363658E-01,
+-                            4.05982765949650E-03);
+-         ir->AddTriPoints6 (31, 1.35440871671036E-01, 2.92221537796944E-01,
+-                            1.34028711415815E-02);
+-         ir->AddTriPoints6 (37, 5.44239242905830E-02, 3.19574885423190E-01,
+-                            9.22999660541100E-03);
+-         ir->AddTriPoints6 (43, 1.28685608336370E-02, 1.90704224192292E-01,
+-                            4.23843426716400E-03);
+-         ir->AddTriPoints6 (49, 6.71657824135240E-02, 1.80483211648746E-01,
+-                            9.14639838501250E-03);
+-         ir->AddTriPoints6 (55, 1.46631822248280E-02, 8.07113136795640E-02,
+-                            3.33281600208250E-03);
++         ir = new IntegrationRule(60);
++         ir->AddTriPoints3(0, 4.17103444361599295931e-01, 1.36554632640510532210e-02);
++         ir->AddTriPoints3(3, 1.47554916607539610141e-02, 1.38694378881882109979e-03);
++         ir->AddTriPoints3(6, 4.65597871618890324363e-01, 1.25097254752486782697e-02);
++         ir->AddTriPoints3(9, 1.80358116266370605008e-01, 1.31563152940089925225e-02);
++         ir->AddTriPoints3(12, 6.66540634795969033632e-02, 6.22950040115272107855e-03);
++         ir->AddTriPoints3(15, 2.85706502436586629035e-01, 1.88581185763976415248e-02);
++         ir->AddTriPoints6(18, 8.24790070165088096132e-01, 1.59192287472792681768e-01,
++                           3.98915010296479674579e-03);
++         ir->AddTriPoints6(24, 6.26369030386452196879e-01, 3.06281591746186521164e-01,
++                           1.12438862733455335191e-02);
++         ir->AddTriPoints6(30, 5.71294867944684092720e-01, 4.15475459295228999324e-01,
++                           5.19921997791976831654e-03);
++         ir->AddTriPoints6(36, 7.53235145936458128091e-01, 1.68722513495259462957e-01,
++                           1.02789491602272593102e-02);
++         ir->AddTriPoints6(42, 7.15072259110642427515e-01, 2.71791870055354878311e-01,
++                           4.34610725050059605590e-03);
++         ir->AddTriPoints6(48, 9.15919353297816929427e-01, 7.25054707990024915887e-02,
++                           2.29217420086793351869e-03);
++         ir->AddTriPoints6(54, 5.43275579596159796658e-01, 2.99218942476970228839e-01,
++                           1.30858129676684944304e-02);
+          ir->SetOrder(17);
+-         TriangleIntRules[16] = TriangleIntRules[17] = ir;
++         TriangleIntRules[17] = ir;
++         return ir;
++
++      case 18:
++         ir = new IntegrationRule(67);
++         ir->AddTriMidPoint(0, 1.81778676507133342410e-02);
++         ir->AddTriPoints3(1, 3.99955628067576229867e-01, 1.66522350166950668104e-02);
++         ir->AddTriPoints3(4, 4.87580301574869645620e-01, 6.02332381699985548729e-03);
++         ir->AddTriPoints3(7, 4.61809506406449243876e-01, 9.47458575338943308208e-03);
++         ir->AddTriPoints3(10, 2.42264702514271956790e-01, 1.82375447044718190515e-02);
++         ir->AddTriPoints3(13, 3.88302560886856218403e-02, 3.56466300985948522304e-03);
++         ir->AddTriPoints3(16, 9.19477421216432500017e-02, 8.27957997600162372287e-03);
++         ir->AddTriPoints6(19, 7.70372376214675247397e-01, 1.83822707925463957324e-01,
++                           6.87980811747110256732e-03);
++         ir->AddTriPoints6(25, 6.70953985194234547862e-01, 2.06349257433837918185e-01,
++                           1.18909554500764153007e-02);
++         ir->AddTriPoints6(31, 6.00418954634256873959e-01, 3.95683434332269712286e-01,
++                           2.26526725112853252742e-03);
++         ir->AddTriPoints6(37, 8.78342189467521738955e-01, 1.08195793791033278985e-01,
++                           3.42005505980359086893e-03);
++         ir->AddTriPoints6(43, 6.39988092004714625993e-01, 3.19751624525377309283e-01,
++                           8.87374455101020212511e-03);
++         ir->AddTriPoints6(49, 7.58929479855198430016e-01, 2.35772184958191743931e-01,
++                           2.50533043728986106261e-03);
++         ir->AddTriPoints6(55, 9.72360728962795684005e-01, 2.70909109951620319379e-02,
++                           6.11474063480544911126e-04);
++         ir->AddTriPoints6(61, 5.45918775386194599086e-01, 3.33493529449880754534e-01,
++                           1.27410876559122202695e-02);
++         ir->SetOrder(18);
++         TriangleIntRules[18] = ir;
+          return ir;
+ 
+-      case 18: // 73 point - 19 degree (used for 18 as well)
+       case 19:
+          ir = new IntegrationRule(73);
+-         ir->AddTriMidPoint(0,  0.0164531656944595);
+-         ir->AddTriPoints3b(1,  0.020780025853987, 0.005165365945636);
+-         ir->AddTriPoints3b(4,  0.090926214604215, 0.011193623631508);
+-         ir->AddTriPoints3b(7,  0.197166638701138, 0.015133062934734);
+-         ir->AddTriPoints3 (10, 0.255551654403098, 0.015245483901099);
+-         ir->AddTriPoints3 (13, 0.17707794215213,  0.0120796063708205);
+-         ir->AddTriPoints3 (16, 0.110061053227952, 0.0080254017934005);
+-         ir->AddTriPoints3 (19, 0.05552862425184,  0.004042290130892);
+-         ir->AddTriPoints3 (22, 0.012621863777229, 0.0010396810137425);
+-         ir->AddTriPoints6 (25, 0.003611417848412, 0.395754787356943,
+-                            0.0019424384524905);
+-         ir->AddTriPoints6 (31, 0.13446675453078, 0.307929983880436,
+-                            0.012787080306011);
+-         ir->AddTriPoints6 (37, 0.014446025776115, 0.26456694840652,
+-                            0.004440451786669);
+-         ir->AddTriPoints6 (43, 0.046933578838178, 0.358539352205951,
+-                            0.0080622733808655);
+-         ir->AddTriPoints6 (49, 0.002861120350567, 0.157807405968595,
+-                            0.0012459709087455);
+-         ir->AddTriPoints6 (55, 0.075050596975911, 0.223861424097916,
+-                            0.0091214200594755);
+-         ir->AddTriPoints6 (61, 0.03464707481676, 0.142421601113383,
+-                            0.0051292818680995);
+-         ir->AddTriPoints6 (67, 0.065494628082938, 0.010161119296278,
+-                            0.001899964427651);
++         ir->AddTriMidPoint(0, 1.72346988520061666916e-02);
++         ir->AddTriPoints3(1, 5.25238903512089683190e-02, 3.55462829889906543543e-03);
++         ir->AddTriPoints3(4, 4.92512675041336889237e-01, 5.16087757147214078873e-03);
++         ir->AddTriPoints3(7, 1.11448873323021391268e-01, 7.61717554650914990128e-03);
++         ir->AddTriPoints3(10, 4.59194201039543670184e-01, 1.14917950133708035576e-02);
++         ir->AddTriPoints3(13, 4.03969722551901222474e-01, 1.57687674465774863020e-02);
++         ir->AddTriPoints3(16, 1.78170104781764315760e-01, 1.23259574240954274116e-02);
++         ir->AddTriPoints3(19, 1.16394611837894457196e-02, 8.82661388221423837477e-04);
++         ir->AddTriPoints3(22, 2.55161632913607716588e-01, 1.58765096830015377261e-02);
++         ir->AddTriPoints6(25, 8.30156464400275351245e-01, 1.30697676268032414448e-01,
++                           4.84774224342752330097e-03);
++         ir->AddTriPoints6(31, 5.59369805720300927732e-01, 3.11317629809541251973e-01,
++                           1.31731609886953666272e-02);
++         ir->AddTriPoints6(37, 6.33313293128784149388e-01, 3.64617780974611060962e-01,
++                           1.64103827591790965915e-03);
++         ir->AddTriPoints6(43, 7.04004819966042139079e-01, 2.21434885432331141075e-01,
++                           9.05397246560622585843e-03);
++         ir->AddTriPoints6(49, 8.52566954376889230005e-01, 1.42425757365756355810e-01,
++                           1.46315755173510018451e-03);
++         ir->AddTriPoints6(55, 6.05083979068707922266e-01, 3.54028009735275261960e-01,
++                           8.05108138201205379703e-03);
++         ir->AddTriPoints6(61, 7.43181368957436361278e-01, 2.41894578960579587079e-01,
++                           4.22794374976824798712e-03);
++         ir->AddTriPoints6(67, 9.30137698876805085746e-01, 6.00862753223067036501e-02,
++                           1.66360068142969402642e-03);
+          ir->SetOrder(19);
+-         TriangleIntRules[18] = TriangleIntRules[19] = ir;
++         TriangleIntRules[19] = ir;
+          return ir;
+ 
+-      case 20: // 85 point - 20 degree
+-         ir = new IntegrationRule(85);
+-         ir->AddTriMidPoint(0, 0.01380521349884976);
+-         ir->AddTriPoints3b(1, 0.001500649324429,     0.00088951477366337);
+-         ir->AddTriPoints3b(4, 0.0941397519389508667, 0.010056199056980585);
+-         ir->AddTriPoints3b(7, 0.2044721240895264,    0.013408923629665785);
+-         ir->AddTriPoints3(10, 0.264500202532787333,  0.012261566900751005);
+-         ir->AddTriPoints3(13, 0.211018964092076767,  0.008197289205347695);
+-         ir->AddTriPoints3(16, 0.107735607171271333,  0.0073979536993248);
+-         ir->AddTriPoints3(19, 0.0390690878378026667, 0.0022896411388521255);
+-         ir->AddTriPoints3(22, 0.0111743797293296333, 0.0008259132577881085);
+-         ir->AddTriPoints6(25, 0.00534961818733726667, 0.0635496659083522206,
+-                           0.001174585454287792);
+-         ir->AddTriPoints6(31, 0.00795481706619893333, 0.157106918940706982,
+-                           0.0022329628770908965);
+-         ir->AddTriPoints6(37, 0.0104223982812638,     0.395642114364374018,
+-                           0.003049783403953986);
+-         ir->AddTriPoints6(43, 0.0109644147961233333,  0.273167570712910522,
+-                           0.0034455406635941015);
+-         ir->AddTriPoints6(49, 0.0385667120854623333,  0.101785382485017108,
+-                           0.0039987375362390815);
+-         ir->AddTriPoints6(55, 0.0355805078172182,     0.446658549176413815,
+-                           0.003693067142668012);
+-         ir->AddTriPoints6(61, 0.0496708163627641333,  0.199010794149503095,
+-                           0.00639966593932413);
+-         ir->AddTriPoints6(67, 0.0585197250843317333,  0.3242611836922827,
+-                           0.008629035587848275);
+-         ir->AddTriPoints6(73, 0.121497787004394267,   0.208531363210132855,
+-                           0.009336472951467735);
+-         ir->AddTriPoints6(79, 0.140710844943938733,   0.323170566536257485,
+-                           0.01140911202919763);
++      case 20:
++         ir = new IntegrationRule(79);
++         ir->AddTriMidPoint(0, 1.39101107014531159140e-02);
++         ir->AddTriPoints3(1, 2.54579267673339160183e-01, 1.40832013075202471669e-02);
++         ir->AddTriPoints3(4, 1.09761410283977789426e-02, 7.98840791066619858654e-04);
++         ir->AddTriPoints3(7, 1.09383596711714603522e-01, 7.83023077607453328597e-03);
++         ir->AddTriPoints3(10, 1.86294997744540946627e-01, 9.17346297425291473671e-03);
++         ir->AddTriPoints3(13, 4.45551056955924895675e-01, 9.45239993323244813428e-03);
++         ir->AddTriPoints3(16, 3.73108805988847103130e-02, 2.16127541066557732688e-03);
++         ir->AddTriPoints3(19, 3.93425347817099924086e-01, 1.37880506290704585304e-02);
++         ir->AddTriPoints3(22, 4.76245611540499047543e-01, 7.10182530340844071076e-03);
++         ir->AddTriPoints6(25, 8.33295511838236246938e-01, 1.59133707657067247077e-01,
++                           2.20289741855849742491e-03);
++         ir->AddTriPoints6(31, 7.54921502863547422280e-01, 1.98518132228788335425e-01,
++                           5.98639857895469015836e-03);
++         ir->AddTriPoints6(37, 9.31054476783942153162e-01, 6.40905856084340586065e-02,
++                           1.12986960212586558597e-03);
++         ir->AddTriPoints6(43, 6.11877703547425655373e-01, 3.33134817309587605294e-01,
++                           8.66722556721933289070e-03);
++         ir->AddTriPoints6(49, 8.61684018936486717521e-01, 9.99522962881386756173e-02,
++                           4.14571152761385782609e-03);
++         ir->AddTriPoints6(55, 6.78165737889635522606e-01, 2.15607057390094447591e-01,
++                           7.72260782209923009323e-03);
++         ir->AddTriPoints6(61, 5.70144692890973359134e-01, 4.20023758816224113133e-01,
++                           3.69568150025529782929e-03);
++         ir->AddTriPoints6(67, 5.42331804172428100230e-01, 3.17860123835772001577e-01,
++                           1.16917457318277372147e-02);
++         ir->AddTriPoints6(73, 7.08681375720323636358e-01, 2.80581411423665327831e-01,
++                           3.57820023845768515197e-03);
+          ir->SetOrder(20);
+          TriangleIntRules[20] = ir;
+          return ir;
+ 
+-      case 21: // 126 point - 25 degree (used also for degrees from 21 to 24)
++      case 21:
+       case 22:
+       case 23:
+       case 24:
+@@ -1644,12 +1662,13 @@ IntegrationRule *IntegrationRules::TriangleIntegrationRule(int Order)
+          return ir;
+ 
+       default:
+-         // Grundmann-Moller rules
+-         int i = (Order / 2) * 2 + 1;   // Get closest odd # >= Order
++         // Grundmann-Moller fallback for orders beyond tabulated rules
++         int i = (Order / 2) * 2 + 1;   // closest odd >= Order
+          AllocIntRule(TriangleIntRules, i);
+          ir = new IntegrationRule;
+-         ir->GrundmannMollerSimplexRule(i/2,2);
+-         TriangleIntRules[i-1] = TriangleIntRules[i] = ir;
++         ir->GrundmannMollerSimplexRule(i/2, 2);
++         if (!TriangleIntRules[i-1]) { TriangleIntRules[i-1] = ir; }
++         TriangleIntRules[i] = ir;
+          return ir;
+    }
+ }
+@@ -1671,119 +1690,584 @@ IntegrationRule *IntegrationRules::SquareIntegrationRule(int Order)
+    return SquareIntRules[Order];
+ }
+ 
+-/** Integration rules for reference tetrahedron
+-    {[0,0,0],[1,0,0],[0,1,0],[0,0,1]}          */
++// Tet rules d=0-13 from Witherden & Vincent [1], orbit data from PyFR, CC-BY 4.0.
++// Tet rules d=14-20 from Chuluunbaatar et al. [2], supplementary data.
+ IntegrationRule *IntegrationRules::TetrahedronIntegrationRule(int Order)
+ {
+-   IntegrationRule *ir;
+-   // Note: Set TetrahedronIntRules[*] to ir only *after* ir is fully
+-   // constructed. This is needed in multithreaded environment.
++   IntegrationRule *ir = NULL;
+ 
+-   // assuming that orders <= 9 are pre-allocated
+    switch (Order)
+    {
+-      case 0:  // 1 point - degree 1
++      case 0:
+       case 1:
+          ir = new IntegrationRule(1);
+          ir->AddTetMidPoint(0, 1./6.);
+          ir->SetOrder(1);
+-         TetrahedronIntRules[0] = TetrahedronIntRules[1] = ir;
++         TetrahedronIntRules[0] =
++            TetrahedronIntRules[1] = ir;
+          return ir;
+ 
+-      case 2:  // 4 points - degree 2
++      case 2:
+          ir = new IntegrationRule(4);
+-         // ir->AddTetPoints4(0, 0.13819660112501051518, 1./24.);
+-         ir->AddTetPoints4b(0, 0.58541019662496845446, 1./24.);
++         ir->AddTetPoints4(0, 1.38196601125010531952e-01, 1./24.);
+          ir->SetOrder(2);
+          TetrahedronIntRules[2] = ir;
+          return ir;
+ 
+-      case 3:  // 5 points - degree 3 (negative weight)
+-         ir = new IntegrationRule(5);
+-         ir->AddTetMidPoint(0, -2./15.);
+-         ir->AddTetPoints4b(1, 0.5, 0.075);
++      case 3:
++         ir = new IntegrationRule(8);
++         ir->AddTetPoints4(0, 3.28163302516381705232e-01, 2.27029737561812265667e-02);
++         ir->AddTetPoints4(4, 1.08047249898428621151e-01, 1.89636929104854412564e-02);
+          ir->SetOrder(3);
+          TetrahedronIntRules[3] = ir;
+          return ir;
+ 
+-      case 4:  // 11 points - degree 4 (negative weight)
+-         ir = new IntegrationRule(11);
+-         ir->AddTetPoints4(0, 1./14., 343./45000.);
+-         ir->AddTetMidPoint(4, -74./5625.);
+-         ir->AddTetPoints6(5, 0.10059642383320079500, 28./1125.);
+-         ir->SetOrder(4);
+-         TetrahedronIntRules[4] = ir;
+-         return ir;
+-
+-      case 5:  // 14 points - degree 5
++      case 4:
++      case 5:
+          ir = new IntegrationRule(14);
+-         ir->AddTetPoints6(0, 0.045503704125649649492,
+-                           7.0910034628469110730E-03);
+-         ir->AddTetPoints4(6, 0.092735250310891226402, 0.012248840519393658257);
+-         ir->AddTetPoints4b(10, 0.067342242210098170608,
+-                            0.018781320953002641800);
++         ir->AddTetPoints4(0, 3.10885919263300669613e-01, 1.87813209530026427319e-02);
++         ir->AddTetPoints4(4, 9.27352503108912484819e-02, 1.22488405193936587129e-02);
++         ir->AddTetPoints6(8, 4.54496295874350364485e-01, 7.09100346284691120807e-03);
+          ir->SetOrder(5);
+-         TetrahedronIntRules[5] = ir;
++         TetrahedronIntRules[4] =
++            TetrahedronIntRules[5] = ir;
+          return ir;
+ 
+-      case 6:  // 24 points - degree 6
++      case 6:
+          ir = new IntegrationRule(24);
+-         ir->AddTetPoints4(0, 0.21460287125915202929,
+-                           6.6537917096945820166E-03);
+-         ir->AddTetPoints4(4, 0.040673958534611353116,
+-                           1.6795351758867738247E-03);
+-         ir->AddTetPoints4b(8, 0.032986329573173468968,
+-                            9.2261969239424536825E-03);
+-         ir->AddTetPoints12(12, 0.063661001875017525299, 0.26967233145831580803,
+-                            8.0357142857142857143E-03);
++         ir->AddTetPoints4(0, 4.06739585346113652342e-02, 1.67953517588677390775e-03);
++         ir->AddTetPoints4(4, 3.22337890142275540484e-01, 9.22619692394245453915e-03);
++         ir->AddTetPoints4(8, 2.14602871259152117034e-01, 6.65379170969458179352e-03);
++         ir->AddTetPoints12(12, 6.36610018750174977420e-02, 6.03005664791649187428e-01,
++                            8.03571428571428492127e-03);
+          ir->SetOrder(6);
+          TetrahedronIntRules[6] = ir;
+          return ir;
+ 
+-      case 7:  // 31 points - degree 7 (negative weight)
+-         ir = new IntegrationRule(31);
+-         ir->AddTetPoints6(0, 0.0, 9.7001763668430335097E-04);
+-         ir->AddTetMidPoint(6, 0.018264223466108820291);
+-         ir->AddTetPoints4(7, 0.078213192330318064374, 0.010599941524413686916);
+-         ir->AddTetPoints4(11, 0.12184321666390517465,
+-                           -0.062517740114331851691);
+-         ir->AddTetPoints4b(15, 2.3825066607381275412E-03,
+-                            4.8914252630734993858E-03);
+-         ir->AddTetPoints12(19, 0.1, 0.2, 0.027557319223985890653);
++      case 7:
++         ir = new IntegrationRule(35);
++         ir->AddTetMidPoint(0, 1.59142149106884754628e-02);
++         ir->AddTetPoints4(1, 3.15701149778202794227e-01, 7.05493020166117132397e-03);
++         ir->AddTetPoints6(5, 4.49510177401603649994e-01, 5.31615463880959638471e-03);
++         ir->AddTetPoints12(11, 1.88833831026001153219e-01, 5.75171637586999962011e-01,
++                            6.20118845472243662709e-03);
++         ir->AddTetPoints12(23, 2.12654725414832546093e-02, 8.10830241098548620826e-01,
++                            1.35179513831722359664e-03);
+          ir->SetOrder(7);
+          TetrahedronIntRules[7] = ir;
+          return ir;
+ 
+-      case 8:  // 43 points - degree 8 (negative weight)
+-         ir = new IntegrationRule(43);
+-         ir->AddTetPoints4(0, 5.7819505051979972532E-03,
+-                           1.6983410909288737984E-04);
+-         ir->AddTetPoints4(4, 0.082103588310546723091,
+-                           1.9670333131339009876E-03);
+-         ir->AddTetPoints12(8, 0.036607749553197423679, 0.19048604193463345570,
+-                            2.1405191411620925965E-03);
+-         ir->AddTetPoints6(20, 0.050532740018894224426,
+-                           4.5796838244672818007E-03);
+-         ir->AddTetPoints12(26, 0.22906653611681113960, 0.035639582788534043717,
+-                            5.7044858086819185068E-03);
+-         ir->AddTetPoints4(38, 0.20682993161067320408, 0.014250305822866901248);
+-         ir->AddTetMidPoint(42, -0.020500188658639915841);
++      case 8:
++         ir = new IntegrationRule(46);
++         ir->AddTetPoints4(0, 1.07952724962210866444e-01, 4.40444181806813866292e-03);
++         ir->AddTetPoints4(4, 1.85109487782586568105e-01, 8.67195792728975463348e-03);
++         ir->AddTetPoints4(8, 4.23165436847673381848e-02, 1.25420935892336655841e-03);
++         ir->AddTetPoints4(12, 3.14181709124039088010e-01, 6.96063047615581593358e-03);
++         ir->AddTetPoints6(16, 4.35591328583830206256e-01, 6.04682171021813687217e-03);
++         ir->AddTetPoints12(22, 2.14339301271305737728e-02, 7.17464063426308307214e-01,
++                            1.19281714847407210867e-03);
++         ir->AddTetPoints12(34, 2.04139333876029116510e-01, 5.83797378302144398532e-01,
++                            2.57558102516005586052e-03);
+          ir->SetOrder(8);
+          TetrahedronIntRules[8] = ir;
+          return ir;
+ 
+-      case 9: // orders 9 and higher -- Grundmann-Moller rules
+-         ir = new IntegrationRule;
+-         ir->GrundmannMollerSimplexRule(4,3);
++      case 9:
++         ir = new IntegrationRule(59);
++         ir->AddTetMidPoint(0, 9.66842481874670943431e-03);
++         ir->AddTetPoints4(1, 6.19817086544571793638e-10, 1.07198802932093984424e-05);
++         ir->AddTetPoints4(5, 1.60774535395261597426e-01, 3.86222307707090968185e-03);
++         ir->AddTetPoints4(9, 3.22276521821420969260e-01, 4.92715205590488116577e-03);
++         ir->AddTetPoints4(13, 4.51089183454135844720e-02, 1.34399666326936377374e-03);
++         ir->AddTetPoints6(17, 3.87703453995623947836e-01, 6.35568001728374458448e-03);
++         ir->AddTetPoints12(23, 4.58871448752459276665e-01, 7.97025232620401369310e-02,
++                            1.39740369971642539558e-03);
++         ir->AddTetPoints12(35, 3.37758706853386048152e-02, 7.18350326442074527122e-01,
++                            1.70575989212422133613e-03);
++         ir->AddTetPoints12(47, 1.83641369809927956780e-01, 5.98301349801968918030e-01,
++                            3.42081932799802312939e-03);
++         ir->SetOrder(9);
+          TetrahedronIntRules[9] = ir;
+          return ir;
+ 
+-      default: // Grundmann-Moller rules
+-         int i = (Order / 2) * 2 + 1;   // Get closest odd # >= Order
++      case 10:
++         ir = new IntegrationRule(81);
++         ir->AddTetMidPoint(0, 7.89996225933678984654e-03);
++         ir->AddTetPoints4(1, 3.12250068695188676138e-01, 4.48950999871145037950e-03);
++         ir->AddTetPoints4(5, 1.14309653857346149586e-01, 1.64485995279889710662e-03);
++         ir->AddTetPoints12(9, 4.10430739218965501269e-01, 1.65486025619611065718e-01,
++                            1.89898020336587186781e-03);
++         ir->AddTetPoints12(21, 6.13800882479076381770e-03, 9.42988767345204870196e-01,
++                            6.03240573898756009806e-05);
++         ir->AddTetPoints12(33, 1.21050181145589408338e-01, 4.77190379904280370660e-01,
++                            4.28995533007601147213e-03);
++         ir->AddTetPoints12(45, 3.27794682164426753879e-02, 5.94256269480006982242e-01,
++                            1.68931194662596552945e-03);
++         ir->AddTetPoints12(57, 3.24852815648231096901e-02, 8.01177284658344368573e-01,
++                            1.09602454617265063913e-03);
++         ir->AddTetPoints12(69, 1.74979342183939068356e-01, 6.28071845475365986289e-01,
++                            2.15117263314366490706e-03);
++         ir->SetOrder(10);
++         TetrahedronIntRules[10] = ir;
++         return ir;
++
++      case 11:
++         ir = new IntegrationRule(96);
++         ir->AddTetPoints4(0, 2.71527207067321363354e-02, 3.30755017786941475644e-04);
++         ir->AddTetPoints4(4, 7.29513610462571016058e-02, 1.27724462275054500421e-03);
++         ir->AddTetPoints4(8, 1.16306248902001030388e-01, 2.22195840281977797376e-03);
++         ir->AddTetPoints4(12, 1.79873804986097840519e-01, 3.55549424791121128006e-03);
++         ir->AddTetPoints4(16, 2.90224794862315171873e-01, 4.27767411104971236741e-03);
++         ir->AddTetPoints4(20, 3.25420936748619160639e-01, 2.29560465583227143668e-03);
++         ir->AddTetPoints6(24, 4.99998725049884129579e-01, 1.95152894059845476377e-04);
++         ir->AddTetPoints6(30, 3.94300142842090972639e-01, 4.13762713030314983886e-03);
++         ir->AddTetPoints12(36, 1.53994139264412854828e-02, 8.20202176629804657892e-01,
++                            3.44113207868302869216e-04);
++         ir->AddTetPoints12(48, 4.36843254717693696421e-02, 6.27516751622257062948e-01,
++                            2.00540889524405963051e-03);
++         ir->AddTetPoints12(60, 1.32316796082697751835e-01, 7.35366407834604496330e-01,
++                            5.92160675031106853265e-04);
++         ir->AddTetPoints12(72, 2.14430354900043917965e-01, 5.31595425719235903372e-01,
++                            3.02373698028425954079e-03);
++         ir->AddTetPoints12(84, 4.39586615093850330283e-01, 1.15789732843376125260e-01,
++                            1.10416876556284249307e-03);
++         ir->SetOrder(11);
++         TetrahedronIntRules[11] = ir;
++         return ir;
++
++      case 12:
++         ir = new IntegrationRule(123);
++         ir->AddTetMidPoint(0, 3.73841522662751247000e-03);
++         ir->AddTetPoints4(1, 1.87550512633127830497e-02, 1.42695998696545141987e-04);
++         ir->AddTetPoints4(5, 1.08129536920462676619e-01, 2.21101382522646480733e-03);
++         ir->AddTetPoints4(9, 2.00131676822545012673e-01, 1.02716311841773611131e-03);
++         ir->AddTetPoints4(13, 3.00854293538076578152e-01, 3.84627572131096594557e-03);
++         ir->AddTetPoints4(17, 3.33333333333333259318e-01, 3.93263480259983290444e-04);
++         ir->AddTetPoints6(21, 4.61659950214442116323e-01, 3.64372815936188636666e-04);
++         ir->AddTetPoints12(27, 1.44707549187619299857e-02, 8.12825119403836504617e-01,
++                            2.78228183082975693598e-04);
++         ir->AddTetPoints12(39, 1.93543398987769954545e-02, 6.01428742996530152354e-01,
++                            5.44744358748572190913e-04);
++         ir->AddTetPoints12(51, 7.79277628532308863640e-02, 8.27642519452021385717e-01,
++                            4.96603607085240776955e-04);
++         ir->AddTetPoints12(63, 1.22055870746741623734e-01, 4.77806520042316273944e-01,
++                            3.50252598711278933380e-03);
++         ir->AddTetPoints12(75, 2.47870739372197945727e-01, 4.77761799116294072487e-01,
++                            1.92951947508030146987e-03);
++         ir->AddTetPoints12(87, 4.29731509588804683197e-01, 1.17747435901101149547e-01,
++                            1.59458000732484511328e-03);
++         ir->AddTetPoints24(99, 6.53037808968305988344e-01, 2.26776739658831050228e-01,
++                            9.77349816032284102185e-02, 1.25441443948160597475e-03);
++         ir->SetOrder(12);
++         TetrahedronIntRules[12] = ir;
++         return ir;
++
++      case 13:
++         ir = new IntegrationRule(145);
++         ir->AddTetMidPoint(0, 4.65163625751287973520e-03);
++         ir->AddTetPoints4(1, 1.83047574861928130652e-02, 1.11328544338301012079e-04);
++         ir->AddTetPoints4(5, 1.79015082630022803745e-01, 3.21788310144650391634e-03);
++         ir->AddTetPoints4(9, 3.29615853754448240309e-01, 1.11613482625956810662e-03);
++         ir->AddTetPoints6(13, 4.84258919196047465938e-01, 4.22777636660235864308e-04);
++         ir->AddTetPoints6(19, 4.37693799377281589358e-01, 1.42741910394635984974e-03);
++         ir->AddTetPoints12(25, 1.68580250819503341120e-02, 7.16622722155387803511e-01,
++                            3.27226375772531215529e-04);
++         ir->AddTetPoints12(37, 2.35257182448596058322e-02, 8.50600927605402956644e-01,
++                            4.21131904215649394228e-04);
++         ir->AddTetPoints12(49, 6.94418950495464537553e-02, 7.08574314820604622689e-01,
++                            9.98940835257684659268e-04);
++         ir->AddTetPoints12(61, 9.37005272821476720146e-02, 5.63456727822489344959e-01,
++                            1.88623398086488818989e-03);
++         ir->AddTetPoints12(73, 1.25885360164042447995e-01, 7.40105985667665167149e-01,
++                            5.36847192210583730974e-04);
++         ir->AddTetPoints12(85, 2.17955270547737667286e-01, 5.35290625276012344003e-01,
++                            1.79931440889047528954e-03);
++         ir->AddTetPoints12(97, 3.54663455472783883948e-01, 2.00014910210617791186e-01,
++                            2.91862732938839245650e-03);
++         ir->AddTetPoints12(109, 4.16689287657038387458e-01, 1.52054854976777675812e-01,
++                            9.62750257012783515476e-04);
++         ir->AddTetPoints24(121, 6.06652560730350010054e-01, 3.04158433676372519372e-01,
++                            7.79465622318078477093e-02, 6.21649861415869242447e-04);
++         ir->SetOrder(13);
++         TetrahedronIntRules[13] = ir;
++         return ir;
++
++      case 14:
++         // Chuluunbaatar et al. 2022: 175 pts, 1xCent + 6xS31 + 1xS22 + 10xS211 + 1xS1111
++         ir = new IntegrationRule(175);
++         ir->AddTetMidPoint(0, 2.79630622899013732072e-03);
++         ir->AddTetPoints4(1, 3.33328696010048830534e-01, 1.46917540892973303920e-04);
++         ir->AddTetPoints4(5, 2.03700979179134489261e-01, 1.62781576883158380503e-03);
++         ir->AddTetPoints4(9, 4.23119120487503441730e-02, 3.41825853298758448786e-04);
++         ir->AddTetPoints4(13, 1.66911321524259963212e-02, 6.86936532495300726303e-05);
++         ir->AddTetPoints4(17, 1.64429779556425403886e-01, 2.04589809259575743788e-03);
++         ir->AddTetPoints4(21, 3.05243130480787605574e-01, 2.91353212326472864671e-03);
++         ir->AddTetPoints6(25, 3.64287147870284933049e-01, 2.86344364876423311192e-03);
++         ir->AddTetPoints12(31, 2.15024351638664623643e-01, 5.09377822427890203372e-01,
++                            1.86604702493200981690e-03);
++         ir->AddTetPoints12(43, 4.08453557824531576781e-01, 2.80291884809145824820e-02,
++                            1.37079721642533879263e-03);
++         ir->AddTetPoints12(55, 2.59921479331125596102e-02, 7.58920443676681433232e-01,
++                            5.79340182017268356431e-04);
++         ir->AddTetPoints12(67, 1.49228115767079897586e-02, 6.12483481308898292106e-01,
++                            3.15838010189281473503e-04);
++         ir->AddTetPoints12(79, 8.37923462693035414617e-02, 8.22678981568125355928e-01,
++                            3.37570057080462679680e-04);
++         ir->AddTetPoints12(91, 2.67966909251860618824e-01, 1.21450733299050331326e-02,
++                            7.33968440516890967273e-04);
++         ir->AddTetPoints12(103, 7.22614743275835913483e-02, 2.93829968686904419162e-01,
++                            1.73940500456261186446e-03);
++         ir->AddTetPoints12(115, 4.61690356122462508548e-01, 6.24312988244191680032e-02,
++                            5.50156605537168688809e-04);
++         ir->AddTetPoints12(127, 1.02556247843651599492e-05, 9.09652220362116237240e-01,
++                            3.74818592914694638193e-05);
++         ir->AddTetPoints12(139, 1.30667193397036723868e-01, 6.88760930866849863108e-01,
++                            1.25403778742792012396e-03);
++         ir->AddTetPoints24(151, 4.96082264783182565887e-03, 1.13153535288820022986e-01,
++                            2.55337379104889128367e-01, 5.28969173366363918341e-04);
++         ir->SetOrder(14);
++         TetrahedronIntRules[14] = ir;
++         return ir;
++
++      case 15:
++         // Chuluunbaatar et al. 2022: 209 pts, 1xCent + 4xS31 + 2xS22 + 11xS211 + 2xS1111
++         ir = new IntegrationRule(209);
++         ir->AddTetMidPoint(0, 1.41781886024826123995e-03);
++         ir->AddTetPoints4(1, 3.28314281102506377863e-01, 8.78039594754075579386e-04);
++         ir->AddTetPoints4(5, 5.95315181460130682378e-02, 6.49096346642090296988e-04);
++         ir->AddTetPoints4(9, 1.79953296856689010097e-01, 2.51315014419990637520e-03);
++         ir->AddTetPoints4(13, 2.87467267398706316506e-01, 1.49510722611222407301e-03);
++         ir->AddTetPoints6(17, 1.63820429539269674102e-01, 1.59662635597863640340e-03);
++         ir->AddTetPoints6(23, 4.49691037286174599696e-01, 5.56101982388976408267e-04);
++         ir->AddTetPoints12(29, 4.00736540413628217205e-01, 1.44084693588290586180e-02,
++                            7.41098096825775301023e-04);
++         ir->AddTetPoints12(41, 9.30917130105696349895e-02, 5.00389099764625755462e-01,
++                            1.48911776902807633377e-03);
++         ir->AddTetPoints12(53, 8.58969713610300000806e-02, 6.54216648304909331735e-01,
++                            1.11933741359922559432e-03);
++         ir->AddTetPoints12(65, 2.10555592437809635520e-01, 6.11819979966393701076e-02,
++                            1.62698522585349981094e-03);
++         ir->AddTetPoints12(77, 8.52418251507089524965e-02, 9.51322614055337932581e-03,
++                            1.79269162929391381043e-04);
++         ir->AddTetPoints12(89, 3.39062766530538045595e-02, 1.00939010271196565223e-03,
++                            6.94170071688395131987e-05);
++         ir->AddTetPoints12(101, 3.61923170322900333851e-01, 6.94744848139675630350e-02,
++                            1.59447426536825733780e-03);
++         ir->AddTetPoints12(113, 4.82786943073602314858e-01, 2.98248878093377127463e-02,
++                            1.85989656618890347554e-04);
++         ir->AddTetPoints12(125, 1.67482762532157707092e-02, 8.28143120169573809797e-01,
++                            2.46974837616148544094e-04);
++         ir->AddTetPoints12(137, 1.77918303634979659000e-02, 2.95058306317452390122e-01,
++                            3.64978786118624204133e-04);
++         ir->AddTetPoints12(149, 2.22052218944333024098e-01, 5.47335431979886655185e-01,
++                            5.97708840203148013097e-04);
++         ir->AddTetPoints24(161, 7.01737933129022994905e-01, 1.90468590405707266511e-01,
++                            9.00527571062145620884e-02, 6.09486799192013577881e-04);
++         ir->AddTetPoints24(185, 9.30456155647334665071e-02, 3.39714197260826189506e-01,
++                            1.75628396157984228987e-02, 7.07458692200529210524e-04);
++         ir->SetOrder(15);
++         TetrahedronIntRules[15] = ir;
++         return ir;
++
++      case 16:
++         // Chuluunbaatar et al. 2022: 248 pts, 8xS31 + 2xS22 + 11xS211 + 3xS1111
++         ir = new IntegrationRule(248);
++         ir->AddTetPoints4(0, 3.27237393634992601577e-01, 1.02720766161859349518e-03);
++         ir->AddTetPoints4(4, 1.70006239733430930539e-01, 1.65526299995553852033e-03);
++         ir->AddTetPoints4(8, 1.15524594427552973475e-01, 9.38857005487496389280e-04);
++         ir->AddTetPoints4(12, 2.91444830780401391290e-02, 6.51307095609853886316e-05);
++         ir->AddTetPoints4(16, 2.99333264802760234957e-01, 1.67595204112568215392e-03);
++         ir->AddTetPoints4(20, 3.08156348381425804206e-01, 5.31861976759345615219e-04);
++         ir->AddTetPoints4(24, 1.50096498602994791322e-02, 5.70730781338430579624e-05);
++         ir->AddTetPoints4(28, 2.15377318942399170743e-01, 1.98341225672248695419e-03);
++         ir->AddTetPoints6(32, 4.31794349434656055120e-01, 1.33750640425309239717e-03);
++         ir->AddTetPoints6(38, 3.51744151127164061954e-01, 1.76194827177588689456e-03);
++         ir->AddTetPoints12(44, 8.29503118270854405969e-03, 1.02752005688054329213e-01,
++                            7.68034210054447698257e-05);
++         ir->AddTetPoints12(56, 4.80211290069074772657e-02, 1.31255041819827861227e-01,
++                            6.11232548742599826781e-04);
++         ir->AddTetPoints12(68, 1.58330800578366723275e-02, 7.34396180497698725098e-01,
++                            2.28071918651308634196e-04);
++         ir->AddTetPoints12(80, 2.33860521976982954628e-01, 5.24417475091189966285e-01,
++                            5.40957583749601829924e-04);
++         ir->AddTetPoints12(92, 4.03140399383019043533e-01, 1.61507733387133249614e-02,
++                            7.92724342585451077596e-04);
++         ir->AddTetPoints12(104, 4.62664186044103586948e-01, 8.10608626417101511830e-03,
++                            2.79154547643383486085e-04);
++         ir->AddTetPoints12(116, 9.54740058566225929804e-02, 2.29727696324949964835e-01,
++                            9.08138512826854073234e-04);
++         ir->AddTetPoints12(128, 5.22644355194657739272e-02, 2.65023714835432855352e-01,
++                            5.22825831653034723938e-04);
++         ir->AddTetPoints12(140, 1.54579380484822833525e-02, 3.89592797271225033118e-01,
++                            2.70158998322436651619e-04);
++         ir->AddTetPoints12(152, 6.51049846146104782552e-02, 1.06703843590993385781e-02,
++                            1.88285026959176838299e-04);
++         ir->AddTetPoints12(164, 1.54103896531334966236e-01, 6.61715834360067312048e-01,
++                            9.04088475565851504463e-04);
++         ir->AddTetPoints24(176, 3.06401667521507548031e-01, 9.60517630472854377910e-02,
++                            5.82737849734082380415e-01, 5.64378019946093099565e-04);
++         ir->AddTetPoints24(200, 7.52465510383990981991e-02, 7.45338308409307703783e-01,
++                            1.68928693209220324653e-04, 1.50966565378719797755e-04);
++         ir->AddTetPoints24(224, 6.67856954025341370551e-02, 1.77429946613237937703e-01,
++                            4.69596074566060506239e-01, 1.47055596469915315222e-03);
++         ir->SetOrder(16);
++         TetrahedronIntRules[16] = ir;
++         return ir;
++
++      case 17:
++         // Chuluunbaatar et al. 2022: 284 pts, 8xS31 + 2xS22 + 14xS211 + 3xS1111
++         ir = new IntegrationRule(284);
++         ir->AddTetPoints4(0, 7.70317217555786387662e-02, 5.61182432136912863994e-04);
++         ir->AddTetPoints4(4, 3.33178098937441047322e-01, 1.00689770519544758830e-04);
++         ir->AddTetPoints4(8, 4.70791056455278841830e-02, 3.67647177915098600219e-05);
++         ir->AddTetPoints4(12, 3.04818016813530989761e-01, 1.65889979279151768450e-03);
++         ir->AddTetPoints4(16, 1.30944391509640850613e-01, 1.44088986921434188127e-03);
++         ir->AddTetPoints4(20, 1.92535395691919936079e-01, 9.15395452245713805300e-04);
++         ir->AddTetPoints4(24, 2.76657577444746005657e-01, 1.54901732083429399985e-03);
++         ir->AddTetPoints4(28, 1.25812395975189866837e-02, 3.37712309004839736381e-05);
++         ir->AddTetPoints6(32, 3.79480026881957605706e-03, 6.09736907157372445766e-05);
++         ir->AddTetPoints6(38, 6.63574479091031538269e-02, 9.84476156008183339238e-04);
++         ir->AddTetPoints12(44, 1.55274078991054307469e-02, 1.62632006416904367763e-01,
++                            1.72804764846827806904e-04);
++         ir->AddTetPoints12(56, 1.50826606630984655366e-01, 2.85203367985520928052e-01,
++                            1.47139558752681400000e-03);
++         ir->AddTetPoints12(68, 2.15452134252546806392e-01, 4.98848586166224794436e-01,
++                            1.39306363547645069810e-03);
++         ir->AddTetPoints12(80, 8.76098177043343750992e-02, 5.57666868254345748923e-01,
++                            1.10287787958478406859e-03);
++         ir->AddTetPoints12(92, 2.76970146665180327883e-01, 4.27355049241211759625e-01,
++                            7.68445912031412610432e-04);
++         ir->AddTetPoints12(104, 7.84854004483451911378e-02, 8.28808438721931994841e-01,
++                            2.88969312852391376975e-04);
++         ir->AddTetPoints12(116, 4.16031236751370603333e-01, 5.86692256093802096822e-03,
++                            3.64077087845785364682e-04);
++         ir->AddTetPoints12(128, 3.11418578536613735799e-03, 2.87446864291519776913e-01,
++                            3.72979694270385682886e-05);
++         ir->AddTetPoints12(140, 1.48006973486492082737e-01, 3.23952556344107162056e-02,
++                            8.00305827281323160088e-04);
++         ir->AddTetPoints12(152, 1.43033359605475689225e-02, 6.71799737186767331742e-02,
++                            1.04156882366980408738e-04);
++         ir->AddTetPoints12(164, 4.67432600299733047589e-01, 1.06651363448138844503e-02,
++                            3.28661937749209721524e-04);
++         ir->AddTetPoints12(176, 3.84385292133539946402e-01, 1.72438767641856061097e-01,
++                            1.30629277435961670649e-03);
++         ir->AddTetPoints12(188, 4.91933575124020999736e-02, 1.88887052205218147760e-01,
++                            6.32377286180225181393e-04);
++         ir->AddTetPoints12(200, 2.12339226453523544080e-01, 4.66228370030305223209e-03,
++                            2.79290741478308349437e-04);
++         ir->AddTetPoints24(212, 5.56903978599597615506e-01, 3.03486206973905936479e-01,
++                            1.18763501187465259079e-01, 6.89373805061415279201e-04);
++         ir->AddTetPoints24(236, 7.72626757757003540528e-02, 7.24630018034681633310e-01,
++                            1.98007306189710574618e-01, 1.50128885728721429014e-04);
++         ir->AddTetPoints24(260, 3.94151894733046209707e-02, 1.37686205384732439361e-02,
++                            3.27821577026260191356e-01, 2.69135394730690453036e-04);
++         ir->SetOrder(17);
++         TetrahedronIntRules[17] = ir;
++         return ir;
++
++      case 18:
++         // Chuluunbaatar et al. 2022: 343 pts, 1xCent + 6xS31 + 1xS22 + 18xS211 + 4xS1111
++         ir = new IntegrationRule(343);
++         ir->AddTetMidPoint(0, 1.50320520665968271855e-03);
++         ir->AddTetPoints4(1, 1.48031283019549930735e-01, 1.29183701010426432026e-03);
++         ir->AddTetPoints4(5, 9.18424577295562372115e-02, 5.65165600131921114398e-04);
++         ir->AddTetPoints4(9, 1.21731006846268821620e-02, 2.96638661291863846498e-05);
++         ir->AddTetPoints4(13, 2.96287086243479214076e-01, 1.68021599975901742008e-03);
++         ir->AddTetPoints4(17, 3.26360945420223702573e-01, 5.68549421501223146459e-04);
++         ir->AddTetPoints4(21, 2.16789137320780644913e-01, 5.55087845531530756776e-04);
++         ir->AddTetPoints6(25, 4.02614199568341046831e-01, 9.23257682834909085973e-04);
++         ir->AddTetPoints12(31, 4.40000294606430919497e-01, 2.67115252330815747261e-02,
++                            5.89236226881148018354e-04);
++         ir->AddTetPoints12(43, 4.12210360146149590310e-01, 1.74700018752273367184e-01,
++                            2.20201645262970173086e-04);
++         ir->AddTetPoints12(55, 3.72351275734651154803e-01, 2.01036053559202482210e-01,
++                            1.19114067751767893286e-03);
++         ir->AddTetPoints12(67, 2.69242251358920825499e-01, 4.49869807112305730712e-01,
++                            4.91465288515259167076e-04);
++         ir->AddTetPoints12(79, 9.76627002277863226487e-02, 5.25608170470769464622e-01,
++                            9.64486286268097844226e-04);
++         ir->AddTetPoints12(91, 7.95369840699704060138e-03, 9.21057037109250575924e-01,
++                            3.50539800669945179434e-05);
++         ir->AddTetPoints12(103, 1.87267495112264620305e-01, 6.22654876406694146596e-01,
++                            2.16974411183503681708e-04);
++         ir->AddTetPoints12(115, 1.11072676172302167719e-01, 7.68063140393386190041e-01,
++                            2.34282910136374296871e-04);
++         ir->AddTetPoints12(127, 7.28479245819699250397e-02, 6.49722949416110195919e-01,
++                            4.87753175028903700802e-04);
++         ir->AddTetPoints12(139, 4.35242030819264283381e-02, 1.17645467635705727738e-01,
++                            3.42108745586317604826e-04);
++         ir->AddTetPoints12(151, 4.75773867838972852606e-01, 4.07808063375027368691e-02,
++                            1.54370387724311262630e-04);
++         ir->AddTetPoints12(163, 1.34493684207502642303e-02, 7.10473230757141527292e-01,
++                            1.53084245505138950502e-04);
++         ir->AddTetPoints12(175, 1.55188126953707594691e-01, 4.80062353992693063853e-02,
++                            7.41762550201790354931e-04);
++         ir->AddTetPoints12(187, 9.79636346189126545891e-03, 3.98937047199835026490e-01,
++                            9.70189072253606300464e-05);
++         ir->AddTetPoints12(199, 2.22972555180978554423e-01, 4.76498622927509052349e-01,
++                            1.23606250864963739498e-03);
++         ir->AddTetPoints12(211, 4.21260289724883496554e-02, 5.74400719147259319897e-01,
++                            5.42935360416797459064e-04);
++         ir->AddTetPoints12(223, 5.33405089760383491204e-02, 8.80640211270697026436e-01,
++                            1.37609067910292992955e-04);
++         ir->AddTetPoints12(235, 1.61516953402295604381e-01, 3.97825734805866804145e-01,
++                            1.38360407936932868454e-03);
++         ir->AddTetPoints24(247, 1.45407574632878761056e-01, 2.91164813093958863011e-01,
++                            2.48113931665566757323e-02, 7.08660104525119741853e-04);
++         ir->AddTetPoints24(271, 3.05950727633923398596e-03, 8.24555339410965038027e-01,
++                            2.68565028804892240444e-02, 7.44949831353088352858e-05);
++         ir->AddTetPoints24(295, 6.97926614756394281258e-01, 2.13661027507205059095e-01,
++                            1.66604565123493510159e-02, 3.48774332060316084436e-04);
++         ir->AddTetPoints24(319, 7.44110286479428006956e-02, 3.30529031579615995007e-01,
++                            5.94802303330229431566e-01, 1.27738536486342427233e-04);
++         ir->SetOrder(18);
++         TetrahedronIntRules[18] = ir;
++         return ir;
++
++      case 19:
++         // Chuluunbaatar et al. 2022: 383 pts, 1xCent + 7xS31 + 3xS22 + 18xS211 + 5xS1111
++         ir = new IntegrationRule(383);
++         ir->AddTetMidPoint(0, 1.63415516118113374362e-03);
++         ir->AddTetPoints4(1, 1.99329047511150186933e-01, 1.40358969707281050661e-03);
++         ir->AddTetPoints4(5, 3.19645815662209620278e-01, 7.59091701642911130359e-04);
++         ir->AddTetPoints4(9, 1.28431938718745111000e-02, 3.39837309564992975973e-05);
++         ir->AddTetPoints4(13, 4.48982322308715264825e-02, 2.64426237344559213801e-04);
++         ir->AddTetPoints4(17, 1.36956642483832574664e-01, 7.02856031175181001670e-04);
++         ir->AddTetPoints4(21, 2.90023140379987831583e-01, 1.36565197140369625796e-03);
++         ir->AddTetPoints4(25, 9.74727471850612287030e-02, 5.49967641751253539552e-04);
++         ir->AddTetPoints6(29, 9.71505706534206842084e-02, 8.76433367036966496331e-04);
++         ir->AddTetPoints6(35, 3.43201766155373844125e-01, 1.42791869569969145752e-03);
++         ir->AddTetPoints6(41, 4.81048682529186311108e-01, 2.14104659346860782621e-04);
++         ir->AddTetPoints12(47, 1.65183633185142647593e-01, 6.20343271948313068620e-01,
++                            7.66624121398570487762e-04);
++         ir->AddTetPoints12(59, 6.34076230480007801971e-02, 1.00712056526788096511e-06,
++                            5.83669782740338187726e-05);
++         ir->AddTetPoints12(71, 1.14358224224068777061e-03, 4.00863727471189756901e-01,
++                            2.15179213304474853136e-05);
++         ir->AddTetPoints12(83, 1.72841898844304682481e-02, 2.80974353964125900252e-01,
++                            2.11660243173845957895e-04);
++         ir->AddTetPoints12(95, 3.72064110603054443160e-01, 1.99705629942815821032e-01,
++                            8.75485553265252145448e-04);
++         ir->AddTetPoints12(107, 5.20242736384851911513e-02, 1.37531998164526964024e-01,
++                            2.41875978548309640689e-04);
++         ir->AddTetPoints12(119, 1.60391938988630439189e-01, 6.93856268828658799552e-03,
++                            2.41421983084743403854e-04);
++         ir->AddTetPoints12(131, 2.04009311788170112981e-03, 1.85612033720361502276e-01,
++                            2.06414634761428247161e-05);
++         ir->AddTetPoints12(143, 5.27148535043930888122e-02, 3.66275984765305206992e-01,
++                            5.92721175866269185152e-04);
++         ir->AddTetPoints12(155, 2.43716736043213727525e-01, 4.92302726125094514131e-01,
++                            5.87805221231469288319e-04);
++         ir->AddTetPoints12(167, 2.35953750535251915998e-01, 4.36147999140300668408e-01,
++                            1.13870552683716466137e-03);
++         ir->AddTetPoints12(179, 3.76396525093455058819e-01, 2.39487568074092604942e-01,
++                            3.83776606890474675932e-04);
++         ir->AddTetPoints12(191, 1.01345934656195998946e-01, 7.75724008450901503231e-01,
++                            3.13399374751491786861e-04);
++         ir->AddTetPoints12(203, 1.03978576485678226443e-02, 9.15876371248755760668e-01,
++                            5.00269118618190970684e-05);
++         ir->AddTetPoints12(215, 4.34825905512194077485e-01, 1.54748703679767566493e-02,
++                            5.11025086732051357814e-04);
++         ir->AddTetPoints12(227, 1.21014927057761692564e-01, 5.16611295841134299245e-01,
++                            1.01828426898903845119e-03);
++         ir->AddTetPoints12(239, 2.02221519510974792611e-02, 1.24848861415461573343e-01,
++                            1.43839758373707761420e-04);
++         ir->AddTetPoints12(251, 6.45349753497565792326e-02, 6.58797428082353198064e-01,
++                            5.65395165437599170333e-04);
++         ir->AddTetPoints24(263, 5.41751356566992220420e-02, 1.84132753560833639650e-01,
++                            7.55314661874476711567e-01, 1.46482060260151955187e-04);
++         ir->AddTetPoints24(287, 8.81662316742434920558e-02, 6.34269181940578685719e-01,
++                            1.71630199679625762565e-02, 3.35574461050504140036e-04);
++         ir->AddTetPoints24(311, 2.99876324799488391815e-01, 5.12011805461436986242e-01,
++                            1.39945756017624101109e-01, 6.87105878169750922298e-04);
++         ir->AddTetPoints24(335, 5.43033048543508201078e-01, 3.98941499659789464149e-03,
++                            2.96197429831241032527e-01, 1.95466436126630728823e-04);
++         ir->AddTetPoints24(359, 5.64748804926985426000e-01, 5.04233375578317308263e-02,
++                            5.38328049657370907161e-03, 1.64230458281612438400e-04);
++         ir->SetOrder(19);
++         TetrahedronIntRules[19] = ir;
++         return ir;
++
++      case 20:
++         // Chuluunbaatar et al. 2022: 441 pts, 1xCent + 8xS31 + 4xS22 + 20xS211 + 6xS1111
++         ir = new IntegrationRule(441);
++         ir->AddTetMidPoint(0, 1.18189152746071531389e-03);
++         ir->AddTetPoints4(1, 1.44398440418483348102e-01, 9.66793543666131616025e-04);
++         ir->AddTetPoints4(5, 7.58537944731913719304e-03, 7.60496594911488061041e-06);
++         ir->AddTetPoints4(9, 2.92814880923072839991e-01, 1.15590090717039508002e-03);
++         ir->AddTetPoints4(13, 3.21283351882928780441e-01, 6.88636083704610908220e-04);
++         ir->AddTetPoints4(17, 1.99126561548209762842e-01, 1.13432796616464034133e-03);
++         ir->AddTetPoints4(21, 9.94395843777090698845e-02, 4.86016284904959409725e-04);
++         ir->AddTetPoints4(25, 5.64411054542164752901e-02, 3.22610846546138882538e-04);
++         ir->AddTetPoints4(29, 2.29655958568571322287e-02, 5.81298999656466752642e-05);
++         ir->AddTetPoints6(33, 1.79141319969986889671e-01, 1.09529243625682828553e-03);
++         ir->AddTetPoints6(39, 1.28302591207222288494e-01, 5.73999808941768881014e-04);
++         ir->AddTetPoints6(45, 1.45390267831490144212e-02, 1.87025723866918939901e-04);
++         ir->AddTetPoints6(51, 4.21353125034043096697e-01, 7.00971843927932197066e-04);
++         ir->AddTetPoints12(57, 2.31009838674045342444e-01, 4.40089079468850674637e-01,
++                            9.76079610667369892280e-04);
++         ir->AddTetPoints12(69, 5.34968850305569260106e-03, 1.93845423004036843118e-01,
++                            3.05362840831512903964e-05);
++         ir->AddTetPoints12(81, 1.24450062776632008887e-01, 2.68040213994591769442e-01,
++                            8.86819612544235340128e-04);
++         ir->AddTetPoints12(93, 2.04325810970697151203e-02, 2.58308114426872181824e-01,
++                            1.92969809694827680807e-04);
++         ir->AddTetPoints12(105, 4.82828231821251577238e-02, 5.80396926577266859815e-03,
++                            6.19875387085781813859e-05);
++         ir->AddTetPoints12(117, 2.89149787325270579696e-01, 4.18198665244226719384e-01,
++                            1.83280536557154684341e-04);
++         ir->AddTetPoints12(129, 4.46052961749180063022e-02, 1.63432451082933805075e-01,
++                            3.33781952353497124181e-04);
++         ir->AddTetPoints12(141, 2.70746696244779198881e-03, 9.47473449934223443947e-01,
++                            9.76902870004440518315e-06);
++         ir->AddTetPoints12(153, 1.81599434536722420530e-01, 5.60679256816125279328e-02,
++                            7.16163033436471277611e-04);
++         ir->AddTetPoints12(165, 7.98241847716316815786e-02, 2.24797220241435974364e-01,
++                            6.67198343081130593700e-04);
++         ir->AddTetPoints12(177, 1.46751424308700709198e-02, 1.05996716787954137207e-01,
++                            1.02112432519835052438e-04);
++         ir->AddTetPoints12(189, 1.47988480867858707146e-01, 6.98383249617804735543e-01,
++                            1.65331156535892409174e-04);
++         ir->AddTetPoints12(201, 4.52421884404454466289e-01, 8.04532263339534647884e-02,
++                            3.75572061072812404120e-04);
++         ir->AddTetPoints12(213, 4.82988925439242506449e-03, 3.56408995372303527560e-01,
++                            3.70372995907359520520e-05);
++         ir->AddTetPoints12(225, 3.96075105354866952023e-01, 1.93643489260524576112e-01,
++                            4.20355142354041121707e-04);
++         ir->AddTetPoints12(237, 2.11473197416018027228e-01, 5.66485319771568907044e-01,
++                            3.20737452814357003415e-04);
++         ir->AddTetPoints12(249, 2.56953469781508958558e-01, 4.54914546979866607490e-01,
++                            5.44699807144207316482e-04);
++         ir->AddTetPoints12(261, 3.63762446007509787638e-01, 7.07704644682126682298e-02,
++                            8.71724711789707298014e-04);
++         ir->AddTetPoints12(273, 4.34084693566413395982e-02, 5.61826818484091217165e-01,
++                            4.66716966955154613419e-04);
++         ir->AddTetPoints12(285, 1.26096840063810999855e-01, 4.15826803180920218095e-02,
++                            4.05374493922667372432e-04);
++         ir->AddTetPoints24(297, 3.19830665436792060952e-01, 4.36032333155288651105e-02,
++                            5.02224495466759290885e-01, 7.04309544520175120040e-04);
++         ir->AddTetPoints24(321, 6.93104765295092647634e-04, 7.61588104432530443866e-01,
++                            4.88127934219945436300e-02, 6.22004488693067759562e-05);
++         ir->AddTetPoints24(345, 3.01059509765821443905e-03, 4.68001522754562040984e-02,
++                            5.92923309754523120141e-01, 1.11516752988944994277e-04);
++         ir->AddTetPoints24(369, 8.05046770737637640281e-01, 1.15008082676269607347e-01,
++                            6.71446262920421810261e-02, 1.37946350993749402127e-04);
++         ir->AddTetPoints24(393, 3.27837344763098725853e-01, 1.41845459805815643506e-01,
++                            3.73982332962941683638e-03, 1.75610557117139728066e-04);
++         ir->AddTetPoints24(417, 1.69431918115453827856e-02, 9.30651836894259287813e-02,
++                            6.46141331822991715761e-01, 3.76832469454361519961e-04);
++         ir->SetOrder(20);
++         TetrahedronIntRules[20] = ir;
++         return ir;
++
++      default:
++         // Grundmann-Moller fallback for orders beyond tabulated rules
++         int i = (Order / 2) * 2 + 1;   // closest odd >= Order
+          AllocIntRule(TetrahedronIntRules, i);
+          ir = new IntegrationRule;
+-         ir->GrundmannMollerSimplexRule(i/2,3);
+-         TetrahedronIntRules[i-1] = TetrahedronIntRules[i] = ir;
++         ir->GrundmannMollerSimplexRule(i/2, 3);
++         if (!TetrahedronIntRules[i-1]) { TetrahedronIntRules[i-1] = ir; }
++         TetrahedronIntRules[i] = ir;
+          return ir;
+    }
+ }
+diff --git a/fem/intrules.hpp b/fem/intrules.hpp
+index d3fac20e93..b7f44f97b4 100644
+--- a/fem/intrules.hpp
++++ b/fem/intrules.hpp
+@@ -125,18 +125,6 @@ private:
+    void AddTriPoints3b(const int off, const real_t b, const real_t weight)
+    { AddTriPoints3(off, (1. - b)/2., b, weight); }
+ 
+-   void AddTriPoints3R(const int off, const real_t a, const real_t b,
+-                       const real_t c, const real_t weight)
+-   {
+-      IntPoint(off + 0).Set2w(a, b, weight);
+-      IntPoint(off + 1).Set2w(c, a, weight);
+-      IntPoint(off + 2).Set2w(b, c, weight);
+-   }
+-
+-   void AddTriPoints3R(const int off, const real_t a, const real_t b,
+-                       const real_t weight)
+-   { AddTriPoints3R(off, a, b, 1. - a - b, weight); }
+-
+    void AddTriPoints6(const int off, const real_t a, const real_t b,
+                       const real_t c, const real_t weight)
+    {
+@@ -183,14 +171,6 @@ private:
+       AddTetPoints3(off + 1, a, 1. - 3.*a, weight);
+    }
+ 
+-   // given b, add the permutations of (a,a,a,b), where 3*a + b = 1
+-   void AddTetPoints4b(const int off, const real_t b, const real_t weight)
+-   {
+-      const real_t a = (1. - b)/3.;
+-      IntPoint(off).Set(a, a, a, weight);
+-      AddTetPoints3(off + 1, a, b, weight);
+-   }
+-
+    // add the permutations of (a,a,b,b), 2*(a + b) = 1
+    void AddTetPoints6(const int off, const real_t a, const real_t weight)
+    {
+@@ -209,14 +189,37 @@ private:
+       AddTetPoints6(off + 6, a, bc, cb, weight);
+    }
+ 
+-   // given (b,c), add the permutations of (a,a,b,c), 2*a + b + c = 1
+-   void AddTetPoints12bc(const int off, const real_t b, const real_t c,
+-                         const real_t weight)
++   // add all 24 permutations of (a,b,c,d) where a+b+c+d = 1, all distinct
++   void AddTetPoints24(const int off, const real_t a, const real_t b,
++                       const real_t c, const real_t weight)
+    {
+-      const real_t a = (1. - b - c)/2.;
+-      AddTetPoints3(off,     a, b, weight);
+-      AddTetPoints3(off + 3, a, c, weight);
+-      AddTetPoints6(off + 6, a, b, c, weight);
++      const real_t d = 1. - a - b - c;
++      // all 24 permutations of 4 distinct barycentric coordinates
++      // permuting which coordinate goes to x, y, z (4th is 1-x-y-z)
++      IntPoint(off +  0).Set(a, b, c, weight);
++      IntPoint(off +  1).Set(a, b, d, weight);
++      IntPoint(off +  2).Set(a, c, b, weight);
++      IntPoint(off +  3).Set(a, c, d, weight);
++      IntPoint(off +  4).Set(a, d, b, weight);
++      IntPoint(off +  5).Set(a, d, c, weight);
++      IntPoint(off +  6).Set(b, a, c, weight);
++      IntPoint(off +  7).Set(b, a, d, weight);
++      IntPoint(off +  8).Set(b, c, a, weight);
++      IntPoint(off +  9).Set(b, c, d, weight);
++      IntPoint(off + 10).Set(b, d, a, weight);
++      IntPoint(off + 11).Set(b, d, c, weight);
++      IntPoint(off + 12).Set(c, a, b, weight);
++      IntPoint(off + 13).Set(c, a, d, weight);
++      IntPoint(off + 14).Set(c, b, a, weight);
++      IntPoint(off + 15).Set(c, b, d, weight);
++      IntPoint(off + 16).Set(c, d, a, weight);
++      IntPoint(off + 17).Set(c, d, b, weight);
++      IntPoint(off + 18).Set(d, a, b, weight);
++      IntPoint(off + 19).Set(d, a, c, weight);
++      IntPoint(off + 20).Set(d, b, a, weight);
++      IntPoint(off + 21).Set(d, b, c, weight);
++      IntPoint(off + 22).Set(d, c, a, weight);
++      IntPoint(off + 23).Set(d, c, b, weight);
+    }
+ 
+ public:
diff --git a/palace/linalg/cudss.cpp b/palace/linalg/cudss.cpp
index 800ef0c0c6..c56fbee0ed 100644
--- a/palace/linalg/cudss.cpp
+++ b/palace/linalg/cudss.cpp
@@ -19,13 +19,13 @@ mfem::CuDSSSolver::MatType GetCuDSSMatType(MatrixSymmetry sym)
   switch (sym)
   {
     case MatrixSymmetry::SPD:
-      return mfem::CuDSSSolver::MatType::SYMMETRIC_POSITIVE_DEFINITE;
+      return mfem::CuDSSSolver::SYMMETRIC_POSITIVE_DEFINITE;
     case MatrixSymmetry::SYMMETRIC:
-      return mfem::CuDSSSolver::MatType::SYMMETRIC_INDEFINITE;
+      return mfem::CuDSSSolver::SYMMETRIC_INDEFINITE;
     case MatrixSymmetry::UNSYMMETRIC:
-      return mfem::CuDSSSolver::MatType::NONSYMMETRIC;
+      return mfem::CuDSSSolver::NONSYMMETRIC;
   }
-  return mfem::CuDSSSolver::MatType::NONSYMMETRIC;
+  return mfem::CuDSSSolver::NONSYMMETRIC;
 }
 
 }  // namespace
diff --git a/spack_repo/local/packages/palace/mfem_pr5124_cudss.diff b/spack_repo/local/packages/palace/mfem_pr5124_cudss.diff
new file mode 120000
index 0000000000..93ac76686b
--- /dev/null
+++ b/spack_repo/local/packages/palace/mfem_pr5124_cudss.diff
@@ -0,0 +1 @@
+../../../../extern/patch/mfem/mfem_pr5124_cudss.diff
\ No newline at end of file
diff --git a/spack_repo/local/packages/palace/package.py b/spack_repo/local/packages/palace/package.py
index cbe4652b5e..647ada9a84 100644
--- a/spack_repo/local/packages/palace/package.py
+++ b/spack_repo/local/packages/palace/package.py
@@ -45,6 +45,12 @@ class Palace(CMakePackage, CudaPackage, ROCmPackage):
         when="@0.14:",
     )
     variant("mumps", default=False, description="Build with MUMPS sparse direct solver")
+    variant(
+        "cudss",
+        default=False,
+        description="Build with cuDSS sparse direct solver",
+        when="@0.17:",
+    )
     variant("slepc", default=True, description="Build with SLEPc eigenvalue solver")
     variant("arpack", default=False, description="Build with ARPACK eigenvalue solver")
     variant("libxsmm", default=True, description="Build with libxsmm backend for libCEED")
@@ -87,7 +93,16 @@ class Palace(CMakePackage, CudaPackage, ROCmPackage):
     depends_on("eigen", type="build")
     depends_on("lcov@1.15:", when="+coverage@0.16:", type="run")
 
-    conflicts("~superlu-dist~strumpack~mumps", msg="Need at least one sparse direct solver")
+    conflicts(
+        "~superlu-dist~strumpack~mumps",
+        when="@:0.16",
+        msg="Need at least one sparse direct solver",
+    )
+    conflicts(
+        "~superlu-dist~strumpack~mumps~cudss",
+        when="@0.17:",
+        msg="Need at least one sparse direct solver",
+    )
     conflicts(
         "+asan",
         when="platform=darwin %gcc",
@@ -180,6 +195,7 @@ class Palace(CMakePackage, CudaPackage, ROCmPackage):
                 "patch_gmsh_parser_performance.diff",
                 "mfem_pr5280.diff",
                 patch("mfem_pr5246.diff", when="@:4.9"),
+                patch("mfem_pr5124_cudss.diff", when="@:4.9 +cudss"),
             ],
         )
         depends_on("mfem+shared", when="+shared")
@@ -204,6 +220,8 @@ class Palace(CMakePackage, CudaPackage, ROCmPackage):
         # depends_on("mfem+exceptions", type="test")
 
         depends_on("mfem+libunwind", when="build_type=Debug")
+        depends_on("mfem+cudss", when="+cudss")
+        depends_on("cudss", when="+cudss")
         depends_on("eigen@3.5:", type="build")
 
     with when("+libxsmm"):
@@ -232,6 +250,7 @@ class Palace(CMakePackage, CudaPackage, ROCmPackage):
     conflicts("+cuda", when="@:0.13", msg="CUDA is only supported for Palace versions after 0.13")
     conflicts("+rocm", when="@:0.13", msg="ROCm is only supported for Palace versions after 0.13")
     conflicts("+cuda+rocm", msg="PALACE_WITH_CUDA is not compatible with PALACE_WITH_HIP")
+    conflicts("+cudss", when="~cuda", msg="PALACE_WITH_CUDSS requires PALACE_WITH_CUDA")
     conflicts(
         "cuda_arch=none", when="+cuda", msg="palace: Please specify a CUDA arch value / values"
     )
@@ -307,6 +326,7 @@ def cmake_args(self):
             self.define_from_variant("PALACE_WITH_HIP", "rocm"),
             self.define_from_variant("PALACE_WITH_LIBXSMM", "libxsmm"),
             self.define_from_variant("PALACE_WITH_MUMPS", "mumps"),
+            self.define_from_variant("PALACE_WITH_CUDSS", "cudss"),
             self.define_from_variant("PALACE_WITH_OPENMP", "openmp"),
             self.define_from_variant("PALACE_WITH_SLEPC", "slepc"),
             self.define_from_variant("PALACE_WITH_STRUMPACK", "strumpack"),
@@ -322,6 +342,8 @@ def cmake_args(self):
             args.append(self.define("MFEM_DIR", self.spec["mfem"].prefix))
             if self.spec.satisfies("+mumps"):
                 args.append(self.define("MUMPS_DIR", self.spec["mumps"].prefix))
+            if self.spec.satisfies("+cudss"):
+                args.append(self.define("CUDSS_DIR", self.spec["cudss"].prefix))
             if self.spec.satisfies("+strumpack"):
                 args.append(self.define("STRUMPACK_DIR", self.spec["strumpack"].prefix))
             if self.spec.satisfies("+mumps") or self.spec.satisfies("+strumpack"):
diff --git a/spack_repo/patches/pr5124_mfem_cudss.diff b/spack_repo/patches/pr5124_mfem_cudss.diff
new file mode 100644
index 0000000000..22946443cb
--- /dev/null
+++ b/spack_repo/patches/pr5124_mfem_cudss.diff
@@ -0,0 +1,81 @@
+diff --git a/repos/spack_repo/builtin/packages/mfem/package.py b/repos/spack_repo/builtin/packages/mfem/package.py
+index e0e7d762..d760729d 100644
+--- a/repos/spack_repo/builtin/packages/mfem/package.py
++++ b/repos/spack_repo/builtin/packages/mfem/package.py
+@@ -188,6 +188,12 @@ class Mfem(Package, CudaPackage, ROCmPackage):
+     variant("libceed", default=False, description="Enable libCEED backend")
+     variant("umpire", default=False, description="Enable Umpire support")
+     variant("amgx", default=False, description="Enable NVIDIA AmgX solver support")
++    variant(
++        "cudss",
++        default=False,
++        when="@4.9.0:",
++        description="Enable NVIDIA cuDSS solver support",
++    )
+ 
+     variant(
+         "threadsafe",
+@@ -261,6 +267,7 @@ class Mfem(Package, CudaPackage, ROCmPackage):
+ 
+     conflicts("+cuda+rocm")
+     conflicts("+amgx", when="~cuda")
++    conflicts("+cudss", when="~cuda")
+     conflicts("+mpi~cuda ^hypre+cuda")
+     conflicts("+mpi~rocm ^hypre+rocm")
+ 
+@@ -530,6 +537,7 @@ class Mfem(Package, CudaPackage, ROCmPackage):
+     depends_on("enzyme@0.0.176:", when="+enzyme")
+     requires("%cxx=llvm", when="+enzyme~rocm")
+     depends_on("cuda+allow-unsupported-compilers", when="+enzyme+cuda")
++    depends_on("cudss", when="+cudss")
+     depends_on("enzyme %libllvm=llvm-amdgpu", when="+enzyme+rocm")
+     requires("%cxx=llvm-amdgpu", when="+enzyme+rocm")
+ 
+@@ -693,6 +701,7 @@ class Mfem(Package, CudaPackage, ROCmPackage):
+             "MFEM_MPIEXEC_NP=%s" % mfem_mpiexec_np,
+             "MFEM_USE_EXCEPTIONS=%s" % yes_no("+exceptions"),
+             "MFEM_USE_MUMPS=%s" % yes_no("+mumps"),
++            "MFEM_USE_CUDSS=%s" % yes_no("+cudss"),
+         ]
+         if spec.satisfies("@4.7.0:"):
+             options += ["MFEM_PRECISION=%s" % spec.variants["precision"].value]
+@@ -1054,6 +1063,39 @@ class Mfem(Package, CudaPackage, ROCmPackage):
+                     raise InstallError("Required CUDA libraries not found: %s" % culibs)
+                 options += ["CUDA_LIB=%s" % ld_flags_from_library_list(cuda_libs)]
+ 
++        if "+cudss" in spec:
++            cudss = spec["cudss"]
++            cudss_libs = find_libraries(
++                "libcudss", cudss.prefix.lib, shared=True, recursive=True
++            )
++            if not cudss_libs:
++                raise InstallError("Required cuDSS library not found")
++            options += [
++                "CUDSS_OPT=-I%s" % cudss.prefix.include,
++                "CUDSS_LIB=%s" % ld_flags_from_library_list(cudss_libs),
++            ]
++
++            # cuDSS loads communication/threading layers dynamically. Passing absolute
++            # paths through config.hpp avoids relying on user runtime environment vars.
++            if "+mpi" in spec:
++                comm_candidates = ["libcudss_commlayer_openmpi"]
++                if "^mpich" in spec or "^mvapich" in spec:
++                    comm_candidates = [
++                        "libcudss_commlayer_mpich",
++                        "libcudss_commlayer_openmpi",
++                    ]
++                comm_libs = find_libraries(
++                    comm_candidates, cudss.prefix.lib, shared=True, recursive=True
++                )
++                if comm_libs:
++                    options += ["MFEM_CUDSS_COMM_LIB=%s" % next(iter(comm_libs))]
++            if "+openmp" in spec:
++                thread_libs = find_libraries(
++                    "libcudss_mtlayer_gomp", cudss.prefix.lib, shared=True, recursive=True
++                )
++                if thread_libs:
++                    options += ["MFEM_CUDSS_THREADING_LIB=%s" % next(iter(thread_libs))]
++
+         if "+rocm" in spec:
+             amdgpu_target = ",".join(spec.variants["amdgpu_target"].value)
+             options += ["HIP_CXX=%s" % spec["hip"].hipcc, "HIP_ARCH=%s" % amdgpu_target]