From 51d7c2f52258a475ecd4914b76e4545860128d00 Mon Sep 17 00:00:00 2001 From: David Sanftenberg Date: Tue, 7 Oct 2025 10:10:35 +0000 Subject: [PATCH 1/2] Fix validation tolerance: use relative error instead of absolute The validation was using absolute error tolerance (1e-8) which fails for large matrix multiplication results (magnitude ~1e4). This caused false negatives where COSMA computed correct results but failed validation. Changes: - Switch from absolute error to relative error for validation - Use 1e-5 tolerance for float32 (appropriate for single precision) - Use 1e-8 tolerance for float64 (appropriate for double precision) - Handle small values near zero with absolute error fallback This fixes issue #153 where K-split strategy was incorrectly reported as producing 93.6% errors when actual relative errors were < 1e-6. Tested with: - 32x896x896 float32: now passes (was 93.8% false errors) - 32x10000x896 float32: now passes (was 93.6% false errors) - 32x32x32 float64: still passes (regression test) --- utils/cosma_utils.hpp | 37 ++++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/utils/cosma_utils.hpp b/utils/cosma_utils.hpp index 8b0d26dd..3eb73a96 100644 --- a/utils/cosma_utils.hpp +++ b/utils/cosma_utils.hpp @@ -333,23 +333,38 @@ bool test_cosma(Strategy s, // Now Check result isOK = globCcheck.size() == globC.size(); for (int i = 0; i < globC.size(); ++i) { - isOK = isOK && (std::abs(globC[i] - globCcheck[i]) < epsilon); + // Use relative error for large values, absolute error for small values + double abs_error = std::abs(globC[i] - globCcheck[i]); + double scale = std::max(std::abs(globC[i]), std::abs(globCcheck[i])); + double rel_error = (scale > 1e-10) ? abs_error / scale : abs_error; + // For float32, relative error tolerance should be ~1e-6 + // For float64, relative error tolerance should be ~1e-12 + double tolerance = (sizeof(Scalar) == 4) ? 1e-5 : epsilon; + isOK = isOK && (rel_error < tolerance); } if (!isOK) { std::cout << "Result is NOT OK" << std::endl; + int error_count = 0; + const int MAX_ERRORS_TO_PRINT = 20; for (int i = 0; i < m * n; i++) { if (globCcheck[i] != globC[i]) { - int x = i % m; - int y = i / m; - int locidx, rank; - std::tie(locidx, rank) = C.local_coordinates(x, y); - std::cout << "global(" << x << ", " << y - << ") = (loc = " << locidx << ", rank = " << rank - << ") = " << globC.at(i) << " and should be " - << globCcheck.at(i) << std::endl; + error_count++; + if (error_count <= MAX_ERRORS_TO_PRINT) { + int x = i % m; + int y = i / m; + int locidx, rank; + std::tie(locidx, rank) = C.local_coordinates(x, y); + std::cout << "global(" << x << ", " << y + << ") = (loc = " << locidx << ", rank = " << rank + << ") = " << globC.at(i) << " and should be " + << globCcheck.at(i) << " (diff = " + << std::abs(globC.at(i) - globCcheck.at(i)) << ")" << std::endl; + } } } + std::cout << "Total errors: " << error_count << " out of " << (m * n) << " elements (" + << (100.0 * error_count / (m * n)) << "%)" << std::endl; } else { std::cout <<"Result is OK"< 0 || isOK; } From af75c77c225ac197c38c5e97d2fc3b9007d25329 Mon Sep 17 00:00:00 2001 From: Simon Pintarelli Date: Mon, 13 Oct 2025 15:15:13 +0200 Subject: [PATCH 2/2] format --- utils/cosma_utils.hpp | 185 +++++++++++++++++++++++++----------------- 1 file changed, 110 insertions(+), 75 deletions(-) diff --git a/utils/cosma_utils.hpp b/utils/cosma_utils.hpp index 3eb73a96..081fa3c2 100644 --- a/utils/cosma_utils.hpp +++ b/utils/cosma_utils.hpp @@ -1,21 +1,21 @@ #include #include +#include +#include +#include #include #include #include +#include #include #include -#include -#include -#include -#include using namespace cosma; template -void fill_matrix(T* ptr, size_t size) { - static std::random_device dev; // seed - static std::mt19937 rng(dev()); // generator +void fill_matrix(T *ptr, size_t size) { + static std::random_device dev; // seed + static std::mt19937 rng(dev()); // generator static std::uniform_real_distribution dist(10.0); // distribution for (unsigned i = 0; i < size; ++i) { @@ -24,9 +24,9 @@ void fill_matrix(T* ptr, size_t size) { } template -void fill_matrix(std::complex* ptr, size_t size) { - static std::random_device dev; // seed - static std::mt19937 rng(dev()); // generator +void fill_matrix(std::complex *ptr, size_t size) { + static std::random_device dev; // seed + static std::mt19937 rng(dev()); // generator static std::uniform_real_distribution dist(10.0); // distribution for (unsigned i = 0; i < size; ++i) { @@ -36,10 +36,10 @@ void fill_matrix(std::complex* ptr, size_t size) { template bool test_cosma(Strategy s, - context& ctx, - MPI_Comm comm = MPI_COMM_WORLD, - double epsilon = 1e-8, - int tag = 0) { + context &ctx, + MPI_Comm comm = MPI_COMM_WORLD, + double epsilon = 1e-8, + int tag = 0) { auto alpha = Scalar{1}; auto beta = Scalar{1}; @@ -103,12 +103,15 @@ bool test_cosma(Strategy s, std::vector As, Bs, Cs; if (rank == 0) { As = std::vector(m * k); - std::memcpy(As.data(), A.matrix_pointer(), A.matrix_size()*sizeof(Scalar)); + std::memcpy( + As.data(), A.matrix_pointer(), A.matrix_size() * sizeof(Scalar)); Bs = std::vector(k * n); - std::memcpy(Bs.data(), B.matrix_pointer(), B.matrix_size()*sizeof(Scalar)); + std::memcpy( + Bs.data(), B.matrix_pointer(), B.matrix_size() * sizeof(Scalar)); // copy C in case beta > 0 Cs = std::vector(m * n); - std::memcpy(Cs.data(), C.matrix_pointer(), C.matrix_size()*sizeof(Scalar)); + std::memcpy( + Cs.data(), C.matrix_pointer(), C.matrix_size() * sizeof(Scalar)); int offsetA = sizeA; int offsetB = sizeB; @@ -124,53 +127,62 @@ bool test_cosma(Strategy s, // Rank 0 receive data int info = MPI_Recv(As.data() + offsetA, - receive_size_A, - mpi_type, - i, - tag*n_comm_rounds, - comm, - &status); + receive_size_A, + mpi_type, + i, + tag * n_comm_rounds, + comm, + &status); if (info != MPI_SUCCESS) { // check if we received the right amount MPI_Get_elements(&status, mpi_type, &amount); if (amount != receive_size_A) { - std::cout << "Error: Did not receive all data for matrix A!" << std::endl; - std::cout << "Received " << amount << ", instead of " << receive_size_A << std::endl; - std::cout << "Message source: " << status.MPI_SOURCE << ", tag = " << status.MPI_TAG << std::endl; + std::cout << "Error: Did not receive all data for matrix A!" + << std::endl; + std::cout << "Received " << amount << ", instead of " + << receive_size_A << std::endl; + std::cout << "Message source: " << status.MPI_SOURCE + << ", tag = " << status.MPI_TAG << std::endl; } } info = MPI_Recv(Bs.data() + offsetB, - receive_size_B, - mpi_type, - i, - tag*n_comm_rounds + 1, - comm, - &status); + receive_size_B, + mpi_type, + i, + tag * n_comm_rounds + 1, + comm, + &status); if (info != MPI_SUCCESS) { // check if we received the right amount MPI_Get_elements(&status, mpi_type, &amount); if (amount != receive_size_B) { - std::cout << "Error: Did not receive all data for matrix B!" << std::endl; - std::cout << "Received " << amount << ", instead of " << receive_size_B << std::endl; - std::cout << "Message source: " << status.MPI_SOURCE << ", tag = " << status.MPI_TAG << std::endl; + std::cout << "Error: Did not receive all data for matrix B!" + << std::endl; + std::cout << "Received " << amount << ", instead of " + << receive_size_B << std::endl; + std::cout << "Message source: " << status.MPI_SOURCE + << ", tag = " << status.MPI_TAG << std::endl; } } info = MPI_Recv(Cs.data() + offsetC, - receive_size_C, - mpi_type, - i, - tag*n_comm_rounds + 2, - comm, - &status); + receive_size_C, + mpi_type, + i, + tag * n_comm_rounds + 2, + comm, + &status); if (info != MPI_SUCCESS) { // check if we received the right amount MPI_Get_elements(&status, mpi_type, &amount); if (amount != receive_size_C) { - std::cout << "Error: Did not receive all data for matrix C!" << std::endl; - std::cout << "Received " << amount << ", instead of " << receive_size_C << std::endl; - std::cout << "Message source: " << status.MPI_SOURCE << ", tag = " << status.MPI_TAG << std::endl; + std::cout << "Error: Did not receive all data for matrix C!" + << std::endl; + std::cout << "Received " << amount << ", instead of " + << receive_size_C << std::endl; + std::cout << "Message source: " << status.MPI_SOURCE + << ", tag = " << status.MPI_TAG << std::endl; } } @@ -181,17 +193,31 @@ bool test_cosma(Strategy s, } // Rank i send data if (rank > 0 && rank < s.P) { - int info = MPI_Ssend(A.matrix_pointer(), sizeA, mpi_type, 0, tag*n_comm_rounds, comm); + int info = MPI_Ssend( + A.matrix_pointer(), sizeA, mpi_type, 0, tag * n_comm_rounds, comm); if (info != MPI_SUCCESS) { - std::cout << "MPI_Send was not successful on rank: " << rank << ", for matrix A" << std::endl; + std::cout << "MPI_Send was not successful on rank: " << rank + << ", for matrix A" << std::endl; } - info = MPI_Ssend(B.matrix_pointer(), sizeB, mpi_type, 0, tag*n_comm_rounds+1, comm); + info = MPI_Ssend(B.matrix_pointer(), + sizeB, + mpi_type, + 0, + tag * n_comm_rounds + 1, + comm); if (info != MPI_SUCCESS) { - std::cout << "MPI_Send was not successful on rank: " << rank << ", for matrix B" << std::endl; + std::cout << "MPI_Send was not successful on rank: " << rank + << ", for matrix B" << std::endl; } - info = MPI_Ssend(C.matrix_pointer(), sizeC, mpi_type, 0, tag*n_comm_rounds+2, comm); + info = MPI_Ssend(C.matrix_pointer(), + sizeC, + mpi_type, + 0, + tag * n_comm_rounds + 2, + comm); if (info != MPI_SUCCESS) { - std::cout << "MPI_Send was not successful on rank: " << rank << ", for matrix C" << std::endl; + std::cout << "MPI_Send was not successful on rank: " << rank + << ", for matrix C" << std::endl; } } @@ -247,15 +273,14 @@ bool test_cosma(Strategy s, offsetC += local_size_C; } // Now compute the result - cosma::local_multiply_cpu( - globA.data(), - globB.data(), - globCcheck.data(), - m, - n, - k, - alpha, - beta); + cosma::local_multiply_cpu(globA.data(), + globB.data(), + globCcheck.data(), + m, + n, + k, + alpha, + beta); #ifdef DEBUG std::cout << "Complete matrix A: " << std::endl; for (int i = 0; i < m; i++) { @@ -289,7 +314,8 @@ bool test_cosma(Strategy s, // Then rank0 asks for other ranks data if (rank == 0) { - std::memcpy(Cs.data(), C.matrix_pointer(), C.matrix_size()*sizeof(Scalar)); + std::memcpy( + Cs.data(), C.matrix_pointer(), C.matrix_size() * sizeof(Scalar)); int offsetC = sizeC; @@ -300,7 +326,7 @@ bool test_cosma(Strategy s, receive_size_C, mpi_type, i, - tag*n_comm_rounds + 4, + tag * n_comm_rounds + 4, comm, MPI_STATUSES_IGNORE); offsetC += receive_size_C; @@ -308,7 +334,12 @@ bool test_cosma(Strategy s, } // Rank i sends data if (rank > 0 && rank < s.P) { - MPI_Ssend(C.matrix_pointer(), sizeC, mpi_type, 0, tag*n_comm_rounds+4, comm); + MPI_Ssend(C.matrix_pointer(), + sizeC, + mpi_type, + 0, + tag * n_comm_rounds + 4, + comm); } // Then rank 0 must reorder data locally @@ -333,9 +364,11 @@ bool test_cosma(Strategy s, // Now Check result isOK = globCcheck.size() == globC.size(); for (int i = 0; i < globC.size(); ++i) { - // Use relative error for large values, absolute error for small values + // Use relative error for large values, absolute error for small + // values double abs_error = std::abs(globC[i] - globCcheck[i]); - double scale = std::max(std::abs(globC[i]), std::abs(globCcheck[i])); + double scale = + std::max(std::abs(globC[i]), std::abs(globCcheck[i])); double rel_error = (scale > 1e-10) ? abs_error / scale : abs_error; // For float32, relative error tolerance should be ~1e-6 // For float64, relative error tolerance should be ~1e-12 @@ -355,19 +388,21 @@ bool test_cosma(Strategy s, int y = i / m; int locidx, rank; std::tie(locidx, rank) = C.local_coordinates(x, y); - std::cout << "global(" << x << ", " << y - << ") = (loc = " << locidx << ", rank = " << rank - << ") = " << globC.at(i) << " and should be " - << globCcheck.at(i) << " (diff = " - << std::abs(globC.at(i) - globCcheck.at(i)) << ")" << std::endl; + std::cout + << "global(" << x << ", " << y + << ") = (loc = " << locidx << ", rank = " << rank + << ") = " << globC.at(i) << " and should be " + << globCcheck.at(i) << " (diff = " + << std::abs(globC.at(i) - globCcheck.at(i)) << ")" + << std::endl; } } } - std::cout << "Total errors: " << error_count << " out of " << (m * n) << " elements (" + std::cout << "Total errors: " << error_count << " out of " + << (m * n) << " elements (" << (100.0 * error_count / (m * n)) << "%)" << std::endl; - } - else { - std::cout <<"Result is OK"< 0 || isOK; }