From 5a9849293c1f1a2dcc4d5d4ba1a8c75777f2be38 Mon Sep 17 00:00:00 2001 From: Nik Bamert Date: Sat, 14 Feb 2026 13:14:45 +0100 Subject: [PATCH 01/36] cmakelists eigen3 --- samples/CMakeLists.txt | 6 +++--- tests/CMakeLists.txt | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt index 3bbe11f..a7e73d7 100644 --- a/samples/CMakeLists.txt +++ b/samples/CMakeLists.txt @@ -1,10 +1,10 @@ add_executable(extract extract.cpp) -target_link_libraries(extract ${PNG_LIBRARIES} Threads::Threads) +target_link_libraries(extract ${PNG_LIBRARIES} Threads::Threads Eigen3::Eigen) add_executable(train train.cpp) -target_link_libraries(train ${PNG_LIBRARIES} Threads::Threads) +target_link_libraries(train ${PNG_LIBRARIES} Threads::Threads Eigen3::Eigen) add_executable(sparsematch sparsematch.cpp) -target_link_libraries(sparsematch ${PNG_LIBRARIES} Threads::Threads) +target_link_libraries(sparsematch ${PNG_LIBRARIES} Threads::Threads Eigen3::Eigen) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index acf86db..a211cac 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -12,7 +12,7 @@ FetchContent_MakeAvailable(approvaltests) find_package(GTest REQUIRED) add_executable(test_single_matching test_single_matching.cpp) -target_link_libraries(test_single_matching PRIVATE ${PNG_LIBRARIES} ApprovalTests::ApprovalTests GTest::gtest_main) +target_link_libraries(test_single_matching PRIVATE ${PNG_LIBRARIES} ApprovalTests::ApprovalTests GTest::gtest_main Eigen3::Eigen) add_test(NAME single_matching COMMAND test_single_matching) From a4436e7660dade09c94c50e5cc2bdb5bb2213d18 Mon Sep 17 00:00:00 2001 From: Nik Bamert Date: Sat, 14 Feb 2026 14:09:18 +0100 Subject: [PATCH 02/36] decouple forest.hpp --- CMakeLists.txt | 10 + lib/gpc/buffer.hpp | 4 +- lib/gpc/filter.hpp | 24 +- lib/gpc/forest.cpp | 374 ++++++++++++++++++++++++ lib/gpc/forest.hpp | 283 ++++++++++++++++++ lib/gpc/inference.hpp | 505 --------------------------------- samples/CMakeLists.txt | 6 +- samples/sparsematch.cpp | 10 +- tests/CMakeLists.txt | 8 +- tests/test_single_matching.cpp | 8 +- 10 files changed, 700 insertions(+), 532 deletions(-) create mode 100644 lib/gpc/forest.cpp create mode 100644 lib/gpc/forest.hpp delete mode 100644 lib/gpc/inference.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 55e3851..535b559 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -37,6 +37,16 @@ if(SSE) endif() endif() +add_library(gpc_core + lib/gpc/forest.cpp +) +target_link_libraries(gpc_core + PUBLIC + Eigen3::Eigen + ${PNG_LIBRARIES} + Threads::Threads +) +target_include_directories(gpc_core PUBLIC lib) enable_testing() add_subdirectory(samples) add_subdirectory(tests) diff --git a/lib/gpc/buffer.hpp b/lib/gpc/buffer.hpp index dd6dce6..453ecaa 100644 --- a/lib/gpc/buffer.hpp +++ b/lib/gpc/buffer.hpp @@ -896,7 +896,7 @@ class RGBBuffer : public Buffer { free(rowPointers); } }; -Buffer getDisparityVisualization( +inline Buffer getDisparityVisualization( ndb::Buffer& srcImg, std::vector& validEstimateIndices, ndb::Buffer& disparity) { @@ -969,7 +969,7 @@ Buffer getDisparityVisualization( } return dispVis; } -Buffer getDisparityVisualization(ndb::Buffer& srcImg, +inline Buffer getDisparityVisualization(ndb::Buffer& srcImg, std::vector& support) { float min_disparity = 0; float max_disparity = 128; diff --git a/lib/gpc/filter.hpp b/lib/gpc/filter.hpp index 49384f0..2caecc2 100644 --- a/lib/gpc/filter.hpp +++ b/lib/gpc/filter.hpp @@ -58,7 +58,7 @@ namespace ndb { * @param ind output array (indices into n of nonzero elements) * @param m number of elements in output */ -__attribute__((noinline)) void arr2ind(const unsigned char* a, +inline void arr2ind(const unsigned char* a, int n, int* ind, int* m) { @@ -132,7 +132,7 @@ void pack16to8(const __m128i x0, const __m128i x1, __m128i& y) { * @param end end of the range * @param nThreads number of threads to use */ -void parFor(std::function const& f, +inline void parFor(std::function const& f, int start, int end, int nThreads) { @@ -165,7 +165,7 @@ void parFor(std::function const& f, * @param[in] numThreads number of threads to use * @param threshold threshold to binarize sobel filter output */ -void sobelNaive( +inline void sobelNaive( uint8_t* in, uint8_t* gradient, int width, int height, uint8_t threshold) { assert(width % 16 == 0 && "width must be multiple of 16!"); int thresholdSq = threshold * threshold; @@ -217,7 +217,7 @@ void sobelNaive( * @param[in] height The height * @param[in] numThreads number of threads to use */ -void boxNaive(uint8_t* in, uint8_t* blurred, int width, int height) { +inline void boxNaive(uint8_t* in, uint8_t* blurred, int width, int height) { assert(width % 16 == 0 && "width must be multiple of 16!"); // allocate space for result uint8_t* ptr = in; @@ -269,7 +269,7 @@ void boxNaive(uint8_t* in, uint8_t* blurred, int width, int height) { * @param width The width of the image at pointer *in * @param height The height of the image at pointer *in */ -void gpcFilterNaive(uint8_t* in, +inline void gpcFilterNaive(uint8_t* in, const uint8_t* grad, uint32_t* gpc, std::vector fastmask, @@ -303,7 +303,7 @@ void gpcFilterNaive(uint8_t* in, * @param width The width of the image at pointer *in * @param height The height of the image at pointer *in */ -void gpcFilterTauNaive(uint8_t* in, +inline void gpcFilterTauNaive(uint8_t* in, const uint8_t* grad, uint32_t* gpc, std::vector fastmask, @@ -336,7 +336,7 @@ void gpcFilterTauNaive(uint8_t* in, * @param[in] height The height * @param[in] numThreads number of threads to use */ -void box(uint8_t* in, uint8_t* blurred, int width, int height, int numThreads) { +inline void box(uint8_t* in, uint8_t* blurred, int width, int height, int numThreads) { assert(width % 16 == 0 && "width must be multiple of 16!"); #ifndef _INTRINSICS_SSE boxNaive(in, blurred, width, height); @@ -464,7 +464,7 @@ void box(uint8_t* in, uint8_t* blurred, int width, int height, int numThreads) { * @param[in] numThreads number of threads to use */ -void sobel(uint8_t* in, +inline void sobel(uint8_t* in, uint8_t* blurred, int width, int height, @@ -645,7 +645,7 @@ inline bool isAllZeros(__m128i xmm) { * @param height The height of the image at pointer *in * @param numThreadsNumber of threads to use */ -void gpcFilter(uint8_t* in, +inline void gpcFilter(uint8_t* in, const uint8_t* grad, uint32_t* gpc, std::vector fastmask, @@ -731,7 +731,7 @@ void gpcFilter(uint8_t* in, * @param height The height of the image at pointer *in * @param numThreads Number of threads to use */ -void gpcFilterTau(uint8_t* in, +inline void gpcFilterTau(uint8_t* in, const uint8_t* grad, uint32_t* gpc, std::vector fastmask, @@ -816,7 +816,7 @@ void gpcFilterTau(uint8_t* in, * @param width Width of the image at *in pointer * @param height Heiht of the image at *in pointer */ -void census5x5Naive(uint8_t* in, uint32_t* census, int width, int height) { +inline void census5x5Naive(uint8_t* in, uint32_t* census, int width, int height) { uint32_t val; uint32_t* dst; for (int y = 2; y < height - 3; y++) { @@ -850,7 +850,7 @@ void census5x5Naive(uint8_t* in, uint32_t* census, int width, int height) { * @param width * @param height */ -void census5x5(uint8_t* in, uint32_t* census, int width, int height) { +inline void census5x5(uint8_t* in, uint32_t* census, int width, int height) { assert(width % 16 == 0 && "width must be multiple of 16!"); #ifndef _INTRINSICS_SSE census5x5Naive(in, census, width, height); diff --git a/lib/gpc/forest.cpp b/lib/gpc/forest.cpp new file mode 100644 index 0000000..0809951 --- /dev/null +++ b/lib/gpc/forest.cpp @@ -0,0 +1,374 @@ +// Copyright (c) 2018, ETH Zurich +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// 3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// Implements and extends the method proposed in +// The Global Patch Collider +// Shenlong Wang, Sean Ryan Fanello, Christoph Rhemann, Shahram Izadi, Pushmeet +// Kohli CVPR 2016 Code Author: Niklaus Bamert (bamertn@ethz.ch) +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// GPC includes +#include "gpc/Feature.hpp" +#include "gpc/SintelOpticalFlow.hpp" +#include "gpc/SintelStereo.hpp" +#include "gpc/buffer.hpp" +#include "gpc/filter.hpp" +#include "gpc/hashmatch.hpp" +#include "gpc/forest.hpp" + + +namespace gpc { +namespace inference { + + /** + * @brief Computes sparse matches on a pair of rectified and smoothed + * images. Here the src and tar images refer to the left and right images, + * respectively. + * + * @param src Preprocessed source(left) image + * @param tar Preprocessed target(right) image + * @param fastmask forest mask of relative integer offsets. + * + * @return + */ +std::vector Forest::depthPriorFast( + PreprocessedImage& src, + PreprocessedImage& tar, + FilterMask& fastmask, + InferenceSettings& settings) { + std::vector statesSrc = evalFastMaskOnSubsetSSE( + src.smooth, src.grad, src.mask, fastmask, settings); + std::vector statesTar = evalFastMaskOnSubsetSSE( + tar.smooth, tar.grad, tar.mask, fastmask, settings); + // Epipolar mode. Use upper 32bit of 64bit descriptor to store y + // coordinate + if (settings.epipolarMode_) { + for (auto& el : statesSrc) el.state |= uint64_t(el.point.y) << 32; + for (auto& el : statesTar) el.state |= uint64_t(el.point.y) << 32; + } + // Use sort method for matching + if (settings.useHashtable_ == false) { + std::vector corr = + findCorrespondences(statesSrc, statesTar); + return corr; + } + // Use hashtable matching + else { + for (auto& q : statesSrc) q.srcDescr = true; + for (auto& q : statesTar) q.srcDescr = false; + + ndb::Hashmatch hm( + 214673, // statesSrc.size() + statesTar.size() , + statesSrc.size() + statesTar.size()); + std::vector> corr; + for (auto& q : statesSrc) hm.insert(q); + for (auto& q : statesTar) hm.insert(q); + hm.getDuplicates(corr); + // Store vertices in a format that is more convenient for us: + std::vector corr2; + for (auto& e : corr) { + corr2.push_back( + ndb::Correspondence(e.first.point, e.second.point)); + } + + return corr2; + } +} +std::vector Forest::findCorrespondences( + std::vector& srcStates, + std::vector& tarStates) { + int numStates = std::min(srcStates.size(), tarStates.size()); + // Limit search to rectified epipolar case. + std::sort(srcStates.begin(), srcStates.end()); + + std::sort(tarStates.begin(), tarStates.end()); + std::vector corr; + uint32_t j = 0; + for (uint32_t i = 0; i < srcStates.size(); ++i) { + bool unique = true; + while (i + 1 < srcStates.size() && srcStates[i] == srcStates[i + 1]) + ++i, unique = false; + + if (unique) { + // emulates std::lowerbound behavior for arrays + for (; j < tarStates.size() - 1; ++j) { + if (!(tarStates[j] < srcStates[i])) break; + } + + if (j != tarStates.size() - 1 && tarStates[j] == srcStates[i] && + ((j + 1) == tarStates.size() - 1 || + !(tarStates[j] == tarStates[j + 1]))) + corr.push_back(ndb::Correspondence(srcStates[i].point, + tarStates[j].point)); + } + } + return corr; +} + +/** + * @brief Evaluates a given forest mask on an image and returns the + * descriptors + * + * @param img The image + * @param grad gradient image + * @param idx offsets with high gradient pixels within the grad image + * @param fastmask the forest mask + * + * @return + */ +std::vector Forest::evalFastMaskOnSubsetSSE( + ndb::Buffer& img, + ndb::Buffer& grad, + std::vector& idx, + FilterMask& fastmask, + InferenceSettings& settings) { + std::chrono::high_resolution_clock::time_point t0, t1; + + // output buffer of same size + ndb::Buffer gpcstates(img.rows(), img.cols(), 0); + if (fastmask.type == 0) { + ndb::gpcFilter(img.data(), + grad.data(), + gpcstates.data(), + fastmask.mask, + idx, + img.cols(), + img.rows(), + settings.numThreads_); + } else { + ndb::gpcFilterTau(img.data(), + grad.data(), + gpcstates.data(), + fastmask.mask, + fastmask.tau, + idx, + img.cols(), + img.rows(), + settings.numThreads_); + } + std::vector out(idx.size()); + int j = 0; + + for (auto k : idx) { + int x = k % img.cols(); + int y = k / img.cols(); + out[j] = ndb::Descriptor(ndb::Point(x, y), gpcstates.data()[k]); + j++; + } + return out; +} + +/** + * @brief Preprocesses an image. (smooth, binary sobel image and gradient + * pixel indices) + * + * @param img The raw input image to be preprocessed + * @param InferenceSettings inference settings struct + * + * @return the preprocessed image + */ +PreprocessedImage Forest::preprocessImage(ndb::Buffer& img, + InferenceSettings settings) { + assert((settings.gradientThreshold_ >= 0 && + settings.gradientThreshold_ <= 255) && + "gradientThreshold needs to be within 0...255"); + + ndb::Buffer smooth(img.rows(), img.cols()); + smooth.width = img.width; + ndb::box(img.data(), + smooth.data(), + img.cols(), + img.rows(), + settings.numThreads_); + smooth.clearBoundary(); + ndb::Buffer grad(img.rows(), img.cols()); + grad.width = img.width; + ndb::Buffer maskTmp; + ndb::sobel(img.data(), + grad.data(), + img.cols(), + img.rows(), + settings.gradientThreshold_, + settings.numThreads_); + + ndb::Buffer idx; + idx.resize(grad.rows(), grad.cols()); + auto ff = [&](ndb::Buffer& in, std::vector& out, int m) { + for (int i = 0; i < m; i++) { + int x = in.data()[i] % grad.cols(); + int y = in.data()[i] / grad.cols(); + if (y >= 13 && y < grad.rows() - 13 && x >= 13 && + x < grad.cols() - 13) + out.push_back(in.data()[i]); + } + }; + int m; + // mask indexing gradient pixels + std::vector mask; + ndb::arr2ind(grad.data(), grad.cols() * grad.rows(), idx.data(), &m); + ff(idx, mask, m); + // Our outputs are: smooth, grad, mask; + return PreprocessedImage(smooth, grad, mask); +} +/** + * @brief Finds matches between two stereo images based on a given forest + * mask. + * + * @param simg source image (assumed to be the left image) + * @param timg target image (assumed to be the right image) + * @param forestmask forest mask, provided by readForest method + * @param InferenceSettings inference settings struct + * @return Set of correspondences (ptSrc, ptTar) where + * ptSrc and ptTar are points in the source and target images, respectively. + */ +std::vector Forest::stereoMatch(PreprocessedImage& simg, + PreprocessedImage& timg, + FilterMask& forestmask, + InferenceSettings settings) { + // make sure the delivered mask matches the image dimensions + assert( + (forestmask.width == simg.smooth.cols() && + forestmask.height == simg.smooth.rows()) && + "Source Image: dimension does not fit dimension of supplied forest " + "mask"); + assert( + (forestmask.width == timg.smooth.cols() && + forestmask.height == simg.smooth.rows()) && + "Targe Image: dimension does not fit dimension of supplied forest " + "mask"); + bool m_debug = false; + std::chrono::high_resolution_clock::time_point t0, t1; + // Match + std::vector corr = + depthPriorFast(simg, timg, forestmask, settings); + t1 = sysTick(); + + return corr; +} + +/** + * @brief Returns support (set of x,y coordinates and + * disparity) of a pair of images that have been rectified. + * + * @@param simg source image (assumed to be the left image) + * @param timg target image (assumed to be the right image) + * @param forestmask forest mask, provided by readForest method + * @param InferenceSettings inference settings struct + * In practice, values between 5...20 produce good + * results. + * + * @return Set of supports (x,y,d) with x,y the coordinate + * of a point in the left image and d the disparity. + */ +std::vector Forest::rectifiedMatch(PreprocessedImage& simg, + PreprocessedImage& timg, + FilterMask& forestmask, + InferenceSettings settings) { + // Do matching + std::vector corr = + stereoMatch(simg, timg, forestmask, settings); + // Filter epipolar matches + std::vector supp; + for (auto& e : corr) { + // epipolar constraint + if (std::abs(e.srcPt.y - e.tarPt.y) <= settings.verticalTolerance_ + // disparity filter + && std::abs(e.srcPt.x - e.tarPt.x) <= settings.dispHigh_) + supp.push_back( + ndb::Support(e.srcPt.x, e.srcPt.y, e.srcPt.x - e.tarPt.x)); + } + return supp; +} + +/** + * @brief Reads text-based forest format and returns a mask for a given + * image size. + * + * @param path Path to the file that contains the forest. + * @param width 16-Byte aligned width of the image in pixels + * @param height height of the image in pixels + * + * @return + */ +FilterMask Forest::readForest(std::string path, int width, int height) { + std::ifstream ff(path); + + std::vector fastmask; + std::vector taus; + if (ff.fail()) { + cout << "Error opening forest file" << endl; + return FilterMask(fastmask, width, height, 0); + } + int numNonZeroTau = 0; + int numFerns; + int type; + ff >> numFerns; + cout << "number of ferns:" << numFerns << endl; + for (int i = 0; i < numFerns; i++) { + int fernID, numTests; + std::string fernScale; + ff >> fernID >> fernScale >> numTests; + for (int j = 0; j < numTests; j++) { + int levelID, ix, iy, jx, jy, tau; + ff >> levelID >> ix >> iy >> jx >> jy >> tau; + // Limit mask size to 32 binary tests + if (fastmask.size() < 64 && taus.size() < 32) { + fastmask.push_back(ix + iy * width); + fastmask.push_back(jx + jy * width); + taus.push_back(tau); + } else { + cout << "Note: A maximum of 32 fern features are allowed, " + "discarding " + "remainder of forest." + << endl; + } + if (tau != 0) numNonZeroTau++; + } + } + if (numNonZeroTau == 0) { + type = 0; // We have a zero forest (all tau=0) + FilterMask fm(fastmask, width, height, type); + return fm; + } else { + type = 1; // We have a tau forest (some tau!=0) + FilterMask fm(fastmask, taus, width, height, type); + return fm; + } +} + +} // namespace inference +} diff --git a/lib/gpc/forest.hpp b/lib/gpc/forest.hpp new file mode 100644 index 0000000..87939c1 --- /dev/null +++ b/lib/gpc/forest.hpp @@ -0,0 +1,283 @@ +// Copyright (c) 2018, ETH Zurich +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// 3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// Implements and extends the method proposed in +// The Global Patch Collider +// Shenlong Wang, Sean Ryan Fanello, Christoph Rhemann, Shahram Izadi, Pushmeet +// Kohli CVPR 2016 Code Author: Niklaus Bamert (bamertn@ethz.ch) +#ifndef _GPC_inference +#define _GPC_inference +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// GPC includes +#include "gpc/Feature.hpp" +#include "gpc/SintelOpticalFlow.hpp" +#include "gpc/SintelStereo.hpp" +#include "gpc/buffer.hpp" +#include "gpc/filter.hpp" +#include "gpc/hashmatch.hpp" + +/** + * @brief The inference class of the GPC forest + * + */ +namespace gpc { +namespace inference { +typedef typename std::chrono::high_resolution_clock::time_point time_point; +inline std::chrono::high_resolution_clock::time_point sysTick() { + return std::chrono::high_resolution_clock::now(); +} +inline float tickToMs(std::chrono::high_resolution_clock::time_point t0, + std::chrono::high_resolution_clock::time_point t1) { + return std::abs( + 1000. * + std::chrono::duration_cast>(t1 - t0) + .count()); +} +struct InferenceSettings { + // Threshold to be used for edge detection. Can be 0...255. + // In practice, values between 5...20 produce good results. uint8_t + // gradientThreshold; + uint8_t gradientThreshold_ = 10; + // upper absolute limit for disparity in pixels. The lower (implied) limit + // is + // 0 + int dispHigh_ = 128; + // vertical deviation tolerance in pixels for corresponding features in + // rectified stereo images. + int verticalTolerance_ = 1; + // Whether to use epipolar mode on matching or not. + bool epipolarMode_ = false; + // Use hashtable to match extracted descriptors. Usually only faster with a + // large number of descriptors (> 100k) or when using multiple threads. Note + // that the hashtable method does not return a slightly reduced amount of + // matches as a result of the hash table implementation (small bucket size) + // if false, the descriptors are sorted and matched by iterating + // alternatingly through both sets. + bool useHashtable_ = false; + + // Number of threads to use for inference + int numThreads_ = 1; + + // Default contructor defaults to using a single thread + InferenceSettings(uint8_t gradientThreshold, + int dispHigh, + int verticalTolerance, + bool epipolarMode, + bool useHashtable, + int numThreads) + : gradientThreshold_(gradientThreshold), + dispHigh_(dispHigh), + verticalTolerance_(verticalTolerance), + epipolarMode_(epipolarMode), + useHashtable_(useHashtable), + numThreads_(numThreads) {} + + InferenceSettings() {} + InferenceSettings& builder(void) { return *this; } + InferenceSettings& gradientThreshold(uint8_t gradientThreshold) { + this->gradientThreshold_ = gradientThreshold; + return *this; + } + InferenceSettings& dispHigh(int dispHigh) { + this->dispHigh_ = dispHigh; + return *this; + } + InferenceSettings& verticalTolerance(int verticalTolerance) { + this->verticalTolerance_ = verticalTolerance; + return *this; + } + InferenceSettings& epipolarMode(bool epipolarMode) { + this->epipolarMode_ = epipolarMode; + return *this; + } + InferenceSettings& useHashtable(bool useHashtable) { + this->useHashtable_ = useHashtable; + return *this; + } + InferenceSettings& numThreads(int numThreads) { + if (numThreads > std::thread::hardware_concurrency()) + this->numThreads_ = std::thread::hardware_concurrency(); + else + this->numThreads_ = numThreads; + return *this; + } +}; +/** + * @brief FilterMask object that is returned by the forest reader + */ +struct FilterMask { + std::vector mask; + std::vector tau; + int width; + int height; + int type; + FilterMask(std::vector mask, int width, int height, int type) { + this->mask = mask; + this->width = width; + this->height = height; + this->type = type; + } + FilterMask(std::vector mask, + std::vector tau, + int width, + int height, + int type) { + this->mask = mask; + this->tau = tau; + this->width = width; + this->height = height; + this->type = type; + } +}; +struct PreprocessedImage { + ndb::Buffer smooth; + ndb::Buffer grad; + std::vector mask; + PreprocessedImage(ndb::Buffer& smooth, + ndb::Buffer& grad, + std::vector& mask) + : smooth(smooth), grad(grad), mask(mask) {}; +}; + +enum CorrMethod { sorting = 's', hashtable = 'h' }; +struct MatchStats { + double prec, rec, timeProp, timeMatch; + int numInlier, numStates, numMatches; +}; + + +class Forest { + public: + /** + * @brief Computes sparse matches on a pair of rectified and smoothed + * images. Here the src and tar images refer to the left and right images, + * respectively. + * + * @param src Preprocessed source(left) image + * @param tar Preprocessed target(right) image + * @param fastmask forest mask of relative integer offsets. + * + * @return + */ + std::vector depthPriorFast( + PreprocessedImage& src, + PreprocessedImage& tar, + FilterMask& fastmask, + InferenceSettings& settings); + std::vector findCorrespondences( + std::vector& srcStates, + std::vector& tarStates); + /** + * @brief Evaluates a given forest mask on an image and returns the + * descriptors + * + * @param img The image + * @param grad gradient image + * @param idx offsets with high gradient pixels within the grad image + * @param fastmask the forest mask + * + * @return + */ + std::vector evalFastMaskOnSubsetSSE( + ndb::Buffer& img, + ndb::Buffer& grad, + std::vector& idx, + FilterMask& fastmask, + InferenceSettings& settings); + + /** + * @brief Preprocesses an image. (smooth, binary sobel image and gradient + * pixel indices) + * + * @param img The raw input image to be preprocessed + * @param InferenceSettings inference settings struct + * + * @return the preprocessed image + */ + PreprocessedImage preprocessImage(ndb::Buffer& img, + InferenceSettings settings); + /** + * @brief Finds matches between two stereo images based on a given forest + * mask. + * + * @param simg source image (assumed to be the left image) + * @param timg target image (assumed to be the right image) + * @param forestmask forest mask, provided by readForest method + * @param InferenceSettings inference settings struct + * @return Set of correspondences (ptSrc, ptTar) where + * ptSrc and ptTar are points in the source and target images, respectively. + */ + std::vector stereoMatch(PreprocessedImage& simg, + PreprocessedImage& timg, + FilterMask& forestmask, + InferenceSettings settings); + /** + * @brief Returns support (set of x,y coordinates and + * disparity) of a pair of images that have been rectified. + * + * @@param simg source image (assumed to be the left image) + * @param timg target image (assumed to be the right image) + * @param forestmask forest mask, provided by readForest method + * @param InferenceSettings inference settings struct + * In practice, values between 5...20 produce good + * results. + * + * @return Set of supports (x,y,d) with x,y the coordinate + * of a point in the left image and d the disparity. + */ + std::vector rectifiedMatch(PreprocessedImage& simg, + PreprocessedImage& timg, + FilterMask& forestmask, + InferenceSettings settings); + + /** + * @brief Reads text-based forest format and returns a mask for a given + * image size. + * + * @param path Path to the file that contains the forest. + * @param width 16-Byte aligned width of the image in pixels + * @param height height of the image in pixels + * + * @return + */ + FilterMask readForest(std::string path, int width, int height); +}; // forest class +} // namespace inference +} // namespace gpc + +#endif diff --git a/lib/gpc/inference.hpp b/lib/gpc/inference.hpp deleted file mode 100644 index e1a887a..0000000 --- a/lib/gpc/inference.hpp +++ /dev/null @@ -1,505 +0,0 @@ -// Copyright (c) 2018, ETH Zurich -// All rights reserved. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are met: -// -// 1. Redistributions of source code must retain the above copyright notice, -// this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright notice, -// this list of conditions and the following disclaimer in the documentation -// and/or other materials provided with the distribution. -// -// 3. Neither the name of the copyright holder nor the names of its contributors -// may be used to endorse or promote products derived from this software without -// specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE -// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR -// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF -// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS -// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN -// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) -// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE -// POSSIBILITY OF SUCH DAMAGE. -// -// Implements and extends the method proposed in -// The Global Patch Collider -// Shenlong Wang, Sean Ryan Fanello, Christoph Rhemann, Shahram Izadi, Pushmeet -// Kohli CVPR 2016 Code Author: Niklaus Bamert (bamertn@ethz.ch) -#ifndef _GPC_inference -#define _GPC_inference -#include -#include -#include -#include -#include -#include -#include -#include -#include - -// GPC includes -#include "gpc/Feature.hpp" -#include "gpc/SintelOpticalFlow.hpp" -#include "gpc/SintelStereo.hpp" -#include "gpc/buffer.hpp" -#include "gpc/filter.hpp" -#include "gpc/hashmatch.hpp" - -/** - * @brief The inference class of the GPC forest - * - */ -namespace gpc { -namespace inference { -typedef typename std::chrono::high_resolution_clock::time_point time_point; -std::chrono::high_resolution_clock::time_point sysTick() { - return std::chrono::high_resolution_clock::now(); -} -float tickToMs(std::chrono::high_resolution_clock::time_point t0, - std::chrono::high_resolution_clock::time_point t1) { - return std::abs( - 1000. * - std::chrono::duration_cast>(t1 - t0) - .count()); -} -struct InferenceSettings { - // Threshold to be used for edge detection. Can be 0...255. - // In practice, values between 5...20 produce good results. uint8_t - // gradientThreshold; - uint8_t gradientThreshold_ = 10; - // upper absolute limit for disparity in pixels. The lower (implied) limit - // is - // 0 - int dispHigh_ = 128; - // vertical deviation tolerance in pixels for corresponding features in - // rectified stereo images. - int verticalTolerance_ = 1; - // Whether to use epipolar mode on matching or not. - bool epipolarMode_ = false; - // Use hashtable to match extracted descriptors. Usually only faster with a - // large number of descriptors (> 100k) or when using multiple threads. Note - // that the hashtable method does not return a slightly reduced amount of - // matches as a result of the hash table implementation (small bucket size) - // if false, the descriptors are sorted and matched by iterating - // alternatingly through both sets. - bool useHashtable_ = false; - - // Number of threads to use for inference - int numThreads_ = 1; - - // Default contructor defaults to using a single thread - InferenceSettings(uint8_t gradientThreshold, - int dispHigh, - int verticalTolerance, - bool epipolarMode, - bool useHashtable, - int numThreads) - : gradientThreshold_(gradientThreshold), - dispHigh_(dispHigh), - verticalTolerance_(verticalTolerance), - epipolarMode_(epipolarMode), - useHashtable_(useHashtable), - numThreads_(numThreads) {} - - InferenceSettings() {} - InferenceSettings& builder(void) { return *this; } - InferenceSettings& gradientThreshold(uint8_t gradientThreshold) { - this->gradientThreshold_ = gradientThreshold; - return *this; - } - InferenceSettings& dispHigh(int dispHigh) { - this->dispHigh_ = dispHigh; - return *this; - } - InferenceSettings& verticalTolerance(int verticalTolerance) { - this->verticalTolerance_ = verticalTolerance; - return *this; - } - InferenceSettings& epipolarMode(bool epipolarMode) { - this->epipolarMode_ = epipolarMode; - return *this; - } - InferenceSettings& useHashtable(bool useHashtable) { - this->useHashtable_ = useHashtable; - return *this; - } - InferenceSettings& numThreads(int numThreads) { - if (numThreads > std::thread::hardware_concurrency()) - this->numThreads_ = std::thread::hardware_concurrency(); - else - this->numThreads_ = numThreads; - return *this; - } -}; -class Forest { - public: - /** - * @brief FilterMask object that is returned by the forest reader - */ - struct FilterMask { - std::vector mask; - std::vector tau; - int width; - int height; - int type; - FilterMask(std::vector mask, int width, int height, int type) { - this->mask = mask; - this->width = width; - this->height = height; - this->type = type; - } - FilterMask(std::vector mask, - std::vector tau, - int width, - int height, - int type) { - this->mask = mask; - this->tau = tau; - this->width = width; - this->height = height; - this->type = type; - } - }; - struct PreprocessedImage { - ndb::Buffer smooth; - ndb::Buffer grad; - std::vector mask; - PreprocessedImage(ndb::Buffer& smooth, - ndb::Buffer& grad, - std::vector& mask) - : smooth(smooth), grad(grad), mask(mask) {}; - }; - - enum CorrMethod { sorting = 's', hashtable = 'h' }; - struct MatchStats { - double prec, rec, timeProp, timeMatch; - int numInlier, numStates, numMatches; - }; - - /** - * @brief Computes sparse matches on a pair of rectified and smoothed - * images. Here the src and tar images refer to the left and right images, - * respectively. - * - * @param src Preprocessed source(left) image - * @param tar Preprocessed target(right) image - * @param fastmask forest mask of relative integer offsets. - * - * @return - */ - std::vector depthPriorFast( - PreprocessedImage& src, - PreprocessedImage& tar, - FilterMask& fastmask, - InferenceSettings& settings) { - std::vector statesSrc = evalFastMaskOnSubsetSSE( - src.smooth, src.grad, src.mask, fastmask, settings); - std::vector statesTar = evalFastMaskOnSubsetSSE( - tar.smooth, tar.grad, tar.mask, fastmask, settings); - // Epipolar mode. Use upper 32bit of 64bit descriptor to store y - // coordinate - if (settings.epipolarMode_) { - for (auto& el : statesSrc) el.state |= uint64_t(el.point.y) << 32; - for (auto& el : statesTar) el.state |= uint64_t(el.point.y) << 32; - } - // Use sort method for matching - if (settings.useHashtable_ == false) { - std::vector corr = - findCorrespondences(statesSrc, statesTar); - return corr; - } - // Use hashtable matching - else { - for (auto& q : statesSrc) q.srcDescr = true; - for (auto& q : statesTar) q.srcDescr = false; - - ndb::Hashmatch hm( - 214673, // statesSrc.size() + statesTar.size() , - statesSrc.size() + statesTar.size()); - std::vector> corr; - for (auto& q : statesSrc) hm.insert(q); - for (auto& q : statesTar) hm.insert(q); - hm.getDuplicates(corr); - // Store vertices in a format that is more convenient for us: - std::vector corr2; - for (auto& e : corr) { - corr2.push_back( - ndb::Correspondence(e.first.point, e.second.point)); - } - - return corr2; - } - } - std::vector findCorrespondences( - std::vector& srcStates, - std::vector& tarStates) { - int numStates = std::min(srcStates.size(), tarStates.size()); - // Limit search to rectified epipolar case. - std::sort(srcStates.begin(), srcStates.end()); - - std::sort(tarStates.begin(), tarStates.end()); - std::vector corr; - uint32_t j = 0; - for (uint32_t i = 0; i < srcStates.size(); ++i) { - bool unique = true; - while (i + 1 < srcStates.size() && srcStates[i] == srcStates[i + 1]) - ++i, unique = false; - - if (unique) { - // emulates std::lowerbound behavior for arrays - for (; j < tarStates.size() - 1; ++j) { - if (!(tarStates[j] < srcStates[i])) break; - } - - if (j != tarStates.size() - 1 && tarStates[j] == srcStates[i] && - ((j + 1) == tarStates.size() - 1 || - !(tarStates[j] == tarStates[j + 1]))) - corr.push_back(ndb::Correspondence(srcStates[i].point, - tarStates[j].point)); - } - } - return corr; - } - - /** - * @brief Evaluates a given forest mask on an image and returns the - * descriptors - * - * @param img The image - * @param grad gradient image - * @param idx offsets with high gradient pixels within the grad image - * @param fastmask the forest mask - * - * @return - */ - std::vector evalFastMaskOnSubsetSSE( - ndb::Buffer& img, - ndb::Buffer& grad, - std::vector& idx, - FilterMask& fastmask, - InferenceSettings& settings) { - std::chrono::high_resolution_clock::time_point t0, t1; - - // output buffer of same size - ndb::Buffer gpcstates(img.rows(), img.cols(), 0); - if (fastmask.type == 0) { - ndb::gpcFilter(img.data(), - grad.data(), - gpcstates.data(), - fastmask.mask, - idx, - img.cols(), - img.rows(), - settings.numThreads_); - } else { - ndb::gpcFilterTau(img.data(), - grad.data(), - gpcstates.data(), - fastmask.mask, - fastmask.tau, - idx, - img.cols(), - img.rows(), - settings.numThreads_); - } - std::vector out(idx.size()); - int j = 0; - - for (auto k : idx) { - int x = k % img.cols(); - int y = k / img.cols(); - out[j] = ndb::Descriptor(ndb::Point(x, y), gpcstates.data()[k]); - j++; - } - return out; - } - - /** - * @brief Preprocesses an image. (smooth, binary sobel image and gradient - * pixel indices) - * - * @param img The raw input image to be preprocessed - * @param InferenceSettings inference settings struct - * - * @return the preprocessed image - */ - PreprocessedImage preprocessImage(ndb::Buffer& img, - InferenceSettings settings) { - assert((settings.gradientThreshold_ >= 0 && - settings.gradientThreshold_ <= 255) && - "gradientThreshold needs to be within 0...255"); - - ndb::Buffer smooth(img.rows(), img.cols()); - smooth.width = img.width; - ndb::box(img.data(), - smooth.data(), - img.cols(), - img.rows(), - settings.numThreads_); - smooth.clearBoundary(); - ndb::Buffer grad(img.rows(), img.cols()); - grad.width = img.width; - ndb::Buffer maskTmp; - ndb::sobel(img.data(), - grad.data(), - img.cols(), - img.rows(), - settings.gradientThreshold_, - settings.numThreads_); - - ndb::Buffer idx; - idx.resize(grad.rows(), grad.cols()); - auto ff = [&](ndb::Buffer& in, std::vector& out, int m) { - for (int i = 0; i < m; i++) { - int x = in.data()[i] % grad.cols(); - int y = in.data()[i] / grad.cols(); - if (y >= 13 && y < grad.rows() - 13 && x >= 13 && - x < grad.cols() - 13) - out.push_back(in.data()[i]); - } - }; - int m; - // mask indexing gradient pixels - std::vector mask; - ndb::arr2ind(grad.data(), grad.cols() * grad.rows(), idx.data(), &m); - ff(idx, mask, m); - // Our outputs are: smooth, grad, mask; - return PreprocessedImage(smooth, grad, mask); - } - /** - * @brief Finds matches between two stereo images based on a given forest - * mask. - * - * @param simg source image (assumed to be the left image) - * @param timg target image (assumed to be the right image) - * @param forestmask forest mask, provided by readForest method - * @param InferenceSettings inference settings struct - * @return Set of correspondences (ptSrc, ptTar) where - * ptSrc and ptTar are points in the source and target images, respectively. - */ - std::vector stereoMatch(PreprocessedImage& simg, - PreprocessedImage& timg, - FilterMask& forestmask, - InferenceSettings settings) { - // make sure the delivered mask matches the image dimensions - assert( - (forestmask.width == simg.smooth.cols() && - forestmask.height == simg.smooth.rows()) && - "Source Image: dimension does not fit dimension of supplied forest " - "mask"); - assert( - (forestmask.width == timg.smooth.cols() && - forestmask.height == simg.smooth.rows()) && - "Targe Image: dimension does not fit dimension of supplied forest " - "mask"); - bool m_debug = false; - std::chrono::high_resolution_clock::time_point t0, t1; - // Match - std::vector corr = - depthPriorFast(simg, timg, forestmask, settings); - t1 = sysTick(); - - return corr; - } - - /** - * @brief Returns support (set of x,y coordinates and - * disparity) of a pair of images that have been rectified. - * - * @@param simg source image (assumed to be the left image) - * @param timg target image (assumed to be the right image) - * @param forestmask forest mask, provided by readForest method - * @param InferenceSettings inference settings struct - * In practice, values between 5...20 produce good - * results. - * - * @return Set of supports (x,y,d) with x,y the coordinate - * of a point in the left image and d the disparity. - */ - std::vector rectifiedMatch(PreprocessedImage& simg, - PreprocessedImage& timg, - FilterMask& forestmask, - InferenceSettings settings) { - // Do matching - std::vector corr = - stereoMatch(simg, timg, forestmask, settings); - // Filter epipolar matches - std::vector supp; - for (auto& e : corr) { - // epipolar constraint - if (std::abs(e.srcPt.y - e.tarPt.y) <= settings.verticalTolerance_ - // disparity filter - && std::abs(e.srcPt.x - e.tarPt.x) <= settings.dispHigh_) - supp.push_back( - ndb::Support(e.srcPt.x, e.srcPt.y, e.srcPt.x - e.tarPt.x)); - } - return supp; - } - - /** - * @brief Reads text-based forest format and returns a mask for a given - * image size. - * - * @param path Path to the file that contains the forest. - * @param width 16-Byte aligned width of the image in pixels - * @param height height of the image in pixels - * - * @return - */ - FilterMask readForest(std::string path, int width, int height) { - std::ifstream ff(path); - - std::vector fastmask; - std::vector taus; - if (ff.fail()) { - cout << "Error opening forest file" << endl; - return FilterMask(fastmask, width, height, 0); - } - int numNonZeroTau = 0; - int numFerns; - int type; - ff >> numFerns; - cout << "number of ferns:" << numFerns << endl; - for (int i = 0; i < numFerns; i++) { - int fernID, numTests; - std::string fernScale; - ff >> fernID >> fernScale >> numTests; - for (int j = 0; j < numTests; j++) { - int levelID, ix, iy, jx, jy, tau; - ff >> levelID >> ix >> iy >> jx >> jy >> tau; - // Limit mask size to 32 binary tests - if (fastmask.size() < 64 && taus.size() < 32) { - fastmask.push_back(ix + iy * width); - fastmask.push_back(jx + jy * width); - taus.push_back(tau); - } else { - cout << "Note: A maximum of 32 fern features are allowed, " - "discarding " - "remainder of forest." - << endl; - } - if (tau != 0) numNonZeroTau++; - } - } - if (numNonZeroTau == 0) { - type = 0; // We have a zero forest (all tau=0) - FilterMask fm(fastmask, width, height, type); - return fm; - } else { - type = 1; // We have a tau forest (some tau!=0) - FilterMask fm(fastmask, taus, width, height, type); - return fm; - } - } - -}; // forest class -} // namespace inference -} // namespace gpc - -#endif diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt index a7e73d7..a00cef3 100644 --- a/samples/CMakeLists.txt +++ b/samples/CMakeLists.txt @@ -1,10 +1,10 @@ add_executable(extract extract.cpp) -target_link_libraries(extract ${PNG_LIBRARIES} Threads::Threads Eigen3::Eigen) +target_link_libraries(extract gpc_core) add_executable(train train.cpp) -target_link_libraries(train ${PNG_LIBRARIES} Threads::Threads Eigen3::Eigen) +target_link_libraries(train gpc_core) add_executable(sparsematch sparsematch.cpp) -target_link_libraries(sparsematch ${PNG_LIBRARIES} Threads::Threads Eigen3::Eigen) +target_link_libraries(sparsematch gpc_core) diff --git a/samples/sparsematch.cpp b/samples/sparsematch.cpp index ed43016..6834f61 100644 --- a/samples/sparsematch.cpp +++ b/samples/sparsematch.cpp @@ -1,6 +1,6 @@ #include -#include "gpc/inference.hpp" +#include "gpc/forest.hpp" using namespace std; int main(int argc, char** argv) { std::string forestPath = "../../forests/defaultZeroForest.txt"; @@ -46,15 +46,15 @@ int main(int argc, char** argv) { timg.readPNG(rightImgPath); // Get learned filter for the given image dimensions. - GPCForest_t::FilterMask fm = + gpc::inference::FilterMask fm = forest.readForest(forestPath, simg.cols(), simg.rows()); // Preprocess images (box filter, sobel filter, indices of high gradient // pixels) gpc::inference::time_point t0 = gpc::inference::sysTick(); - GPCForest_t::PreprocessedImage simgP = + gpc::inference::PreprocessedImage simgP = forest.preprocessImage(simg, inferencesettings); - GPCForest_t::PreprocessedImage timgP = + gpc::inference::PreprocessedImage timgP = forest.preprocessImage(timg, inferencesettings); gpc::inference::time_point t1 = gpc::inference::sysTick(); @@ -66,7 +66,7 @@ int main(int argc, char** argv) { << ", #candidatesL:" << simgP.mask.size() << ", #candidatesR:" << timgP.mask.size() << ", tMatch: " << gpc::inference::tickToMs(t2, t1) << " ms" - << ", num matches:" << supp.size() << endl; + << ", num matches:" << supp.size() << std::endl; // Output sparse disparities overlayed on left input image ndb::Buffer renderDisp; diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index a211cac..564c125 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -12,7 +12,13 @@ FetchContent_MakeAvailable(approvaltests) find_package(GTest REQUIRED) add_executable(test_single_matching test_single_matching.cpp) -target_link_libraries(test_single_matching PRIVATE ${PNG_LIBRARIES} ApprovalTests::ApprovalTests GTest::gtest_main Eigen3::Eigen) +#target_link_libraries(test_single_matching PRIVATE ${PNG_LIBRARIES} ApprovalTests::ApprovalTests GTest::gtest_main Eigen3::Eigen) +target_link_libraries(test_single_matching + PRIVATE + gpc_core + ApprovalTests::ApprovalTests + GTest::gtest_main +) add_test(NAME single_matching COMMAND test_single_matching) diff --git a/tests/test_single_matching.cpp b/tests/test_single_matching.cpp index e9e6a75..fdff603 100644 --- a/tests/test_single_matching.cpp +++ b/tests/test_single_matching.cpp @@ -1,7 +1,7 @@ #define APPROVALS_GOOGLETEST #include #include -#include "gpc/inference.hpp" +#include "gpc/forest.hpp" TEST(Approval, Inference) @@ -34,12 +34,12 @@ TEST(Approval, Inference) simg.readPNG(leftImgPath); timg.readPNG(rightImgPath); // Get learned filter for the given image dimensions. - GPCForest_t::FilterMask fm = + gpc::inference::FilterMask fm = forest.readForest(forestPath, simg.cols(), simg.rows()); - GPCForest_t::PreprocessedImage simgP = + gpc::inference::PreprocessedImage simgP = forest.preprocessImage(simg, inferencesettings); - GPCForest_t::PreprocessedImage timgP = + gpc::inference::PreprocessedImage timgP = forest.preprocessImage(timg, inferencesettings); // Match rectified stereo images From 7d00a91072d06528299dece81291d5bfa4c35cc8 Mon Sep 17 00:00:00 2001 From: Nik Bamert Date: Sat, 14 Feb 2026 17:38:00 +0100 Subject: [PATCH 03/36] decouple fern Please enter the commit message for your changes. Lines starting --- CMakeLists.txt | 1 + lib/gpc/Fern.hpp | 157 +++-------------------------------- lib/gpc/fern.cpp | 208 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 221 insertions(+), 145 deletions(-) create mode 100644 lib/gpc/fern.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 535b559..d123aaf 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -39,6 +39,7 @@ endif() add_library(gpc_core lib/gpc/forest.cpp + lib/gpc/fern.cpp ) target_link_libraries(gpc_core PUBLIC diff --git a/lib/gpc/Fern.hpp b/lib/gpc/Fern.hpp index 5f5f533..d9554fd 100644 --- a/lib/gpc/Fern.hpp +++ b/lib/gpc/Fern.hpp @@ -167,10 +167,7 @@ OptimizerSettings TauOptimizer(int taulo, int tauhi, int numResamples, bool onlyScoreNonSplitSamples, - double w1) { - return OptimizerSettings( - taulo, tauhi, numResamples, onlyScoreNonSplitSamples, w1); -} + double w1); /** * @brief Optimzer setting factory for a zero fern * @@ -184,9 +181,7 @@ OptimizerSettings TauOptimizer(int taulo, */ OptimizerSettings ZeroOptimizer(int numResamples, bool onlyScoreNonSplitSamples, - double w1) { - return OptimizerSettings(0, 1, numResamples, onlyScoreNonSplitSamples, w1); -} + double w1) ; struct FernSettings { const int maxDepth; const int scale; @@ -231,63 +226,8 @@ class Fern { FernSettings fernsetting, OptimizerSettings optsetting, int scoreUntilLevel, - splitStats& s) { - s.tp = 0; - s.fn = 0; - s.fp = 0; - s.prec = 0.; - s.rec = 0.; - s.hmean = 0.; - s.convcomb = 0.; - s.tot = 0; - for (auto& triplet : data) { - uint64_t ref = 0, pos = 0, neg = 0; - // Score the first scoreUntilLevel levels of a given fern - for (int i = 0; i < scoreUntilLevel + 1; i++) { - ref <<= 1; - pos <<= 1; - neg <<= 1; - bool refDec, posDec, negDec; - - // Decisions need to be added into a codeword - Feature.getDecisions( - refDec, posDec, negDec, params[i], triplet); - if (refDec) ref++; - if (posDec) pos++; - if (negDec) neg++; - } - // Only count those that haven't been true positives yet - // Ignore samples previously classified as True positive - if (!(triplet.pos.split == true && triplet.neg.split == true)) { - s.tot++; - // Decide which are equal (i.e. set the split indicators) - if (ref == pos) { // 110(TP), 111, 001(TP), 000 - if (ref != neg) { // 110 (TP), 001(TP) - s.tp++; - } else { // 111(FN), 000(FN) - s.fn++; - } - } else { // 100, 101, 011, 010 - if (ref != neg) { // 100(FN), 011(FN) FN - s.fn++; - } else { // 101(FP), 010(FP) - s.fp++; - } - } - } - } - - // Compute statistics of this split - double w2 = 1. - optsetting.w1_; - s.prec = ((s.tp + s.fp) == 0) ? 0. : double(s.tp) / (s.tp + s.fp); - s.rec = ((s.tp + s.fn) == 0) ? 0. : double(s.tp) / (s.tp + s.fn); - - s.hmean = (s.prec + s.rec == 0.) - ? 0. - : s.prec * s.rec / ((1. - w2) * s.prec + w2 * s.rec); - s.convcomb = (1. - w2) * s.prec + w2 * s.rec; - } - /** + splitStats& s); + /** * @brief Mark those samples in the set as "split" if they have been * correctly classified(ref=pos and pos!=neg) with the parameter * set in params @@ -298,26 +238,7 @@ class Fern { */ void markSplitSamples(std::vector& data, std::vector& params, - int numParams) { - for (auto& triplet : data) { - // Evaluate triplet on all given parameters - uint64_t ref = 0, pos = 0, neg = 0; - for (int i = 0; i < numParams; i++) { - ref <<= 1; // shift by one - pos <<= 1; // shift by one - neg <<= 1; // shift by one - bool refDec, posDec, negDec; - - Feature.getDecisions( - refDec, posDec, negDec, params[i], triplet); - if (refDec) ref++; - if (posDec) pos++; - if (negDec) neg++; - } - if (ref == pos) triplet.pos.split = true; - if (ref != neg) triplet.neg.split = true; - } - } + int numParams) ; /** * @brief Reset the mark on the training samples on whether they have been * split correctly or not Since we do not operate on copies of the training @@ -325,13 +246,8 @@ class Fern { * * @param data */ - void resetMarkOnSamples(std::vector& data) { - for (auto& triplet : data) { - triplet.pos.split = false; - triplet.neg.split = false; - } - } - + void resetMarkOnSamples(std::vector& data); + /** * @brief Train a fern given a set of training data and some optimizer * settings @@ -340,70 +256,21 @@ class Fern { * @param optsetting the optimizer settings */ void train(std::vector& trainingSamples, - OptimizerSettings optsetting) { - splitStats stats; - float maxScore = 0.f; - SplitParams_t bestParams; - - fernparams.resize(fernsettings.maxDepth); - - cout << setw(7) << "Level" << setw(10) << "Prec" << setw(10) << "Rec" - << setw(10) << "Har" << setw(8) << "Tot" << setw(8) << "TP" - << setw(8) << "FP" << setw(8) << "FN" << setw(6) << "scale" - << setw(5) << "tau" << setw(5) << "i" << setw(5) << "j" << endl; - if (optsetting.onlyScoreNonSplitSamples_) - resetMarkOnSamples(trainingSamples); - for (int level = 0; level < fernsettings.maxDepth; level++) { - maxScore = 0.f; - for (int k = 0; k < optsetting.numResamples_; k++) { - // Samples a hyperplane in the requested scale - Feature.sampleHyperplane(fernsettings.scale, fernparams[level]); - // Iterates over a small range of tau (intercept) - for (int tau = optsetting.taulo_; tau < optsetting.tauhi_; - tau++) { - fernparams[level].tau = tau; - // Score hyperplane set we have so far - evalSplit(trainingSamples, - fernparams, - fernsettings, - optsetting, - level, - stats); - // If score exceeds previously best, replace paramset - if (stats.hmean > maxScore) { - bestParams = fernparams[level]; - maxScore = stats.hmean; - } - } // tau loop - } // k loop - // Store best performing parameters - fernparams[level] = bestParams; - - // Mark samples as split if they were labeled true positive - if (optsetting.onlyScoreNonSplitSamples_) - markSplitSamples(trainingSamples, fernparams, level); - cout << setw(7) << level << setw(10) << stats.prec << setw(10) - << stats.rec << setw(10) << stats.hmean << setw(8) << stats.tot - << setw(8) << stats.tp << setw(8) << stats.fp << setw(8) - << stats.fn << setw(6) << fernsettings.scale << setw(5) - << fernparams[level].tau << setw(5) << fernparams[level].i - << setw(5) << fernparams[level].j << endl; - } // level loop - } // train - + OptimizerSettings optsetting) ; + /** * @brief Returns the decision of the first five levels of the ferns * * @return The parameters. */ - std::vector getParameters() { return fernparams; } + std::vector getParameters(); /** * @brief Return the scale that this fern uses * * @return The scale */ - int getScale() { return fernsettings.scale; } + int getScale(); }; // Fern @@ -417,7 +284,7 @@ class Fern { * * @return */ -std::vector FernFactory(int num_S, int num_M, int num_L, int maxDepth) { +inline std::vector FernFactory(int num_S, int num_M, int num_L, int maxDepth) { std::vector ferns; for (int i = 0; i < num_S; i++) ferns.push_back(Fern(FernSettings(maxDepth, 2))); diff --git a/lib/gpc/fern.cpp b/lib/gpc/fern.cpp new file mode 100644 index 0000000..a171218 --- /dev/null +++ b/lib/gpc/fern.cpp @@ -0,0 +1,208 @@ +// Copyright (c) 2018, ETH Zurich +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// 3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// Implements and extends the method proposed in +// The Global Patch Collider +// Shenlong Wang, Sean Ryan Fanello, Christoph Rhemann, Shahram Izadi, Pushmeet +// Kohli CVPR 2016 Code Author: Niklaus Bamert (bamertn@ethz.ch) +#include +#include +#include +#include +#include + +#include "gpc/Feature.hpp" +#include "gpc/Fern.hpp" + +using namespace std; +namespace gpc { +namespace training { +OptimizerSettings TauOptimizer(int taulo, + int tauhi, + int numResamples, + bool onlyScoreNonSplitSamples, + double w1) { + return OptimizerSettings( + taulo, tauhi, numResamples, onlyScoreNonSplitSamples, w1); +} +OptimizerSettings ZeroOptimizer(int numResamples, + bool onlyScoreNonSplitSamples, + double w1) { + return OptimizerSettings(0, 1, numResamples, onlyScoreNonSplitSamples, w1); +} +void Fern::evalSplit(std::vector& data, + std::vector& params, + FernSettings fernsetting, + OptimizerSettings optsetting, + int scoreUntilLevel, + splitStats& s) { + s.tp = 0; + s.fn = 0; + s.fp = 0; + s.prec = 0.; + s.rec = 0.; + s.hmean = 0.; + s.convcomb = 0.; + s.tot = 0; + for (auto& triplet : data) { + uint64_t ref = 0, pos = 0, neg = 0; + // Score the first scoreUntilLevel levels of a given fern + for (int i = 0; i < scoreUntilLevel + 1; i++) { + ref <<= 1; + pos <<= 1; + neg <<= 1; + bool refDec, posDec, negDec; + + // Decisions need to be added into a codeword + Feature.getDecisions( + refDec, posDec, negDec, params[i], triplet); + if (refDec) ref++; + if (posDec) pos++; + if (negDec) neg++; + } + // Only count those that haven't been true positives yet + // Ignore samples previously classified as True positive + if (!(triplet.pos.split == true && triplet.neg.split == true)) { + s.tot++; + // Decide which are equal (i.e. set the split indicators) + if (ref == pos) { // 110(TP), 111, 001(TP), 000 + if (ref != neg) { // 110 (TP), 001(TP) + s.tp++; + } else { // 111(FN), 000(FN) + s.fn++; + } + } else { // 100, 101, 011, 010 + if (ref != neg) { // 100(FN), 011(FN) FN + s.fn++; + } else { // 101(FP), 010(FP) + s.fp++; + } + } + } + } + + // Compute statistics of this split + double w2 = 1. - optsetting.w1_; + s.prec = ((s.tp + s.fp) == 0) ? 0. : double(s.tp) / (s.tp + s.fp); + s.rec = ((s.tp + s.fn) == 0) ? 0. : double(s.tp) / (s.tp + s.fn); + + s.hmean = (s.prec + s.rec == 0.) + ? 0. + : s.prec * s.rec / ((1. - w2) * s.prec + w2 * s.rec); + s.convcomb = (1. - w2) * s.prec + w2 * s.rec; +} +void Fern::markSplitSamples(std::vector& data, + std::vector& params, + int numParams) { + for (auto& triplet : data) { + // Evaluate triplet on all given parameters + uint64_t ref = 0, pos = 0, neg = 0; + for (int i = 0; i < numParams; i++) { + ref <<= 1; // shift by one + pos <<= 1; // shift by one + neg <<= 1; // shift by one + bool refDec, posDec, negDec; + + Feature.getDecisions( + refDec, posDec, negDec, params[i], triplet); + if (refDec) ref++; + if (posDec) pos++; + if (negDec) neg++; + } + if (ref == pos) triplet.pos.split = true; + if (ref != neg) triplet.neg.split = true; + } +} +void Fern::resetMarkOnSamples(std::vector& data) { + for (auto& triplet : data) { + triplet.pos.split = false; + triplet.neg.split = false; + } +} + +void Fern::train(std::vector& trainingSamples, + OptimizerSettings optsetting) { + splitStats stats; + float maxScore = 0.f; + SplitParams_t bestParams; + + fernparams.resize(fernsettings.maxDepth); + + cout << setw(7) << "Level" << setw(10) << "Prec" << setw(10) << "Rec" + << setw(10) << "Har" << setw(8) << "Tot" << setw(8) << "TP" + << setw(8) << "FP" << setw(8) << "FN" << setw(6) << "scale" + << setw(5) << "tau" << setw(5) << "i" << setw(5) << "j" << endl; + if (optsetting.onlyScoreNonSplitSamples_) + resetMarkOnSamples(trainingSamples); + for (int level = 0; level < fernsettings.maxDepth; level++) { + maxScore = 0.f; + for (int k = 0; k < optsetting.numResamples_; k++) { + // Samples a hyperplane in the requested scale + Feature.sampleHyperplane(fernsettings.scale, fernparams[level]); + // Iterates over a small range of tau (intercept) + for (int tau = optsetting.taulo_; tau < optsetting.tauhi_; + tau++) { + fernparams[level].tau = tau; + // Score hyperplane set we have so far + evalSplit(trainingSamples, + fernparams, + fernsettings, + optsetting, + level, + stats); + // If score exceeds previously best, replace paramset + if (stats.hmean > maxScore) { + bestParams = fernparams[level]; + maxScore = stats.hmean; + } + } // tau loop + } // k loop + // Store best performing parameters + fernparams[level] = bestParams; + + // Mark samples as split if they were labeled true positive + if (optsetting.onlyScoreNonSplitSamples_) + markSplitSamples(trainingSamples, fernparams, level); + cout << setw(7) << level << setw(10) << stats.prec << setw(10) + << stats.rec << setw(10) << stats.hmean << setw(8) << stats.tot + << setw(8) << stats.tp << setw(8) << stats.fp << setw(8) + << stats.fn << setw(6) << fernsettings.scale << setw(5) + << fernparams[level].tau << setw(5) << fernparams[level].i + << setw(5) << fernparams[level].j << endl; + } // level loop +} // train + +std::vector Fern::getParameters() { return fernparams; } + +int Fern::getScale() { return fernsettings.scale; } + + + +} // namespace training +} // namespace gpc From 5d43602b0e4e3574b3539a8c949bcfb029579977 Mon Sep 17 00:00:00 2001 From: Nik Bamert Date: Sat, 14 Feb 2026 20:27:42 +0100 Subject: [PATCH 04/36] decouple feature --- CMakeLists.txt | 1 + lib/gpc/Feature.hpp | 160 ++------------------------------- lib/gpc/feature.cpp | 215 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 223 insertions(+), 153 deletions(-) create mode 100644 lib/gpc/feature.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index d123aaf..b0f34ec 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -40,6 +40,7 @@ endif() add_library(gpc_core lib/gpc/forest.cpp lib/gpc/fern.cpp + lib/gpc/feature.cpp ) target_link_libraries(gpc_core PUBLIC diff --git a/lib/gpc/Feature.hpp b/lib/gpc/Feature.hpp index f0f7072..82aff06 100644 --- a/lib/gpc/Feature.hpp +++ b/lib/gpc/Feature.hpp @@ -96,32 +96,13 @@ class Feature { * @param params The parameters for this split * @param[in] trip The triplet */ - inline void getDecisions(bool& ref, + void getDecisions(bool& ref, bool& pos, bool& neg, params& params, - const GPCPatchTriplet& trip) { - ref = - ((int)trip.ref.feature(params.i) - (int)trip.ref.feature(params.j) < - params.tau); - pos = - ((int)trip.pos.feature(params.i) - (int)trip.pos.feature(params.j) < - params.tau); - neg = - ((int)trip.neg.feature(params.i) - (int)trip.neg.feature(params.j) < - params.tau); - } - - Feature() { - std::random_device rd2; - rng = std::mt19937(rd2()); - randIJ7 = std::uniform_int_distribution(0, 48); - randIJ17 = std::uniform_int_distribution(0, 17 * 17 - 1); - randIJ27 = std::uniform_int_distribution(0, 27 * 27 - 1); - - randTAU = std::uniform_int_distribution(-15, 15); - } + const GPCPatchTriplet& trip); + Feature(); /** * @brief Returns a random hyperplane within a 27 x 27 * pixel-sized patch. depending on the scale @@ -132,50 +113,7 @@ class Feature { * @param scale Determines which patch size is used * @param params returns the parameters */ - void inline sampleHyperplane(int scale, params& params) { - if (scale == 2) { - params.i = params.j; // s.t. they regenerate each iteration - while (params.i == params.j) { // i and j need to be distinct - int i = randIJ7(rng); - int j = randIJ7(rng); - params.ix = i % 7 - 3; - params.iy = i / 7 - 3; - params.jx = j % 7 - 3; - params.jy = j / 7 - 3; - - params.i = 280 + (params.ix + 3) + 27 * (params.iy + 3); - params.j = 280 + (params.jx + 3) + 27 * (params.jy + 3); - } - } else if (scale == 1) { - params.i = params.j; // s.t. they regenerate each iteration - while (params.i == params.j) { // i and j need to be distinct - int i = randIJ17(rng); - int j = randIJ17(rng); - params.ix = i % 17 - 8; - params.iy = i / 17 - 8; - params.jx = j % 17 - 8; - params.jy = j / 17 - 8; - - params.i = 140 + (params.ix + 8) + 27 * (params.iy + 8); - params.j = 140 + (params.jx + 8) + 27 * (params.jy + 8); - } - } else if (scale == 0) { - params.i = params.j; // s.t. they regenerate each iteration - while (params.i == params.j) { // i and j need to be distinct - params.i = randIJ27(rng); - params.j = randIJ27(rng); - params.ix = params.i % 27 - 13; - params.iy = params.i / 27 - 13; - params.jx = params.j % 27 - 13; - params.jy = params.j / 27 - 13; - - params.i = (params.ix + 13) + 27 * (params.iy + 13); - params.j = (params.jx + 13) + 27 * (params.jy + 13); - } - } - params.tau = randTAU(rng); - } - + void sampleHyperplane(int scale, params& params); /** * @brief Gets all descriptors (triplets) for an image pair for * training given the three keypoint vectors. @@ -193,56 +131,7 @@ class Feature { std::vector& ref, std::vector& pos, std::vector& neg, - std::vector& triplets) { - ndb::Buffer LL(bwL.rows(), bwL.cols()); - LL.width = bwL.width; - ndb::box(bwL.data(), LL.data(), bwL.cols(), bwL.rows(), 1); - LL.clearBoundary(); - - ndb::Buffer RR(bwL.rows(), bwL.cols()); - RR.width = bwR.width; - ndb::box(bwR.data(), RR.data(), bwR.cols(), bwR.rows(), 1); - RR.clearBoundary(); - - auto f = [=](ndb::Point& kp) { - if (kp.x > 20 && kp.y > 20 && kp.x < bwL.cols() - 20 && - kp.y < bwL.rows() - 20) - return false; - else - return true; - }; - - for (std::vector::size_type i = 0; i != ref.size(); i++) { - if (!f(ref[i]) && !f(pos[i]) && !f(neg[i])) { - // Get all descriptors: - GPCPatchTriplet newPatch; - - // Reference patch - //==================================== - newPatch.ref.x = ref[i].x; - newPatch.ref.y = ref[i].y; - - LL.getPatch(newPatch.ref.feature, ref[i].x, ref[i].y, 27); - - // Extract a positive match in the right image - //==================================== - newPatch.pos.x = pos[i].x; - newPatch.pos.y = pos[i].y; - - RR.getPatch(newPatch.pos.feature, pos[i].x, pos[i].y, 27); - - // Extract negative patch - //==================================== - newPatch.neg.x = neg[i].x; - newPatch.neg.y = neg[i].y; - - RR.getPatch(newPatch.neg.feature, neg[i].x, neg[i].y, 27); - - triplets.push_back(std::move(newPatch)); - } - } - } - + std::vector& triplets); /** * @brief Store a vector of triplets of training data to file * @@ -251,16 +140,7 @@ class Feature { * in binary form. */ void storeAllTriplets(std::vector& data, - std::string path) { - ofstream fout; - fout.open(path, ios::binary | ios::out); - for (auto& triplet : data) { - fout.write((char*)triplet.ref.feature.data(), 27 * 27); - fout.write((char*)triplet.pos.feature.data(), 27 * 27); - fout.write((char*)triplet.neg.feature.data(), 27 * 27); - } - fout.close(); - } + std::string path); /** * @brief Read triplets of training data from a binary file * written by the storeAllTriplets method. @@ -269,33 +149,7 @@ class Feature { * * @return The training set */ - std::vector loadAllTriplets(std::string path) { - std::vector data; - std::ifstream in(path, std::ifstream::ate | std::ifstream::binary); - uint32_t filesize = in.tellg(); - if (filesize % ((27 * 27) * 3)) { - cout << "ERR: File is not a training set of this feature type" - << endl; - cout << "FS: " << filesize << endl; - return data; - } - int numSamples = filesize / ((27 * 27) * 3); - data.resize(numSamples); - ifstream fin; - fin.open(path, ios::binary | ios::in); - for (auto& datum : data) { - datum.ref.feature.resize(27, 27); - datum.pos.feature.resize(27, 27); - datum.neg.feature.resize(27, 27); - - fin.read((char*)datum.ref.feature.data(), 27 * 27); - fin.read((char*)datum.pos.feature.data(), 27 * 27); - fin.read((char*)datum.neg.feature.data(), 27 * 27); - } - fin.close(); - return data; - } - + std::vector loadAllTriplets(std::string path); }; // Feature } // namespace training } // namespace gpc diff --git a/lib/gpc/feature.cpp b/lib/gpc/feature.cpp new file mode 100644 index 0000000..f68d440 --- /dev/null +++ b/lib/gpc/feature.cpp @@ -0,0 +1,215 @@ +// Copyright (c) 2018, ETH Zurich +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// 3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// Implements and extends the method proposed in +// The Global Patch Collider +// Shenlong Wang, Sean Ryan Fanello, Christoph Rhemann, Shahram Izadi, Pushmeet +// Kohli CVPR 2016 Code Author: Niklaus Bamert (bamertn@ethz.ch) + +#include +#include +#include //for log2 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace std; + +namespace gpc { +namespace training { +void Feature::getDecisions(bool& ref, + bool& pos, + bool& neg, + params& params, + const GPCPatchTriplet& trip) { + ref = + ((int)trip.ref.feature(params.i) - (int)trip.ref.feature(params.j) < + params.tau); + pos = + ((int)trip.pos.feature(params.i) - (int)trip.pos.feature(params.j) < + params.tau); + neg = + ((int)trip.neg.feature(params.i) - (int)trip.neg.feature(params.j) < + params.tau); +} + +Feature::Feature() { + std::random_device rd2; + rng = std::mt19937(rd2()); + randIJ7 = std::uniform_int_distribution(0, 48); + randIJ17 = std::uniform_int_distribution(0, 17 * 17 - 1); + randIJ27 = std::uniform_int_distribution(0, 27 * 27 - 1); + + randTAU = std::uniform_int_distribution(-15, 15); +} +void Feature::sampleHyperplane(int scale, params& params) { + if (scale == 2) { + params.i = params.j; // s.t. they regenerate each iteration + while (params.i == params.j) { // i and j need to be distinct + int i = randIJ7(rng); + int j = randIJ7(rng); + params.ix = i % 7 - 3; + params.iy = i / 7 - 3; + params.jx = j % 7 - 3; + params.jy = j / 7 - 3; + + params.i = 280 + (params.ix + 3) + 27 * (params.iy + 3); + params.j = 280 + (params.jx + 3) + 27 * (params.jy + 3); + } + } else if (scale == 1) { + params.i = params.j; // s.t. they regenerate each iteration + while (params.i == params.j) { // i and j need to be distinct + int i = randIJ17(rng); + int j = randIJ17(rng); + params.ix = i % 17 - 8; + params.iy = i / 17 - 8; + params.jx = j % 17 - 8; + params.jy = j / 17 - 8; + + params.i = 140 + (params.ix + 8) + 27 * (params.iy + 8); + params.j = 140 + (params.jx + 8) + 27 * (params.jy + 8); + } + } else if (scale == 0) { + params.i = params.j; // s.t. they regenerate each iteration + while (params.i == params.j) { // i and j need to be distinct + params.i = randIJ27(rng); + params.j = randIJ27(rng); + params.ix = params.i % 27 - 13; + params.iy = params.i / 27 - 13; + params.jx = params.j % 27 - 13; + params.jy = params.j / 27 - 13; + + params.i = (params.ix + 13) + 27 * (params.iy + 13); + params.j = (params.jx + 13) + 27 * (params.jy + 13); + } + } + params.tau = randTAU(rng); +} +void Feature::extractAllTriplets(ndb::Buffer& bwL, + ndb::Buffer& bwR, + std::vector& ref, + std::vector& pos, + std::vector& neg, + std::vector& triplets) { + ndb::Buffer LL(bwL.rows(), bwL.cols()); + LL.width = bwL.width; + ndb::box(bwL.data(), LL.data(), bwL.cols(), bwL.rows(), 1); + LL.clearBoundary(); + + ndb::Buffer RR(bwL.rows(), bwL.cols()); + RR.width = bwR.width; + ndb::box(bwR.data(), RR.data(), bwR.cols(), bwR.rows(), 1); + RR.clearBoundary(); + + auto f = [=](ndb::Point& kp) { + if (kp.x > 20 && kp.y > 20 && kp.x < bwL.cols() - 20 && + kp.y < bwL.rows() - 20) + return false; + else + return true; + }; + + for (std::vector::size_type i = 0; i != ref.size(); i++) { + if (!f(ref[i]) && !f(pos[i]) && !f(neg[i])) { + // Get all descriptors: + GPCPatchTriplet newPatch; + + // Reference patch + //==================================== + newPatch.ref.x = ref[i].x; + newPatch.ref.y = ref[i].y; + + LL.getPatch(newPatch.ref.feature, ref[i].x, ref[i].y, 27); + + // Extract a positive match in the right image + //==================================== + newPatch.pos.x = pos[i].x; + newPatch.pos.y = pos[i].y; + + RR.getPatch(newPatch.pos.feature, pos[i].x, pos[i].y, 27); + + // Extract negative patch + //==================================== + newPatch.neg.x = neg[i].x; + newPatch.neg.y = neg[i].y; + + RR.getPatch(newPatch.neg.feature, neg[i].x, neg[i].y, 27); + + triplets.push_back(std::move(newPatch)); + } + } +} + +void Feature::storeAllTriplets(std::vector& data, + std::string path) { + ofstream fout; + fout.open(path, ios::binary | ios::out); + for (auto& triplet : data) { + fout.write((char*)triplet.ref.feature.data(), 27 * 27); + fout.write((char*)triplet.pos.feature.data(), 27 * 27); + fout.write((char*)triplet.neg.feature.data(), 27 * 27); + } + fout.close(); +} +std::vector Feature::loadAllTriplets(std::string path) { + std::vector data; + std::ifstream in(path, std::ifstream::ate | std::ifstream::binary); + uint32_t filesize = in.tellg(); + if (filesize % ((27 * 27) * 3)) { + cout << "ERR: File is not a training set of this feature type" + << endl; + cout << "FS: " << filesize << endl; + return data; + } + int numSamples = filesize / ((27 * 27) * 3); + data.resize(numSamples); + ifstream fin; + fin.open(path, ios::binary | ios::in); + for (auto& datum : data) { + datum.ref.feature.resize(27, 27); + datum.pos.feature.resize(27, 27); + datum.neg.feature.resize(27, 27); + + fin.read((char*)datum.ref.feature.data(), 27 * 27); + fin.read((char*)datum.pos.feature.data(), 27 * 27); + fin.read((char*)datum.neg.feature.data(), 27 * 27); + } + fin.close(); + return data; +} + +} // namespace training +} // namespace gpc From af850bb7436a5ff22099bbd68abd1b5682449c2b Mon Sep 17 00:00:00 2001 From: Nik Bamert Date: Sun, 15 Feb 2026 08:50:30 +0100 Subject: [PATCH 05/36] decouple filter --- CMakeLists.txt | 1 + lib/gpc/filter.hpp | 775 ++-------------------------------------- samples/sparsematch.cpp | 2 +- 3 files changed, 29 insertions(+), 749 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index b0f34ec..60f919e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -41,6 +41,7 @@ add_library(gpc_core lib/gpc/forest.cpp lib/gpc/fern.cpp lib/gpc/feature.cpp + lib/gpc/filter.cpp ) target_link_libraries(gpc_core PUBLIC diff --git a/lib/gpc/filter.hpp b/lib/gpc/filter.hpp index 2caecc2..3576650 100644 --- a/lib/gpc/filter.hpp +++ b/lib/gpc/filter.hpp @@ -58,40 +58,12 @@ namespace ndb { * @param ind output array (indices into n of nonzero elements) * @param m number of elements in output */ -inline void arr2ind(const unsigned char* a, +void arr2ind(const unsigned char* a, int n, int* ind, - int* m) { + int* m); + #ifdef _INTRINSICS_SSE - int i, m0, k; - __m256i msk; - m0 = 0; - for (i = 0; i < n; i = i + 32) { /* Load 32 bytes and compare with zero: */ - msk = _mm256_cmpeq_epi8(_mm256_load_si256((__m256i*)&a[i]), - _mm256_setzero_si256()); - k = _mm256_movemask_epi8(msk); - k = ~k; /* Search for nonzero bits instead of zero bits. */ - while (k) { - ind[m0] = - i + _tzcnt_u32( - k); /* Count the number of trailing zero bits in k. */ - m0++; - k = _blsr_u32(k); /* Clear the lowest set bit in k. */ - } - } - *m = m0; -#else - int nnz = 0; - for (int i = 0; i < n; i++) { - if (a[i] != 0) { - nnz++; - *ind = i; - ind++; - } - } - *m = nnz; -#endif -} /** * @brief Unpacks 16x8bit from a 128bit simd var into 2x128bit vars * (8x16bit) @@ -100,12 +72,8 @@ inline void arr2ind(const unsigned char* a, * @param y0 The y 0 * @param y1 The y 1 */ -#ifdef _INTRINSICS_SSE -void unpack8to16(const __m128i x, __m128i& y0, __m128i& y1) { - __m128i zero = _mm_setzero_si128(); - y0 = _mm_unpacklo_epi8(x, zero); - y1 = _mm_unpackhi_epi8(x, zero); -} +void unpack8to16(const __m128i x, __m128i& y0, __m128i& y1); + /** * @brief Packs 2x128bit vars with 16bit values(where 8 upper bits are * zero) into 1x128bit with 8bit values @@ -114,10 +82,7 @@ void unpack8to16(const __m128i x, __m128i& y0, __m128i& y1) { * @param[in] x1 The x 1 * @param y the packed vector */ -void pack16to8(const __m128i x0, const __m128i x1, __m128i& y) { - y = _mm_packus_epi16(x0, x1); -} - +void pack16to8(const __m128i x0, const __m128i x1, __m128i& y); #endif /** * @brief Calls a given functional f with subranges based on the given start @@ -132,29 +97,10 @@ void pack16to8(const __m128i x0, const __m128i x1, __m128i& y) { * @param end end of the range * @param nThreads number of threads to use */ -inline void parFor(std::function const& f, +void parFor(std::function const& f, int start, int end, - int nThreads) { - // Range definition - // quantities derived from range - int segSize = (end - start) / nThreads; - int lastSeg = (end - start) % nThreads; - - std::vector threads; - threads.reserve(nThreads); - - // Spawn threads - for (int t = 0; t < nThreads - 1; t++) { - threads.emplace_back(f, start + t * segSize, start + (t + 1) * segSize); - } - threads.emplace_back(f, - start + (nThreads - 1) * segSize, - start + (nThreads)*segSize + lastSeg); - // Join - for (auto& t : threads) t.join(); -} - + int nThreads); /** * @brief Naive 3x3 sobel filter implementation * @@ -165,49 +111,9 @@ inline void parFor(std::function const& f, * @param[in] numThreads number of threads to use * @param threshold threshold to binarize sobel filter output */ -inline void sobelNaive( - uint8_t* in, uint8_t* gradient, int width, int height, uint8_t threshold) { - assert(width % 16 == 0 && "width must be multiple of 16!"); - int thresholdSq = threshold * threshold; - uint8_t* ptr = in; - - uint8_t* p11 = ptr + 0 * width; - uint8_t* p12 = ptr + 0 * width + 1; - uint8_t* p13 = ptr + 0 * width + 2; - - uint8_t* p21 = ptr + 1 * width; - uint8_t* p22 = ptr + 1 * width + 1; - uint8_t* p23 = ptr + 1 * width + 2; +void sobelNaive( + uint8_t* in, uint8_t* gradient, int width, int height, uint8_t threshold); - uint8_t* p31 = ptr + 2 * width; - uint8_t* p32 = ptr + 2 * width + 1; - uint8_t* p33 = ptr + 2 * width + 2; - - // output pointer - uint8_t* optr = gradient + 1 * width + 1; - // Apply 3x3 box filter to image less pixel border of 1 (to avoid treating - // boundary) (unoptimized) - for (int iy = 1; iy < height - 1; iy++) { - for (int ix = 0; ix < width; ix++) { - int sx = (*p11 + *p31 + 2 * *p21 - *p13 - 2 * *p23 - *p33) / 9; - int sy = (*p11 + *p13 + 2 * *p12 - *p31 - 2 * *p32 - *p33) / 9; - - int val = sx * sx + sy * sy; - - *optr = val > thresholdSq ? 255 : 0; - p11++; - p12++; - p13++; - p21++; - p22++; - p23++; - p31++; - p32++; - p33++; - optr++; - } - } -} /** * @brief Naive 3x3 box filter implementation * @@ -217,44 +123,8 @@ inline void sobelNaive( * @param[in] height The height * @param[in] numThreads number of threads to use */ -inline void boxNaive(uint8_t* in, uint8_t* blurred, int width, int height) { - assert(width % 16 == 0 && "width must be multiple of 16!"); - // allocate space for result - uint8_t* ptr = in; - uint8_t* p11 = ptr + 0 * width; - uint8_t* p12 = ptr + 0 * width + 1; - uint8_t* p13 = ptr + 0 * width + 2; - - uint8_t* p21 = ptr + 1 * width; - uint8_t* p22 = ptr + 1 * width + 1; - uint8_t* p23 = ptr + 1 * width + 2; - - uint8_t* p31 = ptr + 2 * width; - uint8_t* p32 = ptr + 2 * width + 1; - uint8_t* p33 = ptr + 2 * width + 2; - uint8_t* optr = blurred + 1 * width + 1; +void boxNaive(uint8_t* in, uint8_t* blurred, int width, int height); - // Apply 3x3 box filter to image less pixel border of 1 (to avoid treating - // boundary) (unoptimized) - for (int iy = 1; iy < height - 1; iy++) { - for (int ix = 0; ix < width; ix++) { - int res = - (*p11 + *p12 + *p13 + *p21 + *p22 + *p23 + *p31 + *p32 + *p33) / - 9; - *optr = res; - p11++; - p12++; - p13++; - p21++; - p22++; - p23++; - p31++; - p32++; - p33++; - optr++; - } - } -} /** * @brief Applies a gpc filter defined by the pixel-difference tests in * fastmask. Naive implementation @@ -269,28 +139,14 @@ inline void boxNaive(uint8_t* in, uint8_t* blurred, int width, int height) { * @param width The width of the image at pointer *in * @param height The height of the image at pointer *in */ -inline void gpcFilterNaive(uint8_t* in, +void gpcFilterNaive(uint8_t* in, const uint8_t* grad, uint32_t* gpc, std::vector fastmask, std::vector& idx, int width, - int height) { - // output buffer of same size - uint32_t tmp; + int height); - int j = 0; - for (auto k : idx) { - tmp = 0; - for (uint8_t i = 0; i < fastmask.size(); i += 2) { - tmp <<= 1; // shift by one - if (*(in + k + fastmask[i]) > *(in + k + fastmask[i + 1])) - tmp++; // set this test's result to 1 - } - gpc[k] = tmp; - j++; - } -} /** * @brief Applies a gpc filter defined by the pixel-difference tests in * fastmask. Additionally uses a threshold vector (tau) Naive implementation. @@ -303,29 +159,15 @@ inline void gpcFilterNaive(uint8_t* in, * @param width The width of the image at pointer *in * @param height The height of the image at pointer *in */ -inline void gpcFilterTauNaive(uint8_t* in, +void gpcFilterTauNaive(uint8_t* in, const uint8_t* grad, uint32_t* gpc, std::vector fastmask, std::vector tau, std::vector& idx, int width, - int height) { - uint32_t tmp; - - int j = 0; - for (auto k : idx) { - tmp = 0; - for (uint8_t i = 0; i < fastmask.size(); i += 2) { - tmp <<= 1; // shift by one - if (*(in + k + fastmask[i]) > - *(in + k + fastmask[i + 1]) - tau[i / 2]) - tmp++; // set this test's result to 1 - } - gpc[k] = tmp; - j++; - } -} /** + int height); +/** * @brief boxfilter using SSE2 instructions. Loosely based on * https://www.ignorantus.com/box_sse2/, published under * the https://creativecommons.org/publicdomain/zero/1.0/ licence. @@ -336,123 +178,8 @@ inline void gpcFilterTauNaive(uint8_t* in, * @param[in] height The height * @param[in] numThreads number of threads to use */ -inline void box(uint8_t* in, uint8_t* blurred, int width, int height, int numThreads) { - assert(width % 16 == 0 && "width must be multiple of 16!"); -#ifndef _INTRINSICS_SSE - boxNaive(in, blurred, width, height); -#else - auto boxFilterSegment = [&](int start, int end) { - int x, y; - __m128i one_third; - __m128i *dst0, *dst1; - __m128i zero = _mm_setzero_si128(); - - one_third = _mm_set1_epi16( - 21846); // 2^16/3+1. For 16bit ints. 2^8/3+1=86.33 for 8bit - dst0 = (__m128i*)(blurred + width * (start)); - dst1 = (__m128i*)(blurred + width * (start + 1)); - for (y = start; y < end; - y += 2) { // We compute results for two rows in one iteration - const uint8_t *row0, *row1, *row2, *row3; - - row1 = in + y * width; - row0 = row1 - width; - row2 = row1 + width; - row3 = row2 + width; - - for (x = 0; x < width; x += 16) { - __m128i s00, s01, s02; - __m128i r00, r01, r02; - __m128i ra00, ra01, ra02; - __m128i rb00, rb01, rb02; +void box(uint8_t* in, uint8_t* blurred, int width, int height, int numThreads); - __m128i a00, a01, a02, b00, b01, b02; - - __m128i tmp0, tmp1, res; - - s00 = _mm_loadu_si128((__m128i*)(row0 - 1)); - s01 = _mm_loadu_si128((__m128i*)(row0 + 1)); - s02 = _mm_load_si128((__m128i*)(row0)); - unpack8to16(s00, a00, b00); - unpack8to16(s01, a01, b01); - unpack8to16(s02, a02, b02); - - ra00 = _mm_mulhi_epi16( - _mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third); - rb00 = _mm_mulhi_epi16( - _mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third); - - s00 = _mm_loadu_si128((__m128i*)(row1 - 1)); - s01 = _mm_loadu_si128((__m128i*)(row1 + 1)); - s02 = _mm_load_si128((__m128i*)(row1)); - unpack8to16(s00, a00, b00); - unpack8to16(s01, a01, b01); - unpack8to16(s02, a02, b02); - - ra01 = _mm_mulhi_epi16( - _mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third); - rb01 = _mm_mulhi_epi16( - _mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third); - - s00 = _mm_loadu_si128((__m128i*)(row2 - 1)); - s01 = _mm_loadu_si128((__m128i*)(row2 + 1)); - s02 = _mm_load_si128((__m128i*)(row2)); - unpack8to16(s00, a00, b00); - unpack8to16(s01, a01, b01); - unpack8to16(s02, a02, b02); - - ra02 = _mm_mulhi_epi16( - _mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third); - rb02 = _mm_mulhi_epi16( - _mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third); - - tmp0 = _mm_mulhi_epi16( - _mm_adds_epi16(_mm_adds_epi16(ra00, ra01), ra02), - one_third); - tmp1 = _mm_mulhi_epi16( - _mm_adds_epi16(_mm_adds_epi16(rb00, rb01), rb02), - one_third); - - pack16to8(tmp0, tmp1, res); - _mm_store_si128(dst0++, res); - - s00 = _mm_loadu_si128((__m128i*)(row3 - 1)); - s01 = _mm_loadu_si128((__m128i*)(row3 + 1)); - s02 = _mm_load_si128((__m128i*)(row3)); - unpack8to16(s00, a00, b00); - unpack8to16(s01, a01, b01); - unpack8to16(s02, a02, b02); - ra00 = _mm_mulhi_epi16( - _mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third); - rb00 = _mm_mulhi_epi16( - _mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third); - - tmp0 = _mm_mulhi_epi16( - _mm_adds_epi16(_mm_adds_epi16(ra00, ra01), ra02), - one_third); - tmp1 = _mm_mulhi_epi16( - _mm_adds_epi16(_mm_adds_epi16(rb00, rb01), rb02), - one_third); - - pack16to8(tmp0, tmp1, res); - _mm_store_si128(dst1++, res); - - row0 += 16; - row1 += 16; - row2 += 16; - row3 += 16; - } - // still storing 128bit, but now in 16 x 8bit format, so /16 instead - // of /8 - dst0 += width / 16; - dst1 += width / 16; - } - }; // lambda - - boxFilterSegment(1, height - 3); - // parFor(boxFilterSegment,1,height-3,4); -#endif -} /** * @brief 3x3 Sobel filter. Input dimension must be multiple of 16 * @@ -464,158 +191,12 @@ inline void box(uint8_t* in, uint8_t* blurred, int width, int height, int numThr * @param[in] numThreads number of threads to use */ -inline void sobel(uint8_t* in, +void sobel(uint8_t* in, uint8_t* blurred, int width, int height, uint8_t threshold, - int numThreads) { - assert(width % 16 == 0 && "width must be multiple of 16!"); -#ifndef _INTRINSICS_SSE - sobelNaive(in, blurred, width, height, threshold); -#else - auto sobelSSESegment = [&](int start, int end) { - __m128i one_third, one_ninth, one, two, mone, mtwo, binThres; - __m128i *dst0, *dst1; - __m128i zero = _mm_setzero_si128(); - - int x, y; - one_third = _mm_set1_epi16( - 21846); // 2^16/3+1. For 16bit ints. 2^8/3+1=86.33 for 8bit - one_ninth = _mm_set1_epi16(7282); // 2^16/9+1. For 16bit ints. - - binThres = _mm_set1_epi16(threshold * threshold); - - dst0 = (__m128i*)(blurred + width * 1); - // dst1 = (__m128i *)(blurred + width * 2); - for (y = start; y < end; - y++) { // We compute results for two rows in one iteration - const uint8_t *row0, *row1, *row2; - - row1 = in + y * width; - row0 = row1 - width; - row2 = row1 + width; - - for (x = 0; x < width; x += 16) { - // Note: Center element not used in sobel kernels!! - // Kernel indices: - // 00 01 02 - // 10 11 12 - // 20 21 22 - - __m128i a00, a01, a02, a10, a12, a20, a21, a22; - __m128i b00, b01, b02, b10, b12, b20, b21, b22; - - __m128i raA, raB, rbA, rbB; - __m128i tmpa, tmpb, sya, syb, sxa, sxb, res; - - unpack8to16(_mm_loadu_si128((__m128i*)(row0 - 1)), a00, b00); - unpack8to16(_mm_load_si128((__m128i*)(row0)), a01, b01); - unpack8to16(_mm_loadu_si128((__m128i*)(row0 + 1)), a02, b02); - - unpack8to16(_mm_loadu_si128((__m128i*)(row1 - 1)), a10, b10); - unpack8to16(_mm_loadu_si128((__m128i*)(row1 + 1)), a12, b12); - - unpack8to16(_mm_loadu_si128((__m128i*)(row2 - 1)), a20, b20); - unpack8to16(_mm_load_si128((__m128i*)(row2)), a21, b21); - unpack8to16(_mm_loadu_si128((__m128i*)(row2 + 1)), a22, b22); - - // Sobel kernels for x and y direction. - // 1 0 -1 1 2 1 - // sx = 2 0 -2 sy = 0 0 0 - // 1 0 -1 -1-2-1 - // Note that neither kernel uses the center element) - - // In the following, mullo is used to multiply intermediate - // results with -1 To divide by 3, 16bit overflow divide by - // multiply is used, which thus uses the upper 16bit(_mm_mulhi) - // of the 32bit temporary result. - - // sx column kernel vectors (1,2,1) - // Two chained add/sub are used for 2 and -2 - raA = _mm_mulhi_epi16( - _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(a00, a20), a10), - a10), - one_ninth); - rbA = _mm_mulhi_epi16( - _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(b00, b20), b10), - b10), - one_ninth); - - // sx column kernel vector (-1 -2 -1) - raB = _mm_mulhi_epi16( - _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(a02, a22), a12), - a12), - one_ninth); - rbB = _mm_mulhi_epi16( - _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(b02, b22), b12), - b12), - one_ninth); - - // Square of sx: Add squares of above temporaries into final sum - tmpa = _mm_sub_epi16(raA, raB); - tmpb = _mm_sub_epi16(rbA, rbB); - - sxa = _mm_mullo_epi16(tmpa, tmpa); - sxb = _mm_mullo_epi16(tmpb, tmpb); - - // sy row kernel vector (1,2,1) - // Two chained add are used for 2 and -2 - raA = _mm_mulhi_epi16( - _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(a00, a02), a01), - a01), - one_ninth); - rbA = _mm_mulhi_epi16( - _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(b00, b02), b01), - b01), - one_ninth); - - // sy row kernel vector (-1 -2 -1) - raB = _mm_mulhi_epi16( - _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(a20, a22), a21), - a21), - one_ninth); - rbB = _mm_mulhi_epi16( - _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(b20, b22), b21), - b21), - one_ninth); - - // Square of sx: Add squares of above temporaries into final sum - tmpa = _mm_sub_epi16(raA, raB); - tmpb = _mm_sub_epi16(rbA, rbB); - - // watch out, can't overwrite this - sya = _mm_mullo_epi16(tmpa, tmpa); - syb = _mm_mullo_epi16(tmpb, tmpb); - - __m128i zero = _mm_setzero_si128(); - - // The unpacklo is necessary because _mm_cmput_epi16 sets the - // output to 0xFFFF if the comparison is true. When packing - // 16bit to 8bit however, 0xFFFF will be interpreted (in a - // signed environment) as being negative, and hence set to 0, - // resulting in a 0 output everywhere. using unpacklo in between - // we get 0xFFFF->0xFF - pack16to8( - _mm_unpacklo_epi8( - _mm_cmpgt_epi16(_mm_adds_epi16(sxa, sya), binThres), - zero), - _mm_unpacklo_epi8( - _mm_cmpgt_epi16(_mm_adds_epi16(sxb, syb), binThres), - zero), - res); - - _mm_store_si128(dst0++, res); - - row0 += 16; - row1 += 16; - row2 += 16; - } // cols - } // rows - }; // Lambda - sobelSSESegment(1, height - 3); -#endif -} + int numThreads); /** * @brief Checks if the 128bits in xmm are all zero @@ -625,10 +206,7 @@ inline void sobel(uint8_t* in, * @return true if all zeros, false otherwise */ #ifdef _INTRINSICS_SSE -inline bool isAllZeros(__m128i xmm) { - return _mm_movemask_epi8(_mm_cmpeq_epi8(xmm, _mm_setzero_si128())) == - 0xFFFF; -} +bool isAllZeros(__m128i xmm); #endif /** * @brief Applies a gpc filter defined by the pixel-difference tests in @@ -645,79 +223,15 @@ inline bool isAllZeros(__m128i xmm) { * @param height The height of the image at pointer *in * @param numThreadsNumber of threads to use */ -inline void gpcFilter(uint8_t* in, +void gpcFilter(uint8_t* in, const uint8_t* grad, uint32_t* gpc, std::vector fastmask, std::vector& idx, int width, int height, - int numThreads) { - assert(width % 16 == 0 && "width must be multiple of 16!"); -#ifndef _INTRINSICS_SSE - gpcFilterNaive(in, grad, gpc, fastmask, idx, width, height); -#else - auto gpcFilterSegment = [&](int start, int end) { - __m128i zero = _mm_set1_epi8(0); - __m128i one = _mm_set1_epi8(1); - for (int y = start; y < end; y++) { - for (int x = 0; x < width; x += 16) { - uint8_t* rowPtr; - rowPtr = in + (y - 2) * width + x; - __m128i out[4]; // temporary output vector of 4 128bit words - - const uint8_t* center = (in + y * width + x); - const uint8_t* centerGrad = (grad + y * width + x); - // We only process the current segment if there are any non-zero - // values (high gradient pixels) - if (!isAllZeros(_mm_lddqu_si128((__m128i*)centerGrad))) { - __m128i* dst = - (__m128i*)(gpc + y * width + - x); // Set starting point to pixel (2,2) - out[0] = zero; - out[1] = zero; - out[2] = zero; - out[3] = zero; - uint8_t k = 0; - __m128i bitMask = one; - for (uint8_t i = 0; i < fastmask.size() && i < 64; i += 2) { - out[k] |= _mm_and_si128( - _mm_cmpgt_epu8( - _mm_lddqu_si128( - (__m128i*)(center + fastmask[i])), - _mm_lddqu_si128( - (__m128i*)(center + fastmask[i + 1]))), - bitMask); - // Keeps index into output vector and updates bit mask - if (i % 16 == 0 && i != 0) { - bitMask = one; - k++; - } else { - bitMask += bitMask; - } - } - // 8bit to 16bit - __m128i high1 = _mm_unpacklo_epi8(out[2], out[3]); - __m128i high2 = _mm_unpackhi_epi8(out[2], out[3]); - __m128i low1 = _mm_unpacklo_epi8(out[0], out[1]); - __m128i low2 = _mm_unpackhi_epi8(out[0], out[1]); - - // 16bit to 32bit ints - _mm_storeu_si128(dst, _mm_unpacklo_epi16(low1, high1)); - _mm_storeu_si128(dst + 1, _mm_unpackhi_epi16(low1, high1)); - _mm_storeu_si128(dst + 2, _mm_unpacklo_epi16(low2, high2)); - _mm_storeu_si128(dst + 3, _mm_unpackhi_epi16(low2, high2)); - } - } // col iteration - } // row iteration - }; + int numThreads); - if (numThreads == 1) - gpcFilterSegment(13, height - 15); - else - parFor(gpcFilterSegment, 13, height - 15, 4); -#endif -} /** * @brief Applies a gpc filter defined by the pixel-difference tests in * fastmask. Additionally uses a threshold vector (tau) @@ -731,7 +245,7 @@ inline void gpcFilter(uint8_t* in, * @param height The height of the image at pointer *in * @param numThreads Number of threads to use */ -inline void gpcFilterTau(uint8_t* in, +void gpcFilterTau(uint8_t* in, const uint8_t* grad, uint32_t* gpc, std::vector fastmask, @@ -739,75 +253,7 @@ inline void gpcFilterTau(uint8_t* in, std::vector& idx, int width, int height, - int numThreads) { - assert(width % 16 == 0 && "width must be multiple of 16!"); -#ifndef _INTRINSICS_SSE - gpcFilterTauNaive(in, grad, gpc, fastmask, tau, idx, width, height); -#else - auto gpcFilterSegment = [&](int start, int end) { - __m128i zero = _mm_set1_epi8(0); - __m128i one = _mm_set1_epi8(1); - for (int y = start; y < end; y++) { - for (int x = 0; x < width; x += 16) { - uint8_t* rowPtr; - rowPtr = in + (y - 2) * width + x; - __m128i out[4]; // temporary output vector of 4 128bit words - - const uint8_t* center = (in + y * width + x); - const uint8_t* centerGrad = (grad + y * width + x); - // We only process the current segment if there are any non-zero - // values (high gradient pixels) - if (!isAllZeros(_mm_lddqu_si128((__m128i*)centerGrad))) { - __m128i* dst = - (__m128i*)(gpc + y * width + - x); // Set starting point to pixel (2,2) - out[0] = zero; - out[1] = zero; - out[2] = zero; - out[3] = zero; - uint8_t k = 0; - __m128i bitMask = one; - for (uint8_t i = 0; i < fastmask.size() && i < 64; i += 2) { - out[k] |= _mm_and_si128( - _mm_cmpgt_epu8( - _mm_lddqu_si128( - (__m128i*)(center + fastmask[i])), - _mm_subs_epi8( - _mm_lddqu_si128( - (__m128i*)(center + fastmask[i + 1])), - _mm_set1_epi8(tau[i / 2])) // deduct tau - ), - bitMask); - // Keeps index into output vector and updates bit mask - if (i % 16 == 0 && i != 0) { - bitMask = one; - k++; - } else { - bitMask += bitMask; - } - } - // 8bit to 16bit - __m128i high1 = _mm_unpacklo_epi8(out[2], out[3]); - __m128i high2 = _mm_unpackhi_epi8(out[2], out[3]); - __m128i low1 = _mm_unpacklo_epi8(out[0], out[1]); - __m128i low2 = _mm_unpackhi_epi8(out[0], out[1]); - - // 16bit to 32bit ints - _mm_storeu_si128(dst, _mm_unpacklo_epi16(low1, high1)); - _mm_storeu_si128(dst + 1, _mm_unpackhi_epi16(low1, high1)); - _mm_storeu_si128(dst + 2, _mm_unpacklo_epi16(low2, high2)); - _mm_storeu_si128(dst + 3, _mm_unpackhi_epi16(low2, high2)); - } - } // col iteration - } // row iteration - }; - - if (numThreads == 1) - gpcFilterSegment(13, height - 15); - else - parFor(gpcFilterSegment, 13, height - 15, 4); -#endif -} + int numThreads); /** * @brief Naive version of 5x5 census transoform * @@ -816,30 +262,8 @@ inline void gpcFilterTau(uint8_t* in, * @param width Width of the image at *in pointer * @param height Heiht of the image at *in pointer */ -inline void census5x5Naive(uint8_t* in, uint32_t* census, int width, int height) { - uint32_t val; - uint32_t* dst; - for (int y = 2; y < height - 3; y++) { - for (int x = 0; x < width; x++) { - val = 0; - dst = census + y * width + x; - int i = 0; - // patch loops - for (int px = -2; px <= 2; px++) { - for (int py = -2; py <= 2; py++) { - if (!(px == 0 && py == 0)) { - val |= (in[(y + py) * width + (x + px)] > - in[y * width + x]) - ? (1 << i) - : 0; - i++; - } - } - } // End patch loops - *dst = val; - } - } // End pixel loops -} +void census5x5Naive(uint8_t* in, uint32_t* census, int width, int height); + /** * @brief 5x5 dense census transform of input image. binary codes are returned @@ -850,151 +274,6 @@ inline void census5x5Naive(uint8_t* in, uint32_t* census, int width, int height) * @param width * @param height */ -inline void census5x5(uint8_t* in, uint32_t* census, int width, int height) { - assert(width % 16 == 0 && "width must be multiple of 16!"); -#ifndef _INTRINSICS_SSE - census5x5Naive(in, census, width, height); -#else - __m128i zero = _mm_set1_epi8(0); - __m128i one = _mm_set1_epi8(1); - - for (int y = 2; y < height - 3; y++) { - for (int x = 0; x < width; x += 16) { - uint8_t* rowPtr; - rowPtr = in + (y - 2) * width + x; - __m128i center = _mm_lddqu_si128((__m128i*)(in + y * width + x)); - __m128i* dst = (__m128i*)(census + y * width + - x); // Set starting point to pixel (2,2) - // row 0 - __m128i bitMask = one; - __m128i byte1 = _mm_and_si128( - _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 2))), - bitMask); - bitMask += bitMask; // 2 - byte1 |= _mm_and_si128( - _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 1))), - bitMask); - bitMask += bitMask; // 4 - byte1 |= _mm_and_si128( - _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr))), - bitMask); - bitMask += bitMask; // 8 - byte1 |= _mm_and_si128( - _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 1))), - bitMask); - bitMask += bitMask; // 16 - byte1 |= _mm_and_si128( - _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 2))), - bitMask); - - // row 1 - rowPtr += width; - bitMask += bitMask; // 32 - byte1 |= _mm_and_si128( - _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 2))), - bitMask); - bitMask += bitMask; // 64 - byte1 |= _mm_and_si128( - _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 1))), - bitMask); - bitMask += bitMask; // 128 - byte1 |= _mm_and_si128( - _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr))), - bitMask); - bitMask = one; // 1 - __m128i byte2 = _mm_and_si128( - _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 1))), - bitMask); - bitMask += bitMask; // 2 - byte2 |= _mm_and_si128( - _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 2))), - bitMask); - - // row 2 - rowPtr += width; - bitMask += bitMask; // 4 - byte2 |= _mm_and_si128( - _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 2))), - bitMask); - bitMask += bitMask; // 8 - byte2 |= _mm_and_si128( - _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 1))), - bitMask); - bitMask += bitMask; // 16 - byte2 |= _mm_and_si128( - _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 1))), - bitMask); - bitMask += bitMask; // 32 - byte2 |= _mm_and_si128( - _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 2))), - bitMask); - - // row 3 - rowPtr += width; - bitMask += bitMask; // 64 - byte2 |= _mm_and_si128( - _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 2))), - bitMask); - bitMask += bitMask; // 128 - byte2 |= _mm_and_si128( - _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 1))), - bitMask); - bitMask = one; // 1 - __m128i byte3 = _mm_and_si128( - _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr))), - bitMask); - bitMask += bitMask; // 2 - byte3 |= _mm_and_si128( - _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 1))), - bitMask); - bitMask += bitMask; // 4 - byte3 |= _mm_and_si128( - _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 2))), - bitMask); - - // row 4 - rowPtr += width; - bitMask += bitMask; // 8 - byte3 |= _mm_and_si128( - _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 2))), - bitMask); - bitMask += bitMask; // 16 - byte3 |= _mm_and_si128( - _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 1))), - bitMask); - bitMask += bitMask; // 32 - byte3 |= _mm_and_si128( - _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr))), - bitMask); - bitMask += bitMask; // 64 - byte3 |= _mm_and_si128( - _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 1))), - bitMask); - bitMask += bitMask; // 128 - byte3 |= _mm_and_si128( - _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 2))), - bitMask); - - // 8bit to 16bit - __m128i high1 = _mm_unpacklo_epi8(byte3, zero); - __m128i high2 = _mm_unpackhi_epi8(byte3, zero); - __m128i low1 = _mm_unpacklo_epi8(byte1, byte2); - __m128i low2 = _mm_unpackhi_epi8(byte1, byte2); - - // 16bit to 32bit ints - _mm_storeu_si128(dst, _mm_unpacklo_epi16(low1, high1)); - _mm_storeu_si128(dst + 1, _mm_unpackhi_epi16(low1, high1)); - _mm_storeu_si128(dst + 2, _mm_unpacklo_epi16(low2, high2)); - _mm_storeu_si128(dst + 3, _mm_unpackhi_epi16(low2, high2)); - - } // col iteration - } // row iteration - // if(numThreads == 1) - // gpcFilterSegment(13,height-15); - // else - // parFor(gpcFilterSegment,13,height-15,4); - -#endif -} // census5x5 +void census5x5(uint8_t* in, uint32_t* census, int width, int height); } // namespace ndb #endif diff --git a/samples/sparsematch.cpp b/samples/sparsematch.cpp index 6834f61..3d5f19b 100644 --- a/samples/sparsematch.cpp +++ b/samples/sparsematch.cpp @@ -32,7 +32,7 @@ int main(int argc, char** argv) { gpc::inference::InferenceSettings inferencesettings = gpc::inference::InferenceSettings() .builder() - .gradientThreshold(5) + .gradientThreshold(20) .verticalTolerance( 0) // 0px tolerance for rectified epipolar matches .dispHigh(128) // limit disparities to 128 From d4dcdf6327f931d2b695becc5f7c692abaf7e30e Mon Sep 17 00:00:00 2001 From: Nik Bamert Date: Sun, 15 Feb 2026 09:03:49 +0100 Subject: [PATCH 06/36] add filter cpp --- lib/gpc/filter.cpp | 833 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 833 insertions(+) create mode 100644 lib/gpc/filter.cpp diff --git a/lib/gpc/filter.cpp b/lib/gpc/filter.cpp new file mode 100644 index 0000000..d03c0af --- /dev/null +++ b/lib/gpc/filter.cpp @@ -0,0 +1,833 @@ +// Copyright (c) 2018, ETH Zurich +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// 3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// Code Author: Niklaus Bamert (bamertn@ethz.ch) +#ifndef __NDB__FILTER +#define __NDB__FILTER + +#include +#include + +#include "gpc/filter.hpp" +using namespace std; + +namespace ndb { +void arr2ind(const unsigned char* a, + int n, + int* ind, + int* m) { +#ifdef _INTRINSICS_SSE + int i, m0, k; + __m256i msk; + m0 = 0; + for (i = 0; i < n; i = i + 32) { /* Load 32 bytes and compare with zero: */ + msk = _mm256_cmpeq_epi8(_mm256_load_si256((__m256i*)&a[i]), + _mm256_setzero_si256()); + k = _mm256_movemask_epi8(msk); + k = ~k; /* Search for nonzero bits instead of zero bits. */ + while (k) { + ind[m0] = + i + _tzcnt_u32( + k); /* Count the number of trailing zero bits in k. */ + m0++; + k = _blsr_u32(k); /* Clear the lowest set bit in k. */ + } + } + *m = m0; +#else + int nnz = 0; + for (int i = 0; i < n; i++) { + if (a[i] != 0) { + nnz++; + *ind = i; + ind++; + } + } + *m = nnz; +#endif +} +#ifdef _INTRINSICS_SSE +void unpack8to16(const __m128i x, __m128i& y0, __m128i& y1) { + __m128i zero = _mm_setzero_si128(); + y0 = _mm_unpacklo_epi8(x, zero); + y1 = _mm_unpackhi_epi8(x, zero); +} +void pack16to8(const __m128i x0, const __m128i x1, __m128i& y) { + y = _mm_packus_epi16(x0, x1); +} + +#endif +void parFor(std::function const& f, + int start, + int end, + int nThreads) { + // Range definition + // quantities derived from range + int segSize = (end - start) / nThreads; + int lastSeg = (end - start) % nThreads; + + std::vector threads; + threads.reserve(nThreads); + + // Spawn threads + for (int t = 0; t < nThreads - 1; t++) { + threads.emplace_back(f, start + t * segSize, start + (t + 1) * segSize); + } + threads.emplace_back(f, + start + (nThreads - 1) * segSize, + start + (nThreads)*segSize + lastSeg); + // Join + for (auto& t : threads) t.join(); +} + +void sobelNaive( + uint8_t* in, uint8_t* gradient, int width, int height, uint8_t threshold) { + assert(width % 16 == 0 && "width must be multiple of 16!"); + int thresholdSq = threshold * threshold; + uint8_t* ptr = in; + + uint8_t* p11 = ptr + 0 * width; + uint8_t* p12 = ptr + 0 * width + 1; + uint8_t* p13 = ptr + 0 * width + 2; + + uint8_t* p21 = ptr + 1 * width; + uint8_t* p22 = ptr + 1 * width + 1; + uint8_t* p23 = ptr + 1 * width + 2; + + uint8_t* p31 = ptr + 2 * width; + uint8_t* p32 = ptr + 2 * width + 1; + uint8_t* p33 = ptr + 2 * width + 2; + + // output pointer + uint8_t* optr = gradient + 1 * width + 1; + // Apply 3x3 box filter to image less pixel border of 1 (to avoid treating + // boundary) (unoptimized) + for (int iy = 1; iy < height - 1; iy++) { + for (int ix = 0; ix < width; ix++) { + int sx = (*p11 + *p31 + 2 * *p21 - *p13 - 2 * *p23 - *p33) / 9; + int sy = (*p11 + *p13 + 2 * *p12 - *p31 - 2 * *p32 - *p33) / 9; + + int val = sx * sx + sy * sy; + + *optr = val > thresholdSq ? 255 : 0; + p11++; + p12++; + p13++; + p21++; + p22++; + p23++; + p31++; + p32++; + p33++; + optr++; + } + } +} +void boxNaive(uint8_t* in, uint8_t* blurred, int width, int height) { + assert(width % 16 == 0 && "width must be multiple of 16!"); + // allocate space for result + uint8_t* ptr = in; + uint8_t* p11 = ptr + 0 * width; + uint8_t* p12 = ptr + 0 * width + 1; + uint8_t* p13 = ptr + 0 * width + 2; + + uint8_t* p21 = ptr + 1 * width; + uint8_t* p22 = ptr + 1 * width + 1; + uint8_t* p23 = ptr + 1 * width + 2; + + uint8_t* p31 = ptr + 2 * width; + uint8_t* p32 = ptr + 2 * width + 1; + uint8_t* p33 = ptr + 2 * width + 2; + uint8_t* optr = blurred + 1 * width + 1; + + // Apply 3x3 box filter to image less pixel border of 1 (to avoid treating + // boundary) (unoptimized) + for (int iy = 1; iy < height - 1; iy++) { + for (int ix = 0; ix < width; ix++) { + int res = + (*p11 + *p12 + *p13 + *p21 + *p22 + *p23 + *p31 + *p32 + *p33) / + 9; + *optr = res; + p11++; + p12++; + p13++; + p21++; + p22++; + p23++; + p31++; + p32++; + p33++; + optr++; + } + } +} +void gpcFilterNaive(uint8_t* in, + const uint8_t* grad, + uint32_t* gpc, + std::vector fastmask, + std::vector& idx, + int width, + int height) { + // output buffer of same size + uint32_t tmp; + + int j = 0; + for (auto k : idx) { + tmp = 0; + for (uint8_t i = 0; i < fastmask.size(); i += 2) { + tmp <<= 1; // shift by one + if (*(in + k + fastmask[i]) > *(in + k + fastmask[i + 1])) + tmp++; // set this test's result to 1 + } + gpc[k] = tmp; + j++; + } +} + +void gpcFilterTauNaive(uint8_t* in, + const uint8_t* grad, + uint32_t* gpc, + std::vector fastmask, + std::vector tau, + std::vector& idx, + int width, + int height) { + uint32_t tmp; + + int j = 0; + for (auto k : idx) { + tmp = 0; + for (uint8_t i = 0; i < fastmask.size(); i += 2) { + tmp <<= 1; // shift by one + if (*(in + k + fastmask[i]) > + *(in + k + fastmask[i + 1]) - tau[i / 2]) + tmp++; // set this test's result to 1 + } + gpc[k] = tmp; + j++; + } +} +void box(uint8_t* in, uint8_t* blurred, int width, int height, int numThreads) { + assert(width % 16 == 0 && "width must be multiple of 16!"); +#ifndef _INTRINSICS_SSE + boxNaive(in, blurred, width, height); +#else + auto boxFilterSegment = [&](int start, int end) { + int x, y; + __m128i one_third; + __m128i *dst0, *dst1; + __m128i zero = _mm_setzero_si128(); + + one_third = _mm_set1_epi16( + 21846); // 2^16/3+1. For 16bit ints. 2^8/3+1=86.33 for 8bit + dst0 = (__m128i*)(blurred + width * (start)); + dst1 = (__m128i*)(blurred + width * (start + 1)); + for (y = start; y < end; + y += 2) { // We compute results for two rows in one iteration + const uint8_t *row0, *row1, *row2, *row3; + + row1 = in + y * width; + row0 = row1 - width; + row2 = row1 + width; + row3 = row2 + width; + + for (x = 0; x < width; x += 16) { + __m128i s00, s01, s02; + __m128i r00, r01, r02; + __m128i ra00, ra01, ra02; + __m128i rb00, rb01, rb02; + + __m128i a00, a01, a02, b00, b01, b02; + + __m128i tmp0, tmp1, res; + + s00 = _mm_loadu_si128((__m128i*)(row0 - 1)); + s01 = _mm_loadu_si128((__m128i*)(row0 + 1)); + s02 = _mm_load_si128((__m128i*)(row0)); + unpack8to16(s00, a00, b00); + unpack8to16(s01, a01, b01); + unpack8to16(s02, a02, b02); + + ra00 = _mm_mulhi_epi16( + _mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third); + rb00 = _mm_mulhi_epi16( + _mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third); + + s00 = _mm_loadu_si128((__m128i*)(row1 - 1)); + s01 = _mm_loadu_si128((__m128i*)(row1 + 1)); + s02 = _mm_load_si128((__m128i*)(row1)); + unpack8to16(s00, a00, b00); + unpack8to16(s01, a01, b01); + unpack8to16(s02, a02, b02); + + ra01 = _mm_mulhi_epi16( + _mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third); + rb01 = _mm_mulhi_epi16( + _mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third); + + s00 = _mm_loadu_si128((__m128i*)(row2 - 1)); + s01 = _mm_loadu_si128((__m128i*)(row2 + 1)); + s02 = _mm_load_si128((__m128i*)(row2)); + unpack8to16(s00, a00, b00); + unpack8to16(s01, a01, b01); + unpack8to16(s02, a02, b02); + + ra02 = _mm_mulhi_epi16( + _mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third); + rb02 = _mm_mulhi_epi16( + _mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third); + + tmp0 = _mm_mulhi_epi16( + _mm_adds_epi16(_mm_adds_epi16(ra00, ra01), ra02), + one_third); + tmp1 = _mm_mulhi_epi16( + _mm_adds_epi16(_mm_adds_epi16(rb00, rb01), rb02), + one_third); + + pack16to8(tmp0, tmp1, res); + _mm_store_si128(dst0++, res); + + s00 = _mm_loadu_si128((__m128i*)(row3 - 1)); + s01 = _mm_loadu_si128((__m128i*)(row3 + 1)); + s02 = _mm_load_si128((__m128i*)(row3)); + unpack8to16(s00, a00, b00); + unpack8to16(s01, a01, b01); + unpack8to16(s02, a02, b02); + ra00 = _mm_mulhi_epi16( + _mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third); + rb00 = _mm_mulhi_epi16( + _mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third); + + tmp0 = _mm_mulhi_epi16( + _mm_adds_epi16(_mm_adds_epi16(ra00, ra01), ra02), + one_third); + tmp1 = _mm_mulhi_epi16( + _mm_adds_epi16(_mm_adds_epi16(rb00, rb01), rb02), + one_third); + + pack16to8(tmp0, tmp1, res); + _mm_store_si128(dst1++, res); + + row0 += 16; + row1 += 16; + row2 += 16; + row3 += 16; + } + // still storing 128bit, but now in 16 x 8bit format, so /16 instead + // of /8 + dst0 += width / 16; + dst1 += width / 16; + } + }; // lambda + + boxFilterSegment(1, height - 3); + // parFor(boxFilterSegment,1,height-3,4); +#endif +} +void sobel(uint8_t* in, + uint8_t* blurred, + int width, + int height, + uint8_t threshold, + int numThreads) { + assert(width % 16 == 0 && "width must be multiple of 16!"); +#ifndef _INTRINSICS_SSE + sobelNaive(in, blurred, width, height, threshold); +#else + auto sobelSSESegment = [&](int start, int end) { + __m128i one_third, one_ninth, one, two, mone, mtwo, binThres; + __m128i *dst0, *dst1; + __m128i zero = _mm_setzero_si128(); + + int x, y; + one_third = _mm_set1_epi16( + 21846); // 2^16/3+1. For 16bit ints. 2^8/3+1=86.33 for 8bit + one_ninth = _mm_set1_epi16(7282); // 2^16/9+1. For 16bit ints. + + binThres = _mm_set1_epi16(threshold * threshold); + + dst0 = (__m128i*)(blurred + width * 1); + // dst1 = (__m128i *)(blurred + width * 2); + for (y = start; y < end; + y++) { // We compute results for two rows in one iteration + const uint8_t *row0, *row1, *row2; + + row1 = in + y * width; + row0 = row1 - width; + row2 = row1 + width; + + for (x = 0; x < width; x += 16) { + // Note: Center element not used in sobel kernels!! + // Kernel indices: + // 00 01 02 + // 10 11 12 + // 20 21 22 + + __m128i a00, a01, a02, a10, a12, a20, a21, a22; + __m128i b00, b01, b02, b10, b12, b20, b21, b22; + + __m128i raA, raB, rbA, rbB; + __m128i tmpa, tmpb, sya, syb, sxa, sxb, res; + + unpack8to16(_mm_loadu_si128((__m128i*)(row0 - 1)), a00, b00); + unpack8to16(_mm_load_si128((__m128i*)(row0)), a01, b01); + unpack8to16(_mm_loadu_si128((__m128i*)(row0 + 1)), a02, b02); + + unpack8to16(_mm_loadu_si128((__m128i*)(row1 - 1)), a10, b10); + unpack8to16(_mm_loadu_si128((__m128i*)(row1 + 1)), a12, b12); + + unpack8to16(_mm_loadu_si128((__m128i*)(row2 - 1)), a20, b20); + unpack8to16(_mm_load_si128((__m128i*)(row2)), a21, b21); + unpack8to16(_mm_loadu_si128((__m128i*)(row2 + 1)), a22, b22); + + // Sobel kernels for x and y direction. + // 1 0 -1 1 2 1 + // sx = 2 0 -2 sy = 0 0 0 + // 1 0 -1 -1-2-1 + // Note that neither kernel uses the center element) + + // In the following, mullo is used to multiply intermediate + // results with -1 To divide by 3, 16bit overflow divide by + // multiply is used, which thus uses the upper 16bit(_mm_mulhi) + // of the 32bit temporary result. + + // sx column kernel vectors (1,2,1) + // Two chained add/sub are used for 2 and -2 + raA = _mm_mulhi_epi16( + _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(a00, a20), a10), + a10), + one_ninth); + rbA = _mm_mulhi_epi16( + _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(b00, b20), b10), + b10), + one_ninth); + + // sx column kernel vector (-1 -2 -1) + raB = _mm_mulhi_epi16( + _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(a02, a22), a12), + a12), + one_ninth); + rbB = _mm_mulhi_epi16( + _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(b02, b22), b12), + b12), + one_ninth); + + // Square of sx: Add squares of above temporaries into final sum + tmpa = _mm_sub_epi16(raA, raB); + tmpb = _mm_sub_epi16(rbA, rbB); + + sxa = _mm_mullo_epi16(tmpa, tmpa); + sxb = _mm_mullo_epi16(tmpb, tmpb); + + // sy row kernel vector (1,2,1) + // Two chained add are used for 2 and -2 + raA = _mm_mulhi_epi16( + _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(a00, a02), a01), + a01), + one_ninth); + rbA = _mm_mulhi_epi16( + _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(b00, b02), b01), + b01), + one_ninth); + + // sy row kernel vector (-1 -2 -1) + raB = _mm_mulhi_epi16( + _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(a20, a22), a21), + a21), + one_ninth); + rbB = _mm_mulhi_epi16( + _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(b20, b22), b21), + b21), + one_ninth); + + // Square of sx: Add squares of above temporaries into final sum + tmpa = _mm_sub_epi16(raA, raB); + tmpb = _mm_sub_epi16(rbA, rbB); + + // watch out, can't overwrite this + sya = _mm_mullo_epi16(tmpa, tmpa); + syb = _mm_mullo_epi16(tmpb, tmpb); + + __m128i zero = _mm_setzero_si128(); + + // The unpacklo is necessary because _mm_cmput_epi16 sets the + // output to 0xFFFF if the comparison is true. When packing + // 16bit to 8bit however, 0xFFFF will be interpreted (in a + // signed environment) as being negative, and hence set to 0, + // resulting in a 0 output everywhere. using unpacklo in between + // we get 0xFFFF->0xFF + pack16to8( + _mm_unpacklo_epi8( + _mm_cmpgt_epi16(_mm_adds_epi16(sxa, sya), binThres), + zero), + _mm_unpacklo_epi8( + _mm_cmpgt_epi16(_mm_adds_epi16(sxb, syb), binThres), + zero), + res); + + _mm_store_si128(dst0++, res); + + row0 += 16; + row1 += 16; + row2 += 16; + } // cols + } // rows + }; // Lambda + sobelSSESegment(1, height - 3); +#endif +} + +#ifdef _INTRINSICS_SSE +bool isAllZeros(__m128i xmm) { + return _mm_movemask_epi8(_mm_cmpeq_epi8(xmm, _mm_setzero_si128())) == + 0xFFFF; +} +#endif +void gpcFilter(uint8_t* in, + const uint8_t* grad, + uint32_t* gpc, + std::vector fastmask, + std::vector& idx, + int width, + int height, + int numThreads) { + assert(width % 16 == 0 && "width must be multiple of 16!"); +#ifndef _INTRINSICS_SSE + gpcFilterNaive(in, grad, gpc, fastmask, idx, width, height); +#else + auto gpcFilterSegment = [&](int start, int end) { + __m128i zero = _mm_set1_epi8(0); + __m128i one = _mm_set1_epi8(1); + for (int y = start; y < end; y++) { + for (int x = 0; x < width; x += 16) { + uint8_t* rowPtr; + rowPtr = in + (y - 2) * width + x; + __m128i out[4]; // temporary output vector of 4 128bit words + + const uint8_t* center = (in + y * width + x); + const uint8_t* centerGrad = (grad + y * width + x); + // We only process the current segment if there are any non-zero + // values (high gradient pixels) + if (!isAllZeros(_mm_lddqu_si128((__m128i*)centerGrad))) { + __m128i* dst = + (__m128i*)(gpc + y * width + + x); // Set starting point to pixel (2,2) + out[0] = zero; + out[1] = zero; + out[2] = zero; + out[3] = zero; + uint8_t k = 0; + __m128i bitMask = one; + for (uint8_t i = 0; i < fastmask.size() && i < 64; i += 2) { + out[k] |= _mm_and_si128( + _mm_cmpgt_epu8( + _mm_lddqu_si128( + (__m128i*)(center + fastmask[i])), + _mm_lddqu_si128( + (__m128i*)(center + fastmask[i + 1]))), + bitMask); + // Keeps index into output vector and updates bit mask + if (i % 16 == 0 && i != 0) { + bitMask = one; + k++; + } else { + bitMask += bitMask; + } + } + // 8bit to 16bit + __m128i high1 = _mm_unpacklo_epi8(out[2], out[3]); + __m128i high2 = _mm_unpackhi_epi8(out[2], out[3]); + __m128i low1 = _mm_unpacklo_epi8(out[0], out[1]); + __m128i low2 = _mm_unpackhi_epi8(out[0], out[1]); + + // 16bit to 32bit ints + _mm_storeu_si128(dst, _mm_unpacklo_epi16(low1, high1)); + _mm_storeu_si128(dst + 1, _mm_unpackhi_epi16(low1, high1)); + _mm_storeu_si128(dst + 2, _mm_unpacklo_epi16(low2, high2)); + _mm_storeu_si128(dst + 3, _mm_unpackhi_epi16(low2, high2)); + } + } // col iteration + } // row iteration + }; + + if (numThreads == 1) + gpcFilterSegment(13, height - 15); + else + parFor(gpcFilterSegment, 13, height - 15, 4); +#endif +} +void gpcFilterTau(uint8_t* in, + const uint8_t* grad, + uint32_t* gpc, + std::vector fastmask, + std::vector tau, + std::vector& idx, + int width, + int height, + int numThreads) { + assert(width % 16 == 0 && "width must be multiple of 16!"); +#ifndef _INTRINSICS_SSE + gpcFilterTauNaive(in, grad, gpc, fastmask, tau, idx, width, height); +#else + auto gpcFilterSegment = [&](int start, int end) { + __m128i zero = _mm_set1_epi8(0); + __m128i one = _mm_set1_epi8(1); + for (int y = start; y < end; y++) { + for (int x = 0; x < width; x += 16) { + uint8_t* rowPtr; + rowPtr = in + (y - 2) * width + x; + __m128i out[4]; // temporary output vector of 4 128bit words + + const uint8_t* center = (in + y * width + x); + const uint8_t* centerGrad = (grad + y * width + x); + // We only process the current segment if there are any non-zero + // values (high gradient pixels) + if (!isAllZeros(_mm_lddqu_si128((__m128i*)centerGrad))) { + __m128i* dst = + (__m128i*)(gpc + y * width + + x); // Set starting point to pixel (2,2) + out[0] = zero; + out[1] = zero; + out[2] = zero; + out[3] = zero; + uint8_t k = 0; + __m128i bitMask = one; + for (uint8_t i = 0; i < fastmask.size() && i < 64; i += 2) { + out[k] |= _mm_and_si128( + _mm_cmpgt_epu8( + _mm_lddqu_si128( + (__m128i*)(center + fastmask[i])), + _mm_subs_epi8( + _mm_lddqu_si128( + (__m128i*)(center + fastmask[i + 1])), + _mm_set1_epi8(tau[i / 2])) // deduct tau + ), + bitMask); + // Keeps index into output vector and updates bit mask + if (i % 16 == 0 && i != 0) { + bitMask = one; + k++; + } else { + bitMask += bitMask; + } + } + // 8bit to 16bit + __m128i high1 = _mm_unpacklo_epi8(out[2], out[3]); + __m128i high2 = _mm_unpackhi_epi8(out[2], out[3]); + __m128i low1 = _mm_unpacklo_epi8(out[0], out[1]); + __m128i low2 = _mm_unpackhi_epi8(out[0], out[1]); + + // 16bit to 32bit ints + _mm_storeu_si128(dst, _mm_unpacklo_epi16(low1, high1)); + _mm_storeu_si128(dst + 1, _mm_unpackhi_epi16(low1, high1)); + _mm_storeu_si128(dst + 2, _mm_unpacklo_epi16(low2, high2)); + _mm_storeu_si128(dst + 3, _mm_unpackhi_epi16(low2, high2)); + } + } // col iteration + } // row iteration + }; + + if (numThreads == 1) + gpcFilterSegment(13, height - 15); + else + parFor(gpcFilterSegment, 13, height - 15, 4); +#endif +} +void census5x5Naive(uint8_t* in, uint32_t* census, int width, int height) { + uint32_t val; + uint32_t* dst; + for (int y = 2; y < height - 3; y++) { + for (int x = 0; x < width; x++) { + val = 0; + dst = census + y * width + x; + int i = 0; + // patch loops + for (int px = -2; px <= 2; px++) { + for (int py = -2; py <= 2; py++) { + if (!(px == 0 && py == 0)) { + val |= (in[(y + py) * width + (x + px)] > + in[y * width + x]) + ? (1 << i) + : 0; + i++; + } + } + } // End patch loops + *dst = val; + } + } // End pixel loops +} +void census5x5(uint8_t* in, uint32_t* census, int width, int height) { + assert(width % 16 == 0 && "width must be multiple of 16!"); +#ifndef _INTRINSICS_SSE + census5x5Naive(in, census, width, height); +#else + __m128i zero = _mm_set1_epi8(0); + __m128i one = _mm_set1_epi8(1); + + for (int y = 2; y < height - 3; y++) { + for (int x = 0; x < width; x += 16) { + uint8_t* rowPtr; + rowPtr = in + (y - 2) * width + x; + __m128i center = _mm_lddqu_si128((__m128i*)(in + y * width + x)); + __m128i* dst = (__m128i*)(census + y * width + + x); // Set starting point to pixel (2,2) + // row 0 + __m128i bitMask = one; + __m128i byte1 = _mm_and_si128( + _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 2))), + bitMask); + bitMask += bitMask; // 2 + byte1 |= _mm_and_si128( + _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 1))), + bitMask); + bitMask += bitMask; // 4 + byte1 |= _mm_and_si128( + _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr))), + bitMask); + bitMask += bitMask; // 8 + byte1 |= _mm_and_si128( + _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 1))), + bitMask); + bitMask += bitMask; // 16 + byte1 |= _mm_and_si128( + _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 2))), + bitMask); + + // row 1 + rowPtr += width; + bitMask += bitMask; // 32 + byte1 |= _mm_and_si128( + _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 2))), + bitMask); + bitMask += bitMask; // 64 + byte1 |= _mm_and_si128( + _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 1))), + bitMask); + bitMask += bitMask; // 128 + byte1 |= _mm_and_si128( + _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr))), + bitMask); + bitMask = one; // 1 + __m128i byte2 = _mm_and_si128( + _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 1))), + bitMask); + bitMask += bitMask; // 2 + byte2 |= _mm_and_si128( + _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 2))), + bitMask); + + // row 2 + rowPtr += width; + bitMask += bitMask; // 4 + byte2 |= _mm_and_si128( + _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 2))), + bitMask); + bitMask += bitMask; // 8 + byte2 |= _mm_and_si128( + _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 1))), + bitMask); + bitMask += bitMask; // 16 + byte2 |= _mm_and_si128( + _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 1))), + bitMask); + bitMask += bitMask; // 32 + byte2 |= _mm_and_si128( + _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 2))), + bitMask); + + // row 3 + rowPtr += width; + bitMask += bitMask; // 64 + byte2 |= _mm_and_si128( + _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 2))), + bitMask); + bitMask += bitMask; // 128 + byte2 |= _mm_and_si128( + _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 1))), + bitMask); + bitMask = one; // 1 + __m128i byte3 = _mm_and_si128( + _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr))), + bitMask); + bitMask += bitMask; // 2 + byte3 |= _mm_and_si128( + _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 1))), + bitMask); + bitMask += bitMask; // 4 + byte3 |= _mm_and_si128( + _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 2))), + bitMask); + + // row 4 + rowPtr += width; + bitMask += bitMask; // 8 + byte3 |= _mm_and_si128( + _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 2))), + bitMask); + bitMask += bitMask; // 16 + byte3 |= _mm_and_si128( + _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 1))), + bitMask); + bitMask += bitMask; // 32 + byte3 |= _mm_and_si128( + _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr))), + bitMask); + bitMask += bitMask; // 64 + byte3 |= _mm_and_si128( + _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 1))), + bitMask); + bitMask += bitMask; // 128 + byte3 |= _mm_and_si128( + _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 2))), + bitMask); + + // 8bit to 16bit + __m128i high1 = _mm_unpacklo_epi8(byte3, zero); + __m128i high2 = _mm_unpackhi_epi8(byte3, zero); + __m128i low1 = _mm_unpacklo_epi8(byte1, byte2); + __m128i low2 = _mm_unpackhi_epi8(byte1, byte2); + + // 16bit to 32bit ints + _mm_storeu_si128(dst, _mm_unpacklo_epi16(low1, high1)); + _mm_storeu_si128(dst + 1, _mm_unpackhi_epi16(low1, high1)); + _mm_storeu_si128(dst + 2, _mm_unpacklo_epi16(low2, high2)); + _mm_storeu_si128(dst + 3, _mm_unpackhi_epi16(low2, high2)); + + } // col iteration + } // row iteration + // if(numThreads == 1) + // gpcFilterSegment(13,height-15); + // else + // parFor(gpcFilterSegment,13,height-15,4); + +#endif +} // census5x5 +} // namespace ndb +#endif From b682d4415fb1e9968cd15712a1278bfc82cb66f5 Mon Sep 17 00:00:00 2001 From: Nik Bamert Date: Sun, 15 Feb 2026 09:20:22 +0100 Subject: [PATCH 07/36] break out sobel kernel --- CMakeLists.txt | 1 + lib/gpc/filter.cpp | 193 -------------------------------- lib/gpc/filter.hpp | 31 ------ lib/gpc/forest.cpp | 1 + lib/gpc/kernels/sobel.cpp | 229 ++++++++++++++++++++++++++++++++++++++ lib/gpc/kernels/sobel.hpp | 70 ++++++++++++ 6 files changed, 301 insertions(+), 224 deletions(-) create mode 100644 lib/gpc/kernels/sobel.cpp create mode 100644 lib/gpc/kernels/sobel.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 60f919e..8f7182d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -42,6 +42,7 @@ add_library(gpc_core lib/gpc/fern.cpp lib/gpc/feature.cpp lib/gpc/filter.cpp + lib/gpc/kernels/sobel.cpp ) target_link_libraries(gpc_core PUBLIC diff --git a/lib/gpc/filter.cpp b/lib/gpc/filter.cpp index d03c0af..cbe18b9 100644 --- a/lib/gpc/filter.cpp +++ b/lib/gpc/filter.cpp @@ -106,49 +106,7 @@ void parFor(std::function const& f, for (auto& t : threads) t.join(); } -void sobelNaive( - uint8_t* in, uint8_t* gradient, int width, int height, uint8_t threshold) { - assert(width % 16 == 0 && "width must be multiple of 16!"); - int thresholdSq = threshold * threshold; - uint8_t* ptr = in; - - uint8_t* p11 = ptr + 0 * width; - uint8_t* p12 = ptr + 0 * width + 1; - uint8_t* p13 = ptr + 0 * width + 2; - uint8_t* p21 = ptr + 1 * width; - uint8_t* p22 = ptr + 1 * width + 1; - uint8_t* p23 = ptr + 1 * width + 2; - - uint8_t* p31 = ptr + 2 * width; - uint8_t* p32 = ptr + 2 * width + 1; - uint8_t* p33 = ptr + 2 * width + 2; - - // output pointer - uint8_t* optr = gradient + 1 * width + 1; - // Apply 3x3 box filter to image less pixel border of 1 (to avoid treating - // boundary) (unoptimized) - for (int iy = 1; iy < height - 1; iy++) { - for (int ix = 0; ix < width; ix++) { - int sx = (*p11 + *p31 + 2 * *p21 - *p13 - 2 * *p23 - *p33) / 9; - int sy = (*p11 + *p13 + 2 * *p12 - *p31 - 2 * *p32 - *p33) / 9; - - int val = sx * sx + sy * sy; - - *optr = val > thresholdSq ? 255 : 0; - p11++; - p12++; - p13++; - p21++; - p22++; - p23++; - p31++; - p32++; - p33++; - optr++; - } - } -} void boxNaive(uint8_t* in, uint8_t* blurred, int width, int height) { assert(width % 16 == 0 && "width must be multiple of 16!"); // allocate space for result @@ -350,158 +308,7 @@ void box(uint8_t* in, uint8_t* blurred, int width, int height, int numThreads) { // parFor(boxFilterSegment,1,height-3,4); #endif } -void sobel(uint8_t* in, - uint8_t* blurred, - int width, - int height, - uint8_t threshold, - int numThreads) { - assert(width % 16 == 0 && "width must be multiple of 16!"); -#ifndef _INTRINSICS_SSE - sobelNaive(in, blurred, width, height, threshold); -#else - auto sobelSSESegment = [&](int start, int end) { - __m128i one_third, one_ninth, one, two, mone, mtwo, binThres; - __m128i *dst0, *dst1; - __m128i zero = _mm_setzero_si128(); - int x, y; - one_third = _mm_set1_epi16( - 21846); // 2^16/3+1. For 16bit ints. 2^8/3+1=86.33 for 8bit - one_ninth = _mm_set1_epi16(7282); // 2^16/9+1. For 16bit ints. - - binThres = _mm_set1_epi16(threshold * threshold); - - dst0 = (__m128i*)(blurred + width * 1); - // dst1 = (__m128i *)(blurred + width * 2); - for (y = start; y < end; - y++) { // We compute results for two rows in one iteration - const uint8_t *row0, *row1, *row2; - - row1 = in + y * width; - row0 = row1 - width; - row2 = row1 + width; - - for (x = 0; x < width; x += 16) { - // Note: Center element not used in sobel kernels!! - // Kernel indices: - // 00 01 02 - // 10 11 12 - // 20 21 22 - - __m128i a00, a01, a02, a10, a12, a20, a21, a22; - __m128i b00, b01, b02, b10, b12, b20, b21, b22; - - __m128i raA, raB, rbA, rbB; - __m128i tmpa, tmpb, sya, syb, sxa, sxb, res; - - unpack8to16(_mm_loadu_si128((__m128i*)(row0 - 1)), a00, b00); - unpack8to16(_mm_load_si128((__m128i*)(row0)), a01, b01); - unpack8to16(_mm_loadu_si128((__m128i*)(row0 + 1)), a02, b02); - - unpack8to16(_mm_loadu_si128((__m128i*)(row1 - 1)), a10, b10); - unpack8to16(_mm_loadu_si128((__m128i*)(row1 + 1)), a12, b12); - - unpack8to16(_mm_loadu_si128((__m128i*)(row2 - 1)), a20, b20); - unpack8to16(_mm_load_si128((__m128i*)(row2)), a21, b21); - unpack8to16(_mm_loadu_si128((__m128i*)(row2 + 1)), a22, b22); - - // Sobel kernels for x and y direction. - // 1 0 -1 1 2 1 - // sx = 2 0 -2 sy = 0 0 0 - // 1 0 -1 -1-2-1 - // Note that neither kernel uses the center element) - - // In the following, mullo is used to multiply intermediate - // results with -1 To divide by 3, 16bit overflow divide by - // multiply is used, which thus uses the upper 16bit(_mm_mulhi) - // of the 32bit temporary result. - - // sx column kernel vectors (1,2,1) - // Two chained add/sub are used for 2 and -2 - raA = _mm_mulhi_epi16( - _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(a00, a20), a10), - a10), - one_ninth); - rbA = _mm_mulhi_epi16( - _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(b00, b20), b10), - b10), - one_ninth); - - // sx column kernel vector (-1 -2 -1) - raB = _mm_mulhi_epi16( - _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(a02, a22), a12), - a12), - one_ninth); - rbB = _mm_mulhi_epi16( - _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(b02, b22), b12), - b12), - one_ninth); - - // Square of sx: Add squares of above temporaries into final sum - tmpa = _mm_sub_epi16(raA, raB); - tmpb = _mm_sub_epi16(rbA, rbB); - - sxa = _mm_mullo_epi16(tmpa, tmpa); - sxb = _mm_mullo_epi16(tmpb, tmpb); - - // sy row kernel vector (1,2,1) - // Two chained add are used for 2 and -2 - raA = _mm_mulhi_epi16( - _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(a00, a02), a01), - a01), - one_ninth); - rbA = _mm_mulhi_epi16( - _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(b00, b02), b01), - b01), - one_ninth); - - // sy row kernel vector (-1 -2 -1) - raB = _mm_mulhi_epi16( - _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(a20, a22), a21), - a21), - one_ninth); - rbB = _mm_mulhi_epi16( - _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(b20, b22), b21), - b21), - one_ninth); - - // Square of sx: Add squares of above temporaries into final sum - tmpa = _mm_sub_epi16(raA, raB); - tmpb = _mm_sub_epi16(rbA, rbB); - - // watch out, can't overwrite this - sya = _mm_mullo_epi16(tmpa, tmpa); - syb = _mm_mullo_epi16(tmpb, tmpb); - - __m128i zero = _mm_setzero_si128(); - - // The unpacklo is necessary because _mm_cmput_epi16 sets the - // output to 0xFFFF if the comparison is true. When packing - // 16bit to 8bit however, 0xFFFF will be interpreted (in a - // signed environment) as being negative, and hence set to 0, - // resulting in a 0 output everywhere. using unpacklo in between - // we get 0xFFFF->0xFF - pack16to8( - _mm_unpacklo_epi8( - _mm_cmpgt_epi16(_mm_adds_epi16(sxa, sya), binThres), - zero), - _mm_unpacklo_epi8( - _mm_cmpgt_epi16(_mm_adds_epi16(sxb, syb), binThres), - zero), - res); - - _mm_store_si128(dst0++, res); - - row0 += 16; - row1 += 16; - row2 += 16; - } // cols - } // rows - }; // Lambda - sobelSSESegment(1, height - 3); -#endif -} #ifdef _INTRINSICS_SSE bool isAllZeros(__m128i xmm) { diff --git a/lib/gpc/filter.hpp b/lib/gpc/filter.hpp index 3576650..2e6454a 100644 --- a/lib/gpc/filter.hpp +++ b/lib/gpc/filter.hpp @@ -101,19 +101,6 @@ void parFor(std::function const& f, int start, int end, int nThreads); -/** - * @brief Naive 3x3 sobel filter implementation - * - * @param in input image - * @param blurred The blurred output image - * @param[in] width The width - * @param[in] height The height - * @param[in] numThreads number of threads to use - * @param threshold threshold to binarize sobel filter output - */ -void sobelNaive( - uint8_t* in, uint8_t* gradient, int width, int height, uint8_t threshold); - /** * @brief Naive 3x3 box filter implementation * @@ -180,24 +167,6 @@ void gpcFilterTauNaive(uint8_t* in, */ void box(uint8_t* in, uint8_t* blurred, int width, int height, int numThreads); -/** - * @brief 3x3 Sobel filter. Input dimension must be multiple of 16 - * - * @param in { parameter_description } - * @param blurred The blurred - * @param[in] width The width - * @param[in] height The height - * @param[in] threshold The threshold - * @param[in] numThreads number of threads to use - */ - -void sobel(uint8_t* in, - uint8_t* blurred, - int width, - int height, - uint8_t threshold, - int numThreads); - /** * @brief Checks if the 128bits in xmm are all zero * diff --git a/lib/gpc/forest.cpp b/lib/gpc/forest.cpp index 0809951..8b9d2ef 100644 --- a/lib/gpc/forest.cpp +++ b/lib/gpc/forest.cpp @@ -47,6 +47,7 @@ #include "gpc/SintelStereo.hpp" #include "gpc/buffer.hpp" #include "gpc/filter.hpp" +#include "gpc/kernels/sobel.hpp" #include "gpc/hashmatch.hpp" #include "gpc/forest.hpp" diff --git a/lib/gpc/kernels/sobel.cpp b/lib/gpc/kernels/sobel.cpp new file mode 100644 index 0000000..4d56716 --- /dev/null +++ b/lib/gpc/kernels/sobel.cpp @@ -0,0 +1,229 @@ +// Copyright (c) 2018, ETH Zurich +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// 3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// Code Author: Niklaus Bamert (bamertn@ethz.ch) + +#include "gpc/kernels/sobel.hpp" +namespace ndb { +void sobelNaive( + uint8_t* in, uint8_t* gradient, int width, int height, uint8_t threshold) { + assert(width % 16 == 0 && "width must be multiple of 16!"); + int thresholdSq = threshold * threshold; + uint8_t* ptr = in; + + uint8_t* p11 = ptr + 0 * width; + uint8_t* p12 = ptr + 0 * width + 1; + uint8_t* p13 = ptr + 0 * width + 2; + + uint8_t* p21 = ptr + 1 * width; + uint8_t* p22 = ptr + 1 * width + 1; + uint8_t* p23 = ptr + 1 * width + 2; + + uint8_t* p31 = ptr + 2 * width; + uint8_t* p32 = ptr + 2 * width + 1; + uint8_t* p33 = ptr + 2 * width + 2; + + // output pointer + uint8_t* optr = gradient + 1 * width + 1; + // Apply 3x3 box filter to image less pixel border of 1 (to avoid treating + // boundary) (unoptimized) + for (int iy = 1; iy < height - 1; iy++) { + for (int ix = 0; ix < width; ix++) { + int sx = (*p11 + *p31 + 2 * *p21 - *p13 - 2 * *p23 - *p33) / 9; + int sy = (*p11 + *p13 + 2 * *p12 - *p31 - 2 * *p32 - *p33) / 9; + + int val = sx * sx + sy * sy; + + *optr = val > thresholdSq ? 255 : 0; + p11++; + p12++; + p13++; + p21++; + p22++; + p23++; + p31++; + p32++; + p33++; + optr++; + } + } +} +void sobel(uint8_t* in, + uint8_t* blurred, + int width, + int height, + uint8_t threshold, + int numThreads) { + assert(width % 16 == 0 && "width must be multiple of 16!"); +#ifndef _INTRINSICS_SSE + sobelNaive(in, blurred, width, height, threshold); +#else + auto sobelSSESegment = [&](int start, int end) { + __m128i one_third, one_ninth, one, two, mone, mtwo, binThres; + __m128i *dst0, *dst1; + __m128i zero = _mm_setzero_si128(); + + int x, y; + one_third = _mm_set1_epi16( + 21846); // 2^16/3+1. For 16bit ints. 2^8/3+1=86.33 for 8bit + one_ninth = _mm_set1_epi16(7282); // 2^16/9+1. For 16bit ints. + + binThres = _mm_set1_epi16(threshold * threshold); + + dst0 = (__m128i*)(blurred + width * 1); + // dst1 = (__m128i *)(blurred + width * 2); + for (y = start; y < end; + y++) { // We compute results for two rows in one iteration + const uint8_t *row0, *row1, *row2; + + row1 = in + y * width; + row0 = row1 - width; + row2 = row1 + width; + + for (x = 0; x < width; x += 16) { + // Note: Center element not used in sobel kernels!! + // Kernel indices: + // 00 01 02 + // 10 11 12 + // 20 21 22 + + __m128i a00, a01, a02, a10, a12, a20, a21, a22; + __m128i b00, b01, b02, b10, b12, b20, b21, b22; + + __m128i raA, raB, rbA, rbB; + __m128i tmpa, tmpb, sya, syb, sxa, sxb, res; + + unpack8to16(_mm_loadu_si128((__m128i*)(row0 - 1)), a00, b00); + unpack8to16(_mm_load_si128((__m128i*)(row0)), a01, b01); + unpack8to16(_mm_loadu_si128((__m128i*)(row0 + 1)), a02, b02); + + unpack8to16(_mm_loadu_si128((__m128i*)(row1 - 1)), a10, b10); + unpack8to16(_mm_loadu_si128((__m128i*)(row1 + 1)), a12, b12); + + unpack8to16(_mm_loadu_si128((__m128i*)(row2 - 1)), a20, b20); + unpack8to16(_mm_load_si128((__m128i*)(row2)), a21, b21); + unpack8to16(_mm_loadu_si128((__m128i*)(row2 + 1)), a22, b22); + + // Sobel kernels for x and y direction. + // 1 0 -1 1 2 1 + // sx = 2 0 -2 sy = 0 0 0 + // 1 0 -1 -1-2-1 + // Note that neither kernel uses the center element) + + // In the following, mullo is used to multiply intermediate + // results with -1 To divide by 3, 16bit overflow divide by + // multiply is used, which thus uses the upper 16bit(_mm_mulhi) + // of the 32bit temporary result. + + // sx column kernel vectors (1,2,1) + // Two chained add/sub are used for 2 and -2 + raA = _mm_mulhi_epi16( + _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(a00, a20), a10), + a10), + one_ninth); + rbA = _mm_mulhi_epi16( + _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(b00, b20), b10), + b10), + one_ninth); + + // sx column kernel vector (-1 -2 -1) + raB = _mm_mulhi_epi16( + _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(a02, a22), a12), + a12), + one_ninth); + rbB = _mm_mulhi_epi16( + _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(b02, b22), b12), + b12), + one_ninth); + + // Square of sx: Add squares of above temporaries into final sum + tmpa = _mm_sub_epi16(raA, raB); + tmpb = _mm_sub_epi16(rbA, rbB); + + sxa = _mm_mullo_epi16(tmpa, tmpa); + sxb = _mm_mullo_epi16(tmpb, tmpb); + + // sy row kernel vector (1,2,1) + // Two chained add are used for 2 and -2 + raA = _mm_mulhi_epi16( + _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(a00, a02), a01), + a01), + one_ninth); + rbA = _mm_mulhi_epi16( + _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(b00, b02), b01), + b01), + one_ninth); + + // sy row kernel vector (-1 -2 -1) + raB = _mm_mulhi_epi16( + _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(a20, a22), a21), + a21), + one_ninth); + rbB = _mm_mulhi_epi16( + _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(b20, b22), b21), + b21), + one_ninth); + + // Square of sx: Add squares of above temporaries into final sum + tmpa = _mm_sub_epi16(raA, raB); + tmpb = _mm_sub_epi16(rbA, rbB); + + // watch out, can't overwrite this + sya = _mm_mullo_epi16(tmpa, tmpa); + syb = _mm_mullo_epi16(tmpb, tmpb); + + __m128i zero = _mm_setzero_si128(); + + // The unpacklo is necessary because _mm_cmput_epi16 sets the + // output to 0xFFFF if the comparison is true. When packing + // 16bit to 8bit however, 0xFFFF will be interpreted (in a + // signed environment) as being negative, and hence set to 0, + // resulting in a 0 output everywhere. using unpacklo in between + // we get 0xFFFF->0xFF + pack16to8( + _mm_unpacklo_epi8( + _mm_cmpgt_epi16(_mm_adds_epi16(sxa, sya), binThres), + zero), + _mm_unpacklo_epi8( + _mm_cmpgt_epi16(_mm_adds_epi16(sxb, syb), binThres), + zero), + res); + + _mm_store_si128(dst0++, res); + + row0 += 16; + row1 += 16; + row2 += 16; + } // cols + } // rows + }; // Lambda + sobelSSESegment(1, height - 3); +#endif +} +} // namespace ndb diff --git a/lib/gpc/kernels/sobel.hpp b/lib/gpc/kernels/sobel.hpp new file mode 100644 index 0000000..038408a --- /dev/null +++ b/lib/gpc/kernels/sobel.hpp @@ -0,0 +1,70 @@ +// Copyright (c) 2018, ETH Zurich +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// 3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// Code Author: Niklaus Bamert (bamertn@ethz.ch) + +#ifndef __NDB__KERNEL_SOBEL +#define __NDB__KERNEL_SOBEL +using namespace std; + +#include "gpc/buffer.hpp" + +namespace ndb { +/** + * @brief Naive 3x3 sobel filter implementation + * + * @param in input image + * @param blurred The blurred output image + * @param[in] width The width + * @param[in] height The height + * @param[in] numThreads number of threads to use + * @param threshold threshold to binarize sobel filter output + */ +void sobelNaive( + uint8_t* in, uint8_t* gradient, int width, int height, uint8_t threshold); + +/** + * @brief 3x3 Sobel filter. Input dimension must be multiple of 16 + * + * @param in { parameter_description } + * @param blurred The blurred + * @param[in] width The width + * @param[in] height The height + * @param[in] threshold The threshold + * @param[in] numThreads number of threads to use + */ + +void sobel(uint8_t* in, + uint8_t* blurred, + int width, + int height, + uint8_t threshold, + int numThreads); +} +#endif From 6ed662a91796c4b2009c9b45b3e5269617f364a8 Mon Sep 17 00:00:00 2001 From: Nik Bamert Date: Sun, 15 Feb 2026 09:26:27 +0100 Subject: [PATCH 08/36] break out box filter --- CMakeLists.txt | 1 + lib/gpc/feature.cpp | 1 + lib/gpc/filter.cpp | 154 -------------------------------- lib/gpc/filter.hpp | 23 ----- lib/gpc/forest.cpp | 1 + lib/gpc/kernels/box.cpp | 190 ++++++++++++++++++++++++++++++++++++++++ lib/gpc/kernels/box.hpp | 65 ++++++++++++++ 7 files changed, 258 insertions(+), 177 deletions(-) create mode 100644 lib/gpc/kernels/box.cpp create mode 100644 lib/gpc/kernels/box.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 8f7182d..d7f1d1a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -43,6 +43,7 @@ add_library(gpc_core lib/gpc/feature.cpp lib/gpc/filter.cpp lib/gpc/kernels/sobel.cpp + lib/gpc/kernels/box.cpp ) target_link_libraries(gpc_core PUBLIC diff --git a/lib/gpc/feature.cpp b/lib/gpc/feature.cpp index f68d440..0ffc27b 100644 --- a/lib/gpc/feature.cpp +++ b/lib/gpc/feature.cpp @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include diff --git a/lib/gpc/filter.cpp b/lib/gpc/filter.cpp index cbe18b9..713d0d9 100644 --- a/lib/gpc/filter.cpp +++ b/lib/gpc/filter.cpp @@ -107,44 +107,7 @@ void parFor(std::function const& f, } -void boxNaive(uint8_t* in, uint8_t* blurred, int width, int height) { - assert(width % 16 == 0 && "width must be multiple of 16!"); - // allocate space for result - uint8_t* ptr = in; - uint8_t* p11 = ptr + 0 * width; - uint8_t* p12 = ptr + 0 * width + 1; - uint8_t* p13 = ptr + 0 * width + 2; - - uint8_t* p21 = ptr + 1 * width; - uint8_t* p22 = ptr + 1 * width + 1; - uint8_t* p23 = ptr + 1 * width + 2; - uint8_t* p31 = ptr + 2 * width; - uint8_t* p32 = ptr + 2 * width + 1; - uint8_t* p33 = ptr + 2 * width + 2; - uint8_t* optr = blurred + 1 * width + 1; - - // Apply 3x3 box filter to image less pixel border of 1 (to avoid treating - // boundary) (unoptimized) - for (int iy = 1; iy < height - 1; iy++) { - for (int ix = 0; ix < width; ix++) { - int res = - (*p11 + *p12 + *p13 + *p21 + *p22 + *p23 + *p31 + *p32 + *p33) / - 9; - *optr = res; - p11++; - p12++; - p13++; - p21++; - p22++; - p23++; - p31++; - p32++; - p33++; - optr++; - } - } -} void gpcFilterNaive(uint8_t* in, const uint8_t* grad, uint32_t* gpc, @@ -191,123 +154,6 @@ void gpcFilterTauNaive(uint8_t* in, j++; } } -void box(uint8_t* in, uint8_t* blurred, int width, int height, int numThreads) { - assert(width % 16 == 0 && "width must be multiple of 16!"); -#ifndef _INTRINSICS_SSE - boxNaive(in, blurred, width, height); -#else - auto boxFilterSegment = [&](int start, int end) { - int x, y; - __m128i one_third; - __m128i *dst0, *dst1; - __m128i zero = _mm_setzero_si128(); - - one_third = _mm_set1_epi16( - 21846); // 2^16/3+1. For 16bit ints. 2^8/3+1=86.33 for 8bit - dst0 = (__m128i*)(blurred + width * (start)); - dst1 = (__m128i*)(blurred + width * (start + 1)); - for (y = start; y < end; - y += 2) { // We compute results for two rows in one iteration - const uint8_t *row0, *row1, *row2, *row3; - - row1 = in + y * width; - row0 = row1 - width; - row2 = row1 + width; - row3 = row2 + width; - - for (x = 0; x < width; x += 16) { - __m128i s00, s01, s02; - __m128i r00, r01, r02; - __m128i ra00, ra01, ra02; - __m128i rb00, rb01, rb02; - - __m128i a00, a01, a02, b00, b01, b02; - - __m128i tmp0, tmp1, res; - - s00 = _mm_loadu_si128((__m128i*)(row0 - 1)); - s01 = _mm_loadu_si128((__m128i*)(row0 + 1)); - s02 = _mm_load_si128((__m128i*)(row0)); - unpack8to16(s00, a00, b00); - unpack8to16(s01, a01, b01); - unpack8to16(s02, a02, b02); - - ra00 = _mm_mulhi_epi16( - _mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third); - rb00 = _mm_mulhi_epi16( - _mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third); - - s00 = _mm_loadu_si128((__m128i*)(row1 - 1)); - s01 = _mm_loadu_si128((__m128i*)(row1 + 1)); - s02 = _mm_load_si128((__m128i*)(row1)); - unpack8to16(s00, a00, b00); - unpack8to16(s01, a01, b01); - unpack8to16(s02, a02, b02); - - ra01 = _mm_mulhi_epi16( - _mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third); - rb01 = _mm_mulhi_epi16( - _mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third); - - s00 = _mm_loadu_si128((__m128i*)(row2 - 1)); - s01 = _mm_loadu_si128((__m128i*)(row2 + 1)); - s02 = _mm_load_si128((__m128i*)(row2)); - unpack8to16(s00, a00, b00); - unpack8to16(s01, a01, b01); - unpack8to16(s02, a02, b02); - - ra02 = _mm_mulhi_epi16( - _mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third); - rb02 = _mm_mulhi_epi16( - _mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third); - - tmp0 = _mm_mulhi_epi16( - _mm_adds_epi16(_mm_adds_epi16(ra00, ra01), ra02), - one_third); - tmp1 = _mm_mulhi_epi16( - _mm_adds_epi16(_mm_adds_epi16(rb00, rb01), rb02), - one_third); - - pack16to8(tmp0, tmp1, res); - _mm_store_si128(dst0++, res); - - s00 = _mm_loadu_si128((__m128i*)(row3 - 1)); - s01 = _mm_loadu_si128((__m128i*)(row3 + 1)); - s02 = _mm_load_si128((__m128i*)(row3)); - unpack8to16(s00, a00, b00); - unpack8to16(s01, a01, b01); - unpack8to16(s02, a02, b02); - ra00 = _mm_mulhi_epi16( - _mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third); - rb00 = _mm_mulhi_epi16( - _mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third); - - tmp0 = _mm_mulhi_epi16( - _mm_adds_epi16(_mm_adds_epi16(ra00, ra01), ra02), - one_third); - tmp1 = _mm_mulhi_epi16( - _mm_adds_epi16(_mm_adds_epi16(rb00, rb01), rb02), - one_third); - - pack16to8(tmp0, tmp1, res); - _mm_store_si128(dst1++, res); - - row0 += 16; - row1 += 16; - row2 += 16; - row3 += 16; - } - // still storing 128bit, but now in 16 x 8bit format, so /16 instead - // of /8 - dst0 += width / 16; - dst1 += width / 16; - } - }; // lambda - - boxFilterSegment(1, height - 3); - // parFor(boxFilterSegment,1,height-3,4); -#endif -} #ifdef _INTRINSICS_SSE diff --git a/lib/gpc/filter.hpp b/lib/gpc/filter.hpp index 2e6454a..d48fd6f 100644 --- a/lib/gpc/filter.hpp +++ b/lib/gpc/filter.hpp @@ -101,16 +101,6 @@ void parFor(std::function const& f, int start, int end, int nThreads); -/** - * @brief Naive 3x3 box filter implementation - * - * @param in input image - * @param blurred The blurred output image - * @param[in] width The width - * @param[in] height The height - * @param[in] numThreads number of threads to use - */ -void boxNaive(uint8_t* in, uint8_t* blurred, int width, int height); /** * @brief Applies a gpc filter defined by the pixel-difference tests in @@ -154,19 +144,6 @@ void gpcFilterTauNaive(uint8_t* in, std::vector& idx, int width, int height); -/** - * @brief boxfilter using SSE2 instructions. Loosely based on - * https://www.ignorantus.com/box_sse2/, published under - * the https://creativecommons.org/publicdomain/zero/1.0/ licence. - * - * @param in input image - * @param blurred The blurred - * @param[in] width The width - * @param[in] height The height - * @param[in] numThreads number of threads to use - */ -void box(uint8_t* in, uint8_t* blurred, int width, int height, int numThreads); - /** * @brief Checks if the 128bits in xmm are all zero * diff --git a/lib/gpc/forest.cpp b/lib/gpc/forest.cpp index 8b9d2ef..f8d4e4c 100644 --- a/lib/gpc/forest.cpp +++ b/lib/gpc/forest.cpp @@ -48,6 +48,7 @@ #include "gpc/buffer.hpp" #include "gpc/filter.hpp" #include "gpc/kernels/sobel.hpp" +#include "gpc/kernels/box.hpp" #include "gpc/hashmatch.hpp" #include "gpc/forest.hpp" diff --git a/lib/gpc/kernels/box.cpp b/lib/gpc/kernels/box.cpp new file mode 100644 index 0000000..9e444d4 --- /dev/null +++ b/lib/gpc/kernels/box.cpp @@ -0,0 +1,190 @@ +// Copyright (c) 2018, ETH Zurich +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// 3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// Code Author: Niklaus Bamert (bamertn@ethz.ch) + +#include "gpc/kernels/box.hpp" +namespace ndb { +void boxNaive(uint8_t* in, uint8_t* blurred, int width, int height) { + assert(width % 16 == 0 && "width must be multiple of 16!"); + // allocate space for result + uint8_t* ptr = in; + uint8_t* p11 = ptr + 0 * width; + uint8_t* p12 = ptr + 0 * width + 1; + uint8_t* p13 = ptr + 0 * width + 2; + + uint8_t* p21 = ptr + 1 * width; + uint8_t* p22 = ptr + 1 * width + 1; + uint8_t* p23 = ptr + 1 * width + 2; + + uint8_t* p31 = ptr + 2 * width; + uint8_t* p32 = ptr + 2 * width + 1; + uint8_t* p33 = ptr + 2 * width + 2; + uint8_t* optr = blurred + 1 * width + 1; + + // Apply 3x3 box filter to image less pixel border of 1 (to avoid treating + // boundary) (unoptimized) + for (int iy = 1; iy < height - 1; iy++) { + for (int ix = 0; ix < width; ix++) { + int res = + (*p11 + *p12 + *p13 + *p21 + *p22 + *p23 + *p31 + *p32 + *p33) / + 9; + *optr = res; + p11++; + p12++; + p13++; + p21++; + p22++; + p23++; + p31++; + p32++; + p33++; + optr++; + } + } +} +void box(uint8_t* in, uint8_t* blurred, int width, int height, int numThreads) { + assert(width % 16 == 0 && "width must be multiple of 16!"); +#ifndef _INTRINSICS_SSE + boxNaive(in, blurred, width, height); +#else + auto boxFilterSegment = [&](int start, int end) { + int x, y; + __m128i one_third; + __m128i *dst0, *dst1; + __m128i zero = _mm_setzero_si128(); + + one_third = _mm_set1_epi16( + 21846); // 2^16/3+1. For 16bit ints. 2^8/3+1=86.33 for 8bit + dst0 = (__m128i*)(blurred + width * (start)); + dst1 = (__m128i*)(blurred + width * (start + 1)); + for (y = start; y < end; + y += 2) { // We compute results for two rows in one iteration + const uint8_t *row0, *row1, *row2, *row3; + + row1 = in + y * width; + row0 = row1 - width; + row2 = row1 + width; + row3 = row2 + width; + + for (x = 0; x < width; x += 16) { + __m128i s00, s01, s02; + __m128i r00, r01, r02; + __m128i ra00, ra01, ra02; + __m128i rb00, rb01, rb02; + + __m128i a00, a01, a02, b00, b01, b02; + + __m128i tmp0, tmp1, res; + + s00 = _mm_loadu_si128((__m128i*)(row0 - 1)); + s01 = _mm_loadu_si128((__m128i*)(row0 + 1)); + s02 = _mm_load_si128((__m128i*)(row0)); + unpack8to16(s00, a00, b00); + unpack8to16(s01, a01, b01); + unpack8to16(s02, a02, b02); + + ra00 = _mm_mulhi_epi16( + _mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third); + rb00 = _mm_mulhi_epi16( + _mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third); + + s00 = _mm_loadu_si128((__m128i*)(row1 - 1)); + s01 = _mm_loadu_si128((__m128i*)(row1 + 1)); + s02 = _mm_load_si128((__m128i*)(row1)); + unpack8to16(s00, a00, b00); + unpack8to16(s01, a01, b01); + unpack8to16(s02, a02, b02); + + ra01 = _mm_mulhi_epi16( + _mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third); + rb01 = _mm_mulhi_epi16( + _mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third); + + s00 = _mm_loadu_si128((__m128i*)(row2 - 1)); + s01 = _mm_loadu_si128((__m128i*)(row2 + 1)); + s02 = _mm_load_si128((__m128i*)(row2)); + unpack8to16(s00, a00, b00); + unpack8to16(s01, a01, b01); + unpack8to16(s02, a02, b02); + + ra02 = _mm_mulhi_epi16( + _mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third); + rb02 = _mm_mulhi_epi16( + _mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third); + + tmp0 = _mm_mulhi_epi16( + _mm_adds_epi16(_mm_adds_epi16(ra00, ra01), ra02), + one_third); + tmp1 = _mm_mulhi_epi16( + _mm_adds_epi16(_mm_adds_epi16(rb00, rb01), rb02), + one_third); + + pack16to8(tmp0, tmp1, res); + _mm_store_si128(dst0++, res); + + s00 = _mm_loadu_si128((__m128i*)(row3 - 1)); + s01 = _mm_loadu_si128((__m128i*)(row3 + 1)); + s02 = _mm_load_si128((__m128i*)(row3)); + unpack8to16(s00, a00, b00); + unpack8to16(s01, a01, b01); + unpack8to16(s02, a02, b02); + ra00 = _mm_mulhi_epi16( + _mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third); + rb00 = _mm_mulhi_epi16( + _mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third); + + tmp0 = _mm_mulhi_epi16( + _mm_adds_epi16(_mm_adds_epi16(ra00, ra01), ra02), + one_third); + tmp1 = _mm_mulhi_epi16( + _mm_adds_epi16(_mm_adds_epi16(rb00, rb01), rb02), + one_third); + + pack16to8(tmp0, tmp1, res); + _mm_store_si128(dst1++, res); + + row0 += 16; + row1 += 16; + row2 += 16; + row3 += 16; + } + // still storing 128bit, but now in 16 x 8bit format, so /16 instead + // of /8 + dst0 += width / 16; + dst1 += width / 16; + } + }; // lambda + + boxFilterSegment(1, height - 3); + // parFor(boxFilterSegment,1,height-3,4); +#endif +} + +} diff --git a/lib/gpc/kernels/box.hpp b/lib/gpc/kernels/box.hpp new file mode 100644 index 0000000..bf9eea3 --- /dev/null +++ b/lib/gpc/kernels/box.hpp @@ -0,0 +1,65 @@ +// Copyright (c) 2018, ETH Zurich +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// 3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// Code Author: Niklaus Bamert (bamertn@ethz.ch) + +#ifndef __NDB__KERNEL_BOX +#define __NDB__KERNEL_BOX +using namespace std; + +#include "gpc/buffer.hpp" + +namespace ndb { +/** + * @brief Naive 3x3 box filter implementation + * + * @param in input image + * @param blurred The blurred output image + * @param[in] width The width + * @param[in] height The height + * @param[in] numThreads number of threads to use + */ +void boxNaive(uint8_t* in, uint8_t* blurred, int width, int height); + +/** + * @brief boxfilter using SSE2 instructions. Loosely based on + * https://www.ignorantus.com/box_sse2/, published under + * the https://creativecommons.org/publicdomain/zero/1.0/ licence. + * + * @param in input image + * @param blurred The blurred + * @param[in] width The width + * @param[in] height The height + * @param[in] numThreads number of threads to use + */ +void box(uint8_t* in, uint8_t* blurred, int width, int height, int numThreads); + + +} +#endif From fb4d6261580b40011c585c769ca116fe2fc8d8f2 Mon Sep 17 00:00:00 2001 From: Nik Bamert Date: Sun, 15 Feb 2026 09:31:18 +0100 Subject: [PATCH 09/36] break out census filter --- CMakeLists.txt | 1 + lib/gpc/filter.cpp | 169 ------------------------------ lib/gpc/filter.hpp | 21 ---- lib/gpc/kernels/census.cpp | 204 +++++++++++++++++++++++++++++++++++++ lib/gpc/kernels/census.hpp | 61 +++++++++++ lib/gpc/kernels/sobel.hpp | 2 - 6 files changed, 266 insertions(+), 192 deletions(-) create mode 100644 lib/gpc/kernels/census.cpp create mode 100644 lib/gpc/kernels/census.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index d7f1d1a..e4af04f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -44,6 +44,7 @@ add_library(gpc_core lib/gpc/filter.cpp lib/gpc/kernels/sobel.cpp lib/gpc/kernels/box.cpp + lib/gpc/kernels/census.cpp ) target_link_libraries(gpc_core PUBLIC diff --git a/lib/gpc/filter.cpp b/lib/gpc/filter.cpp index 713d0d9..fa39340 100644 --- a/lib/gpc/filter.cpp +++ b/lib/gpc/filter.cpp @@ -312,175 +312,6 @@ void gpcFilterTau(uint8_t* in, parFor(gpcFilterSegment, 13, height - 15, 4); #endif } -void census5x5Naive(uint8_t* in, uint32_t* census, int width, int height) { - uint32_t val; - uint32_t* dst; - for (int y = 2; y < height - 3; y++) { - for (int x = 0; x < width; x++) { - val = 0; - dst = census + y * width + x; - int i = 0; - // patch loops - for (int px = -2; px <= 2; px++) { - for (int py = -2; py <= 2; py++) { - if (!(px == 0 && py == 0)) { - val |= (in[(y + py) * width + (x + px)] > - in[y * width + x]) - ? (1 << i) - : 0; - i++; - } - } - } // End patch loops - *dst = val; - } - } // End pixel loops -} -void census5x5(uint8_t* in, uint32_t* census, int width, int height) { - assert(width % 16 == 0 && "width must be multiple of 16!"); -#ifndef _INTRINSICS_SSE - census5x5Naive(in, census, width, height); -#else - __m128i zero = _mm_set1_epi8(0); - __m128i one = _mm_set1_epi8(1); - - for (int y = 2; y < height - 3; y++) { - for (int x = 0; x < width; x += 16) { - uint8_t* rowPtr; - rowPtr = in + (y - 2) * width + x; - __m128i center = _mm_lddqu_si128((__m128i*)(in + y * width + x)); - __m128i* dst = (__m128i*)(census + y * width + - x); // Set starting point to pixel (2,2) - // row 0 - __m128i bitMask = one; - __m128i byte1 = _mm_and_si128( - _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 2))), - bitMask); - bitMask += bitMask; // 2 - byte1 |= _mm_and_si128( - _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 1))), - bitMask); - bitMask += bitMask; // 4 - byte1 |= _mm_and_si128( - _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr))), - bitMask); - bitMask += bitMask; // 8 - byte1 |= _mm_and_si128( - _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 1))), - bitMask); - bitMask += bitMask; // 16 - byte1 |= _mm_and_si128( - _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 2))), - bitMask); - // row 1 - rowPtr += width; - bitMask += bitMask; // 32 - byte1 |= _mm_and_si128( - _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 2))), - bitMask); - bitMask += bitMask; // 64 - byte1 |= _mm_and_si128( - _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 1))), - bitMask); - bitMask += bitMask; // 128 - byte1 |= _mm_and_si128( - _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr))), - bitMask); - bitMask = one; // 1 - __m128i byte2 = _mm_and_si128( - _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 1))), - bitMask); - bitMask += bitMask; // 2 - byte2 |= _mm_and_si128( - _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 2))), - bitMask); - - // row 2 - rowPtr += width; - bitMask += bitMask; // 4 - byte2 |= _mm_and_si128( - _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 2))), - bitMask); - bitMask += bitMask; // 8 - byte2 |= _mm_and_si128( - _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 1))), - bitMask); - bitMask += bitMask; // 16 - byte2 |= _mm_and_si128( - _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 1))), - bitMask); - bitMask += bitMask; // 32 - byte2 |= _mm_and_si128( - _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 2))), - bitMask); - - // row 3 - rowPtr += width; - bitMask += bitMask; // 64 - byte2 |= _mm_and_si128( - _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 2))), - bitMask); - bitMask += bitMask; // 128 - byte2 |= _mm_and_si128( - _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 1))), - bitMask); - bitMask = one; // 1 - __m128i byte3 = _mm_and_si128( - _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr))), - bitMask); - bitMask += bitMask; // 2 - byte3 |= _mm_and_si128( - _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 1))), - bitMask); - bitMask += bitMask; // 4 - byte3 |= _mm_and_si128( - _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 2))), - bitMask); - - // row 4 - rowPtr += width; - bitMask += bitMask; // 8 - byte3 |= _mm_and_si128( - _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 2))), - bitMask); - bitMask += bitMask; // 16 - byte3 |= _mm_and_si128( - _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 1))), - bitMask); - bitMask += bitMask; // 32 - byte3 |= _mm_and_si128( - _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr))), - bitMask); - bitMask += bitMask; // 64 - byte3 |= _mm_and_si128( - _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 1))), - bitMask); - bitMask += bitMask; // 128 - byte3 |= _mm_and_si128( - _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 2))), - bitMask); - - // 8bit to 16bit - __m128i high1 = _mm_unpacklo_epi8(byte3, zero); - __m128i high2 = _mm_unpackhi_epi8(byte3, zero); - __m128i low1 = _mm_unpacklo_epi8(byte1, byte2); - __m128i low2 = _mm_unpackhi_epi8(byte1, byte2); - - // 16bit to 32bit ints - _mm_storeu_si128(dst, _mm_unpacklo_epi16(low1, high1)); - _mm_storeu_si128(dst + 1, _mm_unpackhi_epi16(low1, high1)); - _mm_storeu_si128(dst + 2, _mm_unpacklo_epi16(low2, high2)); - _mm_storeu_si128(dst + 3, _mm_unpackhi_epi16(low2, high2)); - - } // col iteration - } // row iteration - // if(numThreads == 1) - // gpcFilterSegment(13,height-15); - // else - // parFor(gpcFilterSegment,13,height-15,4); - -#endif -} // census5x5 } // namespace ndb #endif diff --git a/lib/gpc/filter.hpp b/lib/gpc/filter.hpp index d48fd6f..35f1325 100644 --- a/lib/gpc/filter.hpp +++ b/lib/gpc/filter.hpp @@ -200,26 +200,5 @@ void gpcFilterTau(uint8_t* in, int width, int height, int numThreads); -/** - * @brief Naive version of 5x5 census transoform - * - * @param in Input image - * @param census 32bit census transform output - * @param width Width of the image at *in pointer - * @param height Heiht of the image at *in pointer - */ -void census5x5Naive(uint8_t* in, uint32_t* census, int width, int height); - - -/** - * @brief 5x5 dense census transform of input image. binary codes are returned - * as a 32bit image - * - * @param in - * @param census - * @param width - * @param height - */ -void census5x5(uint8_t* in, uint32_t* census, int width, int height); } // namespace ndb #endif diff --git a/lib/gpc/kernels/census.cpp b/lib/gpc/kernels/census.cpp new file mode 100644 index 0000000..bd70613 --- /dev/null +++ b/lib/gpc/kernels/census.cpp @@ -0,0 +1,204 @@ +// Copyright (c) 2018, ETH Zurich +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// 3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// Code Author: Niklaus Bamert (bamertn@ethz.ch) + +#include "gpc/kernels/census.hpp" +void census5x5Naive(uint8_t* in, uint32_t* census, int width, int height) { + uint32_t val; + uint32_t* dst; + for (int y = 2; y < height - 3; y++) { + for (int x = 0; x < width; x++) { + val = 0; + dst = census + y * width + x; + int i = 0; + // patch loops + for (int px = -2; px <= 2; px++) { + for (int py = -2; py <= 2; py++) { + if (!(px == 0 && py == 0)) { + val |= (in[(y + py) * width + (x + px)] > + in[y * width + x]) + ? (1 << i) + : 0; + i++; + } + } + } // End patch loops + *dst = val; + } + } // End pixel loops +} +void census5x5(uint8_t* in, uint32_t* census, int width, int height) { + assert(width % 16 == 0 && "width must be multiple of 16!"); +#ifndef _INTRINSICS_SSE + census5x5Naive(in, census, width, height); +#else + __m128i zero = _mm_set1_epi8(0); + __m128i one = _mm_set1_epi8(1); + + for (int y = 2; y < height - 3; y++) { + for (int x = 0; x < width; x += 16) { + uint8_t* rowPtr; + rowPtr = in + (y - 2) * width + x; + __m128i center = _mm_lddqu_si128((__m128i*)(in + y * width + x)); + __m128i* dst = (__m128i*)(census + y * width + + x); // Set starting point to pixel (2,2) + // row 0 + __m128i bitMask = one; + __m128i byte1 = _mm_and_si128( + _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 2))), + bitMask); + bitMask += bitMask; // 2 + byte1 |= _mm_and_si128( + _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 1))), + bitMask); + bitMask += bitMask; // 4 + byte1 |= _mm_and_si128( + _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr))), + bitMask); + bitMask += bitMask; // 8 + byte1 |= _mm_and_si128( + _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 1))), + bitMask); + bitMask += bitMask; // 16 + byte1 |= _mm_and_si128( + _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 2))), + bitMask); + + // row 1 + rowPtr += width; + bitMask += bitMask; // 32 + byte1 |= _mm_and_si128( + _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 2))), + bitMask); + bitMask += bitMask; // 64 + byte1 |= _mm_and_si128( + _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 1))), + bitMask); + bitMask += bitMask; // 128 + byte1 |= _mm_and_si128( + _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr))), + bitMask); + bitMask = one; // 1 + __m128i byte2 = _mm_and_si128( + _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 1))), + bitMask); + bitMask += bitMask; // 2 + byte2 |= _mm_and_si128( + _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 2))), + bitMask); + + // row 2 + rowPtr += width; + bitMask += bitMask; // 4 + byte2 |= _mm_and_si128( + _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 2))), + bitMask); + bitMask += bitMask; // 8 + byte2 |= _mm_and_si128( + _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 1))), + bitMask); + bitMask += bitMask; // 16 + byte2 |= _mm_and_si128( + _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 1))), + bitMask); + bitMask += bitMask; // 32 + byte2 |= _mm_and_si128( + _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 2))), + bitMask); + + // row 3 + rowPtr += width; + bitMask += bitMask; // 64 + byte2 |= _mm_and_si128( + _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 2))), + bitMask); + bitMask += bitMask; // 128 + byte2 |= _mm_and_si128( + _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 1))), + bitMask); + bitMask = one; // 1 + __m128i byte3 = _mm_and_si128( + _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr))), + bitMask); + bitMask += bitMask; // 2 + byte3 |= _mm_and_si128( + _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 1))), + bitMask); + bitMask += bitMask; // 4 + byte3 |= _mm_and_si128( + _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 2))), + bitMask); + + // row 4 + rowPtr += width; + bitMask += bitMask; // 8 + byte3 |= _mm_and_si128( + _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 2))), + bitMask); + bitMask += bitMask; // 16 + byte3 |= _mm_and_si128( + _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr - 1))), + bitMask); + bitMask += bitMask; // 32 + byte3 |= _mm_and_si128( + _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr))), + bitMask); + bitMask += bitMask; // 64 + byte3 |= _mm_and_si128( + _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 1))), + bitMask); + bitMask += bitMask; // 128 + byte3 |= _mm_and_si128( + _mm_cmplt_epu8(center, _mm_lddqu_si128((__m128i*)(rowPtr + 2))), + bitMask); + + // 8bit to 16bit + __m128i high1 = _mm_unpacklo_epi8(byte3, zero); + __m128i high2 = _mm_unpackhi_epi8(byte3, zero); + __m128i low1 = _mm_unpacklo_epi8(byte1, byte2); + __m128i low2 = _mm_unpackhi_epi8(byte1, byte2); + + // 16bit to 32bit ints + _mm_storeu_si128(dst, _mm_unpacklo_epi16(low1, high1)); + _mm_storeu_si128(dst + 1, _mm_unpackhi_epi16(low1, high1)); + _mm_storeu_si128(dst + 2, _mm_unpacklo_epi16(low2, high2)); + _mm_storeu_si128(dst + 3, _mm_unpackhi_epi16(low2, high2)); + + } // col iteration + } // row iteration + // if(numThreads == 1) + // gpcFilterSegment(13,height-15); + // else + // parFor(gpcFilterSegment,13,height-15,4); + +#endif +} // census5x5 + + diff --git a/lib/gpc/kernels/census.hpp b/lib/gpc/kernels/census.hpp new file mode 100644 index 0000000..8353a4e --- /dev/null +++ b/lib/gpc/kernels/census.hpp @@ -0,0 +1,61 @@ +// Copyright (c) 2018, ETH Zurich +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// 3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// Code Author: Niklaus Bamert (bamertn@ethz.ch) + +#ifndef __NDB__KERNEL_CENSUS +#define __NDB__KERNEL_CENSUS + +#include "gpc/buffer.hpp" + +namespace ndb { +/** + * @brief Naive version of 5x5 census transoform + * + * @param in Input image + * @param census 32bit census transform output + * @param width Width of the image at *in pointer + * @param height Heiht of the image at *in pointer + */ +void census5x5Naive(uint8_t* in, uint32_t* census, int width, int height); + + +/** + * @brief 5x5 dense census transform of input image. binary codes are returned + * as a 32bit image + * + * @param in + * @param census + * @param width + * @param height + */ +void census5x5(uint8_t* in, uint32_t* census, int width, int height); + +} +#endif diff --git a/lib/gpc/kernels/sobel.hpp b/lib/gpc/kernels/sobel.hpp index 038408a..31749cb 100644 --- a/lib/gpc/kernels/sobel.hpp +++ b/lib/gpc/kernels/sobel.hpp @@ -31,8 +31,6 @@ #ifndef __NDB__KERNEL_SOBEL #define __NDB__KERNEL_SOBEL -using namespace std; - #include "gpc/buffer.hpp" namespace ndb { From 7d0ce8d504c36b97fa7b6f32b957447527aa9dd1 Mon Sep 17 00:00:00 2001 From: Nik Bamert Date: Sun, 15 Feb 2026 09:51:13 +0100 Subject: [PATCH 10/36] extract gpc filter, move utils --- CMakeLists.txt | 3 +- lib/gpc/Feature.hpp | 1 - lib/gpc/feature.cpp | 1 - lib/gpc/forest.cpp | 3 +- lib/gpc/forest.hpp | 1 - lib/gpc/{filter.cpp => kernels/gpc.cpp} | 82 +------------ lib/gpc/{filter.hpp => kernels/gpc.hpp} | 157 +++++++----------------- lib/gpc/kernels/utils.cpp | 109 ++++++++++++++++ lib/gpc/kernels/utils.hpp | 106 ++++++++++++++++ lib/gpc/training.hpp | 1 - 10 files changed, 268 insertions(+), 196 deletions(-) rename lib/gpc/{filter.cpp => kernels/gpc.cpp} (81%) rename lib/gpc/{filter.hpp => kernels/gpc.hpp} (68%) create mode 100644 lib/gpc/kernels/utils.cpp create mode 100644 lib/gpc/kernels/utils.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index e4af04f..c0fe739 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -41,10 +41,11 @@ add_library(gpc_core lib/gpc/forest.cpp lib/gpc/fern.cpp lib/gpc/feature.cpp - lib/gpc/filter.cpp lib/gpc/kernels/sobel.cpp lib/gpc/kernels/box.cpp lib/gpc/kernels/census.cpp + lib/gpc/kernels/gpc.cpp + lib/gpc/kernels/utils.cpp ) target_link_libraries(gpc_core PUBLIC diff --git a/lib/gpc/Feature.hpp b/lib/gpc/Feature.hpp index 82aff06..8a8b55c 100644 --- a/lib/gpc/Feature.hpp +++ b/lib/gpc/Feature.hpp @@ -39,7 +39,6 @@ #include //for log2 #include #include -#include #include #include #include diff --git a/lib/gpc/feature.cpp b/lib/gpc/feature.cpp index 0ffc27b..529970c 100644 --- a/lib/gpc/feature.cpp +++ b/lib/gpc/feature.cpp @@ -37,7 +37,6 @@ #include //for log2 #include #include -#include #include #include #include diff --git a/lib/gpc/forest.cpp b/lib/gpc/forest.cpp index f8d4e4c..52046f2 100644 --- a/lib/gpc/forest.cpp +++ b/lib/gpc/forest.cpp @@ -46,9 +46,10 @@ #include "gpc/SintelOpticalFlow.hpp" #include "gpc/SintelStereo.hpp" #include "gpc/buffer.hpp" -#include "gpc/filter.hpp" #include "gpc/kernels/sobel.hpp" #include "gpc/kernels/box.hpp" +#include "gpc/kernels/gpc.hpp" +#include "gpc/kernels/utils.hpp" #include "gpc/hashmatch.hpp" #include "gpc/forest.hpp" diff --git a/lib/gpc/forest.hpp b/lib/gpc/forest.hpp index 87939c1..d0b5c32 100644 --- a/lib/gpc/forest.hpp +++ b/lib/gpc/forest.hpp @@ -48,7 +48,6 @@ #include "gpc/SintelOpticalFlow.hpp" #include "gpc/SintelStereo.hpp" #include "gpc/buffer.hpp" -#include "gpc/filter.hpp" #include "gpc/hashmatch.hpp" /** diff --git a/lib/gpc/filter.cpp b/lib/gpc/kernels/gpc.cpp similarity index 81% rename from lib/gpc/filter.cpp rename to lib/gpc/kernels/gpc.cpp index fa39340..636a9cb 100644 --- a/lib/gpc/filter.cpp +++ b/lib/gpc/kernels/gpc.cpp @@ -28,86 +28,9 @@ // POSSIBILITY OF SUCH DAMAGE. // // Code Author: Niklaus Bamert (bamertn@ethz.ch) -#ifndef __NDB__FILTER -#define __NDB__FILTER - -#include -#include - -#include "gpc/filter.hpp" -using namespace std; +#include "gpc/kernels/gpc.hpp" namespace ndb { -void arr2ind(const unsigned char* a, - int n, - int* ind, - int* m) { -#ifdef _INTRINSICS_SSE - int i, m0, k; - __m256i msk; - m0 = 0; - for (i = 0; i < n; i = i + 32) { /* Load 32 bytes and compare with zero: */ - msk = _mm256_cmpeq_epi8(_mm256_load_si256((__m256i*)&a[i]), - _mm256_setzero_si256()); - k = _mm256_movemask_epi8(msk); - k = ~k; /* Search for nonzero bits instead of zero bits. */ - while (k) { - ind[m0] = - i + _tzcnt_u32( - k); /* Count the number of trailing zero bits in k. */ - m0++; - k = _blsr_u32(k); /* Clear the lowest set bit in k. */ - } - } - *m = m0; -#else - int nnz = 0; - for (int i = 0; i < n; i++) { - if (a[i] != 0) { - nnz++; - *ind = i; - ind++; - } - } - *m = nnz; -#endif -} -#ifdef _INTRINSICS_SSE -void unpack8to16(const __m128i x, __m128i& y0, __m128i& y1) { - __m128i zero = _mm_setzero_si128(); - y0 = _mm_unpacklo_epi8(x, zero); - y1 = _mm_unpackhi_epi8(x, zero); -} -void pack16to8(const __m128i x0, const __m128i x1, __m128i& y) { - y = _mm_packus_epi16(x0, x1); -} - -#endif -void parFor(std::function const& f, - int start, - int end, - int nThreads) { - // Range definition - // quantities derived from range - int segSize = (end - start) / nThreads; - int lastSeg = (end - start) % nThreads; - - std::vector threads; - threads.reserve(nThreads); - - // Spawn threads - for (int t = 0; t < nThreads - 1; t++) { - threads.emplace_back(f, start + t * segSize, start + (t + 1) * segSize); - } - threads.emplace_back(f, - start + (nThreads - 1) * segSize, - start + (nThreads)*segSize + lastSeg); - // Join - for (auto& t : threads) t.join(); -} - - - void gpcFilterNaive(uint8_t* in, const uint8_t* grad, uint32_t* gpc, @@ -312,6 +235,5 @@ void gpcFilterTau(uint8_t* in, parFor(gpcFilterSegment, 13, height - 15, 4); #endif } +} -} // namespace ndb -#endif diff --git a/lib/gpc/filter.hpp b/lib/gpc/kernels/gpc.hpp similarity index 68% rename from lib/gpc/filter.hpp rename to lib/gpc/kernels/gpc.hpp index 35f1325..5f43743 100644 --- a/lib/gpc/filter.hpp +++ b/lib/gpc/kernels/gpc.hpp @@ -28,79 +28,38 @@ // POSSIBILITY OF SUCH DAMAGE. // // Code Author: Niklaus Bamert (bamertn@ethz.ch) -#ifndef __NDB__FILTER -#define __NDB__FILTER -#include -#include +#ifndef __NDB__KERNEL_GPC +#define __NDB__KERNEL_GPC +using namespace std; #include "gpc/buffer.hpp" -using namespace std; -#ifdef _INTRINSICS_SSE -#include -// greater and lesser than simd ops for unsigned 8bit integer (epu8) -#define _mm_cmpgt_epu8(v0, v1) \ - _mm_cmpgt_epi8(_mm_xor_si128(v0, _mm_set1_epi8(-128)), \ - _mm_xor_si128(v1, _mm_set1_epi8(-128))) -#define _mm_cmplt_epu8(v1, v0) \ - _mm_cmpgt_epi8(_mm_xor_si128(v0, _mm_set1_epi8(-128)), \ - _mm_xor_si128(v1, _mm_set1_epi8(-128))) -#endif namespace ndb { /** - * @brief Gets indices of non-zero values in array a. - * Credits: - * https://stackoverflow.com/questions/18971401/sparse-array-compression-using-simd-avx2/41958528#41958528 - * - * @param input array - * @param n number of input elements - * @param ind output array (indices into n of nonzero elements) - * @param m number of elements in output - */ -void arr2ind(const unsigned char* a, - int n, - int* ind, - int* m); - -#ifdef _INTRINSICS_SSE -/** - * @brief Unpacks 16x8bit from a 128bit simd var into 2x128bit vars - * (8x16bit) + * @brief Applies a gpc filter defined by the pixel-difference tests in + * fastmask. Accelerated with SSE. * - * @param[in] x the 128 bit vector to be unpacked - * @param y0 The y 0 - * @param y1 The y 1 + * @param in The input image. + * @param grad The gradient image, such that we can skip non-gradient + * pixels + * @param gpc The output image of 32bit codes + * @param fastmask The fastmask containing the gpc filter + * @param idx The gradient indices. Only used if no intrincs are available + * and the call gets forwarded to the naive implementation. + * @param width The width of the image at pointer *in + * @param height The height of the image at pointer *in + * @param numThreadsNumber of threads to use */ -void unpack8to16(const __m128i x, __m128i& y0, __m128i& y1); +void gpcFilter(uint8_t* in, + const uint8_t* grad, + uint32_t* gpc, + std::vector fastmask, + std::vector& idx, + int width, + int height, + int numThreads); -/** - * @brief Packs 2x128bit vars with 16bit values(where 8 upper bits are - * zero) into 1x128bit with 8bit values - * - * @param[in] x0 The x 0 - * @param[in] x1 The x 1 - * @param y the packed vector - */ -void pack16to8(const __m128i x0, const __m128i x1, __m128i& y); -#endif -/** - * @brief Calls a given functional f with subranges based on the given start - * and end indices. Here the functional is assumed to take two integer - * arguments indicating their respective start and end ranges. - * nThreads determines the number of threads the given range shall be - * split into. The range is inclusive on the lower bound and exclusive on the - * upper bound, i.e. [start,end) - * - * @param f function object (e.g. a lambda functional) - * @param start start of the range - * @param end end of the range - * @param nThreads number of threads to use - */ -void parFor(std::function const& f, - int start, - int end, - int nThreads); /** * @brief Applies a gpc filter defined by the pixel-difference tests in @@ -123,6 +82,28 @@ void gpcFilterNaive(uint8_t* in, std::vector& idx, int width, int height); +/** + * @brief Applies a gpc filter defined by the pixel-difference tests in + * fastmask. Additionally uses a threshold vector (tau) + * + * @param in The input image. + * @param grad The gradient image, such that we can skip non-gradient + * pixels + * @param gpc The output image of 32bit codes + * @param fastmask The fastmask containing the gpc filter + * @param width The width of the image at pointer *in + * @param height The height of the image at pointer *in + * @param numThreads Number of threads to use + */ +void gpcFilterTau(uint8_t* in, + const uint8_t* grad, + uint32_t* gpc, + std::vector fastmask, + std::vector tau, + std::vector& idx, + int width, + int height, + int numThreads); /** * @brief Applies a gpc filter defined by the pixel-difference tests in @@ -154,51 +135,7 @@ void gpcFilterTauNaive(uint8_t* in, #ifdef _INTRINSICS_SSE bool isAllZeros(__m128i xmm); #endif -/** - * @brief Applies a gpc filter defined by the pixel-difference tests in - * fastmask. Accelerated with SSE. - * - * @param in The input image. - * @param grad The gradient image, such that we can skip non-gradient - * pixels - * @param gpc The output image of 32bit codes - * @param fastmask The fastmask containing the gpc filter - * @param idx The gradient indices. Only used if no intrincs are available - * and the call gets forwarded to the naive implementation. - * @param width The width of the image at pointer *in - * @param height The height of the image at pointer *in - * @param numThreadsNumber of threads to use - */ -void gpcFilter(uint8_t* in, - const uint8_t* grad, - uint32_t* gpc, - std::vector fastmask, - std::vector& idx, - int width, - int height, - int numThreads); -/** - * @brief Applies a gpc filter defined by the pixel-difference tests in - * fastmask. Additionally uses a threshold vector (tau) - * - * @param in The input image. - * @param grad The gradient image, such that we can skip non-gradient - * pixels - * @param gpc The output image of 32bit codes - * @param fastmask The fastmask containing the gpc filter - * @param width The width of the image at pointer *in - * @param height The height of the image at pointer *in - * @param numThreads Number of threads to use - */ -void gpcFilterTau(uint8_t* in, - const uint8_t* grad, - uint32_t* gpc, - std::vector fastmask, - std::vector tau, - std::vector& idx, - int width, - int height, - int numThreads); -} // namespace ndb + +} #endif diff --git a/lib/gpc/kernels/utils.cpp b/lib/gpc/kernels/utils.cpp new file mode 100644 index 0000000..dd5d146 --- /dev/null +++ b/lib/gpc/kernels/utils.cpp @@ -0,0 +1,109 @@ +// Copyright (c) 2018, ETH Zurich +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// 3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// Code Author: Niklaus Bamert (bamertn@ethz.ch) +#include +#include + +using namespace std; + +namespace ndb { +void arr2ind(const unsigned char* a, + int n, + int* ind, + int* m) { +#ifdef _INTRINSICS_SSE + int i, m0, k; + __m256i msk; + m0 = 0; + for (i = 0; i < n; i = i + 32) { /* Load 32 bytes and compare with zero: */ + msk = _mm256_cmpeq_epi8(_mm256_load_si256((__m256i*)&a[i]), + _mm256_setzero_si256()); + k = _mm256_movemask_epi8(msk); + k = ~k; /* Search for nonzero bits instead of zero bits. */ + while (k) { + ind[m0] = + i + _tzcnt_u32( + k); /* Count the number of trailing zero bits in k. */ + m0++; + k = _blsr_u32(k); /* Clear the lowest set bit in k. */ + } + } + *m = m0; +#else + int nnz = 0; + for (int i = 0; i < n; i++) { + if (a[i] != 0) { + nnz++; + *ind = i; + ind++; + } + } + *m = nnz; +#endif +} +#ifdef _INTRINSICS_SSE +void unpack8to16(const __m128i x, __m128i& y0, __m128i& y1) { + __m128i zero = _mm_setzero_si128(); + y0 = _mm_unpacklo_epi8(x, zero); + y1 = _mm_unpackhi_epi8(x, zero); +} +void pack16to8(const __m128i x0, const __m128i x1, __m128i& y) { + y = _mm_packus_epi16(x0, x1); +} + +#endif +void parFor(std::function const& f, + int start, + int end, + int nThreads) { + // Range definition + // quantities derived from range + int segSize = (end - start) / nThreads; + int lastSeg = (end - start) % nThreads; + + std::vector threads; + threads.reserve(nThreads); + + // Spawn threads + for (int t = 0; t < nThreads - 1; t++) { + threads.emplace_back(f, start + t * segSize, start + (t + 1) * segSize); + } + threads.emplace_back(f, + start + (nThreads - 1) * segSize, + start + (nThreads)*segSize + lastSeg); + // Join + for (auto& t : threads) t.join(); +} + + + + + +} // namespace ndb diff --git a/lib/gpc/kernels/utils.hpp b/lib/gpc/kernels/utils.hpp new file mode 100644 index 0000000..e9ce569 --- /dev/null +++ b/lib/gpc/kernels/utils.hpp @@ -0,0 +1,106 @@ +// Copyright (c) 2018, ETH Zurich +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are met: +// +// 1. Redistributions of source code must retain the above copyright notice, +// this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright notice, +// this list of conditions and the following disclaimer in the documentation +// and/or other materials provided with the distribution. +// +// 3. Neither the name of the copyright holder nor the names of its contributors +// may be used to endorse or promote products derived from this software without +// specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +// POSSIBILITY OF SUCH DAMAGE. +// +// Code Author: Niklaus Bamert (bamertn@ethz.ch) +#ifndef __NDB__KERNEL_UTILS +#define __NDB__KERNEL_UTILS + +#include +#include + +#include "gpc/buffer.hpp" +using namespace std; + +#ifdef _INTRINSICS_SSE +#include +// greater and lesser than simd ops for unsigned 8bit integer (epu8) +#define _mm_cmpgt_epu8(v0, v1) \ + _mm_cmpgt_epi8(_mm_xor_si128(v0, _mm_set1_epi8(-128)), \ + _mm_xor_si128(v1, _mm_set1_epi8(-128))) +#define _mm_cmplt_epu8(v1, v0) \ + _mm_cmpgt_epi8(_mm_xor_si128(v0, _mm_set1_epi8(-128)), \ + _mm_xor_si128(v1, _mm_set1_epi8(-128))) +#endif +namespace ndb { +/** + * @brief Gets indices of non-zero values in array a. + * Credits: + * https://stackoverflow.com/questions/18971401/sparse-array-compression-using-simd-avx2/41958528#41958528 + * + * @param input array + * @param n number of input elements + * @param ind output array (indices into n of nonzero elements) + * @param m number of elements in output + */ +void arr2ind(const unsigned char* a, + int n, + int* ind, + int* m); + +#ifdef _INTRINSICS_SSE +/** + * @brief Unpacks 16x8bit from a 128bit simd var into 2x128bit vars + * (8x16bit) + * + * @param[in] x the 128 bit vector to be unpacked + * @param y0 The y 0 + * @param y1 The y 1 + */ +void unpack8to16(const __m128i x, __m128i& y0, __m128i& y1); + +/** + * @brief Packs 2x128bit vars with 16bit values(where 8 upper bits are + * zero) into 1x128bit with 8bit values + * + * @param[in] x0 The x 0 + * @param[in] x1 The x 1 + * @param y the packed vector + */ +void pack16to8(const __m128i x0, const __m128i x1, __m128i& y); +#endif +/** + * @brief Calls a given functional f with subranges based on the given start + * and end indices. Here the functional is assumed to take two integer + * arguments indicating their respective start and end ranges. + * nThreads determines the number of threads the given range shall be + * split into. The range is inclusive on the lower bound and exclusive on the + * upper bound, i.e. [start,end) + * + * @param f function object (e.g. a lambda functional) + * @param start start of the range + * @param end end of the range + * @param nThreads number of threads to use + */ +void parFor(std::function const& f, + int start, + int end, + int nThreads); + +} // namespace ndb +#endif diff --git a/lib/gpc/training.hpp b/lib/gpc/training.hpp index f1e398b..8d557cc 100644 --- a/lib/gpc/training.hpp +++ b/lib/gpc/training.hpp @@ -49,7 +49,6 @@ #include "gpc/SintelOpticalFlow.hpp" #include "gpc/SintelStereo.hpp" #include "gpc/buffer.hpp" -#include "gpc/filter.hpp" #include "gpc/hashmatch.hpp" namespace gpc { From f0ed437fae3e5a9a116eb4eb2968a21abffc99e5 Mon Sep 17 00:00:00 2001 From: Nik Bamert Date: Sun, 15 Feb 2026 10:07:51 +0100 Subject: [PATCH 11/36] add highway --- CMakeLists.txt | 10 ++++++++++ samples/sparsematch.cpp | 20 ++++++++++++++++++++ 2 files changed, 30 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index c0fe739..4e01b6e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -36,6 +36,15 @@ if(SSE) add_compile_options(-mavx2 -march=core-avx2) endif() endif() +include(FetchContent) +set(HWY_ENABLE_TESTS OFF CACHE BOOL "Disable Highway tests" FORCE) +set(HWY_ENABLE_EXAMPLES OFF CACHE BOOL "Disable Highway examples" FORCE) +FetchContent_Declare( + highway + GIT_REPOSITORY https://github.com/google/highway.git + GIT_TAG 1.3.0 +) +FetchContent_MakeAvailable(highway) add_library(gpc_core lib/gpc/forest.cpp @@ -52,6 +61,7 @@ target_link_libraries(gpc_core Eigen3::Eigen ${PNG_LIBRARIES} Threads::Threads + hwy ) target_include_directories(gpc_core PUBLIC lib) enable_testing() diff --git a/samples/sparsematch.cpp b/samples/sparsematch.cpp index 3d5f19b..2554271 100644 --- a/samples/sparsematch.cpp +++ b/samples/sparsematch.cpp @@ -1,7 +1,26 @@ #include +#include #include "gpc/forest.hpp" using namespace std; +void test_hwy_neon() { + namespace hn = hwy::HWY_NAMESPACE; + + // d is a "descriptor" for a vector of 8-bit unsigned ints + const hn::ScalableTag d; + + // If this is NEON, hn::Lanes(d) will be 16 + size_t lanes = hn::Lanes(d); + + auto v1 = hn::Set(d, 10); + auto v2 = hn::Set(d, 20); + auto res = hn::Add(v1, v2); // res lanes all contain 30 + + std::cout << "--- Highway Status ---" << std::endl; + std::cout << "Target: " << hwy::TargetName(hwy::SupportedTargets()) << std::endl; + std::cout << "Vector lanes (uint8): " << lanes << std::endl; + std::cout << "----------------------" << std::endl; +} int main(int argc, char** argv) { std::string forestPath = "../../forests/defaultZeroForest.txt"; std::string leftImgPath = "../../data/kitti/training/image_0/000000_10.png"; @@ -72,4 +91,5 @@ int main(int argc, char** argv) { ndb::Buffer renderDisp; renderDisp = ndb::getDisparityVisualization(simg, supp); renderDisp.writePNGRGB("disparity.png"); + test_hwy_neon(); } From 06bda6e1b9019ccf69d45ca7d45b19c144195258 Mon Sep 17 00:00:00 2001 From: Nik Bamert Date: Sun, 15 Feb 2026 13:44:50 +0100 Subject: [PATCH 12/36] add highway implementation of box filter --- CMakeLists.txt | 1 + lib/gpc/forest.cpp | 3 + lib/gpc/kernels/box.cpp | 214 +++++++++++++++++------------------- lib/gpc/kernels/box.hpp | 4 + lib/gpc/kernels/box_hwy.cpp | 146 ++++++++++++++++++++++++ 5 files changed, 255 insertions(+), 113 deletions(-) create mode 100644 lib/gpc/kernels/box_hwy.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 4e01b6e..56c5314 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -55,6 +55,7 @@ add_library(gpc_core lib/gpc/kernels/census.cpp lib/gpc/kernels/gpc.cpp lib/gpc/kernels/utils.cpp + lib/gpc/kernels/box_hwy.cpp ) target_link_libraries(gpc_core PUBLIC diff --git a/lib/gpc/forest.cpp b/lib/gpc/forest.cpp index 52046f2..5ba9f38 100644 --- a/lib/gpc/forest.cpp +++ b/lib/gpc/forest.cpp @@ -212,11 +212,14 @@ PreprocessedImage Forest::preprocessImage(ndb::Buffer& img, ndb::Buffer smooth(img.rows(), img.cols()); smooth.width = img.width; + gpc::inference::time_point t0 = gpc::inference::sysTick(); ndb::box(img.data(), smooth.data(), img.cols(), img.rows(), settings.numThreads_); + gpc::inference::time_point t1 = gpc::inference::sysTick(); + cout << "box: " << gpc::inference::tickToMs(t1, t0) << " ms" << endl; smooth.clearBoundary(); ndb::Buffer grad(img.rows(), img.cols()); grad.width = img.width; diff --git a/lib/gpc/kernels/box.cpp b/lib/gpc/kernels/box.cpp index 9e444d4..10215d4 100644 --- a/lib/gpc/kernels/box.cpp +++ b/lib/gpc/kernels/box.cpp @@ -31,6 +31,9 @@ #include "gpc/kernels/box.hpp" namespace ndb { +namespace testing { + void box_hwy(uint8_t* in, uint8_t* blurred, int width, int height); +} void boxNaive(uint8_t* in, uint8_t* blurred, int width, int height) { assert(width % 16 == 0 && "width must be multiple of 16!"); // allocate space for result @@ -69,122 +72,107 @@ void boxNaive(uint8_t* in, uint8_t* blurred, int width, int height) { } } } +#ifdef _INTRINSICS_SSE +/** + * @brief SSE implementation of the 3x3 box filter. + * Processed two rows at a time using fixed-point multiplication for division. + */ +#include +void boxSSE(uint8_t* in, uint8_t* blurred, int width, int height) { + int start = 1; + int end = height - 3; + + int x, y; + __m128i one_third = _mm_set1_epi16(21846); // 2^16/3 + 1 + + __m128i *dst0 = (__m128i*)(blurred + width * start); + __m128i *dst1 = (__m128i*)(blurred + width * (start + 1)); + + for (y = start; y < end; y += 2) { + const uint8_t *row0, *row1, *row2, *row3; + + row1 = in + y * width; + row0 = row1 - width; + row2 = row1 + width; + row3 = row2 + width; + + for (x = 0; x < width; x += 16) { + __m128i s00, s01, s02; + __m128i ra00, ra01, ra02, rb00, rb01, rb02; + __m128i a00, a01, a02, b00, b01, b02; + __m128i tmp0, tmp1, res; + + // Row 0 Processing + s00 = _mm_loadu_si128((__m128i*)(row0 - 1)); + s01 = _mm_loadu_si128((__m128i*)(row0 + 1)); + s02 = _mm_load_si128((__m128i*)(row0)); + unpack8to16(s00, a00, b00); + unpack8to16(s01, a01, b01); + unpack8to16(s02, a02, b02); + ra00 = _mm_mulhi_epi16(_mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third); + rb00 = _mm_mulhi_epi16(_mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third); + + // Row 1 Processing + s00 = _mm_loadu_si128((__m128i*)(row1 - 1)); + s01 = _mm_loadu_si128((__m128i*)(row1 + 1)); + s02 = _mm_load_si128((__m128i*)(row1)); + unpack8to16(s00, a00, b00); + unpack8to16(s01, a01, b01); + unpack8to16(s02, a02, b02); + ra01 = _mm_mulhi_epi16(_mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third); + rb01 = _mm_mulhi_epi16(_mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third); + + // Row 2 Processing + s00 = _mm_loadu_si128((__m128i*)(row2 - 1)); + s01 = _mm_loadu_si128((__m128i*)(row2 + 1)); + s02 = _mm_load_si128((__m128i*)(row2)); + unpack8to16(s00, a00, b00); + unpack8to16(s01, a01, b01); + unpack8to16(s02, a02, b02); + ra02 = _mm_mulhi_epi16(_mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third); + rb02 = _mm_mulhi_epi16(_mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third); + + // Accumulate rows 0, 1, 2 for dst0 + tmp0 = _mm_mulhi_epi16(_mm_adds_epi16(_mm_adds_epi16(ra00, ra01), ra02), one_third); + tmp1 = _mm_mulhi_epi16(_mm_adds_epi16(_mm_adds_epi16(rb00, rb01), rb02), one_third); + pack16to8(tmp0, tmp1, res); + _mm_store_si128(dst0++, res); + + // Row 3 Processing + s00 = _mm_loadu_si128((__m128i*)(row3 - 1)); + s01 = _mm_loadu_si128((__m128i*)(row3 + 1)); + s02 = _mm_load_si128((__m128i*)(row3)); + unpack8to16(s00, a00, b00); + unpack8to16(s01, a01, b01); + unpack8to16(s02, a02, b02); + ra00 = _mm_mulhi_epi16(_mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third); + rb00 = _mm_mulhi_epi16(_mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third); + + // Accumulate rows 1, 2, 3 for dst1 + tmp0 = _mm_mulhi_epi16(_mm_adds_epi16(_mm_adds_epi16(ra01, ra02), ra00), one_third); + tmp1 = _mm_mulhi_epi16(_mm_adds_epi16(_mm_adds_epi16(rb01, rb02), rb00), one_third); + pack16to8(tmp0, tmp1, res); + _mm_store_si128(dst1++, res); + + row0 += 16; row1 += 16; row2 += 16; row3 += 16; + } + dst0 += width / 16; + dst1 += width / 16; + } +} +#endif void box(uint8_t* in, uint8_t* blurred, int width, int height, int numThreads) { assert(width % 16 == 0 && "width must be multiple of 16!"); -#ifndef _INTRINSICS_SSE - boxNaive(in, blurred, width, height); +#if defined(__ARM_NEON) || defined(__aarch64__) + // Force use of our new Highway kernel on Mac + testing::box_hwy(in, blurred, width, height); #else - auto boxFilterSegment = [&](int start, int end) { - int x, y; - __m128i one_third; - __m128i *dst0, *dst1; - __m128i zero = _mm_setzero_si128(); - - one_third = _mm_set1_epi16( - 21846); // 2^16/3+1. For 16bit ints. 2^8/3+1=86.33 for 8bit - dst0 = (__m128i*)(blurred + width * (start)); - dst1 = (__m128i*)(blurred + width * (start + 1)); - for (y = start; y < end; - y += 2) { // We compute results for two rows in one iteration - const uint8_t *row0, *row1, *row2, *row3; - - row1 = in + y * width; - row0 = row1 - width; - row2 = row1 + width; - row3 = row2 + width; - - for (x = 0; x < width; x += 16) { - __m128i s00, s01, s02; - __m128i r00, r01, r02; - __m128i ra00, ra01, ra02; - __m128i rb00, rb01, rb02; - - __m128i a00, a01, a02, b00, b01, b02; - - __m128i tmp0, tmp1, res; - - s00 = _mm_loadu_si128((__m128i*)(row0 - 1)); - s01 = _mm_loadu_si128((__m128i*)(row0 + 1)); - s02 = _mm_load_si128((__m128i*)(row0)); - unpack8to16(s00, a00, b00); - unpack8to16(s01, a01, b01); - unpack8to16(s02, a02, b02); - - ra00 = _mm_mulhi_epi16( - _mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third); - rb00 = _mm_mulhi_epi16( - _mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third); - - s00 = _mm_loadu_si128((__m128i*)(row1 - 1)); - s01 = _mm_loadu_si128((__m128i*)(row1 + 1)); - s02 = _mm_load_si128((__m128i*)(row1)); - unpack8to16(s00, a00, b00); - unpack8to16(s01, a01, b01); - unpack8to16(s02, a02, b02); - - ra01 = _mm_mulhi_epi16( - _mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third); - rb01 = _mm_mulhi_epi16( - _mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third); - - s00 = _mm_loadu_si128((__m128i*)(row2 - 1)); - s01 = _mm_loadu_si128((__m128i*)(row2 + 1)); - s02 = _mm_load_si128((__m128i*)(row2)); - unpack8to16(s00, a00, b00); - unpack8to16(s01, a01, b01); - unpack8to16(s02, a02, b02); - - ra02 = _mm_mulhi_epi16( - _mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third); - rb02 = _mm_mulhi_epi16( - _mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third); - - tmp0 = _mm_mulhi_epi16( - _mm_adds_epi16(_mm_adds_epi16(ra00, ra01), ra02), - one_third); - tmp1 = _mm_mulhi_epi16( - _mm_adds_epi16(_mm_adds_epi16(rb00, rb01), rb02), - one_third); - - pack16to8(tmp0, tmp1, res); - _mm_store_si128(dst0++, res); - - s00 = _mm_loadu_si128((__m128i*)(row3 - 1)); - s01 = _mm_loadu_si128((__m128i*)(row3 + 1)); - s02 = _mm_load_si128((__m128i*)(row3)); - unpack8to16(s00, a00, b00); - unpack8to16(s01, a01, b01); - unpack8to16(s02, a02, b02); - ra00 = _mm_mulhi_epi16( - _mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third); - rb00 = _mm_mulhi_epi16( - _mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third); - - tmp0 = _mm_mulhi_epi16( - _mm_adds_epi16(_mm_adds_epi16(ra00, ra01), ra02), - one_third); - tmp1 = _mm_mulhi_epi16( - _mm_adds_epi16(_mm_adds_epi16(rb00, rb01), rb02), - one_third); - - pack16to8(tmp0, tmp1, res); - _mm_store_si128(dst1++, res); - - row0 += 16; - row1 += 16; - row2 += 16; - row3 += 16; - } - // still storing 128bit, but now in 16 x 8bit format, so /16 instead - // of /8 - dst0 += width / 16; - dst1 += width / 16; - } - }; // lambda - - boxFilterSegment(1, height - 3); - // parFor(boxFilterSegment,1,height-3,4); + #ifndef _INTRINSICS_SSE + boxNaive(in, blurred, width, height); + #else + boxSSE(in, blurred, width, height); + #endif #endif } -} +} // namespace ndb diff --git a/lib/gpc/kernels/box.hpp b/lib/gpc/kernels/box.hpp index bf9eea3..c5f2d0e 100644 --- a/lib/gpc/kernels/box.hpp +++ b/lib/gpc/kernels/box.hpp @@ -63,3 +63,7 @@ void box(uint8_t* in, uint8_t* blurred, int width, int height, int numThreads); } #endif + + + + diff --git a/lib/gpc/kernels/box_hwy.cpp b/lib/gpc/kernels/box_hwy.cpp new file mode 100644 index 0000000..8573e40 --- /dev/null +++ b/lib/gpc/kernels/box_hwy.cpp @@ -0,0 +1,146 @@ + +// We define the target BEFORE including highway.h +// On Mac, this forces Highway to use NEON mode without the inclusion loop. +#define HWY_TARGET HWY_NEON +#include + +// We skip foreach_target.h entirely to avoid the "redefinition" and "path" errors. + +HWY_BEFORE_NAMESPACE(); +namespace ndb { +namespace HWY_NAMESPACE { +namespace hn = hwy::HWY_NAMESPACE; + +void BoxKernelNaive(const uint8_t* HWY_RESTRICT in, uint8_t* HWY_RESTRICT blurred, int width, int height) { + const hn::ScalableTag d8; + const hn::ScalableTag d16; + const size_t N = hn::Lanes(d8); + const auto divisor = hn::Set(d16, (uint16_t)7282); // 65536 / 9 + + for (int y = 1; y < height - 1; ++y) { + const uint8_t* r0 = in + (y - 1) * width; + const uint8_t* r1 = in + y * width; + const uint8_t* r2 = in + (y + 1) * width; + + uint8_t* out_row = blurred + y * width + 1; + + for (int x = 0; x < width; x += N) { + // Row 0 + auto v11 = hn::LoadU(d8, r0 + x); + auto v12 = hn::LoadU(d8, r0 + x + 1); + auto v13 = hn::LoadU(d8, r0 + x + 2); + + // Row 1 + auto v21 = hn::LoadU(d8, r1 + x); + auto v22 = hn::LoadU(d8, r1 + x + 1); + auto v23 = hn::LoadU(d8, r1 + x + 2); + + // Row 2 + auto v31 = hn::LoadU(d8, r2 + x); + auto v32 = hn::LoadU(d8, r2 + x + 1); + auto v33 = hn::LoadU(d8, r2 + x + 2); + + // Vertical sums first (3 instructions per half-vector) + auto sum_col1_lo = hn::Add(hn::PromoteLowerTo(d16, v11), hn::Add(hn::PromoteLowerTo(d16, v21), hn::PromoteLowerTo(d16, v31))); + auto sum_col1_hi = hn::Add(hn::PromoteUpperTo(d16, v11), hn::Add(hn::PromoteUpperTo(d16, v21), hn::PromoteUpperTo(d16, v31))); + + auto sum_col2_lo = hn::Add(hn::PromoteLowerTo(d16, v12), hn::Add(hn::PromoteLowerTo(d16, v22), hn::PromoteLowerTo(d16, v32))); + auto sum_col2_hi = hn::Add(hn::PromoteUpperTo(d16, v12), hn::Add(hn::PromoteUpperTo(d16, v22), hn::PromoteUpperTo(d16, v32))); + + auto sum_col3_lo = hn::Add(hn::PromoteLowerTo(d16, v13), hn::Add(hn::PromoteLowerTo(d16, v23), hn::PromoteLowerTo(d16, v33))); + auto sum_col3_hi = hn::Add(hn::PromoteUpperTo(d16, v13), hn::Add(hn::PromoteUpperTo(d16, v23), hn::PromoteUpperTo(d16, v33))); + + // Horizontal accumulation + auto total_lo = hn::Add(sum_col1_lo, hn::Add(sum_col2_lo, sum_col3_lo)); + auto total_hi = hn::Add(sum_col1_hi, hn::Add(sum_col2_hi, sum_col3_hi)); + + // Fixed-point division by 9 + auto res_lo = hn::MulHigh(total_lo, divisor); + auto res_hi = hn::MulHigh(total_hi, divisor); + + hn::StoreU(hn::Combine(d8, hn::DemoteTo(d8, res_hi), hn::DemoteTo(d8, res_lo)), d8, out_row + x); + } + } +} +void BoxKernel(const uint8_t* HWY_RESTRICT in, uint8_t* HWY_RESTRICT blurred, int width, int height) { + const hn::ScalableTag d8; + const hn::ScalableTag d16; + const size_t N = hn::Lanes(d8); + const auto divisor = hn::Set(d16, (uint16_t)7282); + + // We process two output rows at a time (y and y+1) + // This requires 4 input rows (r0, r1, r2, r3) + for (int y = 1; y < height - 2; y += 2) { + const uint8_t* r0 = in + (y - 1) * width; + const uint8_t* r1 = in + y * width; + const uint8_t* r2 = in + (y + 1) * width; + const uint8_t* r3 = in + (y + 2) * width; + + uint8_t* out0 = blurred + y * width + 1; + uint8_t* out1 = blurred + (y + 1) * width + 1; + + for (int x = 0; x < width; x += N) { + // Load all 4 rows needed for 2 output rows + auto v0_0 = hn::LoadU(d8, r0 + x); + auto v0_1 = hn::LoadU(d8, r0 + x + 1); + auto v0_2 = hn::LoadU(d8, r0 + x + 2); + + auto v1_0 = hn::LoadU(d8, r1 + x); + auto v1_1 = hn::LoadU(d8, r1 + x + 1); + auto v1_2 = hn::LoadU(d8, r1 + x + 2); + + auto v2_0 = hn::LoadU(d8, r2 + x); + auto v2_1 = hn::LoadU(d8, r2 + x + 1); + auto v2_2 = hn::LoadU(d8, r2 + x + 2); + + auto v3_0 = hn::LoadU(d8, r3 + x); + auto v3_1 = hn::LoadU(d8, r3 + x + 1); + auto v3_2 = hn::LoadU(d8, r3 + x + 2); + + // Vertical sums for Row Pair 1 (Rows 0, 1, 2) + // Vertical sums for Row Pair 2 (Rows 1, 2, 3) + // Note: Rows 1 and 2 are REUSED. + + auto s1_lo = hn::Add(hn::PromoteLowerTo(d16, v1_1), hn::Add(hn::PromoteLowerTo(d16, v1_0), hn::PromoteLowerTo(d16, v1_2))); + auto s2_lo = hn::Add(hn::PromoteLowerTo(d16, v2_1), hn::Add(hn::PromoteLowerTo(d16, v2_0), hn::PromoteLowerTo(d16, v2_2))); + + // Output Row 0 logic + auto s0_lo = hn::Add(hn::PromoteLowerTo(d16, v0_1), hn::Add(hn::PromoteLowerTo(d16, v0_0), hn::PromoteLowerTo(d16, v0_2))); + auto row0_lo = hn::Add(s0_lo, hn::Add(s1_lo, s2_lo)); + + // Output Row 1 logic + auto s3_lo = hn::Add(hn::PromoteLowerTo(d16, v3_1), hn::Add(hn::PromoteLowerTo(d16, v3_0), hn::PromoteLowerTo(d16, v3_2))); + auto row1_lo = hn::Add(s3_lo, hn::Add(s1_lo, s2_lo)); + + // Repeat for high bits... + auto s1_hi = hn::Add(hn::PromoteUpperTo(d16, v1_1), hn::Add(hn::PromoteUpperTo(d16, v1_0), hn::PromoteUpperTo(d16, v1_2))); + auto s2_hi = hn::Add(hn::PromoteUpperTo(d16, v2_1), hn::Add(hn::PromoteUpperTo(d16, v2_0), hn::PromoteUpperTo(d16, v2_2))); + + auto s0_hi = hn::Add(hn::PromoteUpperTo(d16, v0_1), hn::Add(hn::PromoteUpperTo(d16, v0_0), hn::PromoteUpperTo(d16, v0_2))); + auto row0_hi = hn::Add(s0_hi, hn::Add(s1_hi, s2_hi)); + + auto s3_hi = hn::Add(hn::PromoteUpperTo(d16, v3_1), hn::Add(hn::PromoteUpperTo(d16, v3_0), hn::PromoteUpperTo(d16, v3_2))); + auto row1_hi = hn::Add(s3_hi, hn::Add(s1_hi, s2_hi)); + + // Store both rows + hn::StoreU(hn::Combine(d8, hn::DemoteTo(d8, hn::MulHigh(row0_hi, divisor)), + hn::DemoteTo(d8, hn::MulHigh(row0_lo, divisor))), d8, out0 + x); + hn::StoreU(hn::Combine(d8, hn::DemoteTo(d8, hn::MulHigh(row1_hi, divisor)), + hn::DemoteTo(d8, hn::MulHigh(row1_lo, divisor))), d8, out1 + x); + } + } +} + +} // namespace HWY_NAMESPACE +} // namespace ndb +HWY_AFTER_NAMESPACE(); + +namespace ndb { +namespace testing { + void box_hwy(uint8_t* in, uint8_t* blurred, int width, int height) { + // We call ghwthe NEON version directly. + // Highway maps HWY_NAMESPACE to N_NEON because of our #define above. + ndb::N_NEON::BoxKernel(in, blurred, width, height); + } +} +} From 5e06b649af9363643a7b40f8be333b745eb381de Mon Sep 17 00:00:00 2001 From: Nik Bamert Date: Sun, 15 Feb 2026 14:14:31 +0100 Subject: [PATCH 13/36] highway sobel kernel --- CMakeLists.txt | 1 + lib/gpc/forest.cpp | 6 +- lib/gpc/kernels/box.cpp | 1 - lib/gpc/kernels/box_hwy.cpp | 7 - lib/gpc/kernels/sobel.cpp | 232 +++++++++++++--------------------- lib/gpc/kernels/sobel_hwy.cpp | 165 ++++++++++++++++++++++++ 6 files changed, 259 insertions(+), 153 deletions(-) create mode 100644 lib/gpc/kernels/sobel_hwy.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 56c5314..06b1991 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -56,6 +56,7 @@ add_library(gpc_core lib/gpc/kernels/gpc.cpp lib/gpc/kernels/utils.cpp lib/gpc/kernels/box_hwy.cpp + lib/gpc/kernels/sobel_hwy.cpp ) target_link_libraries(gpc_core PUBLIC diff --git a/lib/gpc/forest.cpp b/lib/gpc/forest.cpp index 5ba9f38..79f6894 100644 --- a/lib/gpc/forest.cpp +++ b/lib/gpc/forest.cpp @@ -212,18 +212,16 @@ PreprocessedImage Forest::preprocessImage(ndb::Buffer& img, ndb::Buffer smooth(img.rows(), img.cols()); smooth.width = img.width; - gpc::inference::time_point t0 = gpc::inference::sysTick(); ndb::box(img.data(), smooth.data(), img.cols(), img.rows(), settings.numThreads_); - gpc::inference::time_point t1 = gpc::inference::sysTick(); - cout << "box: " << gpc::inference::tickToMs(t1, t0) << " ms" << endl; smooth.clearBoundary(); ndb::Buffer grad(img.rows(), img.cols()); grad.width = img.width; ndb::Buffer maskTmp; + gpc::inference::time_point t0 = gpc::inference::sysTick(); ndb::sobel(img.data(), grad.data(), img.cols(), @@ -231,6 +229,8 @@ PreprocessedImage Forest::preprocessImage(ndb::Buffer& img, settings.gradientThreshold_, settings.numThreads_); + gpc::inference::time_point t1 = gpc::inference::sysTick(); + cout << "sobel: " << gpc::inference::tickToMs(t1, t0) << " ms" << endl; ndb::Buffer idx; idx.resize(grad.rows(), grad.cols()); auto ff = [&](ndb::Buffer& in, std::vector& out, int m) { diff --git a/lib/gpc/kernels/box.cpp b/lib/gpc/kernels/box.cpp index 10215d4..0b611fb 100644 --- a/lib/gpc/kernels/box.cpp +++ b/lib/gpc/kernels/box.cpp @@ -174,5 +174,4 @@ void box(uint8_t* in, uint8_t* blurred, int width, int height, int numThreads) { #endif #endif } - } // namespace ndb diff --git a/lib/gpc/kernels/box_hwy.cpp b/lib/gpc/kernels/box_hwy.cpp index 8573e40..384360e 100644 --- a/lib/gpc/kernels/box_hwy.cpp +++ b/lib/gpc/kernels/box_hwy.cpp @@ -1,11 +1,6 @@ - -// We define the target BEFORE including highway.h -// On Mac, this forces Highway to use NEON mode without the inclusion loop. #define HWY_TARGET HWY_NEON #include -// We skip foreach_target.h entirely to avoid the "redefinition" and "path" errors. - HWY_BEFORE_NAMESPACE(); namespace ndb { namespace HWY_NAMESPACE { @@ -138,8 +133,6 @@ HWY_AFTER_NAMESPACE(); namespace ndb { namespace testing { void box_hwy(uint8_t* in, uint8_t* blurred, int width, int height) { - // We call ghwthe NEON version directly. - // Highway maps HWY_NAMESPACE to N_NEON because of our #define above. ndb::N_NEON::BoxKernel(in, blurred, width, height); } } diff --git a/lib/gpc/kernels/sobel.cpp b/lib/gpc/kernels/sobel.cpp index 4d56716..c6a14bc 100644 --- a/lib/gpc/kernels/sobel.cpp +++ b/lib/gpc/kernels/sobel.cpp @@ -31,6 +31,9 @@ #include "gpc/kernels/sobel.hpp" namespace ndb { +namespace testing { + void sobel_hwy(uint8_t* in, uint8_t* blurred, int width, int height, uint8_t threshold); +} void sobelNaive( uint8_t* in, uint8_t* gradient, int width, int height, uint8_t threshold) { assert(width % 16 == 0 && "width must be multiple of 16!"); @@ -74,6 +77,84 @@ void sobelNaive( } } } +#ifdef _INTRINSICS_SSE +#include + +// Assuming your helper macros/inline funcs are defined elsewhere +// pack16to8(lo, hi, res) +// unpack8to16(in, lo, hi) + +void sobelSSE(const uint8_t* in, uint8_t* blurred, + int width, int start, int end, + uint8_t threshold) { + + __m128i zero = _mm_setzero_si128(); + __m128i one_ninth = _mm_set1_epi16(7282); // 2^16/9 + __m128i binThres = _mm_set1_epi16(threshold * threshold); + + for (int y = start; y < end; y++) { + const uint8_t* row1 = in + y * width; + const uint8_t* row0 = row1 - width; + const uint8_t* row2 = row1 + width; + + // Output destination for this specific row + __m128i* dst = (__m128i*)(blurred + y * width + 1); + + for (int x = 0; x < width; x += 16) { + __m128i a00, a01, a02, a10, a12, a20, a21, a22; + __m128i b00, b01, b02, b10, b12, b20, b21, b22; + __m128i raA, raB, rbA, rbB; + __m128i tmpa, tmpb, sya, syb, sxa, sxb, res; + + // Load and unpack 3x3 neighborhood (excluding center a11/b11) + unpack8to16(_mm_loadu_si128((__m128i*)(row0 + x - 1)), a00, b00); + unpack8to16(_mm_loadu_si128((__m128i*)(row0 + x)), a01, b01); + unpack8to16(_mm_loadu_si128((__m128i*)(row0 + x + 1)), a02, b02); + + unpack8to16(_mm_loadu_si128((__m128i*)(row1 + x - 1)), a10, b10); + unpack8to16(_mm_loadu_si128((__m128i*)(row1 + x + 1)), a12, b12); + + unpack8to16(_mm_loadu_si128((__m128i*)(row2 + x - 1)), a20, b20); + unpack8to16(_mm_loadu_si128((__m128i*)(row2 + x)), a21, b21); + unpack8to16(_mm_loadu_si128((__m128i*)(row2 + x + 1)), a22, b22); + + // --- SX Calculation --- + // Left col (1,2,1) + raA = _mm_mulhi_epi16(_mm_add_epi16(_mm_add_epi16(a00, a20), _mm_add_epi16(a10, a10)), one_ninth); + rbA = _mm_mulhi_epi16(_mm_add_epi16(_mm_add_epi16(b00, b20), _mm_add_epi16(b10, b10)), one_ninth); + // Right col (-1,-2,-1) + raB = _mm_mulhi_epi16(_mm_add_epi16(_mm_add_epi16(a02, a22), _mm_add_epi16(a12, a12)), one_ninth); + rbB = _mm_mulhi_epi16(_mm_add_epi16(_mm_add_epi16(b02, b22), _mm_add_epi16(b12, b12)), one_ninth); + + tmpa = _mm_sub_epi16(raA, raB); + tmpb = _mm_sub_epi16(rbA, rbB); + sxa = _mm_mullo_epi16(tmpa, tmpa); + sxb = _mm_mullo_epi16(tmpb, tmpb); + + // --- SY Calculation --- + // Top row (1,2,1) + raA = _mm_mulhi_epi16(_mm_add_epi16(_mm_add_epi16(a00, a02), _mm_add_epi16(a01, a01)), one_ninth); + rbA = _mm_mulhi_epi16(_mm_add_epi16(_mm_add_epi16(b00, b02), _mm_add_epi16(b01, b01)), one_ninth); + // Bottom row (-1,-2,-1) + raB = _mm_mulhi_epi16(_mm_add_epi16(_mm_add_epi16(a20, a22), _mm_add_epi16(a21, a21)), one_ninth); + rbB = _mm_mulhi_epi16(_mm_add_epi16(_mm_add_epi16(b20, b22), _mm_add_epi16(b21, b21)), one_ninth); + + tmpa = _mm_sub_epi16(raA, raB); + tmpb = _mm_sub_epi16(rbA, rbB); + sya = _mm_mullo_epi16(tmpa, tmpa); + syb = _mm_mullo_epi16(tmpb, tmpb); + + // --- Thresholding and Packing --- + pack16to8( + _mm_unpacklo_epi8(_mm_cmpgt_epi16(_mm_adds_epi16(sxa, sya), binThres), zero), + _mm_unpacklo_epi8(_mm_cmpgt_epi16(_mm_adds_epi16(sxb, syb), binThres), zero), + res); + + _mm_storeu_si128(dst++, res); + } + } +} +#endif void sobel(uint8_t* in, uint8_t* blurred, int width, @@ -81,149 +162,16 @@ void sobel(uint8_t* in, uint8_t threshold, int numThreads) { assert(width % 16 == 0 && "width must be multiple of 16!"); -#ifndef _INTRINSICS_SSE - sobelNaive(in, blurred, width, height, threshold); +#if defined(__ARM_NEON) || defined(__aarch64__) + // Force use of our new Highway kernel on Mac + sobelNaive(in, blurred, width, height, threshold); + //testing::sobel_hwy(in, blurred, width, height, threshold); #else - auto sobelSSESegment = [&](int start, int end) { - __m128i one_third, one_ninth, one, two, mone, mtwo, binThres; - __m128i *dst0, *dst1; - __m128i zero = _mm_setzero_si128(); - - int x, y; - one_third = _mm_set1_epi16( - 21846); // 2^16/3+1. For 16bit ints. 2^8/3+1=86.33 for 8bit - one_ninth = _mm_set1_epi16(7282); // 2^16/9+1. For 16bit ints. - - binThres = _mm_set1_epi16(threshold * threshold); - - dst0 = (__m128i*)(blurred + width * 1); - // dst1 = (__m128i *)(blurred + width * 2); - for (y = start; y < end; - y++) { // We compute results for two rows in one iteration - const uint8_t *row0, *row1, *row2; - - row1 = in + y * width; - row0 = row1 - width; - row2 = row1 + width; - - for (x = 0; x < width; x += 16) { - // Note: Center element not used in sobel kernels!! - // Kernel indices: - // 00 01 02 - // 10 11 12 - // 20 21 22 - - __m128i a00, a01, a02, a10, a12, a20, a21, a22; - __m128i b00, b01, b02, b10, b12, b20, b21, b22; - - __m128i raA, raB, rbA, rbB; - __m128i tmpa, tmpb, sya, syb, sxa, sxb, res; - - unpack8to16(_mm_loadu_si128((__m128i*)(row0 - 1)), a00, b00); - unpack8to16(_mm_load_si128((__m128i*)(row0)), a01, b01); - unpack8to16(_mm_loadu_si128((__m128i*)(row0 + 1)), a02, b02); - - unpack8to16(_mm_loadu_si128((__m128i*)(row1 - 1)), a10, b10); - unpack8to16(_mm_loadu_si128((__m128i*)(row1 + 1)), a12, b12); - - unpack8to16(_mm_loadu_si128((__m128i*)(row2 - 1)), a20, b20); - unpack8to16(_mm_load_si128((__m128i*)(row2)), a21, b21); - unpack8to16(_mm_loadu_si128((__m128i*)(row2 + 1)), a22, b22); - - // Sobel kernels for x and y direction. - // 1 0 -1 1 2 1 - // sx = 2 0 -2 sy = 0 0 0 - // 1 0 -1 -1-2-1 - // Note that neither kernel uses the center element) - - // In the following, mullo is used to multiply intermediate - // results with -1 To divide by 3, 16bit overflow divide by - // multiply is used, which thus uses the upper 16bit(_mm_mulhi) - // of the 32bit temporary result. - - // sx column kernel vectors (1,2,1) - // Two chained add/sub are used for 2 and -2 - raA = _mm_mulhi_epi16( - _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(a00, a20), a10), - a10), - one_ninth); - rbA = _mm_mulhi_epi16( - _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(b00, b20), b10), - b10), - one_ninth); - - // sx column kernel vector (-1 -2 -1) - raB = _mm_mulhi_epi16( - _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(a02, a22), a12), - a12), - one_ninth); - rbB = _mm_mulhi_epi16( - _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(b02, b22), b12), - b12), - one_ninth); - - // Square of sx: Add squares of above temporaries into final sum - tmpa = _mm_sub_epi16(raA, raB); - tmpb = _mm_sub_epi16(rbA, rbB); - - sxa = _mm_mullo_epi16(tmpa, tmpa); - sxb = _mm_mullo_epi16(tmpb, tmpb); - - // sy row kernel vector (1,2,1) - // Two chained add are used for 2 and -2 - raA = _mm_mulhi_epi16( - _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(a00, a02), a01), - a01), - one_ninth); - rbA = _mm_mulhi_epi16( - _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(b00, b02), b01), - b01), - one_ninth); - - // sy row kernel vector (-1 -2 -1) - raB = _mm_mulhi_epi16( - _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(a20, a22), a21), - a21), - one_ninth); - rbB = _mm_mulhi_epi16( - _mm_add_epi16(_mm_add_epi16(_mm_add_epi16(b20, b22), b21), - b21), - one_ninth); - - // Square of sx: Add squares of above temporaries into final sum - tmpa = _mm_sub_epi16(raA, raB); - tmpb = _mm_sub_epi16(rbA, rbB); - - // watch out, can't overwrite this - sya = _mm_mullo_epi16(tmpa, tmpa); - syb = _mm_mullo_epi16(tmpb, tmpb); - - __m128i zero = _mm_setzero_si128(); - - // The unpacklo is necessary because _mm_cmput_epi16 sets the - // output to 0xFFFF if the comparison is true. When packing - // 16bit to 8bit however, 0xFFFF will be interpreted (in a - // signed environment) as being negative, and hence set to 0, - // resulting in a 0 output everywhere. using unpacklo in between - // we get 0xFFFF->0xFF - pack16to8( - _mm_unpacklo_epi8( - _mm_cmpgt_epi16(_mm_adds_epi16(sxa, sya), binThres), - zero), - _mm_unpacklo_epi8( - _mm_cmpgt_epi16(_mm_adds_epi16(sxb, syb), binThres), - zero), - res); - - _mm_store_si128(dst0++, res); - - row0 += 16; - row1 += 16; - row2 += 16; - } // cols - } // rows - }; // Lambda - sobelSSESegment(1, height - 3); + #ifndef _INTRINSICS_SSE + sobelNaive(in, blurred, width, height, threshold); + #else + sobelSSE(in, blurred, width, 1, height - 1, threshold); + #endif #endif } } // namespace ndb diff --git a/lib/gpc/kernels/sobel_hwy.cpp b/lib/gpc/kernels/sobel_hwy.cpp new file mode 100644 index 0000000..1e395c4 --- /dev/null +++ b/lib/gpc/kernels/sobel_hwy.cpp @@ -0,0 +1,165 @@ +#define HWY_TARGET HWY_NEON +#include + +HWY_BEFORE_NAMESPACE(); +namespace ndb { +namespace HWY_NAMESPACE { +namespace hn = hwy::HWY_NAMESPACE; + + +void SobelKernel(const uint8_t* HWY_RESTRICT in, uint8_t* HWY_RESTRICT gradient, + int width, int height, uint8_t threshold) { + const hn::ScalableTag d8; + const hn::Rebind> d16; // Signed 16-bit, half the lanes of d8 + const hn::Half d8_half; // Tag for half-width 8-bit loads + + const size_t N = hn::Lanes(d8); + const auto divisor = hn::Set(d16, (int16_t)7282); + const auto threshSq = hn::Set(d16, (int16_t)(threshold * threshold)); + const auto v255 = hn::Set(d16, 255); + const auto v0 = hn::Zero(d16); + + for (int y = 1; y < height - 1; ++y) { + const uint8_t* r0 = in + (y - 1) * width; + const uint8_t* r1 = in + y * width; + const uint8_t* r2 = in + (y + 1) * width; + uint8_t* out = gradient + y * width + 1; + + for (int x = 0; x < width; x += N) { + // Load full 128-bit vectors + auto v11 = hn::LoadU(d8, r0 + x); auto v12 = hn::LoadU(d8, r0 + x + 1); auto v13 = hn::LoadU(d8, r0 + x + 2); + auto v21 = hn::LoadU(d8, r1 + x); auto v23 = hn::LoadU(d8, r1 + x + 2); + auto v31 = hn::LoadU(d8, r2 + x); auto v32 = hn::LoadU(d8, r2 + x + 1); auto v33 = hn::LoadU(d8, r2 + x + 2); + + // LOWER HALF PROCESSING + { + // PromoteTo signed 16-bit from the lower half of our 8-bit vectors + auto p11 = hn::PromoteTo(d16, hn::LowerHalf(v11)); + auto p12 = hn::PromoteTo(d16, hn::LowerHalf(v12)); + auto p13 = hn::PromoteTo(d16, hn::LowerHalf(v13)); + auto p21 = hn::PromoteTo(d16, hn::LowerHalf(v21)); + auto p23 = hn::PromoteTo(d16, hn::LowerHalf(v23)); + auto p31 = hn::PromoteTo(d16, hn::LowerHalf(v31)); + auto p32 = hn::PromoteTo(d16, hn::LowerHalf(v32)); + auto p33 = hn::PromoteTo(d16, hn::LowerHalf(v33)); + + auto sx = hn::Sub(hn::Add(hn::Add(p11, p31), hn::Add(p21, p21)), + hn::Add(hn::Add(p13, p33), hn::Add(p23, p23))); + sx = hn::MulHigh(sx, divisor); + + auto sy = hn::Sub(hn::Add(hn::Add(p11, p13), hn::Add(p12, p12)), + hn::Add(hn::Add(p31, p33), hn::Add(p32, p32))); + sy = hn::MulHigh(sy, divisor); + + auto mag = hn::Add(hn::Mul(sx, sx), hn::Mul(sy, sy)); + auto mask = hn::Gt(mag, threshSq); + auto res_lo = hn::DemoteTo(d8_half, hn::IfThenElse(mask, v255, v0)); + + // UPPER HALF PROCESSING + auto u11 = hn::PromoteTo(d16, hn::UpperHalf(d8, v11)); + auto u12 = hn::PromoteTo(d16, hn::UpperHalf(d8, v12)); + auto u13 = hn::PromoteTo(d16, hn::UpperHalf(d8, v13)); + auto u21 = hn::PromoteTo(d16, hn::UpperHalf(d8, v21)); + auto u23 = hn::PromoteTo(d16, hn::UpperHalf(d8, v23)); + auto u31 = hn::PromoteTo(d16, hn::UpperHalf(d8, v31)); + auto u32 = hn::PromoteTo(d16, hn::UpperHalf(d8, v32)); + auto u33 = hn::PromoteTo(d16, hn::UpperHalf(d8, v33)); + + auto sx_u = hn::Sub(hn::Add(hn::Add(u11, u31), hn::Add(u21, u21)), + hn::Add(hn::Add(u13, u33), hn::Add(u23, u23))); + sx_u = hn::MulHigh(sx_u, divisor); + + auto sy_u = hn::Sub(hn::Add(hn::Add(u11, u13), hn::Add(u12, u12)), + hn::Add(hn::Add(u31, u33), hn::Add(u32, u32))); + sy_u = hn::MulHigh(sy_u, divisor); + + auto mag_u = hn::Add(hn::Mul(sx_u, sx_u), hn::Mul(sy_u, sy_u)); + auto mask_u = hn::Gt(mag_u, threshSq); + auto res_hi = hn::DemoteTo(d8_half, hn::IfThenElse(mask_u, v255, v0)); + + hn::StoreU(hn::Combine(d8, res_hi, res_lo), d8, out + x); + } + } + } +} +void SobelKerneli(const uint8_t* HWY_RESTRICT in, uint8_t* HWY_RESTRICT gradient, + int width, int height, uint8_t threshold) { + const hn::ScalableTag d8; + const hn::Rebind> d16; + const hn::Half d8_half; + + const size_t N = hn::Lanes(d8); + // Multiply threshold by 9 BEFORE squaring to match the "no-division" math + int16_t tScaled = (int16_t)threshold * 9; + const auto threshSq = hn::Set(d16, tScaled * tScaled); + + const auto v255 = hn::Set(d16, 255); + const auto v0 = hn::Zero(d16); + + for (int y = 1; y < height - 1; ++y) { + const uint8_t* r0 = in + (y - 1) * width; + const uint8_t* r1 = in + y * width; + const uint8_t* r2 = in + (y + 1) * width; + uint8_t* out = gradient + y * width + 1; + + for (int x = 0; x < width; x += N) { + auto v11 = hn::LoadU(d8, r0 + x); auto v12 = hn::LoadU(d8, r0 + x + 1); auto v13 = hn::LoadU(d8, r0 + x + 2); + auto v21 = hn::LoadU(d8, r1 + x); auto v23 = hn::LoadU(d8, r1 + x + 2); + auto v31 = hn::LoadU(d8, r2 + x); auto v32 = hn::LoadU(d8, r2 + x + 1); auto v33 = hn::LoadU(d8, r2 + x + 2); + + // LOWER HALF + { + auto p11 = hn::PromoteTo(d16, hn::LowerHalf(v11)); + auto p12 = hn::PromoteTo(d16, hn::LowerHalf(v12)); + auto p13 = hn::PromoteTo(d16, hn::LowerHalf(v13)); + auto p21 = hn::PromoteTo(d16, hn::LowerHalf(v21)); + auto p23 = hn::PromoteTo(d16, hn::LowerHalf(v23)); + auto p31 = hn::PromoteTo(d16, hn::LowerHalf(v31)); + auto p32 = hn::PromoteTo(d16, hn::LowerHalf(v32)); + auto p33 = hn::PromoteTo(d16, hn::LowerHalf(v33)); + + auto sx = hn::Sub(hn::Add(hn::Add(p11, p31), hn::Add(p21, p21)), + hn::Add(hn::Add(p13, p33), hn::Add(p23, p23))); + auto sy = hn::Sub(hn::Add(hn::Add(p11, p13), hn::Add(p12, p12)), + hn::Add(hn::Add(p31, p33), hn::Add(p32, p32))); + + // Removed MulHigh (division). Math is now: (sx*sx + sy*sy) > (threshold*9)^2 + auto mag = hn::Add(hn::Mul(sx, sx), hn::Mul(sy, sy)); + auto mask = hn::Gt(mag, threshSq); + auto res_lo = hn::DemoteTo(d8_half, hn::IfThenElse(mask, v255, v0)); + + // UPPER HALF + auto u11 = hn::PromoteTo(d16, hn::UpperHalf(d8, v11)); + auto u12 = hn::PromoteTo(d16, hn::UpperHalf(d8, v12)); + auto u13 = hn::PromoteTo(d16, hn::UpperHalf(d8, v13)); + auto u21 = hn::PromoteTo(d16, hn::UpperHalf(d8, v21)); + auto u23 = hn::PromoteTo(d16, hn::UpperHalf(d8, v23)); + auto u31 = hn::PromoteTo(d16, hn::UpperHalf(d8, v31)); + auto u32 = hn::PromoteTo(d16, hn::UpperHalf(d8, v32)); + auto u33 = hn::PromoteTo(d16, hn::UpperHalf(d8, v33)); + + auto sx_u = hn::Sub(hn::Add(hn::Add(u11, u31), hn::Add(u21, u21)), + hn::Add(hn::Add(u13, u33), hn::Add(u23, u23))); + auto sy_u = hn::Sub(hn::Add(hn::Add(u11, u13), hn::Add(u12, u12)), + hn::Add(hn::Add(u31, u33), hn::Add(u32, u32))); + + auto mag_u = hn::Add(hn::Mul(sx_u, sx_u), hn::Mul(sy_u, sy_u)); + auto mask_u = hn::Gt(mag_u, threshSq); + auto res_hi = hn::DemoteTo(d8_half, hn::IfThenElse(mask_u, v255, v0)); + + hn::StoreU(hn::Combine(d8, res_hi, res_lo), d8, out + x); + } + } + } +} +} // namespace HWY_NAMESPACE +} // namespace ndb +HWY_AFTER_NAMESPACE(); + +namespace ndb { +namespace testing { + void sobel_hwy(uint8_t* in, uint8_t* blurred, int width, int height, uint8_t threshold) { + ndb::N_NEON::SobelKernel(in, blurred, width, height, threshold); + } +} +} From 0285f71f78ed87d8c9fc2cd7b06b02003edee541 Mon Sep 17 00:00:00 2001 From: Nik Bamert Date: Wed, 18 Feb 2026 16:00:53 +0100 Subject: [PATCH 14/36] checkin --- CMakeLists.txt | 22 +++++++++++++++++++++- lib/gpc/forest.cpp | 12 +++++++----- lib/gpc/kernels/box.cpp | 1 + lib/gpc/kernels/census.cpp | 2 +- lib/gpc/kernels/gpc.cpp | 2 +- lib/gpc/kernels/sobel.cpp | 4 ++-- samples/sparsematch.cpp | 18 +++++++----------- tests/CMakeLists.txt | 26 +++++++++++++++++--------- tests/test_single_matching.cpp | 2 ++ 9 files changed, 59 insertions(+), 30 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 06b1991..c072f48 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,6 +3,9 @@ include(CheckCXXCompilerFlag) include(CheckCXXSourceRuns) project(openGPC CXX) set (REQ_CPP11_FEATURES cxx_strong_enums cxx_auto_type) +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE) +endif() set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) @@ -19,7 +22,12 @@ include_directories(lib) #By default, use SSE intrinsics option(SSE "Enable SSE/AVX optimizations if available" ON) -add_compile_options(-O3 -funroll-loops) +add_compile_options(-O3 -funroll-loops -flto) +if(APPLE AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64") + add_compile_options(-mcpu=apple-m1) +elseif(NOT MSVC) + add_compile_options(-march=native) +endif() if(SSE) message(STATUS "Checking if target CPU supports AVX2 instructions...") check_cxx_source_runs(" @@ -46,6 +54,17 @@ FetchContent_Declare( ) FetchContent_MakeAvailable(highway) +include(FetchContent) +set(BENCHMARK_ENABLE_TESTING OFF CACHE BOOL "" FORCE) +set(BENCHMARK_ENABLE_INSTALL OFF CACHE BOOL "" FORCE) +set(BENCHMARK_ENABLE_GTEST_LIB OFF CACHE BOOL "" FORCE) + +FetchContent_Declare( + google_benchmark + GIT_REPOSITORY https://github.com/google/benchmark.git + GIT_TAG v1.9.5 +) +FetchContent_MakeAvailable(google_benchmark) add_library(gpc_core lib/gpc/forest.cpp lib/gpc/fern.cpp @@ -69,4 +88,5 @@ target_include_directories(gpc_core PUBLIC lib) enable_testing() add_subdirectory(samples) add_subdirectory(tests) +add_subdirectory(benchmarks) diff --git a/lib/gpc/forest.cpp b/lib/gpc/forest.cpp index 79f6894..e39eaff 100644 --- a/lib/gpc/forest.cpp +++ b/lib/gpc/forest.cpp @@ -211,26 +211,26 @@ PreprocessedImage Forest::preprocessImage(ndb::Buffer& img, "gradientThreshold needs to be within 0...255"); ndb::Buffer smooth(img.rows(), img.cols()); + smooth.width = img.width; + // 0.2ms ndb::box(img.data(), smooth.data(), img.cols(), img.rows(), settings.numThreads_); + //4.2 *10^-5 ms smooth.clearBoundary(); ndb::Buffer grad(img.rows(), img.cols()); grad.width = img.width; - ndb::Buffer maskTmp; - gpc::inference::time_point t0 = gpc::inference::sysTick(); + //4.2*10-5ms (unclear how) ndb::sobel(img.data(), grad.data(), img.cols(), img.rows(), settings.gradientThreshold_, settings.numThreads_); - - gpc::inference::time_point t1 = gpc::inference::sysTick(); - cout << "sobel: " << gpc::inference::tickToMs(t1, t0) << " ms" << endl; + gpc::inference::time_point t0 = gpc::inference::sysTick(); ndb::Buffer idx; idx.resize(grad.rows(), grad.cols()); auto ff = [&](ndb::Buffer& in, std::vector& out, int m) { @@ -247,6 +247,8 @@ PreprocessedImage Forest::preprocessImage(ndb::Buffer& img, std::vector mask; ndb::arr2ind(grad.data(), grad.cols() * grad.rows(), idx.data(), &m); ff(idx, mask, m); + + gpc::inference::time_point t1 = gpc::inference::sysTick(); // Our outputs are: smooth, grad, mask; return PreprocessedImage(smooth, grad, mask); } diff --git a/lib/gpc/kernels/box.cpp b/lib/gpc/kernels/box.cpp index 0b611fb..c9984dd 100644 --- a/lib/gpc/kernels/box.cpp +++ b/lib/gpc/kernels/box.cpp @@ -30,6 +30,7 @@ // Code Author: Niklaus Bamert (bamertn@ethz.ch) #include "gpc/kernels/box.hpp" +#include namespace ndb { namespace testing { void box_hwy(uint8_t* in, uint8_t* blurred, int width, int height); diff --git a/lib/gpc/kernels/census.cpp b/lib/gpc/kernels/census.cpp index bd70613..6235b06 100644 --- a/lib/gpc/kernels/census.cpp +++ b/lib/gpc/kernels/census.cpp @@ -28,7 +28,7 @@ // POSSIBILITY OF SUCH DAMAGE. // // Code Author: Niklaus Bamert (bamertn@ethz.ch) - +#include #include "gpc/kernels/census.hpp" void census5x5Naive(uint8_t* in, uint32_t* census, int width, int height) { uint32_t val; diff --git a/lib/gpc/kernels/gpc.cpp b/lib/gpc/kernels/gpc.cpp index 636a9cb..5e22e23 100644 --- a/lib/gpc/kernels/gpc.cpp +++ b/lib/gpc/kernels/gpc.cpp @@ -28,7 +28,7 @@ // POSSIBILITY OF SUCH DAMAGE. // // Code Author: Niklaus Bamert (bamertn@ethz.ch) - +#include #include "gpc/kernels/gpc.hpp" namespace ndb { void gpcFilterNaive(uint8_t* in, diff --git a/lib/gpc/kernels/sobel.cpp b/lib/gpc/kernels/sobel.cpp index c6a14bc..dc6e46b 100644 --- a/lib/gpc/kernels/sobel.cpp +++ b/lib/gpc/kernels/sobel.cpp @@ -28,7 +28,7 @@ // POSSIBILITY OF SUCH DAMAGE. // // Code Author: Niklaus Bamert (bamertn@ethz.ch) - +#include #include "gpc/kernels/sobel.hpp" namespace ndb { namespace testing { @@ -165,7 +165,7 @@ void sobel(uint8_t* in, #if defined(__ARM_NEON) || defined(__aarch64__) // Force use of our new Highway kernel on Mac sobelNaive(in, blurred, width, height, threshold); - //testing::sobel_hwy(in, blurred, width, height, threshold); + //testing::sobel_hwy(in, blurred, width, height, threshold); // not exact! #else #ifndef _INTRINSICS_SSE sobelNaive(in, blurred, width, height, threshold); diff --git a/samples/sparsematch.cpp b/samples/sparsematch.cpp index 2554271..be0015a 100644 --- a/samples/sparsematch.cpp +++ b/samples/sparsematch.cpp @@ -51,7 +51,7 @@ int main(int argc, char** argv) { gpc::inference::InferenceSettings inferencesettings = gpc::inference::InferenceSettings() .builder() - .gradientThreshold(20) + .gradientThreshold(2) // gradientthres 20: matching ~3ms, 2: matching: ~30ms. .verticalTolerance( 0) // 0px tolerance for rectified epipolar matches .dispHigh(128) // limit disparities to 128 @@ -68,9 +68,12 @@ int main(int argc, char** argv) { gpc::inference::FilterMask fm = forest.readForest(forestPath, simg.cols(), simg.rows()); + for(int i = 0; i<10000; i++) { // Preprocess images (box filter, sobel filter, indices of high gradient // pixels) + gpc::inference::time_point t0 = gpc::inference::sysTick(); + gpc::inference::PreprocessedImage simgP = forest.preprocessImage(simg, inferencesettings); gpc::inference::PreprocessedImage timgP = @@ -81,15 +84,8 @@ int main(int argc, char** argv) { std::vector supp = forest.rectifiedMatch(simgP, timgP, fm, inferencesettings); gpc::inference::time_point t2 = gpc::inference::sysTick(); - cout << "tPreprocess: " << gpc::inference::tickToMs(t1, t0) << " ms" - << ", #candidatesL:" << simgP.mask.size() - << ", #candidatesR:" << timgP.mask.size() - << ", tMatch: " << gpc::inference::tickToMs(t2, t1) << " ms" - << ", num matches:" << supp.size() << std::endl; - - // Output sparse disparities overlayed on left input image - ndb::Buffer renderDisp; - renderDisp = ndb::getDisparityVisualization(simg, supp); - renderDisp.writePNGRGB("disparity.png"); + std::cout << "Preprocessing time: " << gpc::inference::tickToMs(t1, t0) << " ms" << std::endl; + std::cout << "Matching time: " << gpc::inference::tickToMs(t2, t1) << " ms" << std::endl; + } test_hwy_neon(); } diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 564c125..5e527f7 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -11,14 +11,22 @@ FetchContent_MakeAvailable(approvaltests) find_package(GTest REQUIRED) -add_executable(test_single_matching test_single_matching.cpp) -#target_link_libraries(test_single_matching PRIVATE ${PNG_LIBRARIES} ApprovalTests::ApprovalTests GTest::gtest_main Eigen3::Eigen) -target_link_libraries(test_single_matching - PRIVATE - gpc_core - ApprovalTests::ApprovalTests - GTest::gtest_main -) +function(add_gpc_approval_test TEST_NAME SOURCE_FILE) + add_executable(${TEST_NAME} ${SOURCE_FILE}) + + target_link_libraries(${TEST_NAME} + PRIVATE + gpc_core + ApprovalTests::ApprovalTests + GTest::gtest_main + ) + + + add_test(NAME ${TEST_NAME} COMMAND ${TEST_NAME}) +endfunction() + +add_gpc_approval_test(test_single_matching test_single_matching.cpp) +add_gpc_approval_test(test_kernel_box test_kernel_box.cpp) +add_gpc_approval_test(test_kernel_sobel test_kernel_sobel.cpp) -add_test(NAME single_matching COMMAND test_single_matching) diff --git a/tests/test_single_matching.cpp b/tests/test_single_matching.cpp index fdff603..e675a7c 100644 --- a/tests/test_single_matching.cpp +++ b/tests/test_single_matching.cpp @@ -52,3 +52,5 @@ TEST(Approval, Inference) EXPECT_EQ(866, supp.size()); ApprovalTests::Approvals::verify(ss.str()); } + + From 832a6516c791b86de798e989f1ac9b10fc985092 Mon Sep 17 00:00:00 2001 From: Nik Bamert Date: Wed, 18 Feb 2026 19:35:51 +0100 Subject: [PATCH 15/36] cmakelist eigen include --- lib/gpc/inference.hpp | 1 - samples/CMakeLists.txt | 6 +++--- tests/CMakeLists.txt | 2 +- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/lib/gpc/inference.hpp b/lib/gpc/inference.hpp index e1a887a..eddc709 100644 --- a/lib/gpc/inference.hpp +++ b/lib/gpc/inference.hpp @@ -465,7 +465,6 @@ class Forest { int numFerns; int type; ff >> numFerns; - cout << "number of ferns:" << numFerns << endl; for (int i = 0; i < numFerns; i++) { int fernID, numTests; std::string fernScale; diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt index 3bbe11f..a7e73d7 100644 --- a/samples/CMakeLists.txt +++ b/samples/CMakeLists.txt @@ -1,10 +1,10 @@ add_executable(extract extract.cpp) -target_link_libraries(extract ${PNG_LIBRARIES} Threads::Threads) +target_link_libraries(extract ${PNG_LIBRARIES} Threads::Threads Eigen3::Eigen) add_executable(train train.cpp) -target_link_libraries(train ${PNG_LIBRARIES} Threads::Threads) +target_link_libraries(train ${PNG_LIBRARIES} Threads::Threads Eigen3::Eigen) add_executable(sparsematch sparsematch.cpp) -target_link_libraries(sparsematch ${PNG_LIBRARIES} Threads::Threads) +target_link_libraries(sparsematch ${PNG_LIBRARIES} Threads::Threads Eigen3::Eigen) diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index acf86db..a211cac 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -12,7 +12,7 @@ FetchContent_MakeAvailable(approvaltests) find_package(GTest REQUIRED) add_executable(test_single_matching test_single_matching.cpp) -target_link_libraries(test_single_matching PRIVATE ${PNG_LIBRARIES} ApprovalTests::ApprovalTests GTest::gtest_main) +target_link_libraries(test_single_matching PRIVATE ${PNG_LIBRARIES} ApprovalTests::ApprovalTests GTest::gtest_main Eigen3::Eigen) add_test(NAME single_matching COMMAND test_single_matching) From ca65a4284e7e73f1640093bb942045bdc0d45a58 Mon Sep 17 00:00:00 2001 From: Nik Bamert Date: Wed, 18 Feb 2026 20:55:42 +0100 Subject: [PATCH 16/36] update dir --- CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 55e3851..56b841c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -40,4 +40,5 @@ endif() enable_testing() add_subdirectory(samples) add_subdirectory(tests) +add_subdirectory(benchmark) From d3e1c320d62830c6012a8f9c2c76dbd247d204fe Mon Sep 17 00:00:00 2001 From: Nik Bamert Date: Wed, 18 Feb 2026 21:02:29 +0100 Subject: [PATCH 17/36] update --- CMakeLists.txt | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 56b841c..e96a542 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -37,6 +37,13 @@ if(SSE) endif() endif() +FetchContent_Declare( + google_benchmark + GIT_REPOSITORY https://github.com/google/benchmark.git + GIT_TAG v1.9.5 +) +FetchContent_MakeAvailable(google_benchmark) + enable_testing() add_subdirectory(samples) add_subdirectory(tests) From 362f3f5be2960e6b601ce97dfc5df75c8d985f03 Mon Sep 17 00:00:00 2001 From: Nik Bamert Date: Wed, 18 Feb 2026 21:26:04 +0100 Subject: [PATCH 18/36] add benchmark. coarse runtime measurement --- CMakeLists.txt | 3 +- benchmarks/CMakeLists.txt | 8 +++++ benchmarks/kernel_bench.cpp | 69 +++++++++++++++++++++++++++++++++++++ 3 files changed, 79 insertions(+), 1 deletion(-) create mode 100644 benchmarks/CMakeLists.txt create mode 100644 benchmarks/kernel_bench.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index e96a542..81d06f9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,6 +8,7 @@ set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) +include(FetchContent) find_package(Eigen3 REQUIRED) find_package(PNG REQUIRED) find_package(Threads REQUIRED) @@ -47,5 +48,5 @@ FetchContent_MakeAvailable(google_benchmark) enable_testing() add_subdirectory(samples) add_subdirectory(tests) -add_subdirectory(benchmark) +add_subdirectory(benchmarks) diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt new file mode 100644 index 0000000..ecbaa4d --- /dev/null +++ b/benchmarks/CMakeLists.txt @@ -0,0 +1,8 @@ +add_executable(kernel_bench kernel_bench.cpp) + +target_link_libraries(kernel_bench + PRIVATE + benchmark::benchmark + Eigen3::Eigen + ${PNG_LIBRARIES} +) diff --git a/benchmarks/kernel_bench.cpp b/benchmarks/kernel_bench.cpp new file mode 100644 index 0000000..13c1927 --- /dev/null +++ b/benchmarks/kernel_bench.cpp @@ -0,0 +1,69 @@ +#include +#include "gpc/inference.hpp" + +typedef gpc::inference::Forest GPCForest_t; +GPCForest_t forest; + +static void fullInference( + benchmark::State& state){ + + std::string forestPath = "../forests/defaultZeroForest.txt"; + std::string leftImgPath = "../data/middlebury/im0.png"; + std::string rightImgPath = "../data/middlebury/im1.png"; + gpc::inference::InferenceSettings inferencesettings = + gpc::inference::InferenceSettings() + .builder() + .gradientThreshold(state.range(0)) // 0...255 gradient threshold for sobel filter + .verticalTolerance( + 0) // 0px tolerance for rectified epipolar matches + .dispHigh(128) // limit disparities to 128 + .epipolarMode(true) // match GPC states in epipolar mode. more + // matches, lower accuracy than global + .useHashtable(false); // use sort method for matching. faster for + // <100K descriptors + + ndb::Buffer simg, timg; + // Load images + simg.readPNG(leftImgPath); + timg.readPNG(rightImgPath); + + // Get learned filter for the given image dimensions. + GPCForest_t::FilterMask fm = + forest.readForest(forestPath, simg.cols(), simg.rows()); + + + + for (auto _ : state) { + GPCForest_t::PreprocessedImage simgP = + forest.preprocessImage(simg, inferencesettings); + GPCForest_t::PreprocessedImage timgP = + forest.preprocessImage(timg, inferencesettings); + std::vector supp = + forest.rectifiedMatch(simgP, timgP, fm, inferencesettings); + state.counters["f_s"] = simgP.mask.size(); + state.counters["f_t"] = timgP.mask.size(); + state.counters["matches"] = supp.size(); + benchmark::DoNotOptimize(supp); + benchmark::ClobberMemory(); + } + +} + +BENCHMARK(fullInference) + ->Unit(benchmark::kMillisecond) + ->Args({0}) + ->Args({5}) + ->Args({100}); + + +BENCHMARK_MAIN(); +/* +int main(int argc, char** argv) { + + BenchmarkResults b = fullInference(simg,timg, fm, inferenceSettings); + for (const auto& [name, time] : b) { + cout << name << ", " << time << " ms" << endl; + } + +} +*/ From 75846ed2296655307fd74e2d7662f8df1511bcd8 Mon Sep 17 00:00:00 2001 From: Nik Bamert Date: Sun, 22 Feb 2026 08:11:03 +0100 Subject: [PATCH 19/36] update legacy bench --- benchmarks/CMakeLists.txt | 16 ++++++++++++++++ benchmarks/box_legacy_bench.cpp | 21 +++++++++++++++++++++ benchmarks/kernel_bench.cpp | 17 +++++------------ benchmarks/sobel_legacy_bench.cpp | 21 +++++++++++++++++++++ 4 files changed, 63 insertions(+), 12 deletions(-) create mode 100644 benchmarks/box_legacy_bench.cpp create mode 100644 benchmarks/sobel_legacy_bench.cpp diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index ecbaa4d..66b886b 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -6,3 +6,19 @@ target_link_libraries(kernel_bench Eigen3::Eigen ${PNG_LIBRARIES} ) +add_executable(sobel_legacy_bench sobel_legacy_bench.cpp) + +target_link_libraries(sobel_legacy_bench + PRIVATE + benchmark::benchmark + Eigen3::Eigen + ${PNG_LIBRARIES} +) +add_executable(box_legacy_bench box_legacy_bench.cpp) + +target_link_libraries(box_legacy_bench + PRIVATE + benchmark::benchmark + Eigen3::Eigen + ${PNG_LIBRARIES} +) diff --git a/benchmarks/box_legacy_bench.cpp b/benchmarks/box_legacy_bench.cpp new file mode 100644 index 0000000..9f06107 --- /dev/null +++ b/benchmarks/box_legacy_bench.cpp @@ -0,0 +1,21 @@ +#include +#include "gpc/filter.hpp" + +static void BM_BoxHighway(benchmark::State& state) { + int w = 1920, h = 1080; + std::vector in(w * h, 128); + std::vector out(w * h, 0); + + for (auto _ : state) { + ndb::box(in.data(), out.data(), w, h, 50); + + // Ensure the compiler doesn't skip the work + benchmark::DoNotOptimize(out.data()); + benchmark::ClobberMemory(); + } + + state.SetBytesProcessed(int64_t(state.iterations()) * w * h); +} +BENCHMARK(BM_BoxHighway)->Unit(benchmark::kMillisecond); + +BENCHMARK_MAIN(); diff --git a/benchmarks/kernel_bench.cpp b/benchmarks/kernel_bench.cpp index 13c1927..d86cfe8 100644 --- a/benchmarks/kernel_bench.cpp +++ b/benchmarks/kernel_bench.cpp @@ -40,8 +40,8 @@ static void fullInference( forest.preprocessImage(timg, inferencesettings); std::vector supp = forest.rectifiedMatch(simgP, timgP, fm, inferencesettings); - state.counters["f_s"] = simgP.mask.size(); - state.counters["f_t"] = timgP.mask.size(); + state.counters["candidates_s"] = simgP.mask.size(); + state.counters["candidates_t"] = timgP.mask.size(); state.counters["matches"] = supp.size(); benchmark::DoNotOptimize(supp); benchmark::ClobberMemory(); @@ -53,17 +53,10 @@ BENCHMARK(fullInference) ->Unit(benchmark::kMillisecond) ->Args({0}) ->Args({5}) + ->Args({10}) + ->Args({20}) + ->Args({50}) ->Args({100}); BENCHMARK_MAIN(); -/* -int main(int argc, char** argv) { - - BenchmarkResults b = fullInference(simg,timg, fm, inferenceSettings); - for (const auto& [name, time] : b) { - cout << name << ", " << time << " ms" << endl; - } - -} -*/ diff --git a/benchmarks/sobel_legacy_bench.cpp b/benchmarks/sobel_legacy_bench.cpp new file mode 100644 index 0000000..406dd6a --- /dev/null +++ b/benchmarks/sobel_legacy_bench.cpp @@ -0,0 +1,21 @@ +#include +#include "gpc/filter.hpp" + +static void BM_SobelHighway(benchmark::State& state) { + int w = 1920, h = 1080; + std::vector in(w * h, 128); + std::vector out(w * h, 0); + + for (auto _ : state) { + ndb::sobel(in.data(), out.data(), w, h, 50, 1); + + // Ensure the compiler doesn't skip the work + benchmark::DoNotOptimize(out.data()); + benchmark::ClobberMemory(); + } + + state.SetBytesProcessed(int64_t(state.iterations()) * w * h); +} +BENCHMARK(BM_SobelHighway)->Unit(benchmark::kMillisecond); + +BENCHMARK_MAIN(); From 0b37d4538dbe3cf42617bfac86e83b982363f36b Mon Sep 17 00:00:00 2001 From: Nik Bamert Date: Sun, 22 Feb 2026 10:28:24 +0100 Subject: [PATCH 20/36] benchmark, perf, box and sobel acceptance tests --- benchmarks/CMakeLists_decouple_branch.txt | 11 ++++++ benchmarks/sobel_bench.cpp | 22 ++++++++++++ lib/gpc/kernels/box_hwy.hpp | 18 ++++++++++ lib/gpc/kernels/sobel_hwy.hpp | 17 +++++++++ tests/test_kernel_box.cpp | 44 +++++++++++++++++++++++ tests/test_kernel_sobel.cpp | 44 +++++++++++++++++++++++ 6 files changed, 156 insertions(+) create mode 100644 benchmarks/CMakeLists_decouple_branch.txt create mode 100644 benchmarks/sobel_bench.cpp create mode 100644 lib/gpc/kernels/box_hwy.hpp create mode 100644 lib/gpc/kernels/sobel_hwy.hpp create mode 100644 tests/test_kernel_box.cpp create mode 100644 tests/test_kernel_sobel.cpp diff --git a/benchmarks/CMakeLists_decouple_branch.txt b/benchmarks/CMakeLists_decouple_branch.txt new file mode 100644 index 0000000..efecb51 --- /dev/null +++ b/benchmarks/CMakeLists_decouple_branch.txt @@ -0,0 +1,11 @@ +add_executable(kernel_bench sobel_bench.cpp) + +target_link_libraries(kernel_bench + PRIVATE + gpc_core + benchmark::benchmark + hwy +) + +# allows the compiler to inline Highway kernels into the benchmark loop +set_target_properties(kernel_bench PROPERTIES INTERPROCEDURAL_OPTIMIZATION TRUE) diff --git a/benchmarks/sobel_bench.cpp b/benchmarks/sobel_bench.cpp new file mode 100644 index 0000000..e0a65d1 --- /dev/null +++ b/benchmarks/sobel_bench.cpp @@ -0,0 +1,22 @@ +#include +#include "gpc/kernels/sobel_hwy.hpp" // Your header + +static void BM_SobelHighway(benchmark::State& state) { + int w = 1920, h = 1080; + std::vector in(w * h, 128); + std::vector out(w * h, 0); + + // Warmup is handled automatically by the library + for (auto _ : state) { + ndb::testing::sobel_hwy(in.data(), out.data(), w, h, 50); + + // Ensure the compiler doesn't skip the work + benchmark::DoNotOptimize(out.data()); + benchmark::ClobberMemory(); + } + + state.SetBytesProcessed(int64_t(state.iterations()) * w * h); +} +BENCHMARK(BM_SobelHighway)->Unit(benchmark::kMillisecond); + +BENCHMARK_MAIN(); diff --git a/lib/gpc/kernels/box_hwy.hpp b/lib/gpc/kernels/box_hwy.hpp new file mode 100644 index 0000000..6c256b0 --- /dev/null +++ b/lib/gpc/kernels/box_hwy.hpp @@ -0,0 +1,18 @@ +#ifndef __NDB__KERNEL_BOX_HWY +#define __NDB__KERNEL_BOX_HWY + +#include + +namespace ndb { + +namespace testing { + /** + * Entry point for benchmarking the MulHigh (approximate) version. + */ + void box_hwy(uint8_t* in, uint8_t* blurred, int width, int height); + +} + +} // namespace ndb + +#endif // GPC_KERNELS_BOX_HWY_H_ diff --git a/lib/gpc/kernels/sobel_hwy.hpp b/lib/gpc/kernels/sobel_hwy.hpp new file mode 100644 index 0000000..bc99199 --- /dev/null +++ b/lib/gpc/kernels/sobel_hwy.hpp @@ -0,0 +1,17 @@ +#ifndef __NDB__KERNEL_SOBEL_HWY +#define __NDB__KERNEL_SOBEL_HWY + +#include + +namespace ndb { + +namespace testing { + /** + * Entry point for benchmarking the MulHigh (approximate) version. + */ + void sobel_hwy(uint8_t* in, uint8_t* blurred, int width, int height, uint8_t threshold); +} + +} // namespace ndb + +#endif // GPC_KERNELS_SOBEL_HWY_H_ diff --git a/tests/test_kernel_box.cpp b/tests/test_kernel_box.cpp new file mode 100644 index 0000000..8fff2ce --- /dev/null +++ b/tests/test_kernel_box.cpp @@ -0,0 +1,44 @@ +#include +#include +#include +#include "gpc/kernels/box.hpp" // Naive version +#include "gpc/kernels/box_hwy.hpp" // Highway version + +TEST(Approval, BoxKernel) { + const int width = 640; + const int height = 480; + const int radius = 2; // Typical for 5x5 box + + // 1. Prepare randomized input + std::vector input(width * height); + std::mt19937 gen(42); + std::uniform_int_distribution<> dis(0, 255); + for (auto& val : input) val = dis(gen); + + // 2. Prepare output buffers + std::vector outNaive(width * height, 0); + std::vector outHighway(width * height, 0); + + // 3. Run Naive version + ndb::boxNaive(input.data(), outNaive.data(), width, height); + + // 4. Run Highway version (only if compiled for the target) +#if defined(HWY_TARGET) && HWY_TARGET == HWY_NEON + ndb::N_NEON::BoxFilter(input.data(), outHighway.data(), width, height); +#else + // Fallback if the specific NEON namespace isn't exposed + ndb::testing::box_hwy(input.data(), outHighway.data(), width, height); + +#endif + + // 5. Compare results + // We skip the border (radius) because different implementations + // might handle edges differently. + for (int y = radius; y < height - radius; ++y) { + for (int x = radius; x < width - radius; ++x) { + int idx = y * width + x; + ASSERT_EQ(outNaive[idx], outHighway[idx]) + << "Mismatch at (" << x << "," << y << ")"; + } + } +} diff --git a/tests/test_kernel_sobel.cpp b/tests/test_kernel_sobel.cpp new file mode 100644 index 0000000..fd5b30d --- /dev/null +++ b/tests/test_kernel_sobel.cpp @@ -0,0 +1,44 @@ +#include +#include +#include +#include "gpc/kernels/sobel.hpp" // Naive version +#include "gpc/kernels/sobel_hwy.hpp" // Highway version + +TEST(Approval, SobelKernel) { + const int width = 640; + const int height = 480; + const int radius = 2; // Typical for 5x5 box + + // 1. Prepare randomized input + std::vector input(width * height); + std::mt19937 gen(42); + std::uniform_int_distribution<> dis(0, 255); + for (auto& val : input) val = dis(gen); + + // 2. Prepare output buffers + std::vector outNaive(width * height, 0); + std::vector outHighway(width * height, 0); + + // 3. Run Naive version + ndb::sobelNaive(input.data(), outNaive.data(), width, height, 30); + + // 4. Run Highway version (only if compiled for the target) +#if defined(HWY_TARGET) && HWY_TARGET == HWY_NEON + ndb::N_NEON::BoxFilter(input.data(), outHighway.data(), width, height); +#else + // Fallback if the specific NEON namespace isn't exposed + ndb::testing::sobel_hwy(input.data(), outHighway.data(), width, height, 30); + +#endif + + // 5. Compare results + // We skip the border (radius) because different implementations + // might handle edges differently. + for (int y = radius; y < height - radius; ++y) { + for (int x = radius; x < width - radius; ++x) { + int idx = y * width + x; + ASSERT_EQ(outNaive[idx], outHighway[idx]) + << "Mismatch at (" << x << "," << y << ")"; + } + } +} From 018e23c91a240a086a89242263e115ce5531d6bf Mon Sep 17 00:00:00 2001 From: Nik Bamert Date: Sun, 22 Feb 2026 10:31:09 +0100 Subject: [PATCH 21/36] move --- benchmarks/{CMakeLists_decouple_branch.txt => CMakeLists.txt} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename benchmarks/{CMakeLists_decouple_branch.txt => CMakeLists.txt} (76%) diff --git a/benchmarks/CMakeLists_decouple_branch.txt b/benchmarks/CMakeLists.txt similarity index 76% rename from benchmarks/CMakeLists_decouple_branch.txt rename to benchmarks/CMakeLists.txt index efecb51..6a97132 100644 --- a/benchmarks/CMakeLists_decouple_branch.txt +++ b/benchmarks/CMakeLists.txt @@ -7,5 +7,5 @@ target_link_libraries(kernel_bench hwy ) -# allows the compiler to inline Highway kernels into the benchmark loop +G# allows the compiler to inline Highway kernels into the benchmark loop set_target_properties(kernel_bench PROPERTIES INTERPROCEDURAL_OPTIMIZATION TRUE) From 83c18b50239a7c3141f2d7f0e6bcf88ee07f97e4 Mon Sep 17 00:00:00 2001 From: Nik Bamert Date: Sun, 22 Feb 2026 12:36:08 +0100 Subject: [PATCH 22/36] rename --- benchmarks/box_legacy_bench.cpp | 4 ++-- benchmarks/sobel_legacy_bench.cpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/benchmarks/box_legacy_bench.cpp b/benchmarks/box_legacy_bench.cpp index 39a5edf..c8960a3 100644 --- a/benchmarks/box_legacy_bench.cpp +++ b/benchmarks/box_legacy_bench.cpp @@ -1,7 +1,7 @@ #include #include "gpc/kernels/box.hpp" -static void BM_BoxHighway(benchmark::State& state) { +static void BM_BoxLegacy(benchmark::State& state) { int w = 1920, h = 1080; std::vector in(w * h, 128); std::vector out(w * h, 0); @@ -16,6 +16,6 @@ static void BM_BoxHighway(benchmark::State& state) { state.SetBytesProcessed(int64_t(state.iterations()) * w * h); } -BENCHMARK(BM_BoxHighway)->Unit(benchmark::kMillisecond); +BENCHMARK(BM_BoxLegacy)->Unit(benchmark::kMillisecond); BENCHMARK_MAIN(); diff --git a/benchmarks/sobel_legacy_bench.cpp b/benchmarks/sobel_legacy_bench.cpp index b20d502..d00c2c6 100644 --- a/benchmarks/sobel_legacy_bench.cpp +++ b/benchmarks/sobel_legacy_bench.cpp @@ -1,7 +1,7 @@ #include #include "gpc/kernels/sobel.hpp" -static void BM_SobelHighway(benchmark::State& state) { +static void BM_SobelLegacy(benchmark::State& state) { int w = 1920, h = 1080; std::vector in(w * h, 128); std::vector out(w * h, 0); @@ -16,6 +16,6 @@ static void BM_SobelHighway(benchmark::State& state) { state.SetBytesProcessed(int64_t(state.iterations()) * w * h); } -BENCHMARK(BM_SobelHighway)->Unit(benchmark::kMillisecond); +BENCHMARK(BM_SobelLegacy)->Unit(benchmark::kMillisecond); BENCHMARK_MAIN(); From 5358de18d0d47f10622ee6e765cb5185f797ec0b Mon Sep 17 00:00:00 2001 From: Nik Bamert Date: Sun, 22 Feb 2026 12:57:26 +0100 Subject: [PATCH 23/36] update hwy kernels --- lib/gpc/kernels/box_hwy.cpp | 125 +++++++----------------------- lib/gpc/kernels/sobel_hwy.cpp | 139 +++++++--------------------------- lib/gpc/kernels/utils.cpp | 1 + 3 files changed, 56 insertions(+), 209 deletions(-) diff --git a/lib/gpc/kernels/box_hwy.cpp b/lib/gpc/kernels/box_hwy.cpp index 384360e..7cdd983 100644 --- a/lib/gpc/kernels/box_hwy.cpp +++ b/lib/gpc/kernels/box_hwy.cpp @@ -6,65 +6,15 @@ namespace ndb { namespace HWY_NAMESPACE { namespace hn = hwy::HWY_NAMESPACE; -void BoxKernelNaive(const uint8_t* HWY_RESTRICT in, uint8_t* HWY_RESTRICT blurred, int width, int height) { - const hn::ScalableTag d8; - const hn::ScalableTag d16; - const size_t N = hn::Lanes(d8); - const auto divisor = hn::Set(d16, (uint16_t)7282); // 65536 / 9 - - for (int y = 1; y < height - 1; ++y) { - const uint8_t* r0 = in + (y - 1) * width; - const uint8_t* r1 = in + y * width; - const uint8_t* r2 = in + (y + 1) * width; - - uint8_t* out_row = blurred + y * width + 1; - - for (int x = 0; x < width; x += N) { - // Row 0 - auto v11 = hn::LoadU(d8, r0 + x); - auto v12 = hn::LoadU(d8, r0 + x + 1); - auto v13 = hn::LoadU(d8, r0 + x + 2); - - // Row 1 - auto v21 = hn::LoadU(d8, r1 + x); - auto v22 = hn::LoadU(d8, r1 + x + 1); - auto v23 = hn::LoadU(d8, r1 + x + 2); - - // Row 2 - auto v31 = hn::LoadU(d8, r2 + x); - auto v32 = hn::LoadU(d8, r2 + x + 1); - auto v33 = hn::LoadU(d8, r2 + x + 2); - - // Vertical sums first (3 instructions per half-vector) - auto sum_col1_lo = hn::Add(hn::PromoteLowerTo(d16, v11), hn::Add(hn::PromoteLowerTo(d16, v21), hn::PromoteLowerTo(d16, v31))); - auto sum_col1_hi = hn::Add(hn::PromoteUpperTo(d16, v11), hn::Add(hn::PromoteUpperTo(d16, v21), hn::PromoteUpperTo(d16, v31))); - - auto sum_col2_lo = hn::Add(hn::PromoteLowerTo(d16, v12), hn::Add(hn::PromoteLowerTo(d16, v22), hn::PromoteLowerTo(d16, v32))); - auto sum_col2_hi = hn::Add(hn::PromoteUpperTo(d16, v12), hn::Add(hn::PromoteUpperTo(d16, v22), hn::PromoteUpperTo(d16, v32))); - - auto sum_col3_lo = hn::Add(hn::PromoteLowerTo(d16, v13), hn::Add(hn::PromoteLowerTo(d16, v23), hn::PromoteLowerTo(d16, v33))); - auto sum_col3_hi = hn::Add(hn::PromoteUpperTo(d16, v13), hn::Add(hn::PromoteUpperTo(d16, v23), hn::PromoteUpperTo(d16, v33))); - // Horizontal accumulation - auto total_lo = hn::Add(sum_col1_lo, hn::Add(sum_col2_lo, sum_col3_lo)); - auto total_hi = hn::Add(sum_col1_hi, hn::Add(sum_col2_hi, sum_col3_hi)); - - // Fixed-point division by 9 - auto res_lo = hn::MulHigh(total_lo, divisor); - auto res_hi = hn::MulHigh(total_hi, divisor); - - hn::StoreU(hn::Combine(d8, hn::DemoteTo(d8, res_hi), hn::DemoteTo(d8, res_lo)), d8, out_row + x); - } - } -} void BoxKernel(const uint8_t* HWY_RESTRICT in, uint8_t* HWY_RESTRICT blurred, int width, int height) { const hn::ScalableTag d8; - const hn::ScalableTag d16; + // We need d16 to be the "Promoted" version of the half-width d8 to stay lane-consistent + const hn::Rebind> d16; + const size_t N = hn::Lanes(d8); const auto divisor = hn::Set(d16, (uint16_t)7282); - // We process two output rows at a time (y and y+1) - // This requires 4 input rows (r0, r1, r2, r3) for (int y = 1; y < height - 2; y += 2) { const uint8_t* r0 = in + (y - 1) * width; const uint8_t* r1 = in + y * width; @@ -75,65 +25,46 @@ void BoxKernel(const uint8_t* HWY_RESTRICT in, uint8_t* HWY_RESTRICT blurred, in uint8_t* out1 = blurred + (y + 1) * width + 1; for (int x = 0; x < width; x += N) { - // Load all 4 rows needed for 2 output rows - auto v0_0 = hn::LoadU(d8, r0 + x); - auto v0_1 = hn::LoadU(d8, r0 + x + 1); - auto v0_2 = hn::LoadU(d8, r0 + x + 2); - - auto v1_0 = hn::LoadU(d8, r1 + x); - auto v1_1 = hn::LoadU(d8, r1 + x + 1); - auto v1_2 = hn::LoadU(d8, r1 + x + 2); - - auto v2_0 = hn::LoadU(d8, r2 + x); - auto v2_1 = hn::LoadU(d8, r2 + x + 1); - auto v2_2 = hn::LoadU(d8, r2 + x + 2); - - auto v3_0 = hn::LoadU(d8, r3 + x); - auto v3_1 = hn::LoadU(d8, r3 + x + 1); - auto v3_2 = hn::LoadU(d8, r3 + x + 2); - - // Vertical sums for Row Pair 1 (Rows 0, 1, 2) - // Vertical sums for Row Pair 2 (Rows 1, 2, 3) - // Note: Rows 1 and 2 are REUSED. - - auto s1_lo = hn::Add(hn::PromoteLowerTo(d16, v1_1), hn::Add(hn::PromoteLowerTo(d16, v1_0), hn::PromoteLowerTo(d16, v1_2))); - auto s2_lo = hn::Add(hn::PromoteLowerTo(d16, v2_1), hn::Add(hn::PromoteLowerTo(d16, v2_0), hn::PromoteLowerTo(d16, v2_2))); + auto v0_0 = hn::LoadU(d8, r0 + x); auto v0_1 = hn::LoadU(d8, r0 + x + 1); auto v0_2 = hn::LoadU(d8, r0 + x + 2); + auto v1_0 = hn::LoadU(d8, r1 + x); auto v1_1 = hn::LoadU(d8, r1 + x + 1); auto v1_2 = hn::LoadU(d8, r1 + x + 2); + auto v2_0 = hn::LoadU(d8, r2 + x); auto v2_1 = hn::LoadU(d8, r2 + x + 1); auto v2_2 = hn::LoadU(d8, r2 + x + 2); + auto v3_0 = hn::LoadU(d8, r3 + x); auto v3_1 = hn::LoadU(d8, r3 + x + 1); auto v3_2 = hn::LoadU(d8, r3 + x + 2); + + // Helper to sum 3 promoted pixels + auto sum3 = [&](auto v0, auto v1, auto v2) { + return hn::Add(v1, hn::Add(v0, v2)); + }; + + // LOWER HALF + auto s1_lo = sum3(hn::PromoteTo(d16, hn::LowerHalf(v1_0)), hn::PromoteTo(d16, hn::LowerHalf(v1_1)), hn::PromoteTo(d16, hn::LowerHalf(v1_2))); + auto s2_lo = sum3(hn::PromoteTo(d16, hn::LowerHalf(v2_0)), hn::PromoteTo(d16, hn::LowerHalf(v2_1)), hn::PromoteTo(d16, hn::LowerHalf(v2_2))); - // Output Row 0 logic - auto s0_lo = hn::Add(hn::PromoteLowerTo(d16, v0_1), hn::Add(hn::PromoteLowerTo(d16, v0_0), hn::PromoteLowerTo(d16, v0_2))); - auto row0_lo = hn::Add(s0_lo, hn::Add(s1_lo, s2_lo)); + auto row0_lo = hn::Add(sum3(hn::PromoteTo(d16, hn::LowerHalf(v0_0)), hn::PromoteTo(d16, hn::LowerHalf(v0_1)), hn::PromoteTo(d16, hn::LowerHalf(v0_2))), hn::Add(s1_lo, s2_lo)); + auto row1_lo = hn::Add(sum3(hn::PromoteTo(d16, hn::LowerHalf(v3_0)), hn::PromoteTo(d16, hn::LowerHalf(v3_1)), hn::PromoteTo(d16, hn::LowerHalf(v3_2))), hn::Add(s1_lo, s2_lo)); - // Output Row 1 logic - auto s3_lo = hn::Add(hn::PromoteLowerTo(d16, v3_1), hn::Add(hn::PromoteLowerTo(d16, v3_0), hn::PromoteLowerTo(d16, v3_2))); - auto row1_lo = hn::Add(s3_lo, hn::Add(s1_lo, s2_lo)); - - // Repeat for high bits... - auto s1_hi = hn::Add(hn::PromoteUpperTo(d16, v1_1), hn::Add(hn::PromoteUpperTo(d16, v1_0), hn::PromoteUpperTo(d16, v1_2))); - auto s2_hi = hn::Add(hn::PromoteUpperTo(d16, v2_1), hn::Add(hn::PromoteUpperTo(d16, v2_0), hn::PromoteUpperTo(d16, v2_2))); + // UPPER HALF + auto s1_hi = sum3(hn::PromoteTo(d16, hn::UpperHalf(d8, v1_0)), hn::PromoteTo(d16, hn::UpperHalf(d8, v1_1)), hn::PromoteTo(d16, hn::UpperHalf(d8, v1_2))); + auto s2_hi = sum3(hn::PromoteTo(d16, hn::UpperHalf(d8, v2_0)), hn::PromoteTo(d16, hn::UpperHalf(d8, v2_1)), hn::PromoteTo(d16, hn::UpperHalf(d8, v2_2))); - auto s0_hi = hn::Add(hn::PromoteUpperTo(d16, v0_1), hn::Add(hn::PromoteUpperTo(d16, v0_0), hn::PromoteUpperTo(d16, v0_2))); - auto row0_hi = hn::Add(s0_hi, hn::Add(s1_hi, s2_hi)); - - auto s3_hi = hn::Add(hn::PromoteUpperTo(d16, v3_1), hn::Add(hn::PromoteUpperTo(d16, v3_0), hn::PromoteUpperTo(d16, v3_2))); - auto row1_hi = hn::Add(s3_hi, hn::Add(s1_hi, s2_hi)); + auto row0_hi = hn::Add(sum3(hn::PromoteTo(d16, hn::UpperHalf(d8, v0_0)), hn::PromoteTo(d16, hn::UpperHalf(d8, v0_1)), hn::PromoteTo(d16, hn::UpperHalf(d8, v0_2))), hn::Add(s1_hi, s2_hi)); + auto row1_hi = hn::Add(sum3(hn::PromoteTo(d16, hn::UpperHalf(d8, v3_0)), hn::PromoteTo(d16, hn::UpperHalf(d8, v3_1)), hn::PromoteTo(d16, hn::UpperHalf(d8, v3_2))), hn::Add(s1_hi, s2_hi)); - // Store both rows - hn::StoreU(hn::Combine(d8, hn::DemoteTo(d8, hn::MulHigh(row0_hi, divisor)), - hn::DemoteTo(d8, hn::MulHigh(row0_lo, divisor))), d8, out0 + x); - hn::StoreU(hn::Combine(d8, hn::DemoteTo(d8, hn::MulHigh(row1_hi, divisor)), - hn::DemoteTo(d8, hn::MulHigh(row1_lo, divisor))), d8, out1 + x); + // Perform normalization and store using OrderedDemote2To + hn::StoreU(hn::OrderedDemote2To(d8, hn::MulHigh(row0_lo, divisor), hn::MulHigh(row0_hi, divisor)), d8, out0 + x); + hn::StoreU(hn::OrderedDemote2To(d8, hn::MulHigh(row1_lo, divisor), hn::MulHigh(row1_hi, divisor)), d8, out1 + x); } } } - } // namespace HWY_NAMESPACE } // namespace ndb HWY_AFTER_NAMESPACE(); namespace ndb { namespace testing { +#if defined(HWY_TARGET) && HWY_TARGET == HWY_NEON void box_hwy(uint8_t* in, uint8_t* blurred, int width, int height) { ndb::N_NEON::BoxKernel(in, blurred, width, height); } +#endif } } diff --git a/lib/gpc/kernels/sobel_hwy.cpp b/lib/gpc/kernels/sobel_hwy.cpp index 1e395c4..97abee3 100644 --- a/lib/gpc/kernels/sobel_hwy.cpp +++ b/lib/gpc/kernels/sobel_hwy.cpp @@ -6,12 +6,11 @@ namespace ndb { namespace HWY_NAMESPACE { namespace hn = hwy::HWY_NAMESPACE; - void SobelKernel(const uint8_t* HWY_RESTRICT in, uint8_t* HWY_RESTRICT gradient, int width, int height, uint8_t threshold) { const hn::ScalableTag d8; - const hn::Rebind> d16; // Signed 16-bit, half the lanes of d8 - const hn::Half d8_half; // Tag for half-width 8-bit loads + // d16 will have half the lanes of d8, regardless of whether N=16 (NEON) or N=32 (AVX2) + const hn::Rebind> d16; const size_t N = hn::Lanes(d8); const auto divisor = hn::Set(d16, (int16_t)7282); @@ -26,23 +25,13 @@ void SobelKernel(const uint8_t* HWY_RESTRICT in, uint8_t* HWY_RESTRICT gradient, uint8_t* out = gradient + y * width + 1; for (int x = 0; x < width; x += N) { - // Load full 128-bit vectors + // Load full vectors (128-bit on NEON, 256-bit on AVX2) auto v11 = hn::LoadU(d8, r0 + x); auto v12 = hn::LoadU(d8, r0 + x + 1); auto v13 = hn::LoadU(d8, r0 + x + 2); auto v21 = hn::LoadU(d8, r1 + x); auto v23 = hn::LoadU(d8, r1 + x + 2); auto v31 = hn::LoadU(d8, r2 + x); auto v32 = hn::LoadU(d8, r2 + x + 1); auto v33 = hn::LoadU(d8, r2 + x + 2); - // LOWER HALF PROCESSING - { - // PromoteTo signed 16-bit from the lower half of our 8-bit vectors - auto p11 = hn::PromoteTo(d16, hn::LowerHalf(v11)); - auto p12 = hn::PromoteTo(d16, hn::LowerHalf(v12)); - auto p13 = hn::PromoteTo(d16, hn::LowerHalf(v13)); - auto p21 = hn::PromoteTo(d16, hn::LowerHalf(v21)); - auto p23 = hn::PromoteTo(d16, hn::LowerHalf(v23)); - auto p31 = hn::PromoteTo(d16, hn::LowerHalf(v31)); - auto p32 = hn::PromoteTo(d16, hn::LowerHalf(v32)); - auto p33 = hn::PromoteTo(d16, hn::LowerHalf(v33)); - + // Helper lambda to process 8-bit to 16-bit math for a specific half + auto process_half = [&](auto p11, auto p12, auto p13, auto p21, auto p23, auto p31, auto p32, auto p33) { auto sx = hn::Sub(hn::Add(hn::Add(p11, p31), hn::Add(p21, p21)), hn::Add(hn::Add(p13, p33), hn::Add(p23, p23))); sx = hn::MulHigh(sx, divisor); @@ -52,114 +41,40 @@ void SobelKernel(const uint8_t* HWY_RESTRICT in, uint8_t* HWY_RESTRICT gradient, sy = hn::MulHigh(sy, divisor); auto mag = hn::Add(hn::Mul(sx, sx), hn::Mul(sy, sy)); - auto mask = hn::Gt(mag, threshSq); - auto res_lo = hn::DemoteTo(d8_half, hn::IfThenElse(mask, v255, v0)); - - // UPPER HALF PROCESSING - auto u11 = hn::PromoteTo(d16, hn::UpperHalf(d8, v11)); - auto u12 = hn::PromoteTo(d16, hn::UpperHalf(d8, v12)); - auto u13 = hn::PromoteTo(d16, hn::UpperHalf(d8, v13)); - auto u21 = hn::PromoteTo(d16, hn::UpperHalf(d8, v21)); - auto u23 = hn::PromoteTo(d16, hn::UpperHalf(d8, v23)); - auto u31 = hn::PromoteTo(d16, hn::UpperHalf(d8, v31)); - auto u32 = hn::PromoteTo(d16, hn::UpperHalf(d8, v32)); - auto u33 = hn::PromoteTo(d16, hn::UpperHalf(d8, v33)); - - auto sx_u = hn::Sub(hn::Add(hn::Add(u11, u31), hn::Add(u21, u21)), - hn::Add(hn::Add(u13, u33), hn::Add(u23, u23))); - sx_u = hn::MulHigh(sx_u, divisor); - - auto sy_u = hn::Sub(hn::Add(hn::Add(u11, u13), hn::Add(u12, u12)), - hn::Add(hn::Add(u31, u33), hn::Add(u32, u32))); - sy_u = hn::MulHigh(sy_u, divisor); - - auto mag_u = hn::Add(hn::Mul(sx_u, sx_u), hn::Mul(sy_u, sy_u)); - auto mask_u = hn::Gt(mag_u, threshSq); - auto res_hi = hn::DemoteTo(d8_half, hn::IfThenElse(mask_u, v255, v0)); - - hn::StoreU(hn::Combine(d8, res_hi, res_lo), d8, out + x); - } + return hn::IfThenElse(hn::Gt(mag, threshSq), v255, v0); + }; + + // 1. Promote and process Lower Half + auto res_lo = process_half( + hn::PromoteTo(d16, hn::LowerHalf(v11)), hn::PromoteTo(d16, hn::LowerHalf(v12)), hn::PromoteTo(d16, hn::LowerHalf(v13)), + hn::PromoteTo(d16, hn::LowerHalf(v21)), hn::PromoteTo(d16, hn::LowerHalf(v23)), + hn::PromoteTo(d16, hn::LowerHalf(v31)), hn::PromoteTo(d16, hn::LowerHalf(v32)), hn::PromoteTo(d16, hn::LowerHalf(v33)) + ); + + // 2. Promote and process Upper Half + auto res_hi = process_half( + hn::PromoteTo(d16, hn::UpperHalf(d8, v11)), hn::PromoteTo(d16, hn::UpperHalf(d8, v12)), hn::PromoteTo(d16, hn::UpperHalf(d8, v13)), + hn::PromoteTo(d16, hn::UpperHalf(d8, v21)), hn::PromoteTo(d16, hn::UpperHalf(d8, v23)), + hn::PromoteTo(d16, hn::UpperHalf(d8, v31)), hn::PromoteTo(d16, hn::UpperHalf(d8, v32)), hn::PromoteTo(d16, hn::UpperHalf(d8, v33)) + ); + + // 3. The "Magic" fix: OrderedDemote2To handles the cross-lane logic for AVX2 automatically + auto result8 = hn::OrderedDemote2To(d8, res_lo, res_hi); + hn::StoreU(result8, d8, out + x); } } } -void SobelKerneli(const uint8_t* HWY_RESTRICT in, uint8_t* HWY_RESTRICT gradient, - int width, int height, uint8_t threshold) { - const hn::ScalableTag d8; - const hn::Rebind> d16; - const hn::Half d8_half; - - const size_t N = hn::Lanes(d8); - // Multiply threshold by 9 BEFORE squaring to match the "no-division" math - int16_t tScaled = (int16_t)threshold * 9; - const auto threshSq = hn::Set(d16, tScaled * tScaled); - - const auto v255 = hn::Set(d16, 255); - const auto v0 = hn::Zero(d16); - for (int y = 1; y < height - 1; ++y) { - const uint8_t* r0 = in + (y - 1) * width; - const uint8_t* r1 = in + y * width; - const uint8_t* r2 = in + (y + 1) * width; - uint8_t* out = gradient + y * width + 1; - - for (int x = 0; x < width; x += N) { - auto v11 = hn::LoadU(d8, r0 + x); auto v12 = hn::LoadU(d8, r0 + x + 1); auto v13 = hn::LoadU(d8, r0 + x + 2); - auto v21 = hn::LoadU(d8, r1 + x); auto v23 = hn::LoadU(d8, r1 + x + 2); - auto v31 = hn::LoadU(d8, r2 + x); auto v32 = hn::LoadU(d8, r2 + x + 1); auto v33 = hn::LoadU(d8, r2 + x + 2); - - // LOWER HALF - { - auto p11 = hn::PromoteTo(d16, hn::LowerHalf(v11)); - auto p12 = hn::PromoteTo(d16, hn::LowerHalf(v12)); - auto p13 = hn::PromoteTo(d16, hn::LowerHalf(v13)); - auto p21 = hn::PromoteTo(d16, hn::LowerHalf(v21)); - auto p23 = hn::PromoteTo(d16, hn::LowerHalf(v23)); - auto p31 = hn::PromoteTo(d16, hn::LowerHalf(v31)); - auto p32 = hn::PromoteTo(d16, hn::LowerHalf(v32)); - auto p33 = hn::PromoteTo(d16, hn::LowerHalf(v33)); - - auto sx = hn::Sub(hn::Add(hn::Add(p11, p31), hn::Add(p21, p21)), - hn::Add(hn::Add(p13, p33), hn::Add(p23, p23))); - auto sy = hn::Sub(hn::Add(hn::Add(p11, p13), hn::Add(p12, p12)), - hn::Add(hn::Add(p31, p33), hn::Add(p32, p32))); - - // Removed MulHigh (division). Math is now: (sx*sx + sy*sy) > (threshold*9)^2 - auto mag = hn::Add(hn::Mul(sx, sx), hn::Mul(sy, sy)); - auto mask = hn::Gt(mag, threshSq); - auto res_lo = hn::DemoteTo(d8_half, hn::IfThenElse(mask, v255, v0)); - - // UPPER HALF - auto u11 = hn::PromoteTo(d16, hn::UpperHalf(d8, v11)); - auto u12 = hn::PromoteTo(d16, hn::UpperHalf(d8, v12)); - auto u13 = hn::PromoteTo(d16, hn::UpperHalf(d8, v13)); - auto u21 = hn::PromoteTo(d16, hn::UpperHalf(d8, v21)); - auto u23 = hn::PromoteTo(d16, hn::UpperHalf(d8, v23)); - auto u31 = hn::PromoteTo(d16, hn::UpperHalf(d8, v31)); - auto u32 = hn::PromoteTo(d16, hn::UpperHalf(d8, v32)); - auto u33 = hn::PromoteTo(d16, hn::UpperHalf(d8, v33)); - - auto sx_u = hn::Sub(hn::Add(hn::Add(u11, u31), hn::Add(u21, u21)), - hn::Add(hn::Add(u13, u33), hn::Add(u23, u23))); - auto sy_u = hn::Sub(hn::Add(hn::Add(u11, u13), hn::Add(u12, u12)), - hn::Add(hn::Add(u31, u33), hn::Add(u32, u32))); - - auto mag_u = hn::Add(hn::Mul(sx_u, sx_u), hn::Mul(sy_u, sy_u)); - auto mask_u = hn::Gt(mag_u, threshSq); - auto res_hi = hn::DemoteTo(d8_half, hn::IfThenElse(mask_u, v255, v0)); - - hn::StoreU(hn::Combine(d8, res_hi, res_lo), d8, out + x); - } - } - } -} } // namespace HWY_NAMESPACE } // namespace ndb HWY_AFTER_NAMESPACE(); namespace ndb { namespace testing { +#if defined(HWY_TARGET) && HWY_TARGET == HWY_NEON void sobel_hwy(uint8_t* in, uint8_t* blurred, int width, int height, uint8_t threshold) { ndb::N_NEON::SobelKernel(in, blurred, width, height, threshold); } +#endif } } diff --git a/lib/gpc/kernels/utils.cpp b/lib/gpc/kernels/utils.cpp index dd5d146..796c2b1 100644 --- a/lib/gpc/kernels/utils.cpp +++ b/lib/gpc/kernels/utils.cpp @@ -30,6 +30,7 @@ // Code Author: Niklaus Bamert (bamertn@ethz.ch) #include #include +#include using namespace std; From 8f654841f0cf45e1eeeb4e0c97995051aa6203fb Mon Sep 17 00:00:00 2001 From: Nik Bamert Date: Sun, 22 Feb 2026 15:04:26 +0100 Subject: [PATCH 24/36] static dispatch --- CMakeLists.txt | 34 ++++++----------------- benchmarks/sobel_legacy_bench.cpp | 4 ++- lib/gpc/kernels/box_hwy.cpp | 45 ++++++++++++++----------------- lib/gpc/kernels/sobel.cpp | 8 +++--- lib/gpc/kernels/sobel.hpp | 6 +++++ lib/gpc/kernels/sobel_hwy.cpp | 43 +++++++++++++---------------- lib/gpc/kernels/utils.cpp | 5 ++-- lib/gpc/kernels/utils.hpp | 4 +-- samples/CMakeLists.txt | 4 +++ tests/test_kernel_box.cpp | 5 ++-- tests/test_kernel_sobel.cpp | 5 ++-- 11 files changed, 73 insertions(+), 90 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 629477a..dfcdd61 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,6 +1,7 @@ cmake_minimum_required(VERSION 3.10) include(CheckCXXCompilerFlag) include(CheckCXXSourceRuns) +include(CMakePushCheckState) project(openGPC CXX) set (REQ_CPP11_FEATURES cxx_strong_enums cxx_auto_type) if(NOT CMAKE_BUILD_TYPE) @@ -19,32 +20,6 @@ find_package(Threads REQUIRED) include_directories(${EIGEN3_INCLUDE_DIR}) include_directories(${PNG_INCLUDE_DIRS}) include_directories(lib) - -#By default, use SSE intrinsics -option(SSE "Enable SSE/AVX optimizations if available" ON) - -add_compile_options(-O3 -funroll-loops -flto) -if(APPLE AND CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64") - add_compile_options(-mcpu=apple-m1) -elseif(NOT MSVC) - add_compile_options(-march=native) -endif() -if(SSE) - message(STATUS "Checking if target CPU supports AVX2 instructions...") - check_cxx_source_runs(" - #include - int main() { - __m256i x = _mm256_set1_epi32(1); - return _mm256_extract_epi32(x, 0); - } - " CPU_HAS_AVX2) - - if(CPU_HAS_AVX2) - message(STATUS "AVX2: supported and enabled") - add_compile_definitions(_INTRINSICS_SSE) - add_compile_options(-mavx2 -march=core-avx2) - endif() -endif() include(FetchContent) set(HWY_ENABLE_TESTS OFF CACHE BOOL "Disable Highway tests" FORCE) set(HWY_ENABLE_EXAMPLES OFF CACHE BOOL "Disable Highway examples" FORCE) @@ -78,6 +53,13 @@ add_library(gpc_core lib/gpc/kernels/box_hwy.cpp lib/gpc/kernels/sobel_hwy.cpp ) +if(MSVC) + target_compile_options(gpc_core PUBLIC /arch:AVX2) +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64") + target_compile_options(gpc_core PUBLIC -march=native) +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm64|aarch64") + target_compile_options(gpc_core PUBLIC -mcpu=native) +endif() target_link_libraries(gpc_core PUBLIC Eigen3::Eigen diff --git a/benchmarks/sobel_legacy_bench.cpp b/benchmarks/sobel_legacy_bench.cpp index d00c2c6..7ffb183 100644 --- a/benchmarks/sobel_legacy_bench.cpp +++ b/benchmarks/sobel_legacy_bench.cpp @@ -7,7 +7,9 @@ static void BM_SobelLegacy(benchmark::State& state) { std::vector out(w * h, 0); for (auto _ : state) { - ndb::sobel(in.data(), out.data(), w, h, 50, 1); + //ndb::sobel(in.data(), out.data(), w, h, 50, 1); + //ndb::sobelSSE(in.data(), out.data(), w, 1, h - 1, 1); + ndb::sobelNaive(in.data(), out.data(), w, h, 1); // Ensure the compiler doesn't skip the work benchmark::DoNotOptimize(out.data()); diff --git a/lib/gpc/kernels/box_hwy.cpp b/lib/gpc/kernels/box_hwy.cpp index 7cdd983..3cc2736 100644 --- a/lib/gpc/kernels/box_hwy.cpp +++ b/lib/gpc/kernels/box_hwy.cpp @@ -9,8 +9,8 @@ namespace hn = hwy::HWY_NAMESPACE; void BoxKernel(const uint8_t* HWY_RESTRICT in, uint8_t* HWY_RESTRICT blurred, int width, int height) { const hn::ScalableTag d8; - // We need d16 to be the "Promoted" version of the half-width d8 to stay lane-consistent - const hn::Rebind> d16; + const hn::Half d8_h; + const hn::Rebind d16; const size_t N = hn::Lanes(d8); const auto divisor = hn::Set(d16, (uint16_t)7282); @@ -25,31 +25,25 @@ void BoxKernel(const uint8_t* HWY_RESTRICT in, uint8_t* HWY_RESTRICT blurred, in uint8_t* out1 = blurred + (y + 1) * width + 1; for (int x = 0; x < width; x += N) { - auto v0_0 = hn::LoadU(d8, r0 + x); auto v0_1 = hn::LoadU(d8, r0 + x + 1); auto v0_2 = hn::LoadU(d8, r0 + x + 2); - auto v1_0 = hn::LoadU(d8, r1 + x); auto v1_1 = hn::LoadU(d8, r1 + x + 1); auto v1_2 = hn::LoadU(d8, r1 + x + 2); - auto v2_0 = hn::LoadU(d8, r2 + x); auto v2_1 = hn::LoadU(d8, r2 + x + 1); auto v2_2 = hn::LoadU(d8, r2 + x + 2); - auto v3_0 = hn::LoadU(d8, r3 + x); auto v3_1 = hn::LoadU(d8, r3 + x + 1); auto v3_2 = hn::LoadU(d8, r3 + x + 2); + auto v00 = hn::LoadU(d8, r0+x); auto v01 = hn::LoadU(d8, r0+x+1); auto v02 = hn::LoadU(d8, r0+x+2); + auto v10 = hn::LoadU(d8, r1+x); auto v11 = hn::LoadU(d8, r1+x+1); auto v12 = hn::LoadU(d8, r1+x+2); + auto v20 = hn::LoadU(d8, r2+x); auto v21 = hn::LoadU(d8, r2+x+1); auto v22 = hn::LoadU(d8, r2+x+2); + auto v30 = hn::LoadU(d8, r3+x); auto v31 = hn::LoadU(d8, r3+x+1); auto v32 = hn::LoadU(d8, r3+x+2); - // Helper to sum 3 promoted pixels - auto sum3 = [&](auto v0, auto v1, auto v2) { - return hn::Add(v1, hn::Add(v0, v2)); - }; - - // LOWER HALF - auto s1_lo = sum3(hn::PromoteTo(d16, hn::LowerHalf(v1_0)), hn::PromoteTo(d16, hn::LowerHalf(v1_1)), hn::PromoteTo(d16, hn::LowerHalf(v1_2))); - auto s2_lo = sum3(hn::PromoteTo(d16, hn::LowerHalf(v2_0)), hn::PromoteTo(d16, hn::LowerHalf(v2_1)), hn::PromoteTo(d16, hn::LowerHalf(v2_2))); + // Lower Half Math + auto s1_lo = hn::Add(hn::PromoteTo(d16, hn::LowerHalf(v11)), hn::Add(hn::PromoteTo(d16, hn::LowerHalf(v10)), hn::PromoteTo(d16, hn::LowerHalf(v12)))); + auto s2_lo = hn::Add(hn::PromoteTo(d16, hn::LowerHalf(v21)), hn::Add(hn::PromoteTo(d16, hn::LowerHalf(v20)), hn::PromoteTo(d16, hn::LowerHalf(v22)))); - auto row0_lo = hn::Add(sum3(hn::PromoteTo(d16, hn::LowerHalf(v0_0)), hn::PromoteTo(d16, hn::LowerHalf(v0_1)), hn::PromoteTo(d16, hn::LowerHalf(v0_2))), hn::Add(s1_lo, s2_lo)); - auto row1_lo = hn::Add(sum3(hn::PromoteTo(d16, hn::LowerHalf(v3_0)), hn::PromoteTo(d16, hn::LowerHalf(v3_1)), hn::PromoteTo(d16, hn::LowerHalf(v3_2))), hn::Add(s1_lo, s2_lo)); + auto row0_lo = hn::Add(hn::Add(hn::PromoteTo(d16, hn::LowerHalf(v01)), hn::Add(hn::PromoteTo(d16, hn::LowerHalf(v00)), hn::PromoteTo(d16, hn::LowerHalf(v02)))), hn::Add(s1_lo, s2_lo)); + auto row1_lo = hn::Add(hn::Add(hn::PromoteTo(d16, hn::LowerHalf(v31)), hn::Add(hn::PromoteTo(d16, hn::LowerHalf(v30)), hn::PromoteTo(d16, hn::LowerHalf(v32)))), hn::Add(s1_lo, s2_lo)); - // UPPER HALF - auto s1_hi = sum3(hn::PromoteTo(d16, hn::UpperHalf(d8, v1_0)), hn::PromoteTo(d16, hn::UpperHalf(d8, v1_1)), hn::PromoteTo(d16, hn::UpperHalf(d8, v1_2))); - auto s2_hi = sum3(hn::PromoteTo(d16, hn::UpperHalf(d8, v2_0)), hn::PromoteTo(d16, hn::UpperHalf(d8, v2_1)), hn::PromoteTo(d16, hn::UpperHalf(d8, v2_2))); + // Upper Half Math + auto s1_hi = hn::Add(hn::PromoteTo(d16, hn::UpperHalf(d8_h, v11)), hn::Add(hn::PromoteTo(d16, hn::UpperHalf(d8_h, v10)), hn::PromoteTo(d16, hn::UpperHalf(d8_h, v12)))); + auto s2_hi = hn::Add(hn::PromoteTo(d16, hn::UpperHalf(d8_h, v21)), hn::Add(hn::PromoteTo(d16, hn::UpperHalf(d8_h, v20)), hn::PromoteTo(d16, hn::UpperHalf(d8_h, v22)))); - auto row0_hi = hn::Add(sum3(hn::PromoteTo(d16, hn::UpperHalf(d8, v0_0)), hn::PromoteTo(d16, hn::UpperHalf(d8, v0_1)), hn::PromoteTo(d16, hn::UpperHalf(d8, v0_2))), hn::Add(s1_hi, s2_hi)); - auto row1_hi = hn::Add(sum3(hn::PromoteTo(d16, hn::UpperHalf(d8, v3_0)), hn::PromoteTo(d16, hn::UpperHalf(d8, v3_1)), hn::PromoteTo(d16, hn::UpperHalf(d8, v3_2))), hn::Add(s1_hi, s2_hi)); + auto row0_hi = hn::Add(hn::Add(hn::PromoteTo(d16, hn::UpperHalf(d8_h, v01)), hn::Add(hn::PromoteTo(d16, hn::UpperHalf(d8_h, v00)), hn::PromoteTo(d16, hn::UpperHalf(d8_h, v02)))), hn::Add(s1_hi, s2_hi)); + auto row1_hi = hn::Add(hn::Add(hn::PromoteTo(d16, hn::UpperHalf(d8_h, v31)), hn::Add(hn::PromoteTo(d16, hn::UpperHalf(d8_h, v30)), hn::PromoteTo(d16, hn::UpperHalf(d8_h, v32)))), hn::Add(s1_hi, s2_hi)); - // Perform normalization and store using OrderedDemote2To hn::StoreU(hn::OrderedDemote2To(d8, hn::MulHigh(row0_lo, divisor), hn::MulHigh(row0_hi, divisor)), d8, out0 + x); hn::StoreU(hn::OrderedDemote2To(d8, hn::MulHigh(row1_lo, divisor), hn::MulHigh(row1_hi, divisor)), d8, out1 + x); } @@ -61,10 +55,11 @@ HWY_AFTER_NAMESPACE(); namespace ndb { namespace testing { -#if defined(HWY_TARGET) && HWY_TARGET == HWY_NEON +//#if defined(HWY_TARGET) && HWY_TARGET == HWY_NEON void box_hwy(uint8_t* in, uint8_t* blurred, int width, int height) { - ndb::N_NEON::BoxKernel(in, blurred, width, height); + //ndb::N_NEON::BoxKernel(in, blurred, width, height); + HWY_STATIC_DISPATCH(BoxKernel)(in, blurred, width, height); } -#endif +//#endif } } diff --git a/lib/gpc/kernels/sobel.cpp b/lib/gpc/kernels/sobel.cpp index dc6e46b..2817622 100644 --- a/lib/gpc/kernels/sobel.cpp +++ b/lib/gpc/kernels/sobel.cpp @@ -30,6 +30,7 @@ // Code Author: Niklaus Bamert (bamertn@ethz.ch) #include #include "gpc/kernels/sobel.hpp" +#include "gpc/kernels/utils.hpp" namespace ndb { namespace testing { void sobel_hwy(uint8_t* in, uint8_t* blurred, int width, int height, uint8_t threshold); @@ -77,13 +78,10 @@ void sobelNaive( } } } -#ifdef _INTRINSICS_SSE +//#ifdef _INTRINSICS_SSE +#if HWY_TARGET == HWY_AVX2 #include -// Assuming your helper macros/inline funcs are defined elsewhere -// pack16to8(lo, hi, res) -// unpack8to16(in, lo, hi) - void sobelSSE(const uint8_t* in, uint8_t* blurred, int width, int start, int end, uint8_t threshold) { diff --git a/lib/gpc/kernels/sobel.hpp b/lib/gpc/kernels/sobel.hpp index 31749cb..c14b950 100644 --- a/lib/gpc/kernels/sobel.hpp +++ b/lib/gpc/kernels/sobel.hpp @@ -34,6 +34,12 @@ #include "gpc/buffer.hpp" namespace ndb { +#if HWY_TARGET == HWY_AVX2 +void sobelSSE(const uint8_t* in, uint8_t* blurred, + int width, int start, int end, + uint8_t threshold); + +#endif /** * @brief Naive 3x3 sobel filter implementation * diff --git a/lib/gpc/kernels/sobel_hwy.cpp b/lib/gpc/kernels/sobel_hwy.cpp index 97abee3..24afc32 100644 --- a/lib/gpc/kernels/sobel_hwy.cpp +++ b/lib/gpc/kernels/sobel_hwy.cpp @@ -6,12 +6,13 @@ namespace ndb { namespace HWY_NAMESPACE { namespace hn = hwy::HWY_NAMESPACE; + void SobelKernel(const uint8_t* HWY_RESTRICT in, uint8_t* HWY_RESTRICT gradient, int width, int height, uint8_t threshold) { const hn::ScalableTag d8; - // d16 will have half the lanes of d8, regardless of whether N=16 (NEON) or N=32 (AVX2) - const hn::Rebind> d16; - + const hn::Half d8_h; + const hn::Rebind d16; + const size_t N = hn::Lanes(d8); const auto divisor = hn::Set(d16, (int16_t)7282); const auto threshSq = hn::Set(d16, (int16_t)(threshold * threshold)); @@ -25,56 +26,48 @@ void SobelKernel(const uint8_t* HWY_RESTRICT in, uint8_t* HWY_RESTRICT gradient, uint8_t* out = gradient + y * width + 1; for (int x = 0; x < width; x += N) { - // Load full vectors (128-bit on NEON, 256-bit on AVX2) auto v11 = hn::LoadU(d8, r0 + x); auto v12 = hn::LoadU(d8, r0 + x + 1); auto v13 = hn::LoadU(d8, r0 + x + 2); auto v21 = hn::LoadU(d8, r1 + x); auto v23 = hn::LoadU(d8, r1 + x + 2); auto v31 = hn::LoadU(d8, r2 + x); auto v32 = hn::LoadU(d8, r2 + x + 1); auto v33 = hn::LoadU(d8, r2 + x + 2); - // Helper lambda to process 8-bit to 16-bit math for a specific half - auto process_half = [&](auto p11, auto p12, auto p13, auto p21, auto p23, auto p31, auto p32, auto p33) { + auto process = [&](auto p11, auto p12, auto p13, auto p21, auto p23, auto p31, auto p32, auto p33) { auto sx = hn::Sub(hn::Add(hn::Add(p11, p31), hn::Add(p21, p21)), hn::Add(hn::Add(p13, p33), hn::Add(p23, p23))); sx = hn::MulHigh(sx, divisor); - auto sy = hn::Sub(hn::Add(hn::Add(p11, p13), hn::Add(p12, p12)), hn::Add(hn::Add(p31, p33), hn::Add(p32, p32))); sy = hn::MulHigh(sy, divisor); - auto mag = hn::Add(hn::Mul(sx, sx), hn::Mul(sy, sy)); return hn::IfThenElse(hn::Gt(mag, threshSq), v255, v0); }; - // 1. Promote and process Lower Half - auto res_lo = process_half( + // Process Lower Half + auto res_lo = process( hn::PromoteTo(d16, hn::LowerHalf(v11)), hn::PromoteTo(d16, hn::LowerHalf(v12)), hn::PromoteTo(d16, hn::LowerHalf(v13)), hn::PromoteTo(d16, hn::LowerHalf(v21)), hn::PromoteTo(d16, hn::LowerHalf(v23)), - hn::PromoteTo(d16, hn::LowerHalf(v31)), hn::PromoteTo(d16, hn::LowerHalf(v32)), hn::PromoteTo(d16, hn::LowerHalf(v33)) - ); + hn::PromoteTo(d16, hn::LowerHalf(v31)), hn::PromoteTo(d16, hn::LowerHalf(v32)), hn::PromoteTo(d16, hn::LowerHalf(v33))); - // 2. Promote and process Upper Half - auto res_hi = process_half( - hn::PromoteTo(d16, hn::UpperHalf(d8, v11)), hn::PromoteTo(d16, hn::UpperHalf(d8, v12)), hn::PromoteTo(d16, hn::UpperHalf(d8, v13)), - hn::PromoteTo(d16, hn::UpperHalf(d8, v21)), hn::PromoteTo(d16, hn::UpperHalf(d8, v23)), - hn::PromoteTo(d16, hn::UpperHalf(d8, v31)), hn::PromoteTo(d16, hn::UpperHalf(d8, v32)), hn::PromoteTo(d16, hn::UpperHalf(d8, v33)) - ); + // Process Upper Half - Using correct d8_h tag + auto res_hi = process( + hn::PromoteTo(d16, hn::UpperHalf(d8_h, v11)), hn::PromoteTo(d16, hn::UpperHalf(d8_h, v12)), hn::PromoteTo(d16, hn::UpperHalf(d8_h, v13)), + hn::PromoteTo(d16, hn::UpperHalf(d8_h, v21)), hn::PromoteTo(d16, hn::UpperHalf(d8_h, v23)), + hn::PromoteTo(d16, hn::UpperHalf(d8_h, v31)), hn::PromoteTo(d16, hn::UpperHalf(d8_h, v32)), hn::PromoteTo(d16, hn::UpperHalf(d8_h, v33))); - // 3. The "Magic" fix: OrderedDemote2To handles the cross-lane logic for AVX2 automatically - auto result8 = hn::OrderedDemote2To(d8, res_lo, res_hi); - hn::StoreU(result8, d8, out + x); + hn::StoreU(hn::OrderedDemote2To(d8, res_lo, res_hi), d8, out + x); } } } - } // namespace HWY_NAMESPACE } // namespace ndb HWY_AFTER_NAMESPACE(); namespace ndb { namespace testing { -#if defined(HWY_TARGET) && HWY_TARGET == HWY_NEON +//#if defined(HWY_TARGET) && HWY_TARGET == HWY_NEON void sobel_hwy(uint8_t* in, uint8_t* blurred, int width, int height, uint8_t threshold) { - ndb::N_NEON::SobelKernel(in, blurred, width, height, threshold); + //ndb::N_NEON::SobelKernel(in, blurred, width, height, threshold); + HWY_STATIC_DISPATCH(SobelKernel)(in, blurred, width, height, threshold); } -#endif +//#endif } } diff --git a/lib/gpc/kernels/utils.cpp b/lib/gpc/kernels/utils.cpp index 796c2b1..ce920e8 100644 --- a/lib/gpc/kernels/utils.cpp +++ b/lib/gpc/kernels/utils.cpp @@ -31,6 +31,7 @@ #include #include #include +#include "gpc/kernels/utils.hpp" using namespace std; @@ -39,7 +40,7 @@ void arr2ind(const unsigned char* a, int n, int* ind, int* m) { -#ifdef _INTRINSICS_SSE +#if HWY_TARGET == HWY_AVX2 int i, m0, k; __m256i msk; m0 = 0; @@ -69,7 +70,7 @@ void arr2ind(const unsigned char* a, *m = nnz; #endif } -#ifdef _INTRINSICS_SSE +#if HWY_TARGET == HWY_AVX2 void unpack8to16(const __m128i x, __m128i& y0, __m128i& y1) { __m128i zero = _mm_setzero_si128(); y0 = _mm_unpacklo_epi8(x, zero); diff --git a/lib/gpc/kernels/utils.hpp b/lib/gpc/kernels/utils.hpp index e9ce569..b2b0c2a 100644 --- a/lib/gpc/kernels/utils.hpp +++ b/lib/gpc/kernels/utils.hpp @@ -37,7 +37,7 @@ #include "gpc/buffer.hpp" using namespace std; -#ifdef _INTRINSICS_SSE +#if HWY_TARGET == HWY_AVX2 #include // greater and lesser than simd ops for unsigned 8bit integer (epu8) #define _mm_cmpgt_epu8(v0, v1) \ @@ -63,7 +63,7 @@ void arr2ind(const unsigned char* a, int* ind, int* m); -#ifdef _INTRINSICS_SSE +#if HWY_TARGET == HWY_AVX2 /** * @brief Unpacks 16x8bit from a 128bit simd var into 2x128bit vars * (8x16bit) diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt index d5be853..14f4a2c 100644 --- a/samples/CMakeLists.txt +++ b/samples/CMakeLists.txt @@ -7,3 +7,7 @@ target_link_libraries(train gpc_core) add_executable(sparsematch sparsematch.cpp) target_link_libraries(sparsematch gpc_core) +add_executable(target target.cpp) +target_link_libraries(target gpc_core) + + diff --git a/tests/test_kernel_box.cpp b/tests/test_kernel_box.cpp index 8fff2ce..9772ecd 100644 --- a/tests/test_kernel_box.cpp +++ b/tests/test_kernel_box.cpp @@ -24,10 +24,11 @@ TEST(Approval, BoxKernel) { // 4. Run Highway version (only if compiled for the target) #if defined(HWY_TARGET) && HWY_TARGET == HWY_NEON - ndb::N_NEON::BoxFilter(input.data(), outHighway.data(), width, height); + ndb::BoxFilter(input.data(), outHighway.data(), width, height); #else + ndb::boxNaive(input.data(), outNaive.data(), width, height); // Fallback if the specific NEON namespace isn't exposed - ndb::testing::box_hwy(input.data(), outHighway.data(), width, height); + //ndb::testing::box_hwy(input.data(), outHighway.data(), width, height); #endif diff --git a/tests/test_kernel_sobel.cpp b/tests/test_kernel_sobel.cpp index fd5b30d..3d4b7d0 100644 --- a/tests/test_kernel_sobel.cpp +++ b/tests/test_kernel_sobel.cpp @@ -24,10 +24,11 @@ TEST(Approval, SobelKernel) { // 4. Run Highway version (only if compiled for the target) #if defined(HWY_TARGET) && HWY_TARGET == HWY_NEON - ndb::N_NEON::BoxFilter(input.data(), outHighway.data(), width, height); + ndb::BoxFilter(input.data(), outHighway.data(), width, height); #else // Fallback if the specific NEON namespace isn't exposed - ndb::testing::sobel_hwy(input.data(), outHighway.data(), width, height, 30); + //ndb::testing::sobel_hwy(input.data(), outHighway.data(), width, height, 30); + ndb::sobelNaive(input.data(), outHighway.data(), width, height, 30); #endif From a9443c0b7ce59530d9a8313c9c97461218de5a3d Mon Sep 17 00:00:00 2001 From: Nik Bamert Date: Sun, 22 Feb 2026 15:46:39 +0100 Subject: [PATCH 25/36] update hwy sobel filter to be pixel accurate with naive version (although inefficient) --- lib/gpc/kernels/sobel_hwy.cpp | 118 +++++++++++++++++++++++++++++++++- lib/gpc/kernels/utils.hpp | 1 + tests/test_kernel_box.cpp | 8 +-- tests/test_kernel_sobel.cpp | 14 ++-- 4 files changed, 123 insertions(+), 18 deletions(-) diff --git a/lib/gpc/kernels/sobel_hwy.cpp b/lib/gpc/kernels/sobel_hwy.cpp index 24afc32..35d0b72 100644 --- a/lib/gpc/kernels/sobel_hwy.cpp +++ b/lib/gpc/kernels/sobel_hwy.cpp @@ -5,10 +5,126 @@ HWY_BEFORE_NAMESPACE(); namespace ndb { namespace HWY_NAMESPACE { namespace hn = hwy::HWY_NAMESPACE; +void SobelKernelNoDiv(const uint8_t* HWY_RESTRICT in, uint8_t* HWY_RESTRICT gradient, + int width, int height, uint8_t threshold) { + const hn::ScalableTag d8; + const hn::Half d8_h; + const hn::Rebind d16; + // d32 has half the lanes of d16 + const hn::Rebind> d32; + + const size_t N = hn::Lanes(d8); + const auto vDivMult = hn::Set(d16, (int16_t)7282); + const auto vThreshSq = hn::Set(d32, (int32_t)threshold * threshold); + const auto v255_16 = hn::Set(d16, (int16_t)255); + const auto v255_8 = hn::Set(d8, (uint8_t)255); + const auto v0_8 = hn::Zero(d8); + + for (int y = 1; y < height - 1; ++y) { + const uint8_t* r0 = in + (y - 1) * width; + const uint8_t* r1 = in + y * width; + const uint8_t* r2 = in + (y + 1) * width; + uint8_t* out = gradient + y * width + 1; + + for (int x = 0; x < width; x += N) { + auto v11 = hn::LoadU(d8, r0 + x); auto v12 = hn::LoadU(d8, r0 + x + 1); auto v13 = hn::LoadU(d8, r0 + x + 2); + auto v21 = hn::LoadU(d8, r1 + x); auto v23 = hn::LoadU(d8, r1 + x + 2); + auto v31 = hn::LoadU(d8, r2 + x); auto v32 = hn::LoadU(d8, r2 + x + 1); auto v33 = hn::LoadU(d8, r2 + x + 2); + + // Helper to process 8 pixels into a 16-bit mask-like result + auto process_half = [&](auto p11, auto p12, auto p13, auto p21, auto p23, auto p31, auto p32, auto p33) { + // Sobel derivatives in 16-bit + auto sx16 = hn::MulHigh(hn::Sub(hn::Add(hn::Add(p11, p31), hn::Add(p21, p21)), + hn::Add(hn::Add(p13, p33), hn::Add(p23, p23))), vDivMult); + auto sy16 = hn::MulHigh(hn::Sub(hn::Add(hn::Add(p11, p13), hn::Add(p12, p12)), + hn::Add(hn::Add(p31, p33), hn::Add(p32, p32))), vDivMult); + + // Magnitude squared in 32-bit + auto sx_lo = hn::PromoteLowerTo(d32, sx16); + auto sy_lo = hn::PromoteLowerTo(d32, sy16); + auto mag_lo = hn::Add(hn::Mul(sx_lo, sx_lo), hn::Mul(sy_lo, sy_lo)); + + auto sx_hi = hn::PromoteUpperTo(d32, sx16); + auto sy_hi = hn::PromoteUpperTo(d32, sy16); + auto mag_hi = hn::Add(hn::Mul(sx_hi, sx_hi), hn::Mul(sy_hi, sy_hi)); + + // Comparison in 32-bit, returning 16-bit values (0 or 255) to avoid mask issues + auto m_lo = hn::IfThenElse(hn::Gt(mag_lo, vThreshSq), hn::Set(d32, 255), hn::Zero(d32)); + auto m_hi = hn::IfThenElse(hn::Gt(mag_hi, vThreshSq), hn::Set(d32, 255), hn::Zero(d32)); + + return hn::OrderedDemote2To(d16, m_lo, m_hi); + }; + // Process halves using standard Highway promotion + auto res_lo = process_half( + hn::PromoteLowerTo(d16, v11), hn::PromoteLowerTo(d16, v12), hn::PromoteLowerTo(d16, v13), + hn::PromoteLowerTo(d16, v21), hn::PromoteLowerTo(d16, v23), + hn::PromoteLowerTo(d16, v31), hn::PromoteLowerTo(d16, v32), hn::PromoteLowerTo(d16, v33)); + auto res_hi = process_half( + hn::PromoteUpperTo(d16, v11), hn::PromoteUpperTo(d16, v12), hn::PromoteUpperTo(d16, v13), + hn::PromoteUpperTo(d16, v21), hn::PromoteUpperTo(d16, v23), + hn::PromoteUpperTo(d16, v31), hn::PromoteUpperTo(d16, v32), hn::PromoteUpperTo(d16, v33)); + + // Final store: 16-bit to 8-bit demotion + auto final_val = hn::OrderedDemote2To(d8, res_lo, res_hi); + hn::StoreU(final_val, d8, out + x); + } + } +} void SobelKernel(const uint8_t* HWY_RESTRICT in, uint8_t* HWY_RESTRICT gradient, int width, int height, uint8_t threshold) { + // We target 4 pixels at a time as our base 'Scalable' unit. + // This allows easy promotion from 8 -> 16 -> 32 bit while keeping lane counts identical. + const hn::FixedTag d8; + const hn::FixedTag d16; + const hn::FixedTag d32; + + const auto vDiv = hn::Set(d32, 9); + const auto vThreshSq = hn::Set(d32, (int32_t)threshold * threshold); + const auto v255 = hn::Set(d32, 255); + const auto v0 = hn::Zero(d32); + + for (int y = 1; y < height - 1; ++y) { + const uint8_t* r0 = in + (y - 1) * width; + const uint8_t* r1 = in + y * width; + const uint8_t* r2 = in + (y + 1) * width; + uint8_t* out = gradient + y * width + 1; + + for (int x = 0; x < width; x += 4) { + // Load and promote immediately to 32-bit to match naive 'int' math + auto load32 = [&](const uint8_t* p) { + return hn::PromoteTo(d32, hn::PromoteTo(d16, hn::LoadU(d8, p))); + }; + + auto p11 = load32(r0 + x); auto p12 = load32(r0 + x + 1); auto p13 = load32(r0 + x + 2); + auto p21 = load32(r1 + x); auto p23 = load32(r1 + x + 2); + auto p31 = load32(r2 + x); auto p32 = load32(r2 + x + 1); auto p33 = load32(r2 + x + 2); + + // Note:: Division is very slow - we use it for now to match exactly with the naive non simd-implementation + // sx = (*p11 + *p31 + 2 * *p21 - *p13 - 2 * *p23 - *p33) / 9; + auto sx = hn::Div(hn::Sub(hn::Add(hn::Add(p11, p31), hn::Add(p21, p21)), + hn::Add(hn::Add(p13, p33), hn::Add(p23, p23))), vDiv); + + // sy = (*p11 + *p13 + 2 * *p12 - *p31 - 2 * *p32 - *p33) / 9; + auto sy = hn::Div(hn::Sub(hn::Add(hn::Add(p11, p13), hn::Add(p12, p12)), + hn::Add(hn::Add(p31, p33), hn::Add(p32, p32))), vDiv); + + // int val = sx * sx + sy * sy; + auto magSq = hn::Add(hn::Mul(sx, sx), hn::Mul(sy, sy)); + + // *optr = val > thresholdSq ? 255 : 0; + auto mask = hn::Gt(magSq, vThreshSq); + auto res32 = hn::IfThenElse(mask, v255, v0); + + // Demote 32 -> 16 -> 8 + auto res8 = hn::DemoteTo(d8, hn::DemoteTo(d16, res32)); + hn::StoreU(res8, d8, out + x); + } + } +} +void SobelKerneli(const uint8_t* HWY_RESTRICT in, uint8_t* HWY_RESTRICT gradient, + int width, int height, uint8_t threshold) { const hn::ScalableTag d8; const hn::Half d8_h; const hn::Rebind d16; @@ -47,7 +163,7 @@ void SobelKernel(const uint8_t* HWY_RESTRICT in, uint8_t* HWY_RESTRICT gradient, hn::PromoteTo(d16, hn::LowerHalf(v21)), hn::PromoteTo(d16, hn::LowerHalf(v23)), hn::PromoteTo(d16, hn::LowerHalf(v31)), hn::PromoteTo(d16, hn::LowerHalf(v32)), hn::PromoteTo(d16, hn::LowerHalf(v33))); - // Process Upper Half - Using correct d8_h tag + // Process Upper Half auto res_hi = process( hn::PromoteTo(d16, hn::UpperHalf(d8_h, v11)), hn::PromoteTo(d16, hn::UpperHalf(d8_h, v12)), hn::PromoteTo(d16, hn::UpperHalf(d8_h, v13)), hn::PromoteTo(d16, hn::UpperHalf(d8_h, v21)), hn::PromoteTo(d16, hn::UpperHalf(d8_h, v23)), diff --git a/lib/gpc/kernels/utils.hpp b/lib/gpc/kernels/utils.hpp index b2b0c2a..18227ba 100644 --- a/lib/gpc/kernels/utils.hpp +++ b/lib/gpc/kernels/utils.hpp @@ -33,6 +33,7 @@ #include #include +#include #include "gpc/buffer.hpp" using namespace std; diff --git a/tests/test_kernel_box.cpp b/tests/test_kernel_box.cpp index 9772ecd..9913a7a 100644 --- a/tests/test_kernel_box.cpp +++ b/tests/test_kernel_box.cpp @@ -23,14 +23,8 @@ TEST(Approval, BoxKernel) { ndb::boxNaive(input.data(), outNaive.data(), width, height); // 4. Run Highway version (only if compiled for the target) -#if defined(HWY_TARGET) && HWY_TARGET == HWY_NEON - ndb::BoxFilter(input.data(), outHighway.data(), width, height); -#else - ndb::boxNaive(input.data(), outNaive.data(), width, height); - // Fallback if the specific NEON namespace isn't exposed - //ndb::testing::box_hwy(input.data(), outHighway.data(), width, height); + ndb::testing::box_hwy(input.data(), outHighway.data(), width, height); -#endif // 5. Compare results // We skip the border (radius) because different implementations diff --git a/tests/test_kernel_sobel.cpp b/tests/test_kernel_sobel.cpp index 3d4b7d0..bfc56ff 100644 --- a/tests/test_kernel_sobel.cpp +++ b/tests/test_kernel_sobel.cpp @@ -7,7 +7,8 @@ TEST(Approval, SobelKernel) { const int width = 640; const int height = 480; - const int radius = 2; // Typical for 5x5 box + const int radius = 2; // Typical for 5x5 bo + const int threshold = 30; // Example threshold for binarization // 1. Prepare randomized input std::vector input(width * height); @@ -20,17 +21,10 @@ TEST(Approval, SobelKernel) { std::vector outHighway(width * height, 0); // 3. Run Naive version - ndb::sobelNaive(input.data(), outNaive.data(), width, height, 30); + ndb::sobelNaive(input.data(), outNaive.data(), width, height, threshold); // 4. Run Highway version (only if compiled for the target) -#if defined(HWY_TARGET) && HWY_TARGET == HWY_NEON - ndb::BoxFilter(input.data(), outHighway.data(), width, height); -#else - // Fallback if the specific NEON namespace isn't exposed - //ndb::testing::sobel_hwy(input.data(), outHighway.data(), width, height, 30); - ndb::sobelNaive(input.data(), outHighway.data(), width, height, 30); - -#endif + ndb::testing::sobel_hwy(input.data(), outHighway.data(), width, height, threshold); // 5. Compare results // We skip the border (radius) because different implementations From 35063490d53aa7155006f4a950fd1773a4fef586 Mon Sep 17 00:00:00 2001 From: Nik Bamert Date: Tue, 24 Feb 2026 14:55:14 +0100 Subject: [PATCH 26/36] wip dense gpt hwy kernel --- CMakeLists.txt | 1 + benchmarks/CMakeLists.txt | 11 +- benchmarks/box_bench.cpp | 56 ++++++ benchmarks/box_legacy_bench.cpp | 21 --- benchmarks/sobel_bench.cpp | 44 ++++- benchmarks/sobel_legacy_bench.cpp | 23 --- lib/gpc/forest.cpp | 6 +- lib/gpc/inference.hpp | 6 +- lib/gpc/kernels/box.cpp | 9 +- lib/gpc/kernels/box.hpp | 3 + lib/gpc/kernels/gpc.cpp | 272 ++++++++++++++++-------------- lib/gpc/kernels/gpc.hpp | 27 ++- lib/gpc/kernels/gpc_hwy.cpp | 157 +++++++++++++++++ lib/gpc/kernels/gpc_hwy.hpp | 17 ++ lib/gpc/kernels/sobel_hwy.cpp | 2 +- tests/CMakeLists.txt | 1 + tests/test_kernel_gpc.cpp | 82 +++++++++ 17 files changed, 535 insertions(+), 203 deletions(-) create mode 100644 benchmarks/box_bench.cpp delete mode 100644 benchmarks/box_legacy_bench.cpp delete mode 100644 benchmarks/sobel_legacy_bench.cpp create mode 100644 lib/gpc/kernels/gpc_hwy.cpp create mode 100644 lib/gpc/kernels/gpc_hwy.hpp create mode 100644 tests/test_kernel_gpc.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index dfcdd61..6957189 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -52,6 +52,7 @@ add_library(gpc_core lib/gpc/kernels/utils.cpp lib/gpc/kernels/box_hwy.cpp lib/gpc/kernels/sobel_hwy.cpp + lib/gpc/kernels/gpc_hwy.cpp ) if(MSVC) target_compile_options(gpc_core PUBLIC /arch:AVX2) diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 0fc8bf3..9735e84 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -15,16 +15,9 @@ target_link_libraries(kernel_bench gpc_core benchmark::benchmark ) -add_executable(sobel_legacy_bench sobel_legacy_bench.cpp) +add_executable(box_bench box_bench.cpp) -target_link_libraries(sobel_legacy_bench - PRIVATE - gpc_core - benchmark::benchmark -) -add_executable(box_legacy_bench box_legacy_bench.cpp) - -target_link_libraries(box_legacy_bench +target_link_libraries(box_bench PRIVATE gpc_core benchmark::benchmark diff --git a/benchmarks/box_bench.cpp b/benchmarks/box_bench.cpp new file mode 100644 index 0000000..7b7ff5c --- /dev/null +++ b/benchmarks/box_bench.cpp @@ -0,0 +1,56 @@ +#include +#include +#include "gpc/kernels/box.hpp" +#include "gpc/kernels/box_hwy.hpp" +static void BM_BoxHighway(benchmark::State& state) { + int w = 1920, h = 1080; + std::vector in(w * h, 128); + std::vector out(w * h, 0); + state.SetLabel(hwy::TargetName(HWY_TARGET)); + // Warmup is handled automatically by the library + for (auto _ : state) { + ndb::testing::box_hwy(in.data(), out.data(), w, h); + + // Ensure the compiler doesn't skip the work + benchmark::DoNotOptimize(out.data()); + benchmark::ClobberMemory(); + } +} + +#if HWY_TARGET == HWY_AVX2 +static void BM_BoxLegacySIMD(benchmark::State& state) { + int w = 1920, h = 1080; + std::vector in(w * h, 128); + std::vector out(w * h, 0); + + state.SetLabel("AVX2_legacy"); + for (auto _ : state) { + ndb::boxSSE(in.data(), out.data(), w, h); + + // Ensure the compiler doesn't skip the work + benchmark::DoNotOptimize(out.data()); + benchmark::ClobberMemory(); + } +} +#endif +static void BM_BoxNaive(benchmark::State& state) { + int w = 1920, h = 1080; + std::vector in(w * h, 128); + std::vector out(w * h, 0); + + state.SetLabel("naive"); + for (auto _ : state) { + ndb::boxNaive(in.data(), out.data(), w, h); + + // Ensure the compiler doesn't skip the work + benchmark::DoNotOptimize(out.data()); + benchmark::ClobberMemory(); + } +} +BENCHMARK(BM_BoxHighway)->Unit(benchmark::kMillisecond); +#if HWY_TARGET == HWY_AVX2 +BENCHMARK(BM_BoxLegacySIMD)->Unit(benchmark::kMillisecond); +#endif +BENCHMARK(BM_BoxNaive)->Unit(benchmark::kMillisecond); + +BENCHMARK_MAIN(); diff --git a/benchmarks/box_legacy_bench.cpp b/benchmarks/box_legacy_bench.cpp deleted file mode 100644 index c8960a3..0000000 --- a/benchmarks/box_legacy_bench.cpp +++ /dev/null @@ -1,21 +0,0 @@ -#include -#include "gpc/kernels/box.hpp" - -static void BM_BoxLegacy(benchmark::State& state) { - int w = 1920, h = 1080; - std::vector in(w * h, 128); - std::vector out(w * h, 0); - - for (auto _ : state) { - ndb::box(in.data(), out.data(), w, h, 50); - - // Ensure the compiler doesn't skip the work - benchmark::DoNotOptimize(out.data()); - benchmark::ClobberMemory(); - } - - state.SetBytesProcessed(int64_t(state.iterations()) * w * h); -} -BENCHMARK(BM_BoxLegacy)->Unit(benchmark::kMillisecond); - -BENCHMARK_MAIN(); diff --git a/benchmarks/sobel_bench.cpp b/benchmarks/sobel_bench.cpp index e0a65d1..5c26d89 100644 --- a/benchmarks/sobel_bench.cpp +++ b/benchmarks/sobel_bench.cpp @@ -1,11 +1,12 @@ #include -#include "gpc/kernels/sobel_hwy.hpp" // Your header - +#include +#include "gpc/kernels/sobel.hpp" +#include "gpc/kernels/sobel_hwy.hpp" static void BM_SobelHighway(benchmark::State& state) { int w = 1920, h = 1080; std::vector in(w * h, 128); std::vector out(w * h, 0); - + state.SetLabel(hwy::TargetName(HWY_TARGET)); // Warmup is handled automatically by the library for (auto _ : state) { ndb::testing::sobel_hwy(in.data(), out.data(), w, h, 50); @@ -14,9 +15,42 @@ static void BM_SobelHighway(benchmark::State& state) { benchmark::DoNotOptimize(out.data()); benchmark::ClobberMemory(); } - - state.SetBytesProcessed(int64_t(state.iterations()) * w * h); +} + +#if HWY_TARGET == HWY_AVX2 +static void BM_SobelLegacySIMD(benchmark::State& state) { + int w = 1920, h = 1080; + std::vector in(w * h, 128); + std::vector out(w * h, 0); + + state.SetLabel("AVX2_legacy"); + for (auto _ : state) { + ndb::sobelSSE(in.data(), out.data(), w, 1, h - 1, 1); + + // Ensure the compiler doesn't skip the work + benchmark::DoNotOptimize(out.data()); + benchmark::ClobberMemory(); + } +} +#endif +static void BM_SobelNaive(benchmark::State& state) { + int w = 1920, h = 1080; + std::vector in(w * h, 128); + std::vector out(w * h, 0); + + state.SetLabel("naive"); + for (auto _ : state) { + ndb::sobelNaive(in.data(), out.data(), w, h, 1); + + // Ensure the compiler doesn't skip the work + benchmark::DoNotOptimize(out.data()); + benchmark::ClobberMemory(); + } } BENCHMARK(BM_SobelHighway)->Unit(benchmark::kMillisecond); +#if HWY_TARGET == HWY_AVX2 +BENCHMARK(BM_SobelLegacySIMD)->Unit(benchmark::kMillisecond); +#endif +BENCHMARK(BM_SobelNaive)->Unit(benchmark::kMillisecond); BENCHMARK_MAIN(); diff --git a/benchmarks/sobel_legacy_bench.cpp b/benchmarks/sobel_legacy_bench.cpp deleted file mode 100644 index 7ffb183..0000000 --- a/benchmarks/sobel_legacy_bench.cpp +++ /dev/null @@ -1,23 +0,0 @@ -#include -#include "gpc/kernels/sobel.hpp" - -static void BM_SobelLegacy(benchmark::State& state) { - int w = 1920, h = 1080; - std::vector in(w * h, 128); - std::vector out(w * h, 0); - - for (auto _ : state) { - //ndb::sobel(in.data(), out.data(), w, h, 50, 1); - //ndb::sobelSSE(in.data(), out.data(), w, 1, h - 1, 1); - ndb::sobelNaive(in.data(), out.data(), w, h, 1); - - // Ensure the compiler doesn't skip the work - benchmark::DoNotOptimize(out.data()); - benchmark::ClobberMemory(); - } - - state.SetBytesProcessed(int64_t(state.iterations()) * w * h); -} -BENCHMARK(BM_SobelLegacy)->Unit(benchmark::kMillisecond); - -BENCHMARK_MAIN(); diff --git a/lib/gpc/forest.cpp b/lib/gpc/forest.cpp index e39eaff..6aca590 100644 --- a/lib/gpc/forest.cpp +++ b/lib/gpc/forest.cpp @@ -170,8 +170,7 @@ std::vector Forest::evalFastMaskOnSubsetSSE( fastmask.mask, idx, img.cols(), - img.rows(), - settings.numThreads_); + img.rows()); } else { ndb::gpcFilterTau(img.data(), grad.data(), @@ -180,8 +179,7 @@ std::vector Forest::evalFastMaskOnSubsetSSE( fastmask.tau, idx, img.cols(), - img.rows(), - settings.numThreads_); + img.rows()); } std::vector out(idx.size()); int j = 0; diff --git a/lib/gpc/inference.hpp b/lib/gpc/inference.hpp index 5136010..e074290 100644 --- a/lib/gpc/inference.hpp +++ b/lib/gpc/inference.hpp @@ -298,8 +298,7 @@ class Forest { fastmask.mask, idx, img.cols(), - img.rows(), - settings.numThreads_); + img.rows()); } else { ndb::gpcFilterTau(img.data(), grad.data(), @@ -308,8 +307,7 @@ class Forest { fastmask.tau, idx, img.cols(), - img.rows(), - settings.numThreads_); + img.rows()); } std::vector out(idx.size()); int j = 0; diff --git a/lib/gpc/kernels/box.cpp b/lib/gpc/kernels/box.cpp index c9984dd..605daa2 100644 --- a/lib/gpc/kernels/box.cpp +++ b/lib/gpc/kernels/box.cpp @@ -30,6 +30,7 @@ // Code Author: Niklaus Bamert (bamertn@ethz.ch) #include "gpc/kernels/box.hpp" +#include "gpc/kernels/utils.hpp" #include namespace ndb { namespace testing { @@ -73,7 +74,7 @@ void boxNaive(uint8_t* in, uint8_t* blurred, int width, int height) { } } } -#ifdef _INTRINSICS_SSE +#if HWY_TARGET == HWY_AVX2 /** * @brief SSE implementation of the 3x3 box filter. * Processed two rows at a time using fixed-point multiplication for division. @@ -168,10 +169,10 @@ void box(uint8_t* in, uint8_t* blurred, int width, int height, int numThreads) { // Force use of our new Highway kernel on Mac testing::box_hwy(in, blurred, width, height); #else - #ifndef _INTRINSICS_SSE - boxNaive(in, blurred, width, height); - #else + #if HWY_TARGET == HWY_AVX2 boxSSE(in, blurred, width, height); + #else + boxNaive(in, blurred, width, height); #endif #endif } diff --git a/lib/gpc/kernels/box.hpp b/lib/gpc/kernels/box.hpp index c5f2d0e..eef0b3d 100644 --- a/lib/gpc/kernels/box.hpp +++ b/lib/gpc/kernels/box.hpp @@ -60,6 +60,9 @@ void boxNaive(uint8_t* in, uint8_t* blurred, int width, int height); */ void box(uint8_t* in, uint8_t* blurred, int width, int height, int numThreads); +#if HWY_TARGET == HWY_AVX2 +void boxSSE(uint8_t* in, uint8_t* blurred, int width, int height); +#endif } #endif diff --git a/lib/gpc/kernels/gpc.cpp b/lib/gpc/kernels/gpc.cpp index 5e22e23..62ffa3e 100644 --- a/lib/gpc/kernels/gpc.cpp +++ b/lib/gpc/kernels/gpc.cpp @@ -79,11 +79,73 @@ void gpcFilterTauNaive(uint8_t* in, } -#ifdef _INTRINSICS_SSE +#if (HWY_ARCH_X86) && (HWY_TARGET == HWY_AVX2) bool isAllZeros(__m128i xmm) { return _mm_movemask_epi8(_mm_cmpeq_epi8(xmm, _mm_setzero_si128())) == 0xFFFF; } +void gpcFilterSSE(uint8_t* in, + const uint8_t* grad, + uint32_t* gpc, + std::vector fastmask, + std::vector& idx, + int width, + int height) { + const int start = 13; + const int end = height - 15; + __m128i zero = _mm_set1_epi8(0); + __m128i one = _mm_set1_epi8(1); + for (int y = start; y < end; y++) { + for (int x = 0; x < width; x += 16) { + uint8_t* rowPtr; + rowPtr = in + (y - 2) * width + x; + __m128i out[4]; // temporary output vector of 4 128bit words + + const uint8_t* center = (in + y * width + x); + const uint8_t* centerGrad = (grad + y * width + x); + // We only process the current segment if there are any non-zero + // values (high gradient pixels) + if (!isAllZeros(_mm_lddqu_si128((__m128i*)centerGrad))) { + __m128i* dst = + (__m128i*)(gpc + y * width + + x); // Set starting point to pixel (2,2) + out[0] = zero; + out[1] = zero; + out[2] = zero; + out[3] = zero; + uint8_t k = 0; + __m128i bitMask = one; + for (uint8_t i = 0; i < fastmask.size() && i < 64; i += 2) { + out[k] |= _mm_and_si128( + _mm_cmpgt_epu8( + _mm_lddqu_si128( + (__m128i*)(center + fastmask[i])), + _mm_lddqu_si128( + (__m128i*)(center + fastmask[i + 1]))), + bitMask); + // Keeps index into output vector and updates bit mask + if (i % 16 == 0 && i != 0) { + bitMask = one; + k++; + } else { + bitMask += bitMask; + } + } + // 8bit to 16bit + __m128i high1 = _mm_unpacklo_epi8(out[2], out[3]); + __m128i high2 = _mm_unpackhi_epi8(out[2], out[3]); + __m128i low1 = _mm_unpacklo_epi8(out[0], out[1]); + __m128i low2 = _mm_unpackhi_epi8(out[0], out[1]); + + // 16bit to 32bit ints + _mm_storeu_si128(dst, _mm_unpacklo_epi16(low1, high1)); + _mm_storeu_si128(dst + 1, _mm_unpackhi_epi16(low1, high1)); + _mm_storeu_si128(dst + 2, _mm_unpacklo_epi16(low2, high2)); + _mm_storeu_si128(dst + 3, _mm_unpackhi_epi16(low2, high2)); + } + } // col iteration + } // row iteration +} #endif void gpcFilter(uint8_t* in, const uint8_t* grad, @@ -91,73 +153,89 @@ void gpcFilter(uint8_t* in, std::vector fastmask, std::vector& idx, int width, - int height, - int numThreads) { + int height){ assert(width % 16 == 0 && "width must be multiple of 16!"); -#ifndef _INTRINSICS_SSE +#if defined(__ARM_NEON) || defined(__aarch64__) + // Replace with call to highway gpcFilterNaive(in, grad, gpc, fastmask, idx, width, height); #else - auto gpcFilterSegment = [&](int start, int end) { - __m128i zero = _mm_set1_epi8(0); - __m128i one = _mm_set1_epi8(1); - for (int y = start; y < end; y++) { - for (int x = 0; x < width; x += 16) { - uint8_t* rowPtr; - rowPtr = in + (y - 2) * width + x; - __m128i out[4]; // temporary output vector of 4 128bit words + #if (HWY_ARCH_X86) && (HWY_TARGET == HWY_AVX2) + gpcFilterSSE(in, grad, gpc, fastmask, idx, width, height); + #else + gpcFilterNaive(in, grad, gpc, fastmask, idx, width, height); +#endif +#endif +} - const uint8_t* center = (in + y * width + x); - const uint8_t* centerGrad = (grad + y * width + x); - // We only process the current segment if there are any non-zero - // values (high gradient pixels) - if (!isAllZeros(_mm_lddqu_si128((__m128i*)centerGrad))) { - __m128i* dst = - (__m128i*)(gpc + y * width + - x); // Set starting point to pixel (2,2) - out[0] = zero; - out[1] = zero; - out[2] = zero; - out[3] = zero; - uint8_t k = 0; - __m128i bitMask = one; - for (uint8_t i = 0; i < fastmask.size() && i < 64; i += 2) { - out[k] |= _mm_and_si128( - _mm_cmpgt_epu8( - _mm_lddqu_si128( - (__m128i*)(center + fastmask[i])), +#if (HWY_ARCH_X86) && (HWY_TARGET == HWY_AVX2) +void gpcFilterTauSSE(uint8_t* in, + const uint8_t* grad, + uint32_t* gpc, + std::vector fastmask, + std::vector tau, + std::vector& idx, + int width, + int height){ + const int start = 13; + const int end = height - 15; + __m128i zero = _mm_set1_epi8(0); + __m128i one = _mm_set1_epi8(1); + for (int y = start; y < end; y++) { + for (int x = 0; x < width; x += 16) { + uint8_t* rowPtr; + rowPtr = in + (y - 2) * width + x; + __m128i out[4]; // temporary output vector of 4 128bit words + + const uint8_t* center = (in + y * width + x); + const uint8_t* centerGrad = (grad + y * width + x); + // We only process the current segment if there are any non-zero + // values (high gradient pixels) + if (!isAllZeros(_mm_lddqu_si128((__m128i*)centerGrad))) { + __m128i* dst = + (__m128i*)(gpc + y * width + + x); // Set starting point to pixel (2,2) + out[0] = zero; + out[1] = zero; + out[2] = zero; + out[3] = zero; + uint8_t k = 0; + __m128i bitMask = one; + for (uint8_t i = 0; i < fastmask.size() && i < 64; i += 2) { + out[k] |= _mm_and_si128( + _mm_cmpgt_epu8( + _mm_lddqu_si128( + (__m128i*)(center + fastmask[i])), + _mm_subs_epi8( _mm_lddqu_si128( - (__m128i*)(center + fastmask[i + 1]))), - bitMask); - // Keeps index into output vector and updates bit mask - if (i % 16 == 0 && i != 0) { - bitMask = one; - k++; - } else { - bitMask += bitMask; - } + (__m128i*)(center + fastmask[i + 1])), + _mm_set1_epi8(tau[i / 2])) // deduct tau + ), + bitMask); + // Keeps index into output vector and updates bit mask + if (i % 16 == 0 && i != 0) { + bitMask = one; + k++; + } else { + bitMask += bitMask; } - // 8bit to 16bit - __m128i high1 = _mm_unpacklo_epi8(out[2], out[3]); - __m128i high2 = _mm_unpackhi_epi8(out[2], out[3]); - __m128i low1 = _mm_unpacklo_epi8(out[0], out[1]); - __m128i low2 = _mm_unpackhi_epi8(out[0], out[1]); - - // 16bit to 32bit ints - _mm_storeu_si128(dst, _mm_unpacklo_epi16(low1, high1)); - _mm_storeu_si128(dst + 1, _mm_unpackhi_epi16(low1, high1)); - _mm_storeu_si128(dst + 2, _mm_unpacklo_epi16(low2, high2)); - _mm_storeu_si128(dst + 3, _mm_unpackhi_epi16(low2, high2)); } - } // col iteration - } // row iteration - }; + // 8bit to 16bit + __m128i high1 = _mm_unpacklo_epi8(out[2], out[3]); + __m128i high2 = _mm_unpackhi_epi8(out[2], out[3]); + __m128i low1 = _mm_unpacklo_epi8(out[0], out[1]); + __m128i low2 = _mm_unpackhi_epi8(out[0], out[1]); - if (numThreads == 1) - gpcFilterSegment(13, height - 15); - else - parFor(gpcFilterSegment, 13, height - 15, 4); -#endif + // 16bit to 32bit ints + _mm_storeu_si128(dst, _mm_unpacklo_epi16(low1, high1)); + _mm_storeu_si128(dst + 1, _mm_unpackhi_epi16(low1, high1)); + _mm_storeu_si128(dst + 2, _mm_unpacklo_epi16(low2, high2)); + _mm_storeu_si128(dst + 3, _mm_unpackhi_epi16(low2, high2)); + } + } // col iteration + } // row iteration } +#endif + void gpcFilterTau(uint8_t* in, const uint8_t* grad, uint32_t* gpc, @@ -165,75 +243,19 @@ void gpcFilterTau(uint8_t* in, std::vector tau, std::vector& idx, int width, - int height, - int numThreads) { + int height){ assert(width % 16 == 0 && "width must be multiple of 16!"); -#ifndef _INTRINSICS_SSE +#if defined(__ARM_NEON) || defined(__aarch64__) + // Replace with call to highway gpcFilterTauNaive(in, grad, gpc, fastmask, tau, idx, width, height); #else - auto gpcFilterSegment = [&](int start, int end) { - __m128i zero = _mm_set1_epi8(0); - __m128i one = _mm_set1_epi8(1); - for (int y = start; y < end; y++) { - for (int x = 0; x < width; x += 16) { - uint8_t* rowPtr; - rowPtr = in + (y - 2) * width + x; - __m128i out[4]; // temporary output vector of 4 128bit words - - const uint8_t* center = (in + y * width + x); - const uint8_t* centerGrad = (grad + y * width + x); - // We only process the current segment if there are any non-zero - // values (high gradient pixels) - if (!isAllZeros(_mm_lddqu_si128((__m128i*)centerGrad))) { - __m128i* dst = - (__m128i*)(gpc + y * width + - x); // Set starting point to pixel (2,2) - out[0] = zero; - out[1] = zero; - out[2] = zero; - out[3] = zero; - uint8_t k = 0; - __m128i bitMask = one; - for (uint8_t i = 0; i < fastmask.size() && i < 64; i += 2) { - out[k] |= _mm_and_si128( - _mm_cmpgt_epu8( - _mm_lddqu_si128( - (__m128i*)(center + fastmask[i])), - _mm_subs_epi8( - _mm_lddqu_si128( - (__m128i*)(center + fastmask[i + 1])), - _mm_set1_epi8(tau[i / 2])) // deduct tau - ), - bitMask); - // Keeps index into output vector and updates bit mask - if (i % 16 == 0 && i != 0) { - bitMask = one; - k++; - } else { - bitMask += bitMask; - } - } - // 8bit to 16bit - __m128i high1 = _mm_unpacklo_epi8(out[2], out[3]); - __m128i high2 = _mm_unpackhi_epi8(out[2], out[3]); - __m128i low1 = _mm_unpacklo_epi8(out[0], out[1]); - __m128i low2 = _mm_unpackhi_epi8(out[0], out[1]); - - // 16bit to 32bit ints - _mm_storeu_si128(dst, _mm_unpacklo_epi16(low1, high1)); - _mm_storeu_si128(dst + 1, _mm_unpackhi_epi16(low1, high1)); - _mm_storeu_si128(dst + 2, _mm_unpacklo_epi16(low2, high2)); - _mm_storeu_si128(dst + 3, _mm_unpackhi_epi16(low2, high2)); - } - } // col iteration - } // row iteration - }; - - if (numThreads == 1) - gpcFilterSegment(13, height - 15); - else - parFor(gpcFilterSegment, 13, height - 15, 4); + #if (HWY_ARCH_X86) && (HWY_TARGET == HWY_AVX2) + gpcFilterTauSSE(in, grad, gpc, fastmask, tau, idx, width, height); + #else + gpcFilterTauNaive(in, grad, gpc, fastmask, tau, idx, width, height); #endif +#endif + } -} +} // namespace ndb diff --git a/lib/gpc/kernels/gpc.hpp b/lib/gpc/kernels/gpc.hpp index 5f43743..49db7ae 100644 --- a/lib/gpc/kernels/gpc.hpp +++ b/lib/gpc/kernels/gpc.hpp @@ -49,7 +49,6 @@ namespace ndb { * and the call gets forwarded to the naive implementation. * @param width The width of the image at pointer *in * @param height The height of the image at pointer *in - * @param numThreadsNumber of threads to use */ void gpcFilter(uint8_t* in, const uint8_t* grad, @@ -57,8 +56,7 @@ void gpcFilter(uint8_t* in, std::vector fastmask, std::vector& idx, int width, - int height, - int numThreads); + int height); /** @@ -93,7 +91,6 @@ void gpcFilterNaive(uint8_t* in, * @param fastmask The fastmask containing the gpc filter * @param width The width of the image at pointer *in * @param height The height of the image at pointer *in - * @param numThreads Number of threads to use */ void gpcFilterTau(uint8_t* in, const uint8_t* grad, @@ -102,8 +99,7 @@ void gpcFilterTau(uint8_t* in, std::vector tau, std::vector& idx, int width, - int height, - int numThreads); + int height); /** * @brief Applies a gpc filter defined by the pixel-difference tests in @@ -132,8 +128,25 @@ void gpcFilterTauNaive(uint8_t* in, * * @return true if all zeros, false otherwise */ -#ifdef _INTRINSICS_SSE +#if (HWY_ARCH_X86) && (HWY_TARGET == HWY_AVX2) bool isAllZeros(__m128i xmm); +void gpcFilterTauSSE(uint8_t* in, + const uint8_t* grad, + uint32_t* gpc, + std::vector fastmask, + std::vector tau, + std::vector& idx, + int width, + int height); +void gpcFilterSSE(uint8_t* in, + const uint8_t* grad, + uint32_t* gpc, + std::vector fastmask, + std::vector& idx, + int width, + int height); + + #endif diff --git a/lib/gpc/kernels/gpc_hwy.cpp b/lib/gpc/kernels/gpc_hwy.cpp new file mode 100644 index 0000000..21dae2d --- /dev/null +++ b/lib/gpc/kernels/gpc_hwy.cpp @@ -0,0 +1,157 @@ +//#define HWY_TARGET HWY_NEON +#include "gpc_hwy.hpp" +HWY_BEFORE_NAMESPACE(); +namespace ndb { +namespace HWY_NAMESPACE { +namespace hn = hwy::HWY_NAMESPACE; + +//dense! +#include + +namespace hn = hwy::HWY_NAMESPACE; + +// Dense Version +void GPCKernel(const uint8_t* HWY_RESTRICT in, + const uint8_t* HWY_RESTRICT grad, + uint32_t* HWY_RESTRICT gpc, + const std::vector& fastmask, + const std::vector& tau, + int width, int height) { + + const hn::ScalableTag d8; + const hn::ScalableTag d32; + const size_t N = hn::Lanes(d8); + + const int border = 13; + const auto v_zero8 = hn::Zero(d8); + const auto v_one8 = hn::Set(d8, 1); + const int32_t* fm = fastmask.data(); + + for (int y = border; y < height - border; ++y) { + const int row_base = y * width; + uint32_t* HWY_RESTRICT row_out = gpc + row_base; + + for (int x = border; x <= width - border - (int)N; x += N) { + const int k = row_base + x; + + // We use four 8-bit registers to build the 32 bits. + // This keeps the entire hot-loop in 8-bit space. + auto v_acc0 = hn::Zero(d8); // Bits 0-7 + auto v_acc1 = hn::Zero(d8); // Bits 8-15 + auto v_acc2 = hn::Zero(d8); // Bits 16-23 + auto v_acc3 = hn::Zero(d8); // Bits 24-31 + + // Pass 1: Bits 0-7 + for (int i = 0; i < 16; i += 2) { + v_acc0 = hn::Add(v_acc0, v_acc0); + auto mask = hn::Gt(hn::LoadU(d8, in + k + fm[i]), + hn::LoadU(d8, in + k + fm[i+1])); + v_acc0 = hn::Or(v_acc0, hn::IfThenElse(mask, v_one8, v_zero8)); + } + + // Pass 2: Bits 8-15 + for (int i = 16; i < 32; i += 2) { + v_acc1 = hn::Add(v_acc1, v_acc1); + auto mask = hn::Gt(hn::LoadU(d8, in + k + fm[i]), + hn::LoadU(d8, in + k + fm[i+1])); + v_acc1 = hn::Or(v_acc1, hn::IfThenElse(mask, v_one8, v_zero8)); + } + + // Pass 3: Bits 16-23 + for (int i = 32; i < 48; i += 2) { + v_acc2 = hn::Add(v_acc2, v_acc2); + auto mask = hn::Gt(hn::LoadU(d8, in + k + fm[i]), + hn::LoadU(d8, in + k + fm[i+1])); + v_acc2 = hn::Or(v_acc2, hn::IfThenElse(mask, v_one8, v_zero8)); + } + + // Pass 4: Bits 24-31 + for (int i = 48; i < 64; i += 2) { + v_acc3 = hn::Add(v_acc3, v_acc3); + auto mask = hn::Gt(hn::LoadU(d8, in + k + fm[i]), + hn::LoadU(d8, in + k + fm[i+1])); + v_acc3 = hn::Or(v_acc3, hn::IfThenElse(mask, v_one8, v_zero8)); + } + + // Final Assembly: Promote the four 8-bit chunks into 32-bit results. + // We use PromoteUpper/Lower to widen the data. + // N is the number of 8-bit lanes. We need to store N/4 results in d32. + + // To be perfectly safe across all Highway targets, we extract and combine: + for (size_t lane = 0; lane < N; ++lane) { + uint32_t final_val = (uint32_t(hn::ExtractLane(v_acc0, lane)) << 24) | + (uint32_t(hn::ExtractLane(v_acc1, lane)) << 16) | + (uint32_t(hn::ExtractLane(v_acc2, lane)) << 8) | + (uint32_t(hn::ExtractLane(v_acc3, lane))); + row_out[x + lane] = final_val; + } + } + } +} +void GPCKerneli(const uint8_t* HWY_RESTRICT in, + const uint8_t* HWY_RESTRICT grad, + uint32_t* HWY_RESTRICT gpc, + const std::vector& fastmask, + const std::vector& tau, + int width, int height) { + // We use the ScalableTag, but we will "Narrow" our view manually + const hn::ScalableTag d32; + const hn::Rebind d8_n; // Same number of lanes as d32 + + const size_t N = hn::Lanes(d32); + const auto v_zero = hn::Zero(d32); + const bool use_tau = !tau.empty(); + + for (int y = 0; y < height; ++y) { + for (int x = 0; x < width; x += N) { + const uint8_t* centerGrad = grad + y * width + x; + + // 1. Load the gradient bytes for the current N lanes + auto v_grad = hn::LoadU(d8_n, centerGrad); + + // 2. Promotion-free zero check + if (hn::AllTrue(d8_n, hn::Eq(v_grad, hn::Zero(d8_n)))) { + continue; + } + + auto v_tmp = hn::Zero(d32); + + for (size_t i = 0; i < fastmask.size(); i += 2) { + v_tmp = hn::ShiftLeft<1>(v_tmp); + + // 3. The "Promotion" that actually works on all platforms: + // Promote N lanes of uint8 to N lanes of uint32 + auto v1 = hn::PromoteTo(d32, hn::LoadU(d8_n, in + y * width + x + fastmask[i])); + auto v2 = hn::PromoteTo(d32, hn::LoadU(d8_n, in + y * width + x + fastmask[i + 1])); + + hn::Mask mask; + if (use_tau) { + auto v_tau = hn::Set(d32, tau[i / 2]); + mask = hn::Gt(v1, hn::Sub(v2, v_tau)); + } else { + mask = hn::Gt(v1, v2); + } + + v_tmp = hn::Add(v_tmp, hn::IfThenElse(mask, hn::Set(d32, 1), v_zero)); + } + + hn::StoreU(v_tmp, d32, gpc + y * width + x); + } + } +} + +} // namespace HWY_NAMESPACE +} // namespace ndb +HWY_AFTER_NAMESPACE(); + +namespace ndb { +namespace testing { + void gpc_hwy(uint8_t* in, uint8_t* grad, uint32_t* HWY_RESTRICT gpc, + const std::vector& fastmask, + const std::vector& tau, int width, int height) { + + HWY_STATIC_DISPATCH(GPCKernel)(in, grad, gpc, fastmask, tau, width, height); + + } +} +} diff --git a/lib/gpc/kernels/gpc_hwy.hpp b/lib/gpc/kernels/gpc_hwy.hpp new file mode 100644 index 0000000..f49d05a --- /dev/null +++ b/lib/gpc/kernels/gpc_hwy.hpp @@ -0,0 +1,17 @@ +#ifndef __NDB__KERNEL_GPC_HWY +#define __NDB__KERNEL_GPC_HWY + +#include +#include + +namespace ndb { + +namespace testing { + void gpc_hwy(uint8_t* in, uint8_t* grad, uint32_t* HWY_RESTRICT gpc, const std::vector& fastmask, const std::vector& tau, int width, int height); + + +} + +} // namespace ndb + +#endif // GPC_KERNELS_SOBEL_HWY_H_ diff --git a/lib/gpc/kernels/sobel_hwy.cpp b/lib/gpc/kernels/sobel_hwy.cpp index 35d0b72..d1b331f 100644 --- a/lib/gpc/kernels/sobel_hwy.cpp +++ b/lib/gpc/kernels/sobel_hwy.cpp @@ -1,4 +1,4 @@ -#define HWY_TARGET HWY_NEON +//#define HWY_TARGET HWY_NEON #include HWY_BEFORE_NAMESPACE(); diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 26ef573..e14ceda 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -28,4 +28,5 @@ endfunction() add_gpc_approval_test(test_single_matching test_single_matching.cpp) add_gpc_approval_test(test_kernel_box test_kernel_box.cpp) add_gpc_approval_test(test_kernel_sobel test_kernel_sobel.cpp) +add_gpc_approval_test(test_kernel_gpc test_kernel_gpc.cpp) diff --git a/tests/test_kernel_gpc.cpp b/tests/test_kernel_gpc.cpp new file mode 100644 index 0000000..7989f35 --- /dev/null +++ b/tests/test_kernel_gpc.cpp @@ -0,0 +1,82 @@ +#include +#include +#include +#include "gpc/forest.hpp" +#include "gpc/kernels/gpc.hpp" // Naive version +#include "gpc/kernels/sobel.hpp" // Highway version +#include "gpc/kernels/gpc_hwy.hpp" // Highway version +#include "gpc/kernels/utils.hpp" // Highway version + +TEST(Approval, GPCKernel) { + auto file = std::filesystem::absolute(__FILE__); + auto dir = file.parent_path(); + std::filesystem::path forestPath = dir / ".." / "forests" / "defaultZeroForest.txt"; + + const int width = 640; + const int height = 480; + const int radius = 2; // Typical for 5x5 bo + const int threshold = 0; // Example threshold for binarization + + typedef gpc::inference::Forest GPCForest_t; + GPCForest_t forest; + gpc::inference::FilterMask fm = + forest.readForest(forestPath, width, height); + + // 1. Prepare randomized input + std::vector input(width * height); + std::mt19937 gen(42); + std::uniform_int_distribution<> dis(0, 255); + for (auto& val : input) val = dis(gen); + + // 2. Prepare output buffers + std::vector grad(width * height, 0); + std::vector outNaive(width * height, 0); + std::vector outHighway(width * height, 0); + + // 3. Prepare gradient and fastmask + ndb::sobelNaive(input.data(), grad.data(), width, height, threshold); + + // More prep + std::vector idx(grad.size()); + auto ff = [&](std::vector& in, std::vector& out, int m) { + for (int i = 0; i < m; i++) { + int x = in.data()[i] % width; + int y = in.data()[i] / width; + if (y >= 13 && y < height - 13 && x >= 13 && x < width - 13) + out.push_back(in.data()[i]); + } + }; + int m; + // mask indexing gradient pixels + std::vector fastmask; + ndb::arr2ind(grad.data(), width * height, idx.data(), &m); + ff(idx, fastmask, m); + + std::vector tau; + // 4. Run Naive version + ndb::gpcFilterNaive(input.data(), grad.data(), outNaive.data(), + fm.mask, fastmask, width, height); + /* + * fastmask.mask, fastmask.tau, idx + fastmask.mask is.. imo the extraction pattern. lets se... + it's filtermask! lol + where idx is... preprocessed.mask. WTF..what is that lol + */ + + // 5. Run Highway version + // + //ndb::gpcFilterSSE(input.data(), grad.data(), outHighway.data(), fastmask, tau, width, height); + ndb::testing::gpc_hwy(input.data(), grad.data(), outHighway.data(), + fastmask, tau, width, height); + + // 6. Compare results + // We skip the border (radius) because different implementations + // might handle edges differently. + for (int y = radius; y < height - radius; ++y) { + for (int x = radius; x < width - radius; ++x) { + int idx = y * width + x; + ASSERT_EQ(outNaive[idx], outHighway[idx]) + << "Mismatch at (" << x << "," << y << ")"; + } + } +} From b1b5dee72d855c4ced4e6ab5b4c0a02f8653c965 Mon Sep 17 00:00:00 2001 From: Nik Bamert Date: Sun, 8 Mar 2026 12:13:56 +0100 Subject: [PATCH 27/36] add matching kernels --- CMakeLists.txt | 10 + benchmarks/CMakeLists.txt | 7 + lib/gpc/forest.cpp | 1244 ++++++++++++++++++++++++++++++++++++- lib/gpc/forest.hpp | 108 +++- samples/sparsematch.cpp | 35 +- 5 files changed, 1374 insertions(+), 30 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6957189..57b44ae 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -40,6 +40,16 @@ FetchContent_Declare( GIT_REPOSITORY https://github.com/google/benchmark.git GIT_TAG v1.9.5 ) +# MUST go before FetchContent_MakeAvailable +set(BENCHMARK_ENABLE_TESTING OFF CACHE BOOL "" FORCE) +set(BENCHMARK_ENABLE_INSTALL OFF CACHE BOOL "" FORCE) + +# Force the library itself to build in Release mode +set(CMAKE_BUILD_TYPE Release CACHE STRING "" FORCE) + +#add_definitions(-DNDEBUG) + +FetchContent_MakeAvailable(google_benchmark) FetchContent_MakeAvailable(google_benchmark) add_library(gpc_core lib/gpc/forest.cpp diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 9735e84..7942fa6 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -22,3 +22,10 @@ target_link_libraries(box_bench gpc_core benchmark::benchmark ) +add_executable(correspondence_bench correspondence_bench.cpp) + +target_link_libraries(correspondence_bench + PRIVATE + gpc_core + benchmark::benchmark +) diff --git a/lib/gpc/forest.cpp b/lib/gpc/forest.cpp index 6aca590..876fe0b 100644 --- a/lib/gpc/forest.cpp +++ b/lib/gpc/forest.cpp @@ -32,6 +32,7 @@ // Shenlong Wang, Sean Ryan Fanello, Christoph Rhemann, Shahram Izadi, Pushmeet // Kohli CVPR 2016 Code Author: Niklaus Bamert (bamertn@ethz.ch) #include +//#include #include #include #include @@ -56,6 +57,1001 @@ namespace gpc { namespace inference { +void Forest::prepareSoAFramesPersistentSingleSlab( + std::vector& srcStates, + std::vector& tarStates, + SoAFramePersistentSingleSlab& srcFrame, + SoAFramePersistentSingleSlab& tarFrame) { + + uint32_t srcCounts[256] = {0}, tarCounts[256] = {0}; + for (const auto& s : srcStates) srcCounts[s.state & 0xFF]++; + for (const auto& t : tarStates) tarCounts[t.state & 0xFF]++; + + StateIdx* sP = srcFrame.slab.data(); + StateIdx* tP = tarFrame.slab.data(); + for (int i = 0; i < 256; ++i) { + srcFrame.bucketData[i] = sP; + srcFrame.bucketSizes[i] = srcCounts[i]; + tarFrame.bucketData[i] = tP; + tarFrame.bucketSizes[i] = tarCounts[i]; + sP += srcCounts[i]; tP += tarCounts[i]; + } + + uint32_t sW[256] = {0}, tW[256] = {0}; + for (uint32_t i = 0; i < (uint32_t)srcStates.size(); ++i) { + uint64_t sv = srcStates[i].state; + uint64_t tv = tarStates[i].state; + srcFrame.bucketData[sv & 0xFF][sW[sv & 0xFF]++] = {sv, i}; + tarFrame.bucketData[tv & 0xFF][tW[tv & 0xFF]++] = {tv, i}; + } +} +void Forest::prepareSoAFramesPersistent( + std::vector& srcStates, + std::vector& tarStates, + SoAFramePersistent& srcFrame, + SoAFramePersistent& tarFrame) { + assert(srcStates.size() == tarStates.size()); + assert(srcStates.size() <= 256 * 16384); // limit for max unique items in our table design +/* + // This is only slightly slower than the bit below. + const uint32_t BUCKET_COUNT = 256; + const uint64_t BUCKET_MASK = 0xFF; + + // 1. Histogram (To find bucket boundaries) + uint32_t srcCounts[BUCKET_COUNT] = {0}; + uint32_t tarCounts[BUCKET_COUNT] = {0}; + for (const auto& s : srcStates) srcCounts[s.state & BUCKET_MASK]++; + for (const auto& t : tarStates) tarCounts[t.state & BUCKET_MASK]++; + + // 2. Setup Bucket Pointers into the Slab + // We treat the slab like a custom allocator + uint64_t* srcPtr = srcFrame.statesSlab.data(); + uint32_t* srcIdxPtr = srcFrame.indicesSlab.data(); + uint64_t* tarPtr = tarFrame.statesSlab.data(); + uint32_t* tarIdxPtr = tarFrame.indicesSlab.data(); + + for (uint32_t i = 0; i < BUCKET_COUNT; ++i) { + srcFrame.bucketStates[i] = srcPtr; + srcFrame.bucketIndices[i] = srcIdxPtr; + srcFrame.bucketSizes[i] = srcCounts[i]; + + tarFrame.bucketStates[i] = tarPtr; + tarFrame.bucketIndices[i] = tarIdxPtr; + tarFrame.bucketSizes[i] = tarCounts[i]; + + srcPtr += srcCounts[i]; + srcIdxPtr += srcCounts[i]; + tarPtr += tarCounts[i]; + tarIdxPtr += tarCounts[i]; + } + + // 3. The "Pure Scatter" (No push_back, no resize, no zeroing) + uint32_t srcWriteIdx[BUCKET_COUNT] = {0}; + uint32_t tarWriteIdx[BUCKET_COUNT] = {0}; + + for (uint32_t i = 0; i < (uint32_t)srcStates.size(); ++i) { + uint64_t s = srcStates[i].state; + uint32_t b = s & BUCKET_MASK; + uint32_t pos = srcWriteIdx[b]++; + srcFrame.bucketStates[b][pos] = s; + srcFrame.bucketIndices[b][pos] = i; + } + + for (uint32_t i = 0; i < (uint32_t)tarStates.size(); ++i) { + uint64_t s = tarStates[i].state; + uint32_t b = s & BUCKET_MASK; + uint32_t pos = tarWriteIdx[b]++; + tarFrame.bucketStates[b][pos] = s; + tarFrame.bucketIndices[b][pos] = i; + } + */ + const uint32_t BUCKET_COUNT = 256; + const uint64_t BUCKET_MASK = 0xFF; + + uint32_t srcCounts[BUCKET_COUNT] = {0}; + uint32_t tarCounts[BUCKET_COUNT] = {0}; + + // 1. Fused Histogram Pass (Assuming equal sizes as per your note) + const uint32_t totalSize = (uint32_t)srcStates.size(); + for (uint32_t i = 0; i < totalSize; ++i) { + srcCounts[srcStates[i].state & BUCKET_MASK]++; + tarCounts[tarStates[i].state & BUCKET_MASK]++; + } + + // 2. Setup Bucket Pointers (Unchanged, this is fast) + uint64_t* sP = srcFrame.statesSlab.data(); + uint32_t* sI = srcFrame.indicesSlab.data(); + uint64_t* tP = tarFrame.statesSlab.data(); + uint32_t* tI = tarFrame.indicesSlab.data(); + + for (uint32_t i = 0; i < BUCKET_COUNT; ++i) { + srcFrame.bucketStates[i] = sP; + srcFrame.bucketIndices[i] = sI; + srcFrame.bucketSizes[i] = srcCounts[i]; + tarFrame.bucketStates[i] = tP; + tarFrame.bucketIndices[i] = tI; + tarFrame.bucketSizes[i] = tarCounts[i]; + sP += srcCounts[i]; sI += srcCounts[i]; + tP += tarCounts[i]; tI += tarCounts[i]; + } + + // 3. Optimized Fused Scatter + uint32_t srcWriteIdx[BUCKET_COUNT] = {0}; + uint32_t tarWriteIdx[BUCKET_COUNT] = {0}; + + // Unroll by 2 to keep the M3's execution ports saturated + uint32_t i = 0; + for (; i + 1 < totalSize; i += 2) { + // Source pair + uint64_t s0 = srcStates[i].state; + uint64_t s1 = srcStates[i+1].state; + uint32_t bS0 = s0 & BUCKET_MASK; + uint32_t bS1 = s1 & BUCKET_MASK; + + srcFrame.bucketStates[bS0][srcWriteIdx[bS0]++] = s0; + srcFrame.bucketIndices[bS0][srcWriteIdx[bS0]-1] = i; + srcFrame.bucketStates[bS1][srcWriteIdx[bS1]++] = s1; + srcFrame.bucketIndices[bS1][srcWriteIdx[bS1]-1] = i+1; + + // Target pair + uint64_t t0 = tarStates[i].state; + uint64_t t1 = tarStates[i+1].state; + uint32_t bT0 = t0 & BUCKET_MASK; + uint32_t bT1 = t1 & BUCKET_MASK; + + tarFrame.bucketStates[bT0][tarWriteIdx[bT0]++] = t0; + tarFrame.bucketIndices[bT0][tarWriteIdx[bT0]-1] = i; + tarFrame.bucketStates[bT1][tarWriteIdx[bT1]++] = t1; + tarFrame.bucketIndices[bT1][tarWriteIdx[bT1]-1] = i+1; + } + + // Handle remainder + for (; i < totalSize; ++i) { + uint64_t s = srcStates[i].state; + uint32_t bS = s & BUCKET_MASK; + srcFrame.bucketStates[bS][srcWriteIdx[bS]++] = s; + srcFrame.bucketIndices[bS][srcWriteIdx[bS]-1] = i; + + uint64_t t = tarStates[i].state; + uint32_t bT = t & BUCKET_MASK; + tarFrame.bucketStates[bT][tarWriteIdx[bT]++] = t; + tarFrame.bucketIndices[bT][tarWriteIdx[bT]-1] = i; + } +} + + // Here we did allocation within the prepare. we can move that part out +std::pair Forest::prepareSoAFrames( + std::vector& srcStates, + std::vector& tarStates) { + SoAFrame srcFrame, tarFrame; + srcFrame.reserve(srcStates.size()); + tarFrame.reserve(tarStates.size()); + + const uint64_t MASK = 0xFF; + + // Distribute into buckets based on the last 8 bits of the state + for (uint32_t i = 0; i < srcStates.size(); ++i) { + uint64_t s = srcStates[i].state; + srcFrame.states[s & MASK].push_back(s); + srcFrame.indices[s & MASK].push_back(i); + } + + for (uint32_t i = 0; i < tarStates.size(); ++i) { + uint64_t s = tarStates[i].state; + tarFrame.states[s & MASK].push_back(s); + tarFrame.indices[s & MASK].push_back(i); + } + + return {srcFrame, tarFrame}; +} +void Forest::matchPipelinedBranchlessPreallocateSingleSlab( + SoAFramePersistentSingleSlab& src, SoAFramePersistentSingleSlab& tar, + std::vector& outS, std::vector& outT) { + + struct Slot { + uint64_t key; // The 64-bit Descriptor/State ID + uint32_t idx; // The original global index in the Source array + uint32_t gen; // The "Generation" ID (replaces memset/clear) + uint32_t count; // The match state (0=empty, 1=unique, >1=dup, 0xFF..=matched) + }; + static std::vector table(16384, {0, 0, 0, 0}); + static uint32_t currentGen = 1; + + for (int b = 0; b < 256; ++b) { + StateIdx* sData = src.bucketData[b]; + uint32_t sSize = src.bucketSizes[b]; + if (sSize == 0) continue; + + const uint32_t mask = (sSize < 1000) ? 2047 : 16383; + const uint32_t shift = (sSize < 1000) ? 53 : 50; + currentGen++; + + for (uint32_t i = 0; i < sSize; ++i) { + uint64_t k = sData[i].state; + uint32_t h = (k * 11400714819323198485llu) >> shift; + h &= mask; + while (table[h].gen == currentGen && table[h].key != k) h = (h + 1) & mask; + if (table[h].gen != currentGen) table[h] = {k, sData[i].index, currentGen, 1}; + else table[h].count++; + } + + StateIdx* tData = tar.bucketData[b]; + uint32_t tSize = tar.bucketSizes[b]; + for (uint32_t i = 0; i < tSize; ++i) { + uint64_t k = tData[i].state; + uint32_t h = (k * 11400714819323198485llu) >> shift; + h &= mask; + while (table[h].gen == currentGen && table[h].key != k) h = (h + 1) & mask; + + if (table[h].gen == currentGen && table[h].key == k) { + if (table[h].count == 1) { + outS.push_back(table[h].idx); + outT.push_back(tData[i].index); + table[h].count = 0xFFFFFFFF; + } else if (table[h].count == 0xFFFFFFFF) { + outS.pop_back(); outT.pop_back(); + table[h].count = 0xEEEEEEEE; + } + } + } + } +} +/* +std::pair, std::vector> Forest::matchAdaptiveNeon( + SoAFrame& src, + SoAFrame& tar) { + + std::pair, std::vector> result; + result.first.reserve(10000); + result.second.reserve(10000); + + // Slot is exactly 32 bytes. 2 Slots = 64 bytes (1 Cache Line). + struct alignas(16) Slot { + uint64_t key; + uint32_t idx; + uint32_t gen; + uint32_t count; + uint32_t padding; + }; + + static uint32_t currentGen = 1; + static std::vector table(8192, {0, 0, 0, 0, 0}); + + for (int b = 0; b < 256; ++b) { + const auto& sStates = src.states[b]; + const auto& sIdxs = src.indices[b]; + if (sStates.empty()) continue; + + const uint32_t mask = (sStates.size() < 500) ? 1023 : 8191; + currentGen++; + + // --- PART 1: SOURCE FILL (Keep Scalar as it's usually not the bottleneck) --- + for (size_t i = 0; i < sStates.size(); ++i) { + uint64_t k = sStates[i]; + uint32_t h = (k * 11400714819323198485llu) >> (64 - 13); + h &= mask; + + while (table[h].gen == currentGen && table[h].key != k) { + h = (h + 1) & mask; + } + + if (table[h].gen != currentGen) { + table[h] = {k, sIdxs[i], currentGen, 1, 0}; + } else { + table[h].count++; + } + } + + const auto& tStates = tar.states[b]; + const auto& tIdxs = tar.indices[b]; + + // --- PART 2: TARGET MATCH (NEON Vectorized Window) --- + uint64x2_t genVec = vdupq_n_u64((uint64_t)currentGen << 32); // Gen is at offset 12 in slot + + for (size_t i = 0; i < tStates.size(); ++i) { + uint64_t k = tStates[i]; + uint32_t h = (k * 11400714819323198485llu) >> (64 - 13); + h &= mask; + + uint64x2_t targetKeyV = vdupq_n_u64(k); + bool found = false; + + // Check 2 slots at a time (One Cache Line) + // This loop usually terminates in the first iteration (h and h+1) + while (true) { + // Load keys from Slot H and Slot H+1 + // We use vld2 to pick the 'key' field which is the first 8 bytes of each 32-byte slot + // For simplicity and speed on M3, we'll just do direct pointer access: + uint64_t k0 = table[h].key; + uint64_t k1 = table[(h + 1) & mask].key; + uint32_t g0 = table[h].gen; + uint32_t g1 = table[(h + 1) & mask].gen; + + uint64x2_t keysV = {k0, k1}; + uint32x2_t gensV = {g0, g1}; + + // Compare keys + uint64x2_t keyMatch = vceqq_u64(keysV, targetKeyV); + // Compare generations + uint32x2_t genMatch = vceq_u32(gensV, vdup_n_u32(currentGen)); + + // Check lane 0 + if (vgetq_lane_u64(keyMatch, 0) && vget_lane_u32(genMatch, 0)) { + if (table[h].count == 1) { + result.first.push_back(table[h].idx); + result.second.push_back(tIdxs[i]); + table[h].count = 0xFFFFFFFF; + } else if (table[h].count == 0xFFFFFFFF) { + result.first.pop_back(); result.second.pop_back(); + table[h].count = 0xEEEEEEEE; + } + found = true; break; + } + + // Check lane 1 + uint32_t nextH = (h + 1) & mask; + if (vgetq_lane_u64(keyMatch, 1) && vget_lane_u32(genMatch, 1)) { + if (table[nextH].count == 1) { + result.first.push_back(table[nextH].idx); + result.second.push_back(tIdxs[i]); + table[nextH].count = 0xFFFFFFFF; + } else if (table[nextH].count == 0xFFFFFFFF) { + result.first.pop_back(); result.second.pop_back(); + table[nextH].count = 0xEEEEEEEE; + } + found = true; break; + } + + // If neither matches and both are "current", we must keep probing + if (g0 == currentGen && g1 == currentGen) { + h = (h + 2) & mask; + } else { + // One of them is an empty slot (gen != currentGen), stop searching + break; + } + } + } + } + return result; +} +*/ +void Forest::matchPipelinedBranchlessPreallocate( + SoAFramePersistent& src, + SoAFramePersistent& tar, + std::vector& resultSrc, + std::vector& resultTar) { + + //std::pair, std::vector> result; + // For 100M items, we might find more matches; + // adjusting reserve to prevent mid-run reallocations. + //result.first.reserve(src.statesSlab.size() / 100); + //result.second.reserve(src.statesSlab.size() / 100); + + struct Slot { + uint64_t key; + uint32_t idx; + uint32_t gen; + uint32_t count; + }; + + static uint32_t currentGen = 1; + // Increased table size slightly to 16k to further reduce Pareto collisions + static std::vector table(16384, {0, 0, 0, 0}); + + for (int b = 0; b < 256; ++b) { + uint64_t* sStates = src.bucketStates[b]; + uint32_t* sIdxs = src.bucketIndices[b]; + uint32_t sSize = src.bucketSizes[b]; + + if (sSize == 0) continue; + + // Adaptive Mask: 2k for small, 16k for large + const uint32_t mask = (sSize < 1000) ? 2047 : 16383; + const uint32_t shift = (sSize < 1000) ? (64 - 11) : (64 - 14); + currentGen++; + + // 1. Fill Table (Source) + for (size_t i = 0; i < sSize; ++i) { + uint64_t k = sStates[i]; + uint32_t h = (k * 11400714819323198485llu) >> shift; + h &= mask; + + // Branchless-ish Probe: Most IDs are unique, so this loop + // is predicted "not taken" after the first check. + while (table[h].gen == currentGen && table[h].key != k) { + h = (h + 1) & mask; + } + + if (table[h].gen != currentGen) { + table[h] = {k, sIdxs[i], currentGen, 1}; + } else { + table[h].count++; + } + } + + // 2. Intersect (Target) with Software Pipelining + uint64_t* tStates = tar.bucketStates[b]; + uint32_t* tIdxs = tar.bucketIndices[b]; + uint32_t tSize = tar.bucketSizes[b]; + + for (size_t i = 0; i < tSize; ++i) { + // Manual prefetch of the state 16 elements ahead to stay in L1 + if (i + 16 < tSize) { + __builtin_prefetch(&tStates[i + 16], 0, 3); + } + + uint64_t k = tStates[i]; + uint32_t h = (k * 11400714819323198485llu) >> shift; + h &= mask; + + // Probe logic + while (table[h].gen == currentGen && table[h].key != k) { + h = (h + 1) & mask; + } + + if (table[h].gen == currentGen && table[h].key == k) { + const uint32_t cnt = table[h].count; + if (cnt == 1) { + resultSrc.push_back(table[h].idx); + resultTar.push_back(tIdxs[i]); + table[h].count = 0xFFFFFFFF; + } else if (cnt == 0xFFFFFFFF) { + // Pareto multi-match removal logic + resultSrc.pop_back(); + resultTar.pop_back(); + table[h].count = 0xEEEEEEEE; + } + } + } + } +} +std::pair, std::vector> Forest::matchPipelinedBranchless( + SoAFramePersistent& src, + SoAFramePersistent& tar) { + + std::pair, std::vector> result; + // For 100M items, we might find more matches; + // adjusting reserve to prevent mid-run reallocations. + result.first.reserve(src.statesSlab.size() / 100); + result.second.reserve(src.statesSlab.size() / 100); + + struct Slot { + uint64_t key; + uint32_t idx; + uint32_t gen; + uint32_t count; + }; + + static uint32_t currentGen = 1; + // Increased table size slightly to 16k to further reduce Pareto collisions + static std::vector table(16384, {0, 0, 0, 0}); + + for (int b = 0; b < 256; ++b) { + uint64_t* sStates = src.bucketStates[b]; + uint32_t* sIdxs = src.bucketIndices[b]; + uint32_t sSize = src.bucketSizes[b]; + + if (sSize == 0) continue; + + // Adaptive Mask: 2k for small, 16k for large + const uint32_t mask = (sSize < 1000) ? 2047 : 16383; + const uint32_t shift = (sSize < 1000) ? (64 - 11) : (64 - 14); + currentGen++; + + // 1. Fill Table (Source) + for (size_t i = 0; i < sSize; ++i) { + uint64_t k = sStates[i]; + uint32_t h = (k * 11400714819323198485llu) >> shift; + h &= mask; + + // Branchless-ish Probe: Most IDs are unique, so this loop + // is predicted "not taken" after the first check. + while (table[h].gen == currentGen && table[h].key != k) { + h = (h + 1) & mask; + } + + if (table[h].gen != currentGen) { + table[h] = {k, sIdxs[i], currentGen, 1}; + } else { + table[h].count++; + } + } + + // 2. Intersect (Target) with Software Pipelining + uint64_t* tStates = tar.bucketStates[b]; + uint32_t* tIdxs = tar.bucketIndices[b]; + uint32_t tSize = tar.bucketSizes[b]; + + for (size_t i = 0; i < tSize; ++i) { + // Manual prefetch of the state 16 elements ahead to stay in L1 + if (i + 16 < tSize) { + __builtin_prefetch(&tStates[i + 16], 0, 3); + } + + uint64_t k = tStates[i]; + uint32_t h = (k * 11400714819323198485llu) >> shift; + h &= mask; + + // Probe logic + while (table[h].gen == currentGen && table[h].key != k) { + h = (h + 1) & mask; + } + + if (table[h].gen == currentGen && table[h].key == k) { + const uint32_t cnt = table[h].count; + if (cnt == 1) { + result.first.push_back(table[h].idx); + result.second.push_back(tIdxs[i]); + table[h].count = 0xFFFFFFFF; + } else if (cnt == 0xFFFFFFFF) { + // Pareto multi-match removal logic + result.first.pop_back(); + result.second.pop_back(); + table[h].count = 0xEEEEEEEE; + } + } + } + } + return result; +} +std::pair, std::vector> Forest::matchAdaptivePersistent( + SoAFramePersistent& src, + SoAFramePersistent& tar) { + + std::pair, std::vector> result; + result.first.reserve(10000); + result.second.reserve(10000); + + struct Slot { + uint64_t key; + uint32_t idx; + uint32_t gen; // Generation counter + uint32_t count; // 1=SrcUnique, 0xFFFFFFFF=Matched, etc. + }; + + static uint32_t currentGen = 1; + static std::vector table(8192, {0, 0, 0, 0}); + + for (int b = 0; b < 256; ++b) { + uint64_t* sStates = src.bucketStates[b]; + uint32_t* sIdxs = src.bucketIndices[b]; + uint32_t sSize = src.bucketSizes[b]; + + if (sSize == 0) continue; + + const uint32_t mask = (sSize < 500) ? 1023 : 8191; + currentGen++; + + // 1. Fill Table + for (size_t i = 0; i < sSize; ++i) { + uint64_t k = sStates[i]; + uint32_t h = (k * 11400714819323198485llu) >> (64 - 13); + h &= mask; + + while (table[h].gen == currentGen && table[h].key != k) { + h = (h + 1) & mask; + } + + if (table[h].gen != currentGen) { + table[h] = {k, sIdxs[i], currentGen, 1}; + } else { + table[h].count++; + } + } + + // 2. Intersect + uint64_t* tStates = tar.bucketStates[b]; + uint32_t* tIdxs = tar.bucketIndices[b]; + uint32_t tSize = tar.bucketSizes[b]; + + for (size_t i = 0; i < tSize; ++i) { + uint64_t k = tStates[i]; + uint32_t h = (k * 11400714819323198485llu) >> (64 - 13); + h &= mask; + + while (table[h].gen == currentGen && table[h].key != k) { + h = (h + 1) & mask; + } + + if (table[h].gen == currentGen && table[h].key == k) { + if (table[h].count == 1) { + result.first.push_back(table[h].idx); + result.second.push_back(tIdxs[i]); + table[h].count = 0xFFFFFFFF; + } else if (table[h].count == 0xFFFFFFFF) { + result.first.pop_back(); + result.second.pop_back(); + table[h].count = 0xEEEEEEEE; + } + } + } + } + return result; +} + +std::pair, std::vector> Forest::matchAdaptive( + SoAFrame& src, + SoAFrame& tar) { + + std::pair, std::vector> result; + result.first.reserve(10000); + result.second.reserve(10000); + + struct Slot { + uint64_t key; + uint32_t idx; + uint32_t gen; // Generation counter + uint32_t count; // 1=SrcUnique, 0xFFFFFFFF=Matched, etc. + }; + + // Global generation for this call + uint32_t currentGen = 1; + std::vector table(8192, {0, 0, 0, 0}); + + for (int b = 0; b < 256; ++b) { + const auto& sStates = src.states[b]; + const auto& sIdxs = src.indices[b]; + if (sStates.empty()) continue; + + // Adaptive Table Mask: Use smaller range for tiny buckets + const uint32_t mask = (sStates.size() < 500) ? 1023 : 8191; + currentGen++; + + // 1. Fill Table + for (size_t i = 0; i < sStates.size(); ++i) { + // Prefetch an element roughly 16 iterations ahead (adjust based on testing) + /* + * This didn't help anymore. So either compiler already optimized this or + * we are compute bound. + * if (i + 16 < sStates.size()) { + __builtin_prefetch(&sStates[i + 16], 0, 3); + __builtin_prefetch(&sIdxs[i + 16], 0, 3); + }*/ + uint64_t k = sStates[i]; + uint32_t h = (k * 11400714819323198485llu) >> (64 - 13); + h &= mask; + + // Probe: Valid if gen matches AND key is different + while (table[h].gen == currentGen && table[h].key != k) { + h = (h + 1) & mask; + } + + if (table[h].gen != currentGen) { + table[h] = {k, sIdxs[i], currentGen, 1}; + } else { + table[h].count++; // Duplicate in Source + } + } + + // 2. Intersect + const auto& tStates = tar.states[b]; + const auto& tIdxs = tar.indices[b]; + for (size_t i = 0; i < tStates.size(); ++i) { + uint64_t k = tStates[i]; + uint32_t h = (k * 11400714819323198485llu) >> (64 - 13); + h &= mask; + + while (table[h].gen == currentGen && table[h].key != k) { + h = (h + 1) & mask; + } + + if (table[h].gen == currentGen && table[h].key == k) { + if (table[h].count == 1) { + result.first.push_back(table[h].idx); + result.second.push_back(tIdxs[i]); + table[h].count = 0xFFFFFFFF; + } else if (table[h].count == 0xFFFFFFFF) { + result.first.pop_back(); + result.second.pop_back(); + table[h].count = 0xEEEEEEEE; + } + } + } + } + return result; +} +std::pair, std::vector> Forest::matchBlockedBloom( + SoAFrame& src, + SoAFrame& tar) { + + std::pair, std::vector> result; + result.first.reserve(10000); + result.second.reserve(10000); + + struct Slot { + uint64_t key; + uint32_t idx; + uint32_t count; + }; + + const uint32_t TABLE_SIZE = 8192; + const uint32_t HASH_MASK = TABLE_SIZE - 1; + std::vector table(TABLE_SIZE); + + // A 512-bit Bloom Filter fits in exactly one Cache Line (64 bytes). + // We use 8 x 64-bit integers to represent the 512 bits. + uint64_t bloom[8]; + + for (int b = 0; b < 256; ++b) { + std::fill(table.begin(), table.end(), Slot{0, 0, 0}); + std::memset(bloom, 0, sizeof(bloom)); + + const auto& sStates = src.states[b]; + const auto& sIdxs = src.indices[b]; + const auto& tStates = tar.states[b]; + const auto& tIdxs = tar.indices[b]; + + // 1. Fill Table + Bloom Filter + for (size_t i = 0; i < sStates.size(); ++i) { + uint64_t k = sStates[i]; + + // Set Bloom bit: use a different hash or shift for the bloom index + // We'll use bits from the key to pick one of 512 bits + uint32_t bHash = (k ^ (k >> 32)); + bloom[(bHash >> 6) & 7] |= (1ull << (bHash & 63)); + + uint32_t h = (k * 11400714819323198485llu) >> (64 - 13); + h &= HASH_MASK; + + while (table[h].count > 0 && table[h].key != k) { + h = (h + 1) & HASH_MASK; + } + + table[h].key = k; + table[h].idx = sIdxs[i]; + table[h].count++; + } + + // 2. Intersection with Bloom Filter Gate + for (size_t i = 0; i < tStates.size(); ++i) { + uint64_t k = tStates[i]; + + // --- BLOOM FILTER GATE --- + uint32_t bHash = (k ^ (k >> 32)); + if (!(bloom[(bHash >> 6) & 7] & (1ull << (bHash & 63)))) { + continue; // 100% certainly not in Source. Skip hash probe! + } + // ------------------------- + + uint32_t h = (k * 11400714819323198485llu) >> (64 - 13); + h &= HASH_MASK; + + while (table[h].count > 0 && table[h].key != k) { + h = (h + 1) & HASH_MASK; + } + + if (table[h].key == k) { + if (table[h].count == 1) { + result.first.push_back(table[h].idx); + result.second.push_back(tIdxs[i]); + table[h].count = 0xFFFFFFFF; + } else if (table[h].count == 0xFFFFFFFF) { + result.first.pop_back(); + result.second.pop_back(); + table[h].count = 0xEEEEEEEE; + } + } + } + } + return result; +} +std::pair, std::vector> Forest::matchParallelRadixPartitioning( + SoAFrame& src, + SoAFrame& tar) { + + std::pair, std::vector> result; + result.first.reserve(10000); + result.second.reserve(10000); + + const uint32_t TABLE_SIZE = 8192; + const uint32_t HASH_MASK = TABLE_SIZE - 1; + + // Aligned scratchpad to maximize L1/L2 cache efficiency + struct alignas(64) Slot { + uint64_t key; + uint32_t idx; + uint32_t count; + }; + std::vector table(TABLE_SIZE); + + for (int b = 0; b < 256; ++b) { + // 1. FAST CLEAR + // std::fill is optimized, but we only zero the 'count' to save cycles + for(auto& s : table) s.count = 0; + + const auto& sStates = src.states[b]; + const auto& sIdxs = src.indices[b]; + const size_t sSize = sStates.size(); + + // 2. PIPELINED FILL (Unrolled x4 for ILP) + // We process 4 items at once to hide memory latency + size_t i = 0; + for (; i + 3 < sSize; i += 4) { + for (int k = 0; k < 4; ++k) { + uint64_t key = sStates[i + k]; + uint32_t h = (key * 11400714819323198485llu) >> (64 - 13); + h &= HASH_MASK; + + while (table[h].count > 0 && table[h].key != key) h = (h + 1) & HASH_MASK; + + table[h].key = key; + table[h].idx = sIdxs[i + k]; + table[h].count++; + } + } + // Handle remainder + for (; i < sSize; ++i) { + uint64_t key = sStates[i]; + uint32_t h = (key * 11400714819323198485llu) >> (64 - 13); + h &= HASH_MASK; + while (table[h].count > 0 && table[h].key != key) h = (h + 1) & HASH_MASK; + table[h].key = key; table[h].idx = sIdxs[i]; table[h].count++; + } + + // 3. OPTIMISTIC INTERSECTION + const auto& tStates = tar.states[b]; + const auto& tIdxs = tar.indices[b]; + const size_t tSize = tStates.size(); + + for (size_t j = 0; j < tSize; ++j) { + uint64_t key = tStates[j]; + uint32_t h = (key * 11400714819323198485llu) >> (64 - 13); + h &= HASH_MASK; + + while (table[h].count > 0 && table[h].key != key) h = (h + 1) & HASH_MASK; + + if (table[h].key == key) { + if (table[h].count == 1) { + result.first.push_back(table[h].idx); + result.second.push_back(tIdxs[j]); + table[h].count = 0xFFFFFFFF; // Mark as Matched + } else if (table[h].count == 0xFFFFFFFF) { + // Pareto duplicate found in Target: Roll back + result.first.pop_back(); + result.second.pop_back(); + table[h].count = 0xEEEEEEEE; // Mark as Permanent Duplicate + } + } + } + } + + return result; +} +std::pair, std::vector> Forest::matchPreparedFramesFaster( + SoAFrame& src, + SoAFrame& tar) { + + std::pair, std::vector> result; + result.first.reserve(10000); + result.second.reserve(10000); + + // Flat, cache-aligned slot structure + struct Slot { + uint64_t key; + uint32_t idx; + uint32_t count; + }; + + // 8192 slots = 128KB. This fits perfectly in your 4MB L2. + // We use a power-of-two size to use bitwise AND instead of modulo %. + const uint32_t TABLE_SIZE = 8192; + const uint32_t HASH_MASK = TABLE_SIZE - 1; + std::vector table(TABLE_SIZE); + + for (int b = 0; b < 256; ++b) { + // FAST: std::fill is usually a vectorized memset. + std::fill(table.begin(), table.end(), Slot{0, 0, 0}); + + const auto& sStates = src.states[b]; + const auto& sIdxs = src.indices[b]; + const auto& tStates = tar.states[b]; + const auto& tIdxs = tar.indices[b]; + + // 1. Fill Table from Source + for (size_t i = 0; i < sStates.size(); ++i) { + uint64_t k = sStates[i]; + // Fibonacci Hashing (very fast for 64-bit keys) + uint32_t h = (k * 11400714819323198485llu) >> (64 - 13); + h &= HASH_MASK; + + while (table[h].count > 0 && table[h].key != k) { + h = (h + 1) & HASH_MASK; + } + + table[h].key = k; + table[h].idx = sIdxs[i]; + table[h].count++; + } + + // 2. Secondary Uniqueness Check + Intersection + // We reuse the 'count' field: + // 1 = Unique in Src + // >1 = Duplicate in Src + // 0 = Already Matched (prevents Target duplicates) + for (size_t i = 0; i < tStates.size(); ++i) { + uint64_t k = tStates[i]; + uint32_t h = (k * 11400714819323198485llu) >> (64 - 13); + h &= HASH_MASK; + + while (table[h].count > 0 && table[h].key != k) { + h = (h + 1) & HASH_MASK; + } + + // We need to know if 'k' is unique in Target too. + // A quick way is to check if it appears again in the target bucket. + // For Pareto, we can use a "tombstone" logic: + if (table[h].key == k) { + if (table[h].count == 1) { + // This is the first time we see it in Target + result.first.push_back(table[h].idx); + result.second.push_back(tIdxs[i]); + table[h].count = 0xFFFFFFFF; // Mark as "Matched once" + } else if (table[h].count == 0xFFFFFFFF) { + // Oh no, this is a Target duplicate! + // We must remove the last added match. + result.first.pop_back(); + result.second.pop_back(); + table[h].count = 0xEEEEEEEE; // Mark as "Permanent Duplicate" + } + } + } + } + return result; +} +std::pair, std::vector> Forest::matchPreparedFrames( SoAFrame& src, SoAFrame& tar) { + + // Initialize the pair of vectors + std::pair, std::vector> result; + + // Heuristic: start with a reasonable reserve (e.g., 5% of average bucket size * 256) + size_t initialReserve = (src.states[0].size() + tar.states[0].size()) * 6; + result.first.reserve(initialReserve); + result.second.reserve(initialReserve); + + // Local structures for bucket-level uniqueness + struct SrcInfo { uint32_t idx; bool isDup; }; + std::unordered_map bucketSrc; + std::unordered_map bucketTar; + + for (int b = 0; b < 256; ++b) { + bucketSrc.clear(); + bucketTar.clear(); + + const auto& sStates = src.states[b]; + const auto& sIdxs = src.indices[b]; + const auto& tStates = tar.states[b]; + const auto& tIdxs = tar.indices[b]; + + // 1. Process Source: Mark unique vs duplicates + for (size_t i = 0; i < sStates.size(); ++i) { + auto [it, inserted] = bucketSrc.try_emplace(sStates[i], SrcInfo{sIdxs[i], false}); + if (!inserted) it->second.isDup = true; + } + + // 2. Process Target: Mark unique vs duplicates + for (size_t i = 0; i < tStates.size(); ++i) { + auto [it, inserted] = bucketTar.try_emplace(tStates[i], false); + if (!inserted) it->second = true; // Mark as duplicate + } + + // 3. Intersect unique-only IDs + for (size_t i = 0; i < tStates.size(); ++i) { + uint64_t id = tStates[i]; + + // Check if unique in Target + if (bucketTar[id] == false) { + auto it = bucketSrc.find(id); + // Check if exists in Source AND is unique there + if (it != bucketSrc.end() && it->second.isDup == false) { + result.first.push_back(it->second.idx); + result.second.push_back(tIdxs[i]); + } + } + } + } + + return result; +} /** * @brief Computes sparse matches on a pair of rectified and smoothed @@ -117,7 +1113,6 @@ std::vector Forest::findCorrespondences( int numStates = std::min(srcStates.size(), tarStates.size()); // Limit search to rectified epipolar case. std::sort(srcStates.begin(), srcStates.end()); - std::sort(tarStates.begin(), tarStates.end()); std::vector corr; uint32_t j = 0; @@ -141,7 +1136,254 @@ std::vector Forest::findCorrespondences( } return corr; } +#include + +// State machine for our IDs +enum class State : uint8_t { Unseen = 0, SeenOnce = 1, Duplicate = 2 }; + +#include +#include +#include + +std::vector Forest::findCorrespondencesHash( + std::vector& srcStates, + std::vector& tarStates) { + + // Tracking states: 0 = Unseen, 1 = SeenOnce, 2 = Duplicate + enum class Occurence : uint8_t { Unseen = 0, SeenOnce = 1, Duplicate = 2 }; + + // 1. Map Source IDs: State -> {OccurenceLevel, OriginalIndex} + // Pre-allocating prevents expensive rehashes during the loop + std::unordered_map> srcMap; + srcMap.reserve(srcStates.size()); + + for (uint32_t i = 0; i < srcStates.size(); ++i) { + auto& entry = srcMap[srcStates[i].state]; + if (entry.first == Occurence::Unseen) { + entry = {Occurence::SeenOnce, i}; + } else { + entry.first = Occurence::Duplicate; + } + } + + // 2. Map Target IDs: State -> OccurenceLevel + std::unordered_map tarMap; + tarMap.reserve(tarStates.size()); + + for (uint32_t j = 0; j < tarStates.size(); ++j) { + auto& occ = tarMap[tarStates[j].state]; + if (occ == Occurence::Unseen) { + occ = Occurence::SeenOnce; + } else { + occ = Occurence::Duplicate; + } + } + + // 3. Intersect unique pairs + std::vector corr; + // Heuristic: Reserve 20% of the smaller set size for the results + corr.reserve(std::min(srcStates.size(), tarStates.size()) / 5); + + for (uint32_t j = 0; j < tarStates.size(); ++j) { + uint64_t currentID = tarStates[j].state; + + // Condition: Must be unique in Target AND unique in Source + if (tarMap[currentID] == Occurence::SeenOnce) { + auto it = srcMap.find(currentID); + if (it != srcMap.end() && it->second.first == Occurence::SeenOnce) { + // Correspondence(Point from Source, Point from Target) + corr.push_back(ndb::Correspondence( + srcStates[it->second.second].point, + tarStates[j].point + )); + } + } + } + + return corr; +} +#include +#include +#include + +// A lightweight structure to avoid moving heavy Descriptor objects +struct KeyIndex { + uint64_t state; + uint32_t index; +}; +#include +#include +#include + +std::vector Forest::findCorrespondencesTurbo( + std::vector& srcStates, + std::vector& tarStates) { + + const int BUCKETS = 256; + const uint64_t MASK = 0xFF; + + // --- STEP 1: Linear Partitioning (Radix Pass) --- + // We use a single flat buffer to avoid 256 separate vector allocations + std::vector srcBuffer(srcStates.size()); + std::vector tarBuffer(tarStates.size()); + std::array srcCounts = {0}, tarCounts = {0}; + std::array srcOffsets, tarOffsets; + + for (const auto& s : srcStates) srcCounts[s.state & MASK]++; + for (const auto& t : tarStates) tarCounts[t.state & MASK]++; + + srcOffsets[0] = tarOffsets[0] = 0; + for (int i = 1; i < BUCKETS; ++i) { + srcOffsets[i] = srcOffsets[i - 1] + srcCounts[i - 1]; + tarOffsets[i] = tarOffsets[i - 1] + tarCounts[i - 1]; + } + + auto srcCursors = srcOffsets; + auto tarCursors = tarOffsets; + + for (uint32_t i = 0; i < srcStates.size(); ++i) { + srcBuffer[srcCursors[srcStates[i].state & MASK]++] = {srcStates[i].state, i}; + } + for (uint32_t i = 0; i < tarStates.size(); ++i) { + tarBuffer[tarCursors[tarStates[i].state & MASK]++] = {tarStates[i].state, i}; + } + // --- STEP 2: In-Cache Hashing --- + std::vector corr; + corr.reserve(std::min(srcStates.size(), tarStates.size()) / 8); + + // Using a tiny fixed-size hash table for each bucket to stay in L1/L2 cache + // State: 0 = Unseen, 1 = SeenOnce, 2 = Duplicate + struct LocalVal { uint32_t index; uint8_t count; }; + + // We reuse this map across buckets to avoid reallocating + // A simple open-addressed hash map for the bucket + std::unordered_map bucketMap; + bucketMap.reserve(srcStates.size() / BUCKETS * 2); + + for (int b = 0; b < BUCKETS; ++b) { + bucketMap.clear(); + + // Load Source bucket into local cache-friendly map + size_t srcStart = srcOffsets[b]; + size_t srcEnd = srcStart + srcCounts[b]; + for (size_t i = srcStart; i < srcEnd; ++i) { + auto& entry = bucketMap[srcBuffer[i].state]; + entry.index = srcBuffer[i].index; + entry.count = (entry.count == 0) ? 1 : 2; + } + + // Intersect with Target bucket + size_t tarStart = tarOffsets[b]; + size_t tarEnd = tarStart + tarCounts[b]; + + // Secondary map to ensure target-side uniqueness + std::unordered_map tarUniqueness; + for (size_t i = tarStart; i < tarEnd; ++i) { + auto& count = tarUniqueness[tarBuffer[i].state]; + count = (count == 0) ? 1 : 2; + } + + for (size_t i = tarStart; i < tarEnd; ++i) { + uint64_t id = tarBuffer[i].state; + if (tarUniqueness[id] == 1) { + auto it = bucketMap.find(id); + if (it != bucketMap.end() && it->second.count == 1) { + corr.push_back(ndb::Correspondence( + srcStates[it->second.index].point, + tarStates[tarBuffer[i].index].point + )); + } + } + } + } + + return corr; +} +std::vector Forest::findCorrespondencesHashingRadix( + std::vector& srcStates, + std::vector& tarStates) { + + const int NUM_BUCKETS = 256; + const uint64_t MASK = 0xFF; + + // 1. Partition Source into Buckets + std::vector srcBuckets[NUM_BUCKETS]; + for (int i = 0; i < NUM_BUCKETS; ++i) srcBuckets[i].reserve(srcStates.size() / NUM_BUCKETS * 1.2); + + for (uint32_t i = 0; i < srcStates.size(); ++i) { + srcBuckets[srcStates[i].state & MASK].push_back({srcStates[i].state, i}); + } + + // 2. Partition Target into Buckets + std::vector tarBuckets[NUM_BUCKETS]; + for (int i = 0; i < NUM_BUCKETS; ++i) tarBuckets[i].reserve(tarStates.size() / NUM_BUCKETS * 1.2); + + for (uint32_t i = 0; i < tarStates.size(); ++i) { + tarBuckets[tarStates[i].state & MASK].push_back(tarStates[i].state); + } + + std::vector corr; + corr.reserve(std::min(srcStates.size(), tarStates.size()) / 5); + + // 3. Process each bucket pair + // This part can be easily parallelized with #pragma omp parallel for + for (int b = 0; b < NUM_BUCKETS; ++b) { + if (srcBuckets[b].empty() || tarBuckets[b].empty()) continue; + + // Small local maps fit in L1/L2 Cache + // Using a simple frequency map for the local bucket + enum class Occ : uint8_t { Unseen = 0, SeenOnce = 1, Duplicate = 2 }; + + struct LocalEntry { + Occ occ = Occ::Unseen; + uint32_t idx = 0; + }; + + // We use a flat hash map here. For simplicity in standard C++, + // std::unordered_map is used, but even it is faster here + // because it stays in cache. + std::unordered_map localSrc; + localSrc.reserve(srcBuckets[b].size()); + + for (auto& ki : srcBuckets[b]) { + auto& entry = localSrc[ki.state]; + if (entry.occ == Occ::Unseen) { + entry = {Occ::SeenOnce, ki.index}; + } else { + entry.occ = Occ::Duplicate; + } + } + + std::unordered_map localTar; + localTar.reserve(tarBuckets[b].size()); + for (uint64_t state : tarBuckets[b]) { + auto& occ = localTar[state]; + occ = (occ == Occ::Unseen) ? Occ::SeenOnce : Occ::Duplicate; + } + + // Intersect within the bucket + // Since we are inside a bucket, we iterate the target indices + // but we need to find the target point. + // To be fast, we'll re-scan the original tarStates for this bucket's IDs + for (uint32_t j = 0; j < tarStates.size(); ++j) { + uint64_t s = tarStates[j].state; + if ((s & MASK) == b) { // Only process IDs belonging to this bucket + if (localTar[s] == Occ::SeenOnce) { + auto it = localSrc.find(s); + if (it != localSrc.end() && it->second.occ == Occ::SeenOnce) { + corr.push_back(ndb::Correspondence( + srcStates[it->second.idx].point, + tarStates[j].point + )); + } + } + } + } + } + + return corr; +} /** * @brief Evaluates a given forest mask on an image and returns the * descriptors diff --git a/lib/gpc/forest.hpp b/lib/gpc/forest.hpp index d0b5c32..9f48c87 100644 --- a/lib/gpc/forest.hpp +++ b/lib/gpc/forest.hpp @@ -178,7 +178,47 @@ struct MatchStats { double prec, rec, timeProp, timeMatch; int numInlier, numStates, numMatches; }; +struct SoAFrame { + // 256 Buckets to ensure each chunk fits in L2/L3 cache + std::vector states[256]; + std::vector indices[256]; + + void reserve(size_t total_size) { + for(int i=0; i<256; ++i) { + states[i].reserve(total_size / size_t(256 * 1.2)); + indices[i].reserve(total_size / size_t(256 * 1.2)); + } + } +}; +struct SoAFramePersistent { + // Persistent memory blocks + std::vector statesSlab; + std::vector indicesSlab; + + // Pointers into the slab for each bucket + uint64_t* bucketStates[256]; + uint32_t* bucketIndices[256]; + uint32_t bucketSizes[256]; + + void preallocate(size_t total_size) { + statesSlab.assign(total_size, 0); + indicesSlab.assign(total_size, 0); + } +}; +struct StateIdx { + uint64_t state; + uint32_t index; +}; +struct SoAFramePersistentSingleSlab { + std::vector slab; + StateIdx* bucketData[256]; + uint32_t bucketSizes[256]; + + void preallocate(size_t total_size) { + slab.assign(total_size, {0, 0}); + } +}; class Forest { public: @@ -198,9 +238,75 @@ class Forest { PreprocessedImage& tar, FilterMask& fastmask, InferenceSettings& settings); - std::vector findCorrespondences( + static std::vector findCorrespondences( + std::vector& srcStates, + std::vector& tarStates); + static std::vector findCorrespondencesHash( + std::vector& srcStates, + std::vector& tarStates); + + static std::vector findCorrespondencesHashingRadix( + std::vector& srcStates, + std::vector& tarStates); + + static std::vector findCorrespondencesTurbo( std::vector& srcStates, std::vector& tarStates); + + + static std::pair prepareSoAFrames( + std::vector& srcStates, + std::vector& tarStates); + + static void prepareSoAFramesPersistent( + std::vector& srcStates, + std::vector& tarStates, + SoAFramePersistent& srcFrame, + SoAFramePersistent& tarFrame); +static void prepareSoAFramesPersistentSingleSlab( + std::vector& srcStates, + std::vector& tarStates, + SoAFramePersistentSingleSlab& srcFrame, + SoAFramePersistentSingleSlab& tarFrame); + + +static std::pair, std::vector> matchPreparedFrames( SoAFrame& src, SoAFrame& tar); +static std::pair, std::vector> matchPreparedFramesFaster( SoAFrame& src, SoAFrame& tar); + +static std::pair, std::vector> matchParallelRadixPartitioning( + SoAFrame& src, + SoAFrame& tar) ; +static std::pair, std::vector> matchBlockedBloom( + SoAFrame& src, + SoAFrame& tar) ; +static std::pair, std::vector> matchAdaptive( + SoAFrame& src, + SoAFrame& tar); +static std::pair, std::vector> matchAdaptivePersistent( + SoAFramePersistent& src, + SoAFramePersistent& tar); +static std::pair, std::vector> matchPipelinedBranchless( + SoAFramePersistent& src, + SoAFramePersistent& tar); +static void matchPipelinedBranchlessPreallocate( + SoAFramePersistent& src, + SoAFramePersistent& tar, + std::vector& resultSrc, + std::vector& resultTar); + +/* +static std::pair, std::vector> matchAdaptiveNeon( + SoAFrame& src, + SoAFrame& tar); +*/ +static void matchPipelinedBranchlessPreallocateSingleSlab( + SoAFramePersistentSingleSlab& src, SoAFramePersistentSingleSlab& tar, + std::vector& outS, std::vector& outT); + + + + + /** * @brief Evaluates a given forest mask on an image and returns the * descriptors diff --git a/samples/sparsematch.cpp b/samples/sparsematch.cpp index be0015a..85070a7 100644 --- a/samples/sparsematch.cpp +++ b/samples/sparsematch.cpp @@ -3,29 +3,11 @@ #include "gpc/forest.hpp" using namespace std; -void test_hwy_neon() { - namespace hn = hwy::HWY_NAMESPACE; - - // d is a "descriptor" for a vector of 8-bit unsigned ints - const hn::ScalableTag d; - - // If this is NEON, hn::Lanes(d) will be 16 - size_t lanes = hn::Lanes(d); - - auto v1 = hn::Set(d, 10); - auto v2 = hn::Set(d, 20); - auto res = hn::Add(v1, v2); // res lanes all contain 30 - - std::cout << "--- Highway Status ---" << std::endl; - std::cout << "Target: " << hwy::TargetName(hwy::SupportedTargets()) << std::endl; - std::cout << "Vector lanes (uint8): " << lanes << std::endl; - std::cout << "----------------------" << std::endl; -} + int main(int argc, char** argv) { - std::string forestPath = "../../forests/defaultZeroForest.txt"; - std::string leftImgPath = "../../data/kitti/training/image_0/000000_10.png"; - std::string rightImgPath = - "../../data/kitti/training/image_1/000000_10.png"; + std::string forestPath = "../forests/defaultZeroForest.txt"; + std::string leftImgPath = "../data/middlebury/im0.png"; + std::string rightImgPath = "../data/middlebury/im1.png"; if (argc == 4) { forestPath = argv[1]; @@ -51,7 +33,7 @@ int main(int argc, char** argv) { gpc::inference::InferenceSettings inferencesettings = gpc::inference::InferenceSettings() .builder() - .gradientThreshold(2) // gradientthres 20: matching ~3ms, 2: matching: ~30ms. + .gradientThreshold(1) // gradientthres 20: matching ~3ms, 2: matching: ~30ms. .verticalTolerance( 0) // 0px tolerance for rectified epipolar matches .dispHigh(128) // limit disparities to 128 @@ -68,9 +50,6 @@ int main(int argc, char** argv) { gpc::inference::FilterMask fm = forest.readForest(forestPath, simg.cols(), simg.rows()); - for(int i = 0; i<10000; i++) { - // Preprocess images (box filter, sobel filter, indices of high gradient - // pixels) gpc::inference::time_point t0 = gpc::inference::sysTick(); @@ -84,8 +63,8 @@ int main(int argc, char** argv) { std::vector supp = forest.rectifiedMatch(simgP, timgP, fm, inferencesettings); gpc::inference::time_point t2 = gpc::inference::sysTick(); + std::cout << "Number of features(s,t): " << simgP.mask.size() << "," << timgP.mask.size() << std::endl; + std::cout << "Number of matches: " << supp.size() << std::endl; std::cout << "Preprocessing time: " << gpc::inference::tickToMs(t1, t0) << " ms" << std::endl; std::cout << "Matching time: " << gpc::inference::tickToMs(t2, t1) << " ms" << std::endl; - } - test_hwy_neon(); } From 55994f5592bb1b8f2b922d2a1e3a3ce5b87a486b Mon Sep 17 00:00:00 2001 From: Nik Bamert Date: Sun, 8 Mar 2026 15:22:13 +0100 Subject: [PATCH 28/36] add correspondence bench and(de)serialization --- benchmarks/correspondence_bench.cpp | 353 ++++++++++++++++++++++++++++ lib/gpc/buffer.hpp | 55 +++++ lib/gpc/forest.cpp | 10 +- samples/sparsematch.cpp | 7 + 4 files changed, 421 insertions(+), 4 deletions(-) create mode 100644 benchmarks/correspondence_bench.cpp diff --git a/benchmarks/correspondence_bench.cpp b/benchmarks/correspondence_bench.cpp new file mode 100644 index 0000000..e16eb9b --- /dev/null +++ b/benchmarks/correspondence_bench.cpp @@ -0,0 +1,353 @@ +#include +#include "gpc/forest.hpp" +#include "gpc/inference.hpp" +#include +#include +#include +#include + +#define NUM_ELEMENTS 262668 //10*1224*375 //1024*1024 + +/* Remaining ideas + * -USE ILP: Parallel Radix Partitioning (Even on one core, using a single-pass shuffle). + * - Didn't speed up. was same as matchPreparedFramesFaster + * - Assuming that the bottleneck is the hash table probes, hence: look into bloom filters... + * -Blocked Bloom Filter to discard non-matches in L1. + * - Faster at 1M, slower at 100K and 10M + * -SIMD-Probed Flat Table (checking 4 slots at once). + * -Manual Prefetching of the next bucket's data. + * */ +/** + * Generates a reproducible Pareto-distributed vector. + * @param count Number of IDs to generate. + * @param target_mean The theoretical mean (requires alpha > 1). + * @param seed A fixed value (e.g., 42) for deterministic benchmarks. + */ +std::vector generate_pareto_ids(size_t count, double target_mean, uint32_t seed = 42) { + std::vector ids; + ids.reserve(count); + + // Using a fixed seed for benchmark consistency + std::mt19937 gen(seed); + + // 1e-9 epsilon prevents division by zero/infinity + std::uniform_real_distribution dist(1e-9, 1.0); + + // Alpha = 1.16 provides a classic "80/20" Pareto distribution + const double alpha = 1.16; + const double xm = target_mean * (alpha - 1.0) / alpha; + + for (size_t i = 0; i < count; ++i) { + // Inverse Transform Sampling + double val = xm / std::pow(dist(gen), 1.0 / alpha); + + // Casting to uint32_t will handle the Pareto "tail" by wrapping + // values that exceed 2^32-1, simulating a dense ID space. + ids.push_back(ndb::Descriptor(ndb::Point(0,0), static_cast(val))); + } + + return ids; +} +std::vector getSrcDescriptors() { + return ndb::Descriptor::deserialize("statesSrc.txt", true); + //return generate_pareto_ids(NUM_ELEMENTS, 1000.0, 42); // 1M IDs with mean ~1000 +} + +std::vector getTarDescriptors() { + return ndb::Descriptor::deserialize("statesTar.txt", false); + //return generate_pareto_ids(NUM_ELEMENTS, 1001.0, 42); // 1M IDs with mean ~1000 +} +std::vector generate_unique_ids(size_t count) { + std::vector ids; + ids.reserve(count); + + + for (size_t i = 0; i < count; ++i) { + ids.push_back(ndb::Descriptor(ndb::Point(0,0), static_cast(i))); + } + return ids; +} +static void matchBySorting( + benchmark::State& state) { + std::vector srcOriginal = getSrcDescriptors(); + std::vector tarOriginal = getTarDescriptors(); + for (auto _ : state) { + state.PauseTiming(); + std::vector src = srcOriginal; + std::vector tar = tarOriginal; + state.ResumeTiming(); + std::vector + matches = gpc::inference::Forest::findCorrespondences(src, tar); + + state.counters["matches"] = matches.size(); + //state.counters["candidates_t"] = timgP.mask.size(); + //state.counters["matches"] = supp.size(); + benchmark::DoNotOptimize(matches); + benchmark::ClobberMemory(); + } +} +static void matchByHashing( + benchmark::State& state) { + std::vector srcOriginal = getSrcDescriptors(); + std::vector tarOriginal = getTarDescriptors(); + for (auto _ : state) { + state.PauseTiming(); + std::vector src = srcOriginal; + std::vector tar = tarOriginal; + state.ResumeTiming(); + std::vector + matches = gpc::inference::Forest::findCorrespondencesTurbo(src, tar); + + state.counters["matches"] = matches.size(); + //state.counters["candidates_t"] = timgP.mask.size(); + //state.counters["matches"] = supp.size(); + benchmark::DoNotOptimize(matches); + benchmark::ClobberMemory(); + } +} +static void matchPreparedFrames( + benchmark::State& state) { + std::vector srcOriginal = getSrcDescriptors(); + std::vector tarOriginal = getTarDescriptors(); + for (auto _ : state) { + state.PauseTiming(); + std::vector src = srcOriginal; + std::vector tar = tarOriginal; + state.ResumeTiming(); + auto v = gpc::inference::Forest::prepareSoAFrames(src, tar); + auto matches = gpc::inference::Forest::matchPreparedFrames(v.first, v.second); + + state.counters["matches"] = matches.first.size(); + //state.counters["candidates_t"] = timgP.mask.size(); + //state.counters["matches"] = supp.size(); + benchmark::DoNotOptimize(matches); + benchmark::ClobberMemory(); + } +} +static void matchPreparedFramesFaster( + benchmark::State& state) { + std::vector srcOriginal = getSrcDescriptors(); + std::vector tarOriginal = getTarDescriptors(); + for (auto _ : state) { + state.PauseTiming(); + std::vector src = srcOriginal; + std::vector tar = tarOriginal; + state.ResumeTiming(); + auto v = gpc::inference::Forest::prepareSoAFrames(src, tar); + //auto matches = gpc::inference::Forest::matchPreparedFramesFaster(v.first, v.second); + auto matches = gpc::inference::Forest::matchParallelRadixPartitioning(v.first,v.second); + state.counters["matches"] = matches.first.size(); + //state.counters["candidates_t"] = timgP.mask.size(); + //state.counters["matches"] = supp.size(); + benchmark::DoNotOptimize(matches); + benchmark::ClobberMemory(); + } +} +static void matchParallelRadixPartitioning( + benchmark::State& state) { + std::vector srcOriginal = getSrcDescriptors(); + std::vector tarOriginal = getTarDescriptors(); + for (auto _ : state) { + state.PauseTiming(); + std::vector src = srcOriginal; + std::vector tar = tarOriginal; + state.ResumeTiming(); + auto v = gpc::inference::Forest::prepareSoAFrames(src,tar); + auto matches = gpc::inference::Forest::matchParallelRadixPartitioning(v.first, v.second); + + state.counters["matches"] = matches.first.size(); + //state.counters["candidates_t"] = timgP.mask.size(); + //state.counters["matches"] = supp.size(); + benchmark::DoNotOptimize(matches); + benchmark::ClobberMemory(); + } +} +static void matchBlockedBloom( + benchmark::State& state) { + std::vector srcOriginal = getSrcDescriptors(); + std::vector tarOriginal = getTarDescriptors(); + + for (auto _ : state) { + state.PauseTiming(); + std::vector src = srcOriginal; + std::vector tar = tarOriginal; + state.ResumeTiming(); + auto v = gpc::inference::Forest::prepareSoAFrames(src,tar); + auto matches = gpc::inference::Forest::matchBlockedBloom(v.first, v.second); + + state.counters["matches"] = matches.first.size(); + benchmark::DoNotOptimize(matches); + benchmark::ClobberMemory(); + } +} +static void matchAdaptive( + benchmark::State& state) { + std::vector srcOriginal = getSrcDescriptors(); + std::vector tarOriginal = getTarDescriptors(); + for (auto _ : state) { + state.PauseTiming(); + std::vector src = srcOriginal; + std::vector tar = tarOriginal; + state.ResumeTiming(); + auto v = gpc::inference::Forest::prepareSoAFrames(src,tar); + auto matches = gpc::inference::Forest::matchAdaptive(v.first, v.second); + + state.counters["matches"] = matches.first.size(); + benchmark::DoNotOptimize(matches); + benchmark::ClobberMemory(); + } +} +/* +static void matchAdaptiveNeon( + benchmark::State& state) { + std::vector src, tar; + src = generate_pareto_ids(NUM_ELEMENTS, 1000.0, 42); // 1M IDs with mean ~1000 + tar = generate_pareto_ids(NUM_ELEMENTS, 1001.0, 42); // 1M IDs with mean ~1000 + + for (auto _ : state) { + auto v = gpc::inference::Forest::prepareSoAFrames(src, tar); + auto matches = gpc::inference::Forest::matchAdaptiveNeon(v.first, v.second); + + state.counters["matches"] = matches.first.size(); + //state.counters["candidates_t"] = timgP.mask.size(); + //state.counters["matches"] = supp.size(); + benchmark::DoNotOptimize(matches); + benchmark::ClobberMemory(); + } +} +*/ +static void matchAdaptivePersistent( + benchmark::State& state) { + std::vector srcOriginal = getSrcDescriptors(); + std::vector tarOriginal = getTarDescriptors(); + gpc::inference::SoAFramePersistent srcFrame, tarFrame; + srcFrame.preallocate(srcOriginal.size()); // size known + tarFrame.preallocate(tarOriginal.size()); + for (auto _ : state) { + state.PauseTiming(); + std::vector src = srcOriginal; + std::vector tar = tarOriginal; + state.ResumeTiming(); + + gpc::inference::Forest::prepareSoAFramesPersistent(src, tar, srcFrame, tarFrame); + auto matches = gpc::inference::Forest::matchAdaptivePersistent(srcFrame, tarFrame); + + state.counters["matches"] = matches.first.size(); + benchmark::DoNotOptimize(matches); + benchmark::ClobberMemory(); + } +} +static void matchPipelinedBranchless( + benchmark::State& state) { + std::vector srcOriginal = getSrcDescriptors(); + std::vector tarOriginal = getTarDescriptors(); + gpc::inference::SoAFramePersistent srcFrame, tarFrame; + srcFrame.preallocate(srcOriginal.size()); // size known + tarFrame.preallocate(tarOriginal.size()); + for (auto _ : state) { + state.PauseTiming(); + std::vector src = srcOriginal; + std::vector tar = tarOriginal; + state.ResumeTiming(); + + gpc::inference::Forest::prepareSoAFramesPersistent(src, tar, srcFrame, tarFrame); + auto matches = gpc::inference::Forest::matchPipelinedBranchless(srcFrame, tarFrame); + benchmark::DoNotOptimize(matches); + benchmark::ClobberMemory(); + } +} +static void matchPipelinedBranchlessPreallocate( + benchmark::State& state) { + std::vector srcOriginal = getSrcDescriptors(); + std::vector tarOriginal = getTarDescriptors(); + gpc::inference::SoAFramePersistent srcFrame, tarFrame; + srcFrame.preallocate(srcOriginal.size()); // size known + tarFrame.preallocate(tarOriginal.size()); + std::vector resultSrc, resultTar; + resultSrc.reserve(srcOriginal.size()/100); + resultTar.reserve(tarOriginal.size()/100); + for (auto _ : state) { + state.PauseTiming(); + resultSrc.clear(); + resultTar.clear(); + std::vector src = srcOriginal; + std::vector tar = tarOriginal; + state.ResumeTiming(); + + // 1. Measure Prepare + // 2M: 5.7ms, 20M: 57ms + gpc::inference::Forest::prepareSoAFramesPersistent(src, tar, srcFrame, tarFrame); + + // 2. Measure Match + // 2M: 5.3ms , 20M: 53ms + gpc::inference::Forest::matchPipelinedBranchlessPreallocate(srcFrame, tarFrame, resultSrc, resultTar); + + state.counters["matches"] = resultSrc.size(); + + benchmark::DoNotOptimize(resultSrc); + benchmark::DoNotOptimize(resultTar); + benchmark::ClobberMemory(); + } +} + +static void matchPipelinedBranchlessPreallocateSingleSlab( + benchmark::State& state) { + std::vector srcOriginal = getSrcDescriptors(); + std::vector tarOriginal = getTarDescriptors(); + + gpc::inference::SoAFramePersistentSingleSlab srcFrame, tarFrame; + srcFrame.preallocate(srcOriginal.size()); // size known + tarFrame.preallocate(tarOriginal.size()); + std::vector resultSrc, resultTar; + resultSrc.reserve(srcOriginal.size()/10); + resultTar.reserve(tarOriginal.size()/10); + for (auto _ : state) { + state.PauseTiming(); + resultSrc.clear(); + resultTar.clear(); + // 1. Measure Prepare + // 2M: 5.7ms, 20M: 57ms + std::vector src = srcOriginal; + std::vector tar = tarOriginal; + state.ResumeTiming(); + gpc::inference::Forest::prepareSoAFramesPersistentSingleSlab(src, tar, srcFrame, tarFrame); + + // 2. Measure Match + // 2M: 5.3ms , 20M: 53ms + gpc::inference::Forest::matchPipelinedBranchlessPreallocateSingleSlab(srcFrame, tarFrame, resultSrc, resultTar); + + state.counters["matches"] = resultSrc.size(); + + benchmark::DoNotOptimize(resultSrc); + benchmark::DoNotOptimize(resultTar); + benchmark::ClobberMemory(); + } +} +BENCHMARK(matchBySorting) + ->Unit(benchmark::kMillisecond); +BENCHMARK(matchByHashing) + ->Unit(benchmark::kMillisecond); +BENCHMARK(matchPreparedFrames) + ->Unit(benchmark::kMillisecond); +BENCHMARK(matchPreparedFramesFaster) + ->Unit(benchmark::kMillisecond); +BENCHMARK(matchParallelRadixPartitioning) + ->Unit(benchmark::kMillisecond); +BENCHMARK(matchBlockedBloom) + ->Unit(benchmark::kMillisecond); +BENCHMARK(matchAdaptive) + ->Unit(benchmark::kMillisecond); +/* +BENCHMARK(matchAdaptiveNeon) + ->Unit(benchmark::kMillisecond); +*/ +BENCHMARK(matchAdaptivePersistent) + ->Unit(benchmark::kMillisecond); + +BENCHMARK(matchPipelinedBranchless) + ->Unit(benchmark::kMillisecond); +BENCHMARK(matchPipelinedBranchlessPreallocate) + ->Unit(benchmark::kMillisecond); +BENCHMARK(matchPipelinedBranchlessPreallocateSingleSlab) + ->Unit(benchmark::kMillisecond); +BENCHMARK_MAIN(); diff --git a/lib/gpc/buffer.hpp b/lib/gpc/buffer.hpp index 453ecaa..26e261a 100644 --- a/lib/gpc/buffer.hpp +++ b/lib/gpc/buffer.hpp @@ -35,6 +35,10 @@ #include #include #include +#include +#include +#include +#include using namespace std; @@ -80,6 +84,57 @@ struct Descriptor { bool operator<(const Descriptor& d) const { return state < d.state; } bool operator<=(const Descriptor& d) const { return state <= d.state; } int operator%(const int& d) const { return state % d; } + static void serialize(const std::string& filename, const std::vector& data) { + std::ofstream outFile(filename); + if (!outFile.is_open()) { + std::cerr << "Error opening file for writing: " << filename << std::endl; + return; + } + + for (const auto& desc : data) { + outFile << desc.point.x << "," + << desc.point.y << "," + << desc.state << "\n"; + } + outFile.close(); + } + + /** + * Deserializes a CSV file back into a vector of Descriptors. + */ + static std::vector deserialize(const std::string& filename, bool srcDescr) { + std::vector result; + std::ifstream inFile(filename); + if (!inFile.is_open()) { + std::cerr << "Error opening file for reading: " << filename << std::endl; + return result; + } + + std::string line; + while (std::getline(inFile, line)) { + if (line.empty()) continue; + + std::stringstream ss(line); + std::string x_str, y_str, state_str; + + // Split by comma + if (std::getline(ss, x_str, ',') && + std::getline(ss, y_str, ',') && + std::getline(ss, state_str, ',')) { + + Descriptor d; + d.point.x = std::stod(x_str); + d.point.y = std::stod(y_str); + d.state = std::stoull(state_str); + d.srcDescr = srcDescr; + + if (d.point.y > 200 && d.point.y < 400) + result.push_back(d); + } + } + inFile.close(); + return result; + } }; // Keeps support points with associated disparity // Support points are only used in the left image diff --git a/lib/gpc/forest.cpp b/lib/gpc/forest.cpp index 876fe0b..026733d 100644 --- a/lib/gpc/forest.cpp +++ b/lib/gpc/forest.cpp @@ -701,8 +701,7 @@ std::pair, std::vector> Forest::matchAdaptive( for (size_t i = 0; i < sStates.size(); ++i) { // Prefetch an element roughly 16 iterations ahead (adjust based on testing) /* - * This didn't help anymore. So either compiler already optimized this or - * we are compute bound. + * This didn't help anymore. * if (i + 16 < sStates.size()) { __builtin_prefetch(&sStates[i + 16], 0, 3); __builtin_prefetch(&sIdxs[i + 16], 0, 3); @@ -1069,6 +1068,7 @@ std::vector Forest::depthPriorFast( PreprocessedImage& tar, FilterMask& fastmask, InferenceSettings& settings) { + std::chrono::high_resolution_clock::time_point t0, t1; std::vector statesSrc = evalFastMaskOnSubsetSSE( src.smooth, src.grad, src.mask, fastmask, settings); std::vector statesTar = evalFastMaskOnSubsetSSE( @@ -1081,8 +1081,12 @@ std::vector Forest::depthPriorFast( } // Use sort method for matching if (settings.useHashtable_ == false) { + t0 = sysTick(); std::vector corr = findCorrespondences(statesSrc, statesTar); + t1 = sysTick(); + std::cout << "findCorrespondences (without allocation): " << gpc::inference::tickToMs(t1, t0) << " ms" << std::endl; + std::cout << "length src: " << statesSrc.size() << std::endl; return corr; } // Use hashtable matching @@ -1519,11 +1523,9 @@ std::vector Forest::stereoMatch(PreprocessedImage& simg, "Targe Image: dimension does not fit dimension of supplied forest " "mask"); bool m_debug = false; - std::chrono::high_resolution_clock::time_point t0, t1; // Match std::vector corr = depthPriorFast(simg, timg, forestmask, settings); - t1 = sysTick(); return corr; } diff --git a/samples/sparsematch.cpp b/samples/sparsematch.cpp index 85070a7..3b94d9f 100644 --- a/samples/sparsematch.cpp +++ b/samples/sparsematch.cpp @@ -67,4 +67,11 @@ int main(int argc, char** argv) { std::cout << "Number of matches: " << supp.size() << std::endl; std::cout << "Preprocessing time: " << gpc::inference::tickToMs(t1, t0) << " ms" << std::endl; std::cout << "Matching time: " << gpc::inference::tickToMs(t2, t1) << " ms" << std::endl; + std::vector statesSrc = forest.evalFastMaskOnSubsetSSE( + simgP.smooth, simgP.grad, simgP.mask, fm, inferencesettings); + std::vector statesTar = forest.evalFastMaskOnSubsetSSE( + timgP.smooth, timgP.grad, timgP.mask, fm, inferencesettings); + ndb::Descriptor::serialize("statesSrc.txt", statesSrc); + ndb::Descriptor::serialize("statesTar.txt", statesTar); + } From 31bd8a227987fcaf46059861ead5b2b807e26628 Mon Sep 17 00:00:00 2001 From: Nik Bamert Date: Sun, 8 Mar 2026 16:12:40 +0100 Subject: [PATCH 29/36] add naive hash match version --- benchmarks/correspondence_bench.cpp | 19 ++++++++++ lib/gpc/buffer.hpp | 2 +- lib/gpc/forest.cpp | 54 +++++++++++++++++++++++++---- lib/gpc/forest.hpp | 3 ++ 4 files changed, 70 insertions(+), 8 deletions(-) diff --git a/benchmarks/correspondence_bench.cpp b/benchmarks/correspondence_bench.cpp index e16eb9b..fe4139f 100644 --- a/benchmarks/correspondence_bench.cpp +++ b/benchmarks/correspondence_bench.cpp @@ -86,6 +86,23 @@ static void matchBySorting( benchmark::ClobberMemory(); } } +static void matchByHashingNaive( + benchmark::State& state) { + std::vector srcOriginal = getSrcDescriptors(); + std::vector tarOriginal = getTarDescriptors(); + for (auto _ : state) { + state.PauseTiming(); + std::vector src = srcOriginal; + std::vector tar = tarOriginal; + state.ResumeTiming(); + std::vector + matches = gpc::inference::Forest::findCorrespondencesHashNaive(src, tar); + + state.counters["matches"] = matches.size(); + benchmark::DoNotOptimize(matches); + benchmark::ClobberMemory(); + } +} static void matchByHashing( benchmark::State& state) { std::vector srcOriginal = getSrcDescriptors(); @@ -325,6 +342,8 @@ static void matchPipelinedBranchlessPreallocateSingleSlab( } BENCHMARK(matchBySorting) ->Unit(benchmark::kMillisecond); +BENCHMARK(matchByHashingNaive) + ->Unit(benchmark::kMillisecond); BENCHMARK(matchByHashing) ->Unit(benchmark::kMillisecond); BENCHMARK(matchPreparedFrames) diff --git a/lib/gpc/buffer.hpp b/lib/gpc/buffer.hpp index 26e261a..903a507 100644 --- a/lib/gpc/buffer.hpp +++ b/lib/gpc/buffer.hpp @@ -128,7 +128,7 @@ struct Descriptor { d.state = std::stoull(state_str); d.srcDescr = srcDescr; - if (d.point.y > 200 && d.point.y < 400) + //if (d.point.y > 200 && d.point.y < 400) result.push_back(d); } } diff --git a/lib/gpc/forest.cpp b/lib/gpc/forest.cpp index 026733d..e95fea9 100644 --- a/lib/gpc/forest.cpp +++ b/lib/gpc/forest.cpp @@ -53,6 +53,7 @@ #include "gpc/kernels/utils.hpp" #include "gpc/hashmatch.hpp" #include "gpc/forest.hpp" +#include namespace gpc { @@ -421,12 +422,6 @@ void Forest::matchPipelinedBranchlessPreallocate( std::vector& resultSrc, std::vector& resultTar) { - //std::pair, std::vector> result; - // For 100M items, we might find more matches; - // adjusting reserve to prevent mid-run reallocations. - //result.first.reserve(src.statesSlab.size() / 100); - //result.second.reserve(src.statesSlab.size() / 100); - struct Slot { uint64_t key; uint32_t idx; @@ -1140,7 +1135,52 @@ std::vector Forest::findCorrespondences( } return corr; } -#include +std::vector Forest::findCorrespondencesHashNaive( + std::vector& srcStates, + std::vector& tarStates) { + + std::vector corr; + struct DescriptorHasher { + std::size_t operator()(const ndb::Descriptor& d) const { + // Just return the state since it's already a unique-ish 64-bit int + return static_cast(d.state); + } + }; + // 1. Count frequencies in Source + std::unordered_map srcCounts; + std::unordered_map tarCounts; + for (const auto& d : srcStates) { + srcCounts[d]++; + } + + // 2. Count frequencies in Target + for (const auto& d : tarStates) { + tarCounts[d]++; + } + + // 3. Match only if the descriptor is unique in both (count == 1) + // We iterate through srcStates to maintain a similar "order" or + // simply to find potential matches. + for (const auto& srcDesc : srcStates) { + // Is it unique in Source? + if (srcCounts[srcDesc] == 1) { + // Does it exist and is it unique in Target? + if (tarCounts.count(srcDesc) && tarCounts[srcDesc] == 1) { + + // We need the actual target object to get the 'point' + // In a naive way, we just go find it. + for (const auto& tarDesc : tarStates) { + if (tarDesc == srcDesc) { + corr.push_back(ndb::Correspondence(srcDesc.point, tarDesc.point)); + break; + } + } + } + } + } + + return corr; +} // State machine for our IDs enum class State : uint8_t { Unseen = 0, SeenOnce = 1, Duplicate = 2 }; diff --git a/lib/gpc/forest.hpp b/lib/gpc/forest.hpp index 9f48c87..559a5c3 100644 --- a/lib/gpc/forest.hpp +++ b/lib/gpc/forest.hpp @@ -241,6 +241,9 @@ class Forest { static std::vector findCorrespondences( std::vector& srcStates, std::vector& tarStates); + static std::vector findCorrespondencesHashNaive( + std::vector& srcStates, + std::vector& tarStates); static std::vector findCorrespondencesHash( std::vector& srcStates, std::vector& tarStates); From 46b8d5d6067eb95d453eddd71eff52210f96c7d4 Mon Sep 17 00:00:00 2001 From: Nik Bamert Date: Sun, 15 Mar 2026 20:14:09 +0100 Subject: [PATCH 30/36] add additional matching method --- benchmarks/correspondence_bench.cpp | 47 +++++++++++-- lib/gpc/forest.cpp | 104 ++++++++++++++++++++++++++++ lib/gpc/forest.hpp | 8 +++ tests/test_single_matching.cpp | 37 ++++++++++ 4 files changed, 190 insertions(+), 6 deletions(-) diff --git a/benchmarks/correspondence_bench.cpp b/benchmarks/correspondence_bench.cpp index fe4139f..d7655f2 100644 --- a/benchmarks/correspondence_bench.cpp +++ b/benchmarks/correspondence_bench.cpp @@ -49,13 +49,13 @@ std::vector generate_pareto_ids(size_t count, double target_mea return ids; } std::vector getSrcDescriptors() { - return ndb::Descriptor::deserialize("statesSrc.txt", true); - //return generate_pareto_ids(NUM_ELEMENTS, 1000.0, 42); // 1M IDs with mean ~1000 + //return ndb::Descriptor::deserialize("statesSrc.txt", true); + return generate_pareto_ids(NUM_ELEMENTS, 1000.0, 42); // 1M IDs with mean ~1000 } std::vector getTarDescriptors() { - return ndb::Descriptor::deserialize("statesTar.txt", false); - //return generate_pareto_ids(NUM_ELEMENTS, 1001.0, 42); // 1M IDs with mean ~1000 + //return ndb::Descriptor::deserialize("statesTar.txt", false); + return generate_pareto_ids(NUM_ELEMENTS, 1001.0, 42); // 1M IDs with mean ~1000 } std::vector generate_unique_ids(size_t count) { std::vector ids; @@ -340,12 +340,45 @@ static void matchPipelinedBranchlessPreallocateSingleSlab( benchmark::ClobberMemory(); } } +static void matchPipelinedBranchlessPreallocateSingleSlabUnordered( + benchmark::State& state) { + std::vector srcOriginal = getSrcDescriptors(); + std::vector tarOriginal = getTarDescriptors(); + + gpc::inference::SoAFramePersistentSingleSlab srcFrame, tarFrame; + srcFrame.preallocate(srcOriginal.size()); // size known + tarFrame.preallocate(tarOriginal.size()); + std::vector resultSrc, resultTar; + resultSrc.reserve(srcOriginal.size()/10); + resultTar.reserve(tarOriginal.size()/10); + for (auto _ : state) { + state.PauseTiming(); + resultSrc.clear(); + resultTar.clear(); + // 1. Measure Prepare + // 2M: 5.7ms, 20M: 57ms + std::vector src = srcOriginal; + std::vector tar = tarOriginal; + state.ResumeTiming(); + gpc::inference::Forest::prepareSoAFramesPersistentSingleSlabUnordered(src, tar, srcFrame, tarFrame); + + // 2. Measure Match + // 2M: 5.3ms , 20M: 53ms + gpc::inference::Forest::matchPipelinedBranchlessPreallocateSingleSlabUnordered(srcFrame, tarFrame, resultSrc, resultTar); + + state.counters["matches"] = resultSrc.size(); + + benchmark::DoNotOptimize(resultSrc); + benchmark::DoNotOptimize(resultTar); + benchmark::ClobberMemory(); + } +} BENCHMARK(matchBySorting) ->Unit(benchmark::kMillisecond); -BENCHMARK(matchByHashingNaive) - ->Unit(benchmark::kMillisecond); BENCHMARK(matchByHashing) ->Unit(benchmark::kMillisecond); +BENCHMARK(matchByHashingNaive) + ->Unit(benchmark::kMillisecond); BENCHMARK(matchPreparedFrames) ->Unit(benchmark::kMillisecond); BENCHMARK(matchPreparedFramesFaster) @@ -369,4 +402,6 @@ BENCHMARK(matchPipelinedBranchlessPreallocate) ->Unit(benchmark::kMillisecond); BENCHMARK(matchPipelinedBranchlessPreallocateSingleSlab) ->Unit(benchmark::kMillisecond); +BENCHMARK(matchPipelinedBranchlessPreallocateSingleSlabUnordered) + ->Unit(benchmark::kMillisecond); BENCHMARK_MAIN(); diff --git a/lib/gpc/forest.cpp b/lib/gpc/forest.cpp index e95fea9..5ebcbf0 100644 --- a/lib/gpc/forest.cpp +++ b/lib/gpc/forest.cpp @@ -58,6 +58,110 @@ namespace gpc { namespace inference { +void Forest::prepareSoAFramesPersistentSingleSlabUnordered( + std::vector& srcStates, + std::vector& tarStates, + SoAFramePersistentSingleSlab& srcFrame, + SoAFramePersistentSingleSlab& tarFrame) { + + uint32_t srcCounts[256] = {0}, tarCounts[256] = {0}; + for (const auto& s : srcStates) { + srcCounts[s.state & 0xFF]++; + } + for (const auto& t : tarStates) { + tarCounts[t.state & 0xFF]++; + } + + StateIdx* sP = srcFrame.slab.data(); + StateIdx* tP = tarFrame.slab.data(); + for (int i = 0; i < 256; ++i) { + srcFrame.bucketData[i] = sP; + srcFrame.bucketSizes[i] = srcCounts[i]; + tarFrame.bucketData[i] = tP; + tarFrame.bucketSizes[i] = tarCounts[i]; + sP += srcCounts[i]; tP += tarCounts[i]; + } + + uint32_t sW[256] = {0}, tW[256] = {0}; + // FIX: Split into two independent loops + for (uint32_t i = 0; i < (uint32_t)srcStates.size(); ++i) { + uint64_t sv = srcStates[i].state; + srcFrame.bucketData[sv & 0xFF][sW[sv & 0xFF]++] = {sv, i}; + } + for (uint32_t i = 0; i < (uint32_t)tarStates.size(); ++i) { + uint64_t tv = tarStates[i].state; + tarFrame.bucketData[tv & 0xFF][tW[tv & 0xFF]++] = {tv, i}; + } +} +void Forest::matchPipelinedBranchlessPreallocateSingleSlabUnordered( + SoAFramePersistentSingleSlab& src, SoAFramePersistentSingleSlab& tar, + std::vector& outS, std::vector& outT) { + + struct Slot { + uint64_t key; + uint32_t idx; + uint32_t gen; + uint32_t count; + uint32_t outIdx; // FIX: Track where the match was written in the output vectors + }; + static std::vector table(16384, {0, 0, 0, 0, 0}); + static uint32_t currentGen = 1; + + for (int b = 0; b < 256; ++b) { + StateIdx* sData = src.bucketData[b]; + uint32_t sSize = src.bucketSizes[b]; + if (sSize == 0) continue; + + const uint32_t mask = (sSize < 1000) ? 2047 : 16383; + const uint32_t shift = (sSize < 1000) ? 53 : 50; + currentGen++; + + for (uint32_t i = 0; i < sSize; ++i) { + uint64_t k = sData[i].state; + uint32_t h = (k * 11400714819323198485llu) >> shift; + h &= mask; + while (table[h].gen == currentGen && table[h].key != k) h = (h + 1) & mask; + if (table[h].gen != currentGen) table[h] = {k, sData[i].index, currentGen, 1, 0}; + else table[h].count++; + } + + StateIdx* tData = tar.bucketData[b]; + uint32_t tSize = tar.bucketSizes[b]; + for (uint32_t i = 0; i < tSize; ++i) { + uint64_t k = tData[i].state; + uint32_t h = (k * 11400714819323198485llu) >> shift; + h &= mask; + while (table[h].gen == currentGen && table[h].key != k) h = (h + 1) & mask; + + if (table[h].gen == currentGen && table[h].key == k) { + if (table[h].count == 1) { + // Unique source, first target match + table[h].outIdx = outS.size(); // Remember the index + outS.push_back(table[h].idx); + outT.push_back(tData[i].index); + table[h].count = 0xFFFFFFFF; // Mark as matched once + } else if (table[h].count == 0xFFFFFFFF) { + // Duplicate target found! Invalidate the previously written match. + outS[table[h].outIdx] = 0xFFFFFFFF; + outT[table[h].outIdx] = 0xFFFFFFFF; + table[h].count = 0xEEEEEEEE; // Mark as ruined + } + } + } + } + + // FIX: Final compaction pass to remove invalidated matches (sentinels) + uint32_t validCount = 0; + for (size_t i = 0; i < outS.size(); ++i) { + if (outS[i] != 0xFFFFFFFF) { + outS[validCount] = outS[i]; + outT[validCount] = outT[i]; + validCount++; + } + } + outS.resize(validCount); + outT.resize(validCount); +} void Forest::prepareSoAFramesPersistentSingleSlab( std::vector& srcStates, std::vector& tarStates, diff --git a/lib/gpc/forest.hpp b/lib/gpc/forest.hpp index 559a5c3..9e45600 100644 --- a/lib/gpc/forest.hpp +++ b/lib/gpc/forest.hpp @@ -308,6 +308,14 @@ static void matchPipelinedBranchlessPreallocateSingleSlab( +static void prepareSoAFramesPersistentSingleSlabUnordered( + std::vector& srcStates, + std::vector& tarStates, + SoAFramePersistentSingleSlab& srcFrame, + SoAFramePersistentSingleSlab& tarFrame); +static void matchPipelinedBranchlessPreallocateSingleSlabUnordered( + SoAFramePersistentSingleSlab& src, SoAFramePersistentSingleSlab& tar, + std::vector& outS, std::vector& outT); /** diff --git a/tests/test_single_matching.cpp b/tests/test_single_matching.cpp index e675a7c..2893b01 100644 --- a/tests/test_single_matching.cpp +++ b/tests/test_single_matching.cpp @@ -52,5 +52,42 @@ TEST(Approval, Inference) EXPECT_EQ(866, supp.size()); ApprovalTests::Approvals::verify(ss.str()); } +std::vector getSrcDescriptors() { + return ndb::Descriptor::deserialize("statesSrc.txt", true); +} + +std::vector getTarDescriptors() { + return ndb::Descriptor::deserialize("statesTar.txt", false); +} +TEST(A,B) { + std::vector srcOriginal = getSrcDescriptors(); + std::vector tarOriginal = getTarDescriptors(); + std::vector srcBaseline = srcOriginal; + std::vector tarBaseline = tarOriginal; + std::vector srcAlt = srcOriginal; + std::vector tarAlt = tarOriginal; + + // Baseline + // To write a test for this we'd actually need to get the ids of the sources back, not just the final matches. + std::vector + matches = gpc::inference::Forest::findCorrespondences(srcBaseline, tarBaseline); + + + // Alternative method + gpc::inference::SoAFramePersistentSingleSlab srcFrame, tarFrame; + srcFrame.preallocate(srcOriginal.size()); // size known + tarFrame.preallocate(tarOriginal.size()); + + std::vector resultSrc, resultTar; + resultSrc.reserve(srcOriginal.size()/10); + resultTar.reserve(tarOriginal.size()/10); + gpc::inference::Forest::prepareSoAFramesPersistentSingleSlabUnordered(srcAlt, tarAlt, srcFrame, tarFrame); + gpc::inference::Forest::matchPipelinedBranchlessPreallocateSingleSlabUnordered(srcFrame, tarFrame, resultSrc, resultTar); + + // Ensure ID pairings of (resultSrc, resultTar) match the naive version. + // We ignore exact matching for now and just expect the count to be the same + EXPECT_EQ(matches.size(), resultSrc.size()); + EXPECT_EQ(matches.size(), resultTar.size()); +} From 62807c6c5aaa1d8236ed91a34d89c87e2c74aba0 Mon Sep 17 00:00:00 2001 From: Nik Bamert Date: Tue, 7 Apr 2026 07:40:49 +0200 Subject: [PATCH 31/36] move individual HT benchmarks to separate repo --- benchmarks/correspondence_bench.cpp | 358 +------ benchmarks/sobel_bench.cpp | 2 +- lib/gpc/forest.cpp | 1388 +-------------------------- lib/gpc/forest.hpp | 119 --- lib/gpc/kernels/box.cpp | 1 - lib/gpc/kernels/sobel.cpp | 1 - samples/sparsematch.cpp | 35 +- tests/test_single_matching.cpp | 4 +- 8 files changed, 54 insertions(+), 1854 deletions(-) diff --git a/benchmarks/correspondence_bench.cpp b/benchmarks/correspondence_bench.cpp index d7655f2..737d4dc 100644 --- a/benchmarks/correspondence_bench.cpp +++ b/benchmarks/correspondence_bench.cpp @@ -8,15 +8,6 @@ #define NUM_ELEMENTS 262668 //10*1224*375 //1024*1024 -/* Remaining ideas - * -USE ILP: Parallel Radix Partitioning (Even on one core, using a single-pass shuffle). - * - Didn't speed up. was same as matchPreparedFramesFaster - * - Assuming that the bottleneck is the hash table probes, hence: look into bloom filters... - * -Blocked Bloom Filter to discard non-matches in L1. - * - Faster at 1M, slower at 100K and 10M - * -SIMD-Probed Flat Table (checking 4 slots at once). - * -Manual Prefetching of the next bucket's data. - * */ /** * Generates a reproducible Pareto-distributed vector. * @param count Number of IDs to generate. @@ -49,23 +40,28 @@ std::vector generate_pareto_ids(size_t count, double target_mea return ids; } std::vector getSrcDescriptors() { - //return ndb::Descriptor::deserialize("statesSrc.txt", true); - return generate_pareto_ids(NUM_ELEMENTS, 1000.0, 42); // 1M IDs with mean ~1000 + std::vector v = ndb::Descriptor::deserialize("statesSrcLarge.txt", true); + std::vector out; + for (size_t i = 0; i < v.size(); i++) { + if (v[i].point.y % 5 == 0 && (v[i].state & 0xFFFFFFFF) != 0) { + out.push_back(v[i]); + } + } + return out; + //return generate_pareto_ids(NUM_ELEMENTS, 1000.0, 42); // 1M IDs with mean ~1000 } std::vector getTarDescriptors() { - //return ndb::Descriptor::deserialize("statesTar.txt", false); - return generate_pareto_ids(NUM_ELEMENTS, 1001.0, 42); // 1M IDs with mean ~1000 -} -std::vector generate_unique_ids(size_t count) { - std::vector ids; - ids.reserve(count); - - - for (size_t i = 0; i < count; ++i) { - ids.push_back(ndb::Descriptor(ndb::Point(0,0), static_cast(i))); + std::vector v = ndb::Descriptor::deserialize("statesTarLarge.txt", false); + std::vector out; + for (size_t i = 0; i < v.size(); i++) { + if (v[i].point.y % 5 == 0 && (v[i].state & 0xFFFFFFFF) != 0) { + out.push_back(v[i]); + } } - return ids; + return out; + + //return generate_pareto_ids(NUM_ELEMENTS, 1001.0, 42); // 1M IDs with mean ~1000 } static void matchBySorting( benchmark::State& state) { @@ -80,328 +76,10 @@ static void matchBySorting( matches = gpc::inference::Forest::findCorrespondences(src, tar); state.counters["matches"] = matches.size(); - //state.counters["candidates_t"] = timgP.mask.size(); - //state.counters["matches"] = supp.size(); - benchmark::DoNotOptimize(matches); - benchmark::ClobberMemory(); - } -} -static void matchByHashingNaive( - benchmark::State& state) { - std::vector srcOriginal = getSrcDescriptors(); - std::vector tarOriginal = getTarDescriptors(); - for (auto _ : state) { - state.PauseTiming(); - std::vector src = srcOriginal; - std::vector tar = tarOriginal; - state.ResumeTiming(); - std::vector - matches = gpc::inference::Forest::findCorrespondencesHashNaive(src, tar); - - state.counters["matches"] = matches.size(); - benchmark::DoNotOptimize(matches); - benchmark::ClobberMemory(); - } -} -static void matchByHashing( - benchmark::State& state) { - std::vector srcOriginal = getSrcDescriptors(); - std::vector tarOriginal = getTarDescriptors(); - for (auto _ : state) { - state.PauseTiming(); - std::vector src = srcOriginal; - std::vector tar = tarOriginal; - state.ResumeTiming(); - std::vector - matches = gpc::inference::Forest::findCorrespondencesTurbo(src, tar); - - state.counters["matches"] = matches.size(); - //state.counters["candidates_t"] = timgP.mask.size(); - //state.counters["matches"] = supp.size(); - benchmark::DoNotOptimize(matches); - benchmark::ClobberMemory(); - } -} -static void matchPreparedFrames( - benchmark::State& state) { - std::vector srcOriginal = getSrcDescriptors(); - std::vector tarOriginal = getTarDescriptors(); - for (auto _ : state) { - state.PauseTiming(); - std::vector src = srcOriginal; - std::vector tar = tarOriginal; - state.ResumeTiming(); - auto v = gpc::inference::Forest::prepareSoAFrames(src, tar); - auto matches = gpc::inference::Forest::matchPreparedFrames(v.first, v.second); - - state.counters["matches"] = matches.first.size(); - //state.counters["candidates_t"] = timgP.mask.size(); - //state.counters["matches"] = supp.size(); benchmark::DoNotOptimize(matches); benchmark::ClobberMemory(); } } -static void matchPreparedFramesFaster( - benchmark::State& state) { - std::vector srcOriginal = getSrcDescriptors(); - std::vector tarOriginal = getTarDescriptors(); - for (auto _ : state) { - state.PauseTiming(); - std::vector src = srcOriginal; - std::vector tar = tarOriginal; - state.ResumeTiming(); - auto v = gpc::inference::Forest::prepareSoAFrames(src, tar); - //auto matches = gpc::inference::Forest::matchPreparedFramesFaster(v.first, v.second); - auto matches = gpc::inference::Forest::matchParallelRadixPartitioning(v.first,v.second); - state.counters["matches"] = matches.first.size(); - //state.counters["candidates_t"] = timgP.mask.size(); - //state.counters["matches"] = supp.size(); - benchmark::DoNotOptimize(matches); - benchmark::ClobberMemory(); - } -} -static void matchParallelRadixPartitioning( - benchmark::State& state) { - std::vector srcOriginal = getSrcDescriptors(); - std::vector tarOriginal = getTarDescriptors(); - for (auto _ : state) { - state.PauseTiming(); - std::vector src = srcOriginal; - std::vector tar = tarOriginal; - state.ResumeTiming(); - auto v = gpc::inference::Forest::prepareSoAFrames(src,tar); - auto matches = gpc::inference::Forest::matchParallelRadixPartitioning(v.first, v.second); - - state.counters["matches"] = matches.first.size(); - //state.counters["candidates_t"] = timgP.mask.size(); - //state.counters["matches"] = supp.size(); - benchmark::DoNotOptimize(matches); - benchmark::ClobberMemory(); - } -} -static void matchBlockedBloom( - benchmark::State& state) { - std::vector srcOriginal = getSrcDescriptors(); - std::vector tarOriginal = getTarDescriptors(); - - for (auto _ : state) { - state.PauseTiming(); - std::vector src = srcOriginal; - std::vector tar = tarOriginal; - state.ResumeTiming(); - auto v = gpc::inference::Forest::prepareSoAFrames(src,tar); - auto matches = gpc::inference::Forest::matchBlockedBloom(v.first, v.second); - - state.counters["matches"] = matches.first.size(); - benchmark::DoNotOptimize(matches); - benchmark::ClobberMemory(); - } -} -static void matchAdaptive( - benchmark::State& state) { - std::vector srcOriginal = getSrcDescriptors(); - std::vector tarOriginal = getTarDescriptors(); - for (auto _ : state) { - state.PauseTiming(); - std::vector src = srcOriginal; - std::vector tar = tarOriginal; - state.ResumeTiming(); - auto v = gpc::inference::Forest::prepareSoAFrames(src,tar); - auto matches = gpc::inference::Forest::matchAdaptive(v.first, v.second); - - state.counters["matches"] = matches.first.size(); - benchmark::DoNotOptimize(matches); - benchmark::ClobberMemory(); - } -} -/* -static void matchAdaptiveNeon( - benchmark::State& state) { - std::vector src, tar; - src = generate_pareto_ids(NUM_ELEMENTS, 1000.0, 42); // 1M IDs with mean ~1000 - tar = generate_pareto_ids(NUM_ELEMENTS, 1001.0, 42); // 1M IDs with mean ~1000 - - for (auto _ : state) { - auto v = gpc::inference::Forest::prepareSoAFrames(src, tar); - auto matches = gpc::inference::Forest::matchAdaptiveNeon(v.first, v.second); - - state.counters["matches"] = matches.first.size(); - //state.counters["candidates_t"] = timgP.mask.size(); - //state.counters["matches"] = supp.size(); - benchmark::DoNotOptimize(matches); - benchmark::ClobberMemory(); - } -} -*/ -static void matchAdaptivePersistent( - benchmark::State& state) { - std::vector srcOriginal = getSrcDescriptors(); - std::vector tarOriginal = getTarDescriptors(); - gpc::inference::SoAFramePersistent srcFrame, tarFrame; - srcFrame.preallocate(srcOriginal.size()); // size known - tarFrame.preallocate(tarOriginal.size()); - for (auto _ : state) { - state.PauseTiming(); - std::vector src = srcOriginal; - std::vector tar = tarOriginal; - state.ResumeTiming(); - - gpc::inference::Forest::prepareSoAFramesPersistent(src, tar, srcFrame, tarFrame); - auto matches = gpc::inference::Forest::matchAdaptivePersistent(srcFrame, tarFrame); - - state.counters["matches"] = matches.first.size(); - benchmark::DoNotOptimize(matches); - benchmark::ClobberMemory(); - } -} -static void matchPipelinedBranchless( - benchmark::State& state) { - std::vector srcOriginal = getSrcDescriptors(); - std::vector tarOriginal = getTarDescriptors(); - gpc::inference::SoAFramePersistent srcFrame, tarFrame; - srcFrame.preallocate(srcOriginal.size()); // size known - tarFrame.preallocate(tarOriginal.size()); - for (auto _ : state) { - state.PauseTiming(); - std::vector src = srcOriginal; - std::vector tar = tarOriginal; - state.ResumeTiming(); - - gpc::inference::Forest::prepareSoAFramesPersistent(src, tar, srcFrame, tarFrame); - auto matches = gpc::inference::Forest::matchPipelinedBranchless(srcFrame, tarFrame); - benchmark::DoNotOptimize(matches); - benchmark::ClobberMemory(); - } -} -static void matchPipelinedBranchlessPreallocate( - benchmark::State& state) { - std::vector srcOriginal = getSrcDescriptors(); - std::vector tarOriginal = getTarDescriptors(); - gpc::inference::SoAFramePersistent srcFrame, tarFrame; - srcFrame.preallocate(srcOriginal.size()); // size known - tarFrame.preallocate(tarOriginal.size()); - std::vector resultSrc, resultTar; - resultSrc.reserve(srcOriginal.size()/100); - resultTar.reserve(tarOriginal.size()/100); - for (auto _ : state) { - state.PauseTiming(); - resultSrc.clear(); - resultTar.clear(); - std::vector src = srcOriginal; - std::vector tar = tarOriginal; - state.ResumeTiming(); - - // 1. Measure Prepare - // 2M: 5.7ms, 20M: 57ms - gpc::inference::Forest::prepareSoAFramesPersistent(src, tar, srcFrame, tarFrame); - - // 2. Measure Match - // 2M: 5.3ms , 20M: 53ms - gpc::inference::Forest::matchPipelinedBranchlessPreallocate(srcFrame, tarFrame, resultSrc, resultTar); - - state.counters["matches"] = resultSrc.size(); - - benchmark::DoNotOptimize(resultSrc); - benchmark::DoNotOptimize(resultTar); - benchmark::ClobberMemory(); - } -} - -static void matchPipelinedBranchlessPreallocateSingleSlab( - benchmark::State& state) { - std::vector srcOriginal = getSrcDescriptors(); - std::vector tarOriginal = getTarDescriptors(); - - gpc::inference::SoAFramePersistentSingleSlab srcFrame, tarFrame; - srcFrame.preallocate(srcOriginal.size()); // size known - tarFrame.preallocate(tarOriginal.size()); - std::vector resultSrc, resultTar; - resultSrc.reserve(srcOriginal.size()/10); - resultTar.reserve(tarOriginal.size()/10); - for (auto _ : state) { - state.PauseTiming(); - resultSrc.clear(); - resultTar.clear(); - // 1. Measure Prepare - // 2M: 5.7ms, 20M: 57ms - std::vector src = srcOriginal; - std::vector tar = tarOriginal; - state.ResumeTiming(); - gpc::inference::Forest::prepareSoAFramesPersistentSingleSlab(src, tar, srcFrame, tarFrame); - - // 2. Measure Match - // 2M: 5.3ms , 20M: 53ms - gpc::inference::Forest::matchPipelinedBranchlessPreallocateSingleSlab(srcFrame, tarFrame, resultSrc, resultTar); - - state.counters["matches"] = resultSrc.size(); - - benchmark::DoNotOptimize(resultSrc); - benchmark::DoNotOptimize(resultTar); - benchmark::ClobberMemory(); - } -} -static void matchPipelinedBranchlessPreallocateSingleSlabUnordered( - benchmark::State& state) { - std::vector srcOriginal = getSrcDescriptors(); - std::vector tarOriginal = getTarDescriptors(); - - gpc::inference::SoAFramePersistentSingleSlab srcFrame, tarFrame; - srcFrame.preallocate(srcOriginal.size()); // size known - tarFrame.preallocate(tarOriginal.size()); - std::vector resultSrc, resultTar; - resultSrc.reserve(srcOriginal.size()/10); - resultTar.reserve(tarOriginal.size()/10); - for (auto _ : state) { - state.PauseTiming(); - resultSrc.clear(); - resultTar.clear(); - // 1. Measure Prepare - // 2M: 5.7ms, 20M: 57ms - std::vector src = srcOriginal; - std::vector tar = tarOriginal; - state.ResumeTiming(); - gpc::inference::Forest::prepareSoAFramesPersistentSingleSlabUnordered(src, tar, srcFrame, tarFrame); - - // 2. Measure Match - // 2M: 5.3ms , 20M: 53ms - gpc::inference::Forest::matchPipelinedBranchlessPreallocateSingleSlabUnordered(srcFrame, tarFrame, resultSrc, resultTar); - - state.counters["matches"] = resultSrc.size(); - - benchmark::DoNotOptimize(resultSrc); - benchmark::DoNotOptimize(resultTar); - benchmark::ClobberMemory(); - } -} BENCHMARK(matchBySorting) ->Unit(benchmark::kMillisecond); -BENCHMARK(matchByHashing) - ->Unit(benchmark::kMillisecond); -BENCHMARK(matchByHashingNaive) - ->Unit(benchmark::kMillisecond); -BENCHMARK(matchPreparedFrames) - ->Unit(benchmark::kMillisecond); -BENCHMARK(matchPreparedFramesFaster) - ->Unit(benchmark::kMillisecond); -BENCHMARK(matchParallelRadixPartitioning) - ->Unit(benchmark::kMillisecond); -BENCHMARK(matchBlockedBloom) - ->Unit(benchmark::kMillisecond); -BENCHMARK(matchAdaptive) - ->Unit(benchmark::kMillisecond); -/* -BENCHMARK(matchAdaptiveNeon) - ->Unit(benchmark::kMillisecond); -*/ -BENCHMARK(matchAdaptivePersistent) - ->Unit(benchmark::kMillisecond); - -BENCHMARK(matchPipelinedBranchless) - ->Unit(benchmark::kMillisecond); -BENCHMARK(matchPipelinedBranchlessPreallocate) - ->Unit(benchmark::kMillisecond); -BENCHMARK(matchPipelinedBranchlessPreallocateSingleSlab) - ->Unit(benchmark::kMillisecond); -BENCHMARK(matchPipelinedBranchlessPreallocateSingleSlabUnordered) - ->Unit(benchmark::kMillisecond); BENCHMARK_MAIN(); diff --git a/benchmarks/sobel_bench.cpp b/benchmarks/sobel_bench.cpp index 5c26d89..490a861 100644 --- a/benchmarks/sobel_bench.cpp +++ b/benchmarks/sobel_bench.cpp @@ -40,7 +40,7 @@ static void BM_SobelNaive(benchmark::State& state) { state.SetLabel("naive"); for (auto _ : state) { - ndb::sobelNaive(in.data(), out.data(), w, h, 1); + ndb::sobelNaive(in.data(), out.data(), w, h, 50); // Ensure the compiler doesn't skip the work benchmark::DoNotOptimize(out.data()); diff --git a/lib/gpc/forest.cpp b/lib/gpc/forest.cpp index 5ebcbf0..87e0582 100644 --- a/lib/gpc/forest.cpp +++ b/lib/gpc/forest.cpp @@ -58,1099 +58,6 @@ namespace gpc { namespace inference { -void Forest::prepareSoAFramesPersistentSingleSlabUnordered( - std::vector& srcStates, - std::vector& tarStates, - SoAFramePersistentSingleSlab& srcFrame, - SoAFramePersistentSingleSlab& tarFrame) { - - uint32_t srcCounts[256] = {0}, tarCounts[256] = {0}; - for (const auto& s : srcStates) { - srcCounts[s.state & 0xFF]++; - } - for (const auto& t : tarStates) { - tarCounts[t.state & 0xFF]++; - } - - StateIdx* sP = srcFrame.slab.data(); - StateIdx* tP = tarFrame.slab.data(); - for (int i = 0; i < 256; ++i) { - srcFrame.bucketData[i] = sP; - srcFrame.bucketSizes[i] = srcCounts[i]; - tarFrame.bucketData[i] = tP; - tarFrame.bucketSizes[i] = tarCounts[i]; - sP += srcCounts[i]; tP += tarCounts[i]; - } - - uint32_t sW[256] = {0}, tW[256] = {0}; - // FIX: Split into two independent loops - for (uint32_t i = 0; i < (uint32_t)srcStates.size(); ++i) { - uint64_t sv = srcStates[i].state; - srcFrame.bucketData[sv & 0xFF][sW[sv & 0xFF]++] = {sv, i}; - } - for (uint32_t i = 0; i < (uint32_t)tarStates.size(); ++i) { - uint64_t tv = tarStates[i].state; - tarFrame.bucketData[tv & 0xFF][tW[tv & 0xFF]++] = {tv, i}; - } -} -void Forest::matchPipelinedBranchlessPreallocateSingleSlabUnordered( - SoAFramePersistentSingleSlab& src, SoAFramePersistentSingleSlab& tar, - std::vector& outS, std::vector& outT) { - - struct Slot { - uint64_t key; - uint32_t idx; - uint32_t gen; - uint32_t count; - uint32_t outIdx; // FIX: Track where the match was written in the output vectors - }; - static std::vector table(16384, {0, 0, 0, 0, 0}); - static uint32_t currentGen = 1; - - for (int b = 0; b < 256; ++b) { - StateIdx* sData = src.bucketData[b]; - uint32_t sSize = src.bucketSizes[b]; - if (sSize == 0) continue; - - const uint32_t mask = (sSize < 1000) ? 2047 : 16383; - const uint32_t shift = (sSize < 1000) ? 53 : 50; - currentGen++; - - for (uint32_t i = 0; i < sSize; ++i) { - uint64_t k = sData[i].state; - uint32_t h = (k * 11400714819323198485llu) >> shift; - h &= mask; - while (table[h].gen == currentGen && table[h].key != k) h = (h + 1) & mask; - if (table[h].gen != currentGen) table[h] = {k, sData[i].index, currentGen, 1, 0}; - else table[h].count++; - } - - StateIdx* tData = tar.bucketData[b]; - uint32_t tSize = tar.bucketSizes[b]; - for (uint32_t i = 0; i < tSize; ++i) { - uint64_t k = tData[i].state; - uint32_t h = (k * 11400714819323198485llu) >> shift; - h &= mask; - while (table[h].gen == currentGen && table[h].key != k) h = (h + 1) & mask; - - if (table[h].gen == currentGen && table[h].key == k) { - if (table[h].count == 1) { - // Unique source, first target match - table[h].outIdx = outS.size(); // Remember the index - outS.push_back(table[h].idx); - outT.push_back(tData[i].index); - table[h].count = 0xFFFFFFFF; // Mark as matched once - } else if (table[h].count == 0xFFFFFFFF) { - // Duplicate target found! Invalidate the previously written match. - outS[table[h].outIdx] = 0xFFFFFFFF; - outT[table[h].outIdx] = 0xFFFFFFFF; - table[h].count = 0xEEEEEEEE; // Mark as ruined - } - } - } - } - - // FIX: Final compaction pass to remove invalidated matches (sentinels) - uint32_t validCount = 0; - for (size_t i = 0; i < outS.size(); ++i) { - if (outS[i] != 0xFFFFFFFF) { - outS[validCount] = outS[i]; - outT[validCount] = outT[i]; - validCount++; - } - } - outS.resize(validCount); - outT.resize(validCount); -} -void Forest::prepareSoAFramesPersistentSingleSlab( - std::vector& srcStates, - std::vector& tarStates, - SoAFramePersistentSingleSlab& srcFrame, - SoAFramePersistentSingleSlab& tarFrame) { - - uint32_t srcCounts[256] = {0}, tarCounts[256] = {0}; - for (const auto& s : srcStates) srcCounts[s.state & 0xFF]++; - for (const auto& t : tarStates) tarCounts[t.state & 0xFF]++; - - StateIdx* sP = srcFrame.slab.data(); - StateIdx* tP = tarFrame.slab.data(); - for (int i = 0; i < 256; ++i) { - srcFrame.bucketData[i] = sP; - srcFrame.bucketSizes[i] = srcCounts[i]; - tarFrame.bucketData[i] = tP; - tarFrame.bucketSizes[i] = tarCounts[i]; - sP += srcCounts[i]; tP += tarCounts[i]; - } - - uint32_t sW[256] = {0}, tW[256] = {0}; - for (uint32_t i = 0; i < (uint32_t)srcStates.size(); ++i) { - uint64_t sv = srcStates[i].state; - uint64_t tv = tarStates[i].state; - srcFrame.bucketData[sv & 0xFF][sW[sv & 0xFF]++] = {sv, i}; - tarFrame.bucketData[tv & 0xFF][tW[tv & 0xFF]++] = {tv, i}; - } -} -void Forest::prepareSoAFramesPersistent( - std::vector& srcStates, - std::vector& tarStates, - SoAFramePersistent& srcFrame, - SoAFramePersistent& tarFrame) { - assert(srcStates.size() == tarStates.size()); - assert(srcStates.size() <= 256 * 16384); // limit for max unique items in our table design -/* - // This is only slightly slower than the bit below. - const uint32_t BUCKET_COUNT = 256; - const uint64_t BUCKET_MASK = 0xFF; - - // 1. Histogram (To find bucket boundaries) - uint32_t srcCounts[BUCKET_COUNT] = {0}; - uint32_t tarCounts[BUCKET_COUNT] = {0}; - for (const auto& s : srcStates) srcCounts[s.state & BUCKET_MASK]++; - for (const auto& t : tarStates) tarCounts[t.state & BUCKET_MASK]++; - - // 2. Setup Bucket Pointers into the Slab - // We treat the slab like a custom allocator - uint64_t* srcPtr = srcFrame.statesSlab.data(); - uint32_t* srcIdxPtr = srcFrame.indicesSlab.data(); - uint64_t* tarPtr = tarFrame.statesSlab.data(); - uint32_t* tarIdxPtr = tarFrame.indicesSlab.data(); - - for (uint32_t i = 0; i < BUCKET_COUNT; ++i) { - srcFrame.bucketStates[i] = srcPtr; - srcFrame.bucketIndices[i] = srcIdxPtr; - srcFrame.bucketSizes[i] = srcCounts[i]; - - tarFrame.bucketStates[i] = tarPtr; - tarFrame.bucketIndices[i] = tarIdxPtr; - tarFrame.bucketSizes[i] = tarCounts[i]; - - srcPtr += srcCounts[i]; - srcIdxPtr += srcCounts[i]; - tarPtr += tarCounts[i]; - tarIdxPtr += tarCounts[i]; - } - - // 3. The "Pure Scatter" (No push_back, no resize, no zeroing) - uint32_t srcWriteIdx[BUCKET_COUNT] = {0}; - uint32_t tarWriteIdx[BUCKET_COUNT] = {0}; - - for (uint32_t i = 0; i < (uint32_t)srcStates.size(); ++i) { - uint64_t s = srcStates[i].state; - uint32_t b = s & BUCKET_MASK; - uint32_t pos = srcWriteIdx[b]++; - srcFrame.bucketStates[b][pos] = s; - srcFrame.bucketIndices[b][pos] = i; - } - - for (uint32_t i = 0; i < (uint32_t)tarStates.size(); ++i) { - uint64_t s = tarStates[i].state; - uint32_t b = s & BUCKET_MASK; - uint32_t pos = tarWriteIdx[b]++; - tarFrame.bucketStates[b][pos] = s; - tarFrame.bucketIndices[b][pos] = i; - } - */ - const uint32_t BUCKET_COUNT = 256; - const uint64_t BUCKET_MASK = 0xFF; - - uint32_t srcCounts[BUCKET_COUNT] = {0}; - uint32_t tarCounts[BUCKET_COUNT] = {0}; - - // 1. Fused Histogram Pass (Assuming equal sizes as per your note) - const uint32_t totalSize = (uint32_t)srcStates.size(); - for (uint32_t i = 0; i < totalSize; ++i) { - srcCounts[srcStates[i].state & BUCKET_MASK]++; - tarCounts[tarStates[i].state & BUCKET_MASK]++; - } - - // 2. Setup Bucket Pointers (Unchanged, this is fast) - uint64_t* sP = srcFrame.statesSlab.data(); - uint32_t* sI = srcFrame.indicesSlab.data(); - uint64_t* tP = tarFrame.statesSlab.data(); - uint32_t* tI = tarFrame.indicesSlab.data(); - - for (uint32_t i = 0; i < BUCKET_COUNT; ++i) { - srcFrame.bucketStates[i] = sP; - srcFrame.bucketIndices[i] = sI; - srcFrame.bucketSizes[i] = srcCounts[i]; - tarFrame.bucketStates[i] = tP; - tarFrame.bucketIndices[i] = tI; - tarFrame.bucketSizes[i] = tarCounts[i]; - sP += srcCounts[i]; sI += srcCounts[i]; - tP += tarCounts[i]; tI += tarCounts[i]; - } - - // 3. Optimized Fused Scatter - uint32_t srcWriteIdx[BUCKET_COUNT] = {0}; - uint32_t tarWriteIdx[BUCKET_COUNT] = {0}; - - // Unroll by 2 to keep the M3's execution ports saturated - uint32_t i = 0; - for (; i + 1 < totalSize; i += 2) { - // Source pair - uint64_t s0 = srcStates[i].state; - uint64_t s1 = srcStates[i+1].state; - uint32_t bS0 = s0 & BUCKET_MASK; - uint32_t bS1 = s1 & BUCKET_MASK; - - srcFrame.bucketStates[bS0][srcWriteIdx[bS0]++] = s0; - srcFrame.bucketIndices[bS0][srcWriteIdx[bS0]-1] = i; - srcFrame.bucketStates[bS1][srcWriteIdx[bS1]++] = s1; - srcFrame.bucketIndices[bS1][srcWriteIdx[bS1]-1] = i+1; - - // Target pair - uint64_t t0 = tarStates[i].state; - uint64_t t1 = tarStates[i+1].state; - uint32_t bT0 = t0 & BUCKET_MASK; - uint32_t bT1 = t1 & BUCKET_MASK; - - tarFrame.bucketStates[bT0][tarWriteIdx[bT0]++] = t0; - tarFrame.bucketIndices[bT0][tarWriteIdx[bT0]-1] = i; - tarFrame.bucketStates[bT1][tarWriteIdx[bT1]++] = t1; - tarFrame.bucketIndices[bT1][tarWriteIdx[bT1]-1] = i+1; - } - - // Handle remainder - for (; i < totalSize; ++i) { - uint64_t s = srcStates[i].state; - uint32_t bS = s & BUCKET_MASK; - srcFrame.bucketStates[bS][srcWriteIdx[bS]++] = s; - srcFrame.bucketIndices[bS][srcWriteIdx[bS]-1] = i; - - uint64_t t = tarStates[i].state; - uint32_t bT = t & BUCKET_MASK; - tarFrame.bucketStates[bT][tarWriteIdx[bT]++] = t; - tarFrame.bucketIndices[bT][tarWriteIdx[bT]-1] = i; - } -} - - // Here we did allocation within the prepare. we can move that part out -std::pair Forest::prepareSoAFrames( - std::vector& srcStates, - std::vector& tarStates) { - SoAFrame srcFrame, tarFrame; - srcFrame.reserve(srcStates.size()); - tarFrame.reserve(tarStates.size()); - - const uint64_t MASK = 0xFF; - - // Distribute into buckets based on the last 8 bits of the state - for (uint32_t i = 0; i < srcStates.size(); ++i) { - uint64_t s = srcStates[i].state; - srcFrame.states[s & MASK].push_back(s); - srcFrame.indices[s & MASK].push_back(i); - } - - for (uint32_t i = 0; i < tarStates.size(); ++i) { - uint64_t s = tarStates[i].state; - tarFrame.states[s & MASK].push_back(s); - tarFrame.indices[s & MASK].push_back(i); - } - - return {srcFrame, tarFrame}; -} -void Forest::matchPipelinedBranchlessPreallocateSingleSlab( - SoAFramePersistentSingleSlab& src, SoAFramePersistentSingleSlab& tar, - std::vector& outS, std::vector& outT) { - - struct Slot { - uint64_t key; // The 64-bit Descriptor/State ID - uint32_t idx; // The original global index in the Source array - uint32_t gen; // The "Generation" ID (replaces memset/clear) - uint32_t count; // The match state (0=empty, 1=unique, >1=dup, 0xFF..=matched) - }; - static std::vector table(16384, {0, 0, 0, 0}); - static uint32_t currentGen = 1; - - for (int b = 0; b < 256; ++b) { - StateIdx* sData = src.bucketData[b]; - uint32_t sSize = src.bucketSizes[b]; - if (sSize == 0) continue; - - const uint32_t mask = (sSize < 1000) ? 2047 : 16383; - const uint32_t shift = (sSize < 1000) ? 53 : 50; - currentGen++; - - for (uint32_t i = 0; i < sSize; ++i) { - uint64_t k = sData[i].state; - uint32_t h = (k * 11400714819323198485llu) >> shift; - h &= mask; - while (table[h].gen == currentGen && table[h].key != k) h = (h + 1) & mask; - if (table[h].gen != currentGen) table[h] = {k, sData[i].index, currentGen, 1}; - else table[h].count++; - } - - StateIdx* tData = tar.bucketData[b]; - uint32_t tSize = tar.bucketSizes[b]; - for (uint32_t i = 0; i < tSize; ++i) { - uint64_t k = tData[i].state; - uint32_t h = (k * 11400714819323198485llu) >> shift; - h &= mask; - while (table[h].gen == currentGen && table[h].key != k) h = (h + 1) & mask; - - if (table[h].gen == currentGen && table[h].key == k) { - if (table[h].count == 1) { - outS.push_back(table[h].idx); - outT.push_back(tData[i].index); - table[h].count = 0xFFFFFFFF; - } else if (table[h].count == 0xFFFFFFFF) { - outS.pop_back(); outT.pop_back(); - table[h].count = 0xEEEEEEEE; - } - } - } - } -} -/* -std::pair, std::vector> Forest::matchAdaptiveNeon( - SoAFrame& src, - SoAFrame& tar) { - - std::pair, std::vector> result; - result.first.reserve(10000); - result.second.reserve(10000); - - // Slot is exactly 32 bytes. 2 Slots = 64 bytes (1 Cache Line). - struct alignas(16) Slot { - uint64_t key; - uint32_t idx; - uint32_t gen; - uint32_t count; - uint32_t padding; - }; - - static uint32_t currentGen = 1; - static std::vector table(8192, {0, 0, 0, 0, 0}); - - for (int b = 0; b < 256; ++b) { - const auto& sStates = src.states[b]; - const auto& sIdxs = src.indices[b]; - if (sStates.empty()) continue; - - const uint32_t mask = (sStates.size() < 500) ? 1023 : 8191; - currentGen++; - - // --- PART 1: SOURCE FILL (Keep Scalar as it's usually not the bottleneck) --- - for (size_t i = 0; i < sStates.size(); ++i) { - uint64_t k = sStates[i]; - uint32_t h = (k * 11400714819323198485llu) >> (64 - 13); - h &= mask; - - while (table[h].gen == currentGen && table[h].key != k) { - h = (h + 1) & mask; - } - - if (table[h].gen != currentGen) { - table[h] = {k, sIdxs[i], currentGen, 1, 0}; - } else { - table[h].count++; - } - } - - const auto& tStates = tar.states[b]; - const auto& tIdxs = tar.indices[b]; - - // --- PART 2: TARGET MATCH (NEON Vectorized Window) --- - uint64x2_t genVec = vdupq_n_u64((uint64_t)currentGen << 32); // Gen is at offset 12 in slot - - for (size_t i = 0; i < tStates.size(); ++i) { - uint64_t k = tStates[i]; - uint32_t h = (k * 11400714819323198485llu) >> (64 - 13); - h &= mask; - - uint64x2_t targetKeyV = vdupq_n_u64(k); - bool found = false; - - // Check 2 slots at a time (One Cache Line) - // This loop usually terminates in the first iteration (h and h+1) - while (true) { - // Load keys from Slot H and Slot H+1 - // We use vld2 to pick the 'key' field which is the first 8 bytes of each 32-byte slot - // For simplicity and speed on M3, we'll just do direct pointer access: - uint64_t k0 = table[h].key; - uint64_t k1 = table[(h + 1) & mask].key; - uint32_t g0 = table[h].gen; - uint32_t g1 = table[(h + 1) & mask].gen; - - uint64x2_t keysV = {k0, k1}; - uint32x2_t gensV = {g0, g1}; - - // Compare keys - uint64x2_t keyMatch = vceqq_u64(keysV, targetKeyV); - // Compare generations - uint32x2_t genMatch = vceq_u32(gensV, vdup_n_u32(currentGen)); - - // Check lane 0 - if (vgetq_lane_u64(keyMatch, 0) && vget_lane_u32(genMatch, 0)) { - if (table[h].count == 1) { - result.first.push_back(table[h].idx); - result.second.push_back(tIdxs[i]); - table[h].count = 0xFFFFFFFF; - } else if (table[h].count == 0xFFFFFFFF) { - result.first.pop_back(); result.second.pop_back(); - table[h].count = 0xEEEEEEEE; - } - found = true; break; - } - - // Check lane 1 - uint32_t nextH = (h + 1) & mask; - if (vgetq_lane_u64(keyMatch, 1) && vget_lane_u32(genMatch, 1)) { - if (table[nextH].count == 1) { - result.first.push_back(table[nextH].idx); - result.second.push_back(tIdxs[i]); - table[nextH].count = 0xFFFFFFFF; - } else if (table[nextH].count == 0xFFFFFFFF) { - result.first.pop_back(); result.second.pop_back(); - table[nextH].count = 0xEEEEEEEE; - } - found = true; break; - } - - // If neither matches and both are "current", we must keep probing - if (g0 == currentGen && g1 == currentGen) { - h = (h + 2) & mask; - } else { - // One of them is an empty slot (gen != currentGen), stop searching - break; - } - } - } - } - return result; -} -*/ -void Forest::matchPipelinedBranchlessPreallocate( - SoAFramePersistent& src, - SoAFramePersistent& tar, - std::vector& resultSrc, - std::vector& resultTar) { - - struct Slot { - uint64_t key; - uint32_t idx; - uint32_t gen; - uint32_t count; - }; - - static uint32_t currentGen = 1; - // Increased table size slightly to 16k to further reduce Pareto collisions - static std::vector table(16384, {0, 0, 0, 0}); - - for (int b = 0; b < 256; ++b) { - uint64_t* sStates = src.bucketStates[b]; - uint32_t* sIdxs = src.bucketIndices[b]; - uint32_t sSize = src.bucketSizes[b]; - - if (sSize == 0) continue; - - // Adaptive Mask: 2k for small, 16k for large - const uint32_t mask = (sSize < 1000) ? 2047 : 16383; - const uint32_t shift = (sSize < 1000) ? (64 - 11) : (64 - 14); - currentGen++; - - // 1. Fill Table (Source) - for (size_t i = 0; i < sSize; ++i) { - uint64_t k = sStates[i]; - uint32_t h = (k * 11400714819323198485llu) >> shift; - h &= mask; - - // Branchless-ish Probe: Most IDs are unique, so this loop - // is predicted "not taken" after the first check. - while (table[h].gen == currentGen && table[h].key != k) { - h = (h + 1) & mask; - } - - if (table[h].gen != currentGen) { - table[h] = {k, sIdxs[i], currentGen, 1}; - } else { - table[h].count++; - } - } - - // 2. Intersect (Target) with Software Pipelining - uint64_t* tStates = tar.bucketStates[b]; - uint32_t* tIdxs = tar.bucketIndices[b]; - uint32_t tSize = tar.bucketSizes[b]; - - for (size_t i = 0; i < tSize; ++i) { - // Manual prefetch of the state 16 elements ahead to stay in L1 - if (i + 16 < tSize) { - __builtin_prefetch(&tStates[i + 16], 0, 3); - } - - uint64_t k = tStates[i]; - uint32_t h = (k * 11400714819323198485llu) >> shift; - h &= mask; - - // Probe logic - while (table[h].gen == currentGen && table[h].key != k) { - h = (h + 1) & mask; - } - - if (table[h].gen == currentGen && table[h].key == k) { - const uint32_t cnt = table[h].count; - if (cnt == 1) { - resultSrc.push_back(table[h].idx); - resultTar.push_back(tIdxs[i]); - table[h].count = 0xFFFFFFFF; - } else if (cnt == 0xFFFFFFFF) { - // Pareto multi-match removal logic - resultSrc.pop_back(); - resultTar.pop_back(); - table[h].count = 0xEEEEEEEE; - } - } - } - } -} -std::pair, std::vector> Forest::matchPipelinedBranchless( - SoAFramePersistent& src, - SoAFramePersistent& tar) { - - std::pair, std::vector> result; - // For 100M items, we might find more matches; - // adjusting reserve to prevent mid-run reallocations. - result.first.reserve(src.statesSlab.size() / 100); - result.second.reserve(src.statesSlab.size() / 100); - - struct Slot { - uint64_t key; - uint32_t idx; - uint32_t gen; - uint32_t count; - }; - - static uint32_t currentGen = 1; - // Increased table size slightly to 16k to further reduce Pareto collisions - static std::vector table(16384, {0, 0, 0, 0}); - - for (int b = 0; b < 256; ++b) { - uint64_t* sStates = src.bucketStates[b]; - uint32_t* sIdxs = src.bucketIndices[b]; - uint32_t sSize = src.bucketSizes[b]; - - if (sSize == 0) continue; - - // Adaptive Mask: 2k for small, 16k for large - const uint32_t mask = (sSize < 1000) ? 2047 : 16383; - const uint32_t shift = (sSize < 1000) ? (64 - 11) : (64 - 14); - currentGen++; - - // 1. Fill Table (Source) - for (size_t i = 0; i < sSize; ++i) { - uint64_t k = sStates[i]; - uint32_t h = (k * 11400714819323198485llu) >> shift; - h &= mask; - - // Branchless-ish Probe: Most IDs are unique, so this loop - // is predicted "not taken" after the first check. - while (table[h].gen == currentGen && table[h].key != k) { - h = (h + 1) & mask; - } - - if (table[h].gen != currentGen) { - table[h] = {k, sIdxs[i], currentGen, 1}; - } else { - table[h].count++; - } - } - - // 2. Intersect (Target) with Software Pipelining - uint64_t* tStates = tar.bucketStates[b]; - uint32_t* tIdxs = tar.bucketIndices[b]; - uint32_t tSize = tar.bucketSizes[b]; - - for (size_t i = 0; i < tSize; ++i) { - // Manual prefetch of the state 16 elements ahead to stay in L1 - if (i + 16 < tSize) { - __builtin_prefetch(&tStates[i + 16], 0, 3); - } - - uint64_t k = tStates[i]; - uint32_t h = (k * 11400714819323198485llu) >> shift; - h &= mask; - - // Probe logic - while (table[h].gen == currentGen && table[h].key != k) { - h = (h + 1) & mask; - } - - if (table[h].gen == currentGen && table[h].key == k) { - const uint32_t cnt = table[h].count; - if (cnt == 1) { - result.first.push_back(table[h].idx); - result.second.push_back(tIdxs[i]); - table[h].count = 0xFFFFFFFF; - } else if (cnt == 0xFFFFFFFF) { - // Pareto multi-match removal logic - result.first.pop_back(); - result.second.pop_back(); - table[h].count = 0xEEEEEEEE; - } - } - } - } - return result; -} -std::pair, std::vector> Forest::matchAdaptivePersistent( - SoAFramePersistent& src, - SoAFramePersistent& tar) { - - std::pair, std::vector> result; - result.first.reserve(10000); - result.second.reserve(10000); - - struct Slot { - uint64_t key; - uint32_t idx; - uint32_t gen; // Generation counter - uint32_t count; // 1=SrcUnique, 0xFFFFFFFF=Matched, etc. - }; - - static uint32_t currentGen = 1; - static std::vector table(8192, {0, 0, 0, 0}); - - for (int b = 0; b < 256; ++b) { - uint64_t* sStates = src.bucketStates[b]; - uint32_t* sIdxs = src.bucketIndices[b]; - uint32_t sSize = src.bucketSizes[b]; - - if (sSize == 0) continue; - - const uint32_t mask = (sSize < 500) ? 1023 : 8191; - currentGen++; - - // 1. Fill Table - for (size_t i = 0; i < sSize; ++i) { - uint64_t k = sStates[i]; - uint32_t h = (k * 11400714819323198485llu) >> (64 - 13); - h &= mask; - - while (table[h].gen == currentGen && table[h].key != k) { - h = (h + 1) & mask; - } - - if (table[h].gen != currentGen) { - table[h] = {k, sIdxs[i], currentGen, 1}; - } else { - table[h].count++; - } - } - - // 2. Intersect - uint64_t* tStates = tar.bucketStates[b]; - uint32_t* tIdxs = tar.bucketIndices[b]; - uint32_t tSize = tar.bucketSizes[b]; - - for (size_t i = 0; i < tSize; ++i) { - uint64_t k = tStates[i]; - uint32_t h = (k * 11400714819323198485llu) >> (64 - 13); - h &= mask; - - while (table[h].gen == currentGen && table[h].key != k) { - h = (h + 1) & mask; - } - - if (table[h].gen == currentGen && table[h].key == k) { - if (table[h].count == 1) { - result.first.push_back(table[h].idx); - result.second.push_back(tIdxs[i]); - table[h].count = 0xFFFFFFFF; - } else if (table[h].count == 0xFFFFFFFF) { - result.first.pop_back(); - result.second.pop_back(); - table[h].count = 0xEEEEEEEE; - } - } - } - } - return result; -} - -std::pair, std::vector> Forest::matchAdaptive( - SoAFrame& src, - SoAFrame& tar) { - - std::pair, std::vector> result; - result.first.reserve(10000); - result.second.reserve(10000); - - struct Slot { - uint64_t key; - uint32_t idx; - uint32_t gen; // Generation counter - uint32_t count; // 1=SrcUnique, 0xFFFFFFFF=Matched, etc. - }; - - // Global generation for this call - uint32_t currentGen = 1; - std::vector table(8192, {0, 0, 0, 0}); - - for (int b = 0; b < 256; ++b) { - const auto& sStates = src.states[b]; - const auto& sIdxs = src.indices[b]; - if (sStates.empty()) continue; - - // Adaptive Table Mask: Use smaller range for tiny buckets - const uint32_t mask = (sStates.size() < 500) ? 1023 : 8191; - currentGen++; - - // 1. Fill Table - for (size_t i = 0; i < sStates.size(); ++i) { - // Prefetch an element roughly 16 iterations ahead (adjust based on testing) - /* - * This didn't help anymore. - * if (i + 16 < sStates.size()) { - __builtin_prefetch(&sStates[i + 16], 0, 3); - __builtin_prefetch(&sIdxs[i + 16], 0, 3); - }*/ - uint64_t k = sStates[i]; - uint32_t h = (k * 11400714819323198485llu) >> (64 - 13); - h &= mask; - - // Probe: Valid if gen matches AND key is different - while (table[h].gen == currentGen && table[h].key != k) { - h = (h + 1) & mask; - } - - if (table[h].gen != currentGen) { - table[h] = {k, sIdxs[i], currentGen, 1}; - } else { - table[h].count++; // Duplicate in Source - } - } - - // 2. Intersect - const auto& tStates = tar.states[b]; - const auto& tIdxs = tar.indices[b]; - for (size_t i = 0; i < tStates.size(); ++i) { - uint64_t k = tStates[i]; - uint32_t h = (k * 11400714819323198485llu) >> (64 - 13); - h &= mask; - - while (table[h].gen == currentGen && table[h].key != k) { - h = (h + 1) & mask; - } - - if (table[h].gen == currentGen && table[h].key == k) { - if (table[h].count == 1) { - result.first.push_back(table[h].idx); - result.second.push_back(tIdxs[i]); - table[h].count = 0xFFFFFFFF; - } else if (table[h].count == 0xFFFFFFFF) { - result.first.pop_back(); - result.second.pop_back(); - table[h].count = 0xEEEEEEEE; - } - } - } - } - return result; -} -std::pair, std::vector> Forest::matchBlockedBloom( - SoAFrame& src, - SoAFrame& tar) { - - std::pair, std::vector> result; - result.first.reserve(10000); - result.second.reserve(10000); - - struct Slot { - uint64_t key; - uint32_t idx; - uint32_t count; - }; - - const uint32_t TABLE_SIZE = 8192; - const uint32_t HASH_MASK = TABLE_SIZE - 1; - std::vector table(TABLE_SIZE); - - // A 512-bit Bloom Filter fits in exactly one Cache Line (64 bytes). - // We use 8 x 64-bit integers to represent the 512 bits. - uint64_t bloom[8]; - - for (int b = 0; b < 256; ++b) { - std::fill(table.begin(), table.end(), Slot{0, 0, 0}); - std::memset(bloom, 0, sizeof(bloom)); - - const auto& sStates = src.states[b]; - const auto& sIdxs = src.indices[b]; - const auto& tStates = tar.states[b]; - const auto& tIdxs = tar.indices[b]; - - // 1. Fill Table + Bloom Filter - for (size_t i = 0; i < sStates.size(); ++i) { - uint64_t k = sStates[i]; - - // Set Bloom bit: use a different hash or shift for the bloom index - // We'll use bits from the key to pick one of 512 bits - uint32_t bHash = (k ^ (k >> 32)); - bloom[(bHash >> 6) & 7] |= (1ull << (bHash & 63)); - - uint32_t h = (k * 11400714819323198485llu) >> (64 - 13); - h &= HASH_MASK; - - while (table[h].count > 0 && table[h].key != k) { - h = (h + 1) & HASH_MASK; - } - - table[h].key = k; - table[h].idx = sIdxs[i]; - table[h].count++; - } - - // 2. Intersection with Bloom Filter Gate - for (size_t i = 0; i < tStates.size(); ++i) { - uint64_t k = tStates[i]; - - // --- BLOOM FILTER GATE --- - uint32_t bHash = (k ^ (k >> 32)); - if (!(bloom[(bHash >> 6) & 7] & (1ull << (bHash & 63)))) { - continue; // 100% certainly not in Source. Skip hash probe! - } - // ------------------------- - - uint32_t h = (k * 11400714819323198485llu) >> (64 - 13); - h &= HASH_MASK; - - while (table[h].count > 0 && table[h].key != k) { - h = (h + 1) & HASH_MASK; - } - - if (table[h].key == k) { - if (table[h].count == 1) { - result.first.push_back(table[h].idx); - result.second.push_back(tIdxs[i]); - table[h].count = 0xFFFFFFFF; - } else if (table[h].count == 0xFFFFFFFF) { - result.first.pop_back(); - result.second.pop_back(); - table[h].count = 0xEEEEEEEE; - } - } - } - } - return result; -} -std::pair, std::vector> Forest::matchParallelRadixPartitioning( - SoAFrame& src, - SoAFrame& tar) { - - std::pair, std::vector> result; - result.first.reserve(10000); - result.second.reserve(10000); - - const uint32_t TABLE_SIZE = 8192; - const uint32_t HASH_MASK = TABLE_SIZE - 1; - - // Aligned scratchpad to maximize L1/L2 cache efficiency - struct alignas(64) Slot { - uint64_t key; - uint32_t idx; - uint32_t count; - }; - std::vector table(TABLE_SIZE); - - for (int b = 0; b < 256; ++b) { - // 1. FAST CLEAR - // std::fill is optimized, but we only zero the 'count' to save cycles - for(auto& s : table) s.count = 0; - - const auto& sStates = src.states[b]; - const auto& sIdxs = src.indices[b]; - const size_t sSize = sStates.size(); - - // 2. PIPELINED FILL (Unrolled x4 for ILP) - // We process 4 items at once to hide memory latency - size_t i = 0; - for (; i + 3 < sSize; i += 4) { - for (int k = 0; k < 4; ++k) { - uint64_t key = sStates[i + k]; - uint32_t h = (key * 11400714819323198485llu) >> (64 - 13); - h &= HASH_MASK; - - while (table[h].count > 0 && table[h].key != key) h = (h + 1) & HASH_MASK; - - table[h].key = key; - table[h].idx = sIdxs[i + k]; - table[h].count++; - } - } - // Handle remainder - for (; i < sSize; ++i) { - uint64_t key = sStates[i]; - uint32_t h = (key * 11400714819323198485llu) >> (64 - 13); - h &= HASH_MASK; - while (table[h].count > 0 && table[h].key != key) h = (h + 1) & HASH_MASK; - table[h].key = key; table[h].idx = sIdxs[i]; table[h].count++; - } - - // 3. OPTIMISTIC INTERSECTION - const auto& tStates = tar.states[b]; - const auto& tIdxs = tar.indices[b]; - const size_t tSize = tStates.size(); - - for (size_t j = 0; j < tSize; ++j) { - uint64_t key = tStates[j]; - uint32_t h = (key * 11400714819323198485llu) >> (64 - 13); - h &= HASH_MASK; - - while (table[h].count > 0 && table[h].key != key) h = (h + 1) & HASH_MASK; - - if (table[h].key == key) { - if (table[h].count == 1) { - result.first.push_back(table[h].idx); - result.second.push_back(tIdxs[j]); - table[h].count = 0xFFFFFFFF; // Mark as Matched - } else if (table[h].count == 0xFFFFFFFF) { - // Pareto duplicate found in Target: Roll back - result.first.pop_back(); - result.second.pop_back(); - table[h].count = 0xEEEEEEEE; // Mark as Permanent Duplicate - } - } - } - } - - return result; -} -std::pair, std::vector> Forest::matchPreparedFramesFaster( - SoAFrame& src, - SoAFrame& tar) { - - std::pair, std::vector> result; - result.first.reserve(10000); - result.second.reserve(10000); - - // Flat, cache-aligned slot structure - struct Slot { - uint64_t key; - uint32_t idx; - uint32_t count; - }; - - // 8192 slots = 128KB. This fits perfectly in your 4MB L2. - // We use a power-of-two size to use bitwise AND instead of modulo %. - const uint32_t TABLE_SIZE = 8192; - const uint32_t HASH_MASK = TABLE_SIZE - 1; - std::vector table(TABLE_SIZE); - - for (int b = 0; b < 256; ++b) { - // FAST: std::fill is usually a vectorized memset. - std::fill(table.begin(), table.end(), Slot{0, 0, 0}); - - const auto& sStates = src.states[b]; - const auto& sIdxs = src.indices[b]; - const auto& tStates = tar.states[b]; - const auto& tIdxs = tar.indices[b]; - - // 1. Fill Table from Source - for (size_t i = 0; i < sStates.size(); ++i) { - uint64_t k = sStates[i]; - // Fibonacci Hashing (very fast for 64-bit keys) - uint32_t h = (k * 11400714819323198485llu) >> (64 - 13); - h &= HASH_MASK; - - while (table[h].count > 0 && table[h].key != k) { - h = (h + 1) & HASH_MASK; - } - - table[h].key = k; - table[h].idx = sIdxs[i]; - table[h].count++; - } - - // 2. Secondary Uniqueness Check + Intersection - // We reuse the 'count' field: - // 1 = Unique in Src - // >1 = Duplicate in Src - // 0 = Already Matched (prevents Target duplicates) - for (size_t i = 0; i < tStates.size(); ++i) { - uint64_t k = tStates[i]; - uint32_t h = (k * 11400714819323198485llu) >> (64 - 13); - h &= HASH_MASK; - - while (table[h].count > 0 && table[h].key != k) { - h = (h + 1) & HASH_MASK; - } - - // We need to know if 'k' is unique in Target too. - // A quick way is to check if it appears again in the target bucket. - // For Pareto, we can use a "tombstone" logic: - if (table[h].key == k) { - if (table[h].count == 1) { - // This is the first time we see it in Target - result.first.push_back(table[h].idx); - result.second.push_back(tIdxs[i]); - table[h].count = 0xFFFFFFFF; // Mark as "Matched once" - } else if (table[h].count == 0xFFFFFFFF) { - // Oh no, this is a Target duplicate! - // We must remove the last added match. - result.first.pop_back(); - result.second.pop_back(); - table[h].count = 0xEEEEEEEE; // Mark as "Permanent Duplicate" - } - } - } - } - return result; -} -std::pair, std::vector> Forest::matchPreparedFrames( SoAFrame& src, SoAFrame& tar) { - - // Initialize the pair of vectors - std::pair, std::vector> result; - - // Heuristic: start with a reasonable reserve (e.g., 5% of average bucket size * 256) - size_t initialReserve = (src.states[0].size() + tar.states[0].size()) * 6; - result.first.reserve(initialReserve); - result.second.reserve(initialReserve); - - // Local structures for bucket-level uniqueness - struct SrcInfo { uint32_t idx; bool isDup; }; - std::unordered_map bucketSrc; - std::unordered_map bucketTar; - - for (int b = 0; b < 256; ++b) { - bucketSrc.clear(); - bucketTar.clear(); - - const auto& sStates = src.states[b]; - const auto& sIdxs = src.indices[b]; - const auto& tStates = tar.states[b]; - const auto& tIdxs = tar.indices[b]; - - // 1. Process Source: Mark unique vs duplicates - for (size_t i = 0; i < sStates.size(); ++i) { - auto [it, inserted] = bucketSrc.try_emplace(sStates[i], SrcInfo{sIdxs[i], false}); - if (!inserted) it->second.isDup = true; - } - - // 2. Process Target: Mark unique vs duplicates - for (size_t i = 0; i < tStates.size(); ++i) { - auto [it, inserted] = bucketTar.try_emplace(tStates[i], false); - if (!inserted) it->second = true; // Mark as duplicate - } - - // 3. Intersect unique-only IDs - for (size_t i = 0; i < tStates.size(); ++i) { - uint64_t id = tStates[i]; - - // Check if unique in Target - if (bucketTar[id] == false) { - auto it = bucketSrc.find(id); - // Check if exists in Source AND is unique there - if (it != bucketSrc.end() && it->second.isDup == false) { - result.first.push_back(it->second.idx); - result.second.push_back(tIdxs[i]); - } - } - } - } - - return result; -} - /** * @brief Computes sparse matches on a pair of rectified and smoothed * images. Here the src and tar images refer to the left and right images, @@ -1194,7 +101,7 @@ std::vector Forest::depthPriorFast( for (auto& q : statesTar) q.srcDescr = false; ndb::Hashmatch hm( - 214673, // statesSrc.size() + statesTar.size() , + 214673, statesSrc.size() + statesTar.size()); std::vector> corr; for (auto& q : statesSrc) hm.insert(q); @@ -1225,7 +132,6 @@ std::vector Forest::findCorrespondences( ++i, unique = false; if (unique) { - // emulates std::lowerbound behavior for arrays for (; j < tarStates.size() - 1; ++j) { if (!(tarStates[j] < srcStates[i])) break; } @@ -1239,299 +145,7 @@ std::vector Forest::findCorrespondences( } return corr; } -std::vector Forest::findCorrespondencesHashNaive( - std::vector& srcStates, - std::vector& tarStates) { - - std::vector corr; - struct DescriptorHasher { - std::size_t operator()(const ndb::Descriptor& d) const { - // Just return the state since it's already a unique-ish 64-bit int - return static_cast(d.state); - } - }; - // 1. Count frequencies in Source - std::unordered_map srcCounts; - std::unordered_map tarCounts; - for (const auto& d : srcStates) { - srcCounts[d]++; - } - - // 2. Count frequencies in Target - for (const auto& d : tarStates) { - tarCounts[d]++; - } - - // 3. Match only if the descriptor is unique in both (count == 1) - // We iterate through srcStates to maintain a similar "order" or - // simply to find potential matches. - for (const auto& srcDesc : srcStates) { - // Is it unique in Source? - if (srcCounts[srcDesc] == 1) { - // Does it exist and is it unique in Target? - if (tarCounts.count(srcDesc) && tarCounts[srcDesc] == 1) { - - // We need the actual target object to get the 'point' - // In a naive way, we just go find it. - for (const auto& tarDesc : tarStates) { - if (tarDesc == srcDesc) { - corr.push_back(ndb::Correspondence(srcDesc.point, tarDesc.point)); - break; - } - } - } - } - } - - return corr; -} - -// State machine for our IDs -enum class State : uint8_t { Unseen = 0, SeenOnce = 1, Duplicate = 2 }; - -#include -#include -#include - -std::vector Forest::findCorrespondencesHash( - std::vector& srcStates, - std::vector& tarStates) { - - // Tracking states: 0 = Unseen, 1 = SeenOnce, 2 = Duplicate - enum class Occurence : uint8_t { Unseen = 0, SeenOnce = 1, Duplicate = 2 }; - - // 1. Map Source IDs: State -> {OccurenceLevel, OriginalIndex} - // Pre-allocating prevents expensive rehashes during the loop - std::unordered_map> srcMap; - srcMap.reserve(srcStates.size()); - - for (uint32_t i = 0; i < srcStates.size(); ++i) { - auto& entry = srcMap[srcStates[i].state]; - if (entry.first == Occurence::Unseen) { - entry = {Occurence::SeenOnce, i}; - } else { - entry.first = Occurence::Duplicate; - } - } - - // 2. Map Target IDs: State -> OccurenceLevel - std::unordered_map tarMap; - tarMap.reserve(tarStates.size()); - - for (uint32_t j = 0; j < tarStates.size(); ++j) { - auto& occ = tarMap[tarStates[j].state]; - if (occ == Occurence::Unseen) { - occ = Occurence::SeenOnce; - } else { - occ = Occurence::Duplicate; - } - } - - // 3. Intersect unique pairs - std::vector corr; - // Heuristic: Reserve 20% of the smaller set size for the results - corr.reserve(std::min(srcStates.size(), tarStates.size()) / 5); - - for (uint32_t j = 0; j < tarStates.size(); ++j) { - uint64_t currentID = tarStates[j].state; - - // Condition: Must be unique in Target AND unique in Source - if (tarMap[currentID] == Occurence::SeenOnce) { - auto it = srcMap.find(currentID); - if (it != srcMap.end() && it->second.first == Occurence::SeenOnce) { - // Correspondence(Point from Source, Point from Target) - corr.push_back(ndb::Correspondence( - srcStates[it->second.second].point, - tarStates[j].point - )); - } - } - } - - return corr; -} -#include -#include -#include - -// A lightweight structure to avoid moving heavy Descriptor objects -struct KeyIndex { - uint64_t state; - uint32_t index; -}; -#include -#include -#include - -std::vector Forest::findCorrespondencesTurbo( - std::vector& srcStates, - std::vector& tarStates) { - - const int BUCKETS = 256; - const uint64_t MASK = 0xFF; - - // --- STEP 1: Linear Partitioning (Radix Pass) --- - // We use a single flat buffer to avoid 256 separate vector allocations - std::vector srcBuffer(srcStates.size()); - std::vector tarBuffer(tarStates.size()); - std::array srcCounts = {0}, tarCounts = {0}; - std::array srcOffsets, tarOffsets; - - for (const auto& s : srcStates) srcCounts[s.state & MASK]++; - for (const auto& t : tarStates) tarCounts[t.state & MASK]++; - - srcOffsets[0] = tarOffsets[0] = 0; - for (int i = 1; i < BUCKETS; ++i) { - srcOffsets[i] = srcOffsets[i - 1] + srcCounts[i - 1]; - tarOffsets[i] = tarOffsets[i - 1] + tarCounts[i - 1]; - } - auto srcCursors = srcOffsets; - auto tarCursors = tarOffsets; - - for (uint32_t i = 0; i < srcStates.size(); ++i) { - srcBuffer[srcCursors[srcStates[i].state & MASK]++] = {srcStates[i].state, i}; - } - for (uint32_t i = 0; i < tarStates.size(); ++i) { - tarBuffer[tarCursors[tarStates[i].state & MASK]++] = {tarStates[i].state, i}; - } - - // --- STEP 2: In-Cache Hashing --- - std::vector corr; - corr.reserve(std::min(srcStates.size(), tarStates.size()) / 8); - - // Using a tiny fixed-size hash table for each bucket to stay in L1/L2 cache - // State: 0 = Unseen, 1 = SeenOnce, 2 = Duplicate - struct LocalVal { uint32_t index; uint8_t count; }; - - // We reuse this map across buckets to avoid reallocating - // A simple open-addressed hash map for the bucket - std::unordered_map bucketMap; - bucketMap.reserve(srcStates.size() / BUCKETS * 2); - - for (int b = 0; b < BUCKETS; ++b) { - bucketMap.clear(); - - // Load Source bucket into local cache-friendly map - size_t srcStart = srcOffsets[b]; - size_t srcEnd = srcStart + srcCounts[b]; - for (size_t i = srcStart; i < srcEnd; ++i) { - auto& entry = bucketMap[srcBuffer[i].state]; - entry.index = srcBuffer[i].index; - entry.count = (entry.count == 0) ? 1 : 2; - } - - // Intersect with Target bucket - size_t tarStart = tarOffsets[b]; - size_t tarEnd = tarStart + tarCounts[b]; - - // Secondary map to ensure target-side uniqueness - std::unordered_map tarUniqueness; - for (size_t i = tarStart; i < tarEnd; ++i) { - auto& count = tarUniqueness[tarBuffer[i].state]; - count = (count == 0) ? 1 : 2; - } - - for (size_t i = tarStart; i < tarEnd; ++i) { - uint64_t id = tarBuffer[i].state; - if (tarUniqueness[id] == 1) { - auto it = bucketMap.find(id); - if (it != bucketMap.end() && it->second.count == 1) { - corr.push_back(ndb::Correspondence( - srcStates[it->second.index].point, - tarStates[tarBuffer[i].index].point - )); - } - } - } - } - - return corr; -} -std::vector Forest::findCorrespondencesHashingRadix( - std::vector& srcStates, - std::vector& tarStates) { - - const int NUM_BUCKETS = 256; - const uint64_t MASK = 0xFF; - - // 1. Partition Source into Buckets - std::vector srcBuckets[NUM_BUCKETS]; - for (int i = 0; i < NUM_BUCKETS; ++i) srcBuckets[i].reserve(srcStates.size() / NUM_BUCKETS * 1.2); - - for (uint32_t i = 0; i < srcStates.size(); ++i) { - srcBuckets[srcStates[i].state & MASK].push_back({srcStates[i].state, i}); - } - - // 2. Partition Target into Buckets - std::vector tarBuckets[NUM_BUCKETS]; - for (int i = 0; i < NUM_BUCKETS; ++i) tarBuckets[i].reserve(tarStates.size() / NUM_BUCKETS * 1.2); - - for (uint32_t i = 0; i < tarStates.size(); ++i) { - tarBuckets[tarStates[i].state & MASK].push_back(tarStates[i].state); - } - - std::vector corr; - corr.reserve(std::min(srcStates.size(), tarStates.size()) / 5); - - // 3. Process each bucket pair - // This part can be easily parallelized with #pragma omp parallel for - for (int b = 0; b < NUM_BUCKETS; ++b) { - if (srcBuckets[b].empty() || tarBuckets[b].empty()) continue; - - // Small local maps fit in L1/L2 Cache - // Using a simple frequency map for the local bucket - enum class Occ : uint8_t { Unseen = 0, SeenOnce = 1, Duplicate = 2 }; - - struct LocalEntry { - Occ occ = Occ::Unseen; - uint32_t idx = 0; - }; - - // We use a flat hash map here. For simplicity in standard C++, - // std::unordered_map is used, but even it is faster here - // because it stays in cache. - std::unordered_map localSrc; - localSrc.reserve(srcBuckets[b].size()); - - for (auto& ki : srcBuckets[b]) { - auto& entry = localSrc[ki.state]; - if (entry.occ == Occ::Unseen) { - entry = {Occ::SeenOnce, ki.index}; - } else { - entry.occ = Occ::Duplicate; - } - } - - std::unordered_map localTar; - localTar.reserve(tarBuckets[b].size()); - for (uint64_t state : tarBuckets[b]) { - auto& occ = localTar[state]; - occ = (occ == Occ::Unseen) ? Occ::SeenOnce : Occ::Duplicate; - } - - // Intersect within the bucket - // Since we are inside a bucket, we iterate the target indices - // but we need to find the target point. - // To be fast, we'll re-scan the original tarStates for this bucket's IDs - for (uint32_t j = 0; j < tarStates.size(); ++j) { - uint64_t s = tarStates[j].state; - if ((s & MASK) == b) { // Only process IDs belonging to this bucket - if (localTar[s] == Occ::SeenOnce) { - auto it = localSrc.find(s); - if (it != localSrc.end() && it->second.occ == Occ::SeenOnce) { - corr.push_back(ndb::Correspondence( - srcStates[it->second.idx].point, - tarStates[j].point - )); - } - } - } - } - } - - return corr; -} /** * @brief Evaluates a given forest mask on an image and returns the * descriptors diff --git a/lib/gpc/forest.hpp b/lib/gpc/forest.hpp index 9e45600..6463aec 100644 --- a/lib/gpc/forest.hpp +++ b/lib/gpc/forest.hpp @@ -178,48 +178,6 @@ struct MatchStats { double prec, rec, timeProp, timeMatch; int numInlier, numStates, numMatches; }; -struct SoAFrame { - // 256 Buckets to ensure each chunk fits in L2/L3 cache - std::vector states[256]; - std::vector indices[256]; - - void reserve(size_t total_size) { - for(int i=0; i<256; ++i) { - states[i].reserve(total_size / size_t(256 * 1.2)); - indices[i].reserve(total_size / size_t(256 * 1.2)); - } - } -}; -struct SoAFramePersistent { - // Persistent memory blocks - std::vector statesSlab; - std::vector indicesSlab; - - // Pointers into the slab for each bucket - uint64_t* bucketStates[256]; - uint32_t* bucketIndices[256]; - uint32_t bucketSizes[256]; - - void preallocate(size_t total_size) { - statesSlab.assign(total_size, 0); - indicesSlab.assign(total_size, 0); - } -}; -struct StateIdx { - uint64_t state; - uint32_t index; -}; - -struct SoAFramePersistentSingleSlab { - std::vector slab; - StateIdx* bucketData[256]; - uint32_t bucketSizes[256]; - - void preallocate(size_t total_size) { - slab.assign(total_size, {0, 0}); - } -}; - class Forest { public: /** @@ -241,83 +199,6 @@ class Forest { static std::vector findCorrespondences( std::vector& srcStates, std::vector& tarStates); - static std::vector findCorrespondencesHashNaive( - std::vector& srcStates, - std::vector& tarStates); - static std::vector findCorrespondencesHash( - std::vector& srcStates, - std::vector& tarStates); - - static std::vector findCorrespondencesHashingRadix( - std::vector& srcStates, - std::vector& tarStates); - - static std::vector findCorrespondencesTurbo( - std::vector& srcStates, - std::vector& tarStates); - - - static std::pair prepareSoAFrames( - std::vector& srcStates, - std::vector& tarStates); - - static void prepareSoAFramesPersistent( - std::vector& srcStates, - std::vector& tarStates, - SoAFramePersistent& srcFrame, - SoAFramePersistent& tarFrame); -static void prepareSoAFramesPersistentSingleSlab( - std::vector& srcStates, - std::vector& tarStates, - SoAFramePersistentSingleSlab& srcFrame, - SoAFramePersistentSingleSlab& tarFrame); - - -static std::pair, std::vector> matchPreparedFrames( SoAFrame& src, SoAFrame& tar); -static std::pair, std::vector> matchPreparedFramesFaster( SoAFrame& src, SoAFrame& tar); - -static std::pair, std::vector> matchParallelRadixPartitioning( - SoAFrame& src, - SoAFrame& tar) ; -static std::pair, std::vector> matchBlockedBloom( - SoAFrame& src, - SoAFrame& tar) ; -static std::pair, std::vector> matchAdaptive( - SoAFrame& src, - SoAFrame& tar); -static std::pair, std::vector> matchAdaptivePersistent( - SoAFramePersistent& src, - SoAFramePersistent& tar); -static std::pair, std::vector> matchPipelinedBranchless( - SoAFramePersistent& src, - SoAFramePersistent& tar); -static void matchPipelinedBranchlessPreallocate( - SoAFramePersistent& src, - SoAFramePersistent& tar, - std::vector& resultSrc, - std::vector& resultTar); - -/* -static std::pair, std::vector> matchAdaptiveNeon( - SoAFrame& src, - SoAFrame& tar); -*/ -static void matchPipelinedBranchlessPreallocateSingleSlab( - SoAFramePersistentSingleSlab& src, SoAFramePersistentSingleSlab& tar, - std::vector& outS, std::vector& outT); - - - -static void prepareSoAFramesPersistentSingleSlabUnordered( - std::vector& srcStates, - std::vector& tarStates, - SoAFramePersistentSingleSlab& srcFrame, - SoAFramePersistentSingleSlab& tarFrame); -static void matchPipelinedBranchlessPreallocateSingleSlabUnordered( - SoAFramePersistentSingleSlab& src, SoAFramePersistentSingleSlab& tar, - std::vector& outS, std::vector& outT); - - /** * @brief Evaluates a given forest mask on an image and returns the * descriptors diff --git a/lib/gpc/kernels/box.cpp b/lib/gpc/kernels/box.cpp index 605daa2..714fbb6 100644 --- a/lib/gpc/kernels/box.cpp +++ b/lib/gpc/kernels/box.cpp @@ -166,7 +166,6 @@ void boxSSE(uint8_t* in, uint8_t* blurred, int width, int height) { void box(uint8_t* in, uint8_t* blurred, int width, int height, int numThreads) { assert(width % 16 == 0 && "width must be multiple of 16!"); #if defined(__ARM_NEON) || defined(__aarch64__) - // Force use of our new Highway kernel on Mac testing::box_hwy(in, blurred, width, height); #else #if HWY_TARGET == HWY_AVX2 diff --git a/lib/gpc/kernels/sobel.cpp b/lib/gpc/kernels/sobel.cpp index 2817622..531b3ed 100644 --- a/lib/gpc/kernels/sobel.cpp +++ b/lib/gpc/kernels/sobel.cpp @@ -161,7 +161,6 @@ void sobel(uint8_t* in, int numThreads) { assert(width % 16 == 0 && "width must be multiple of 16!"); #if defined(__ARM_NEON) || defined(__aarch64__) - // Force use of our new Highway kernel on Mac sobelNaive(in, blurred, width, height, threshold); //testing::sobel_hwy(in, blurred, width, height, threshold); // not exact! #else diff --git a/samples/sparsematch.cpp b/samples/sparsematch.cpp index 3b94d9f..57864e8 100644 --- a/samples/sparsematch.cpp +++ b/samples/sparsematch.cpp @@ -3,7 +3,30 @@ #include "gpc/forest.hpp" using namespace std; - +std::vector gpcFilterDense(uint8_t* in, + const std::vector& fastmask, + int width, + int height) { + uint32_t tmp; + uint32_t usableW = width - 26; + uint32_t usableH = height - 26; + std::vector out(usableW * usableH); + int j = 0; + for (int y=13;y *(in + idx + fastmask[i + 1])) + tmp++; // set this test's result to 1 + } + out[j] = ndb::Descriptor(ndb::Point(x, y), tmp); + j++; + } + } + return out; +} int main(int argc, char** argv) { std::string forestPath = "../forests/defaultZeroForest.txt"; std::string leftImgPath = "../data/middlebury/im0.png"; @@ -67,11 +90,17 @@ int main(int argc, char** argv) { std::cout << "Number of matches: " << supp.size() << std::endl; std::cout << "Preprocessing time: " << gpc::inference::tickToMs(t1, t0) << " ms" << std::endl; std::cout << "Matching time: " << gpc::inference::tickToMs(t2, t1) << " ms" << std::endl; + /* std::vector statesSrc = forest.evalFastMaskOnSubsetSSE( simgP.smooth, simgP.grad, simgP.mask, fm, inferencesettings); std::vector statesTar = forest.evalFastMaskOnSubsetSSE( timgP.smooth, timgP.grad, timgP.mask, fm, inferencesettings); - ndb::Descriptor::serialize("statesSrc.txt", statesSrc); - ndb::Descriptor::serialize("statesTar.txt", statesTar); + */ + + std::vector statesSrc = gpcFilterDense(simgP.smooth.data(), fm.mask, simgP.smooth.cols(), simgP.smooth.rows()); + std::vector statesTar = gpcFilterDense(timgP.smooth.data(), fm.mask, timgP.smooth.cols(), timgP.smooth.rows()); + + ndb::Descriptor::serialize("statesSrcLargeS.txt", statesSrc); + ndb::Descriptor::serialize("statesTarLargeS.txt", statesTar); } diff --git a/tests/test_single_matching.cpp b/tests/test_single_matching.cpp index 2893b01..f7724b9 100644 --- a/tests/test_single_matching.cpp +++ b/tests/test_single_matching.cpp @@ -61,6 +61,7 @@ std::vector getTarDescriptors() { } +/* TEST(A,B) { std::vector srcOriginal = getSrcDescriptors(); std::vector tarOriginal = getTarDescriptors(); @@ -69,8 +70,6 @@ TEST(A,B) { std::vector srcAlt = srcOriginal; std::vector tarAlt = tarOriginal; - // Baseline - // To write a test for this we'd actually need to get the ids of the sources back, not just the final matches. std::vector matches = gpc::inference::Forest::findCorrespondences(srcBaseline, tarBaseline); @@ -91,3 +90,4 @@ TEST(A,B) { EXPECT_EQ(matches.size(), resultSrc.size()); EXPECT_EQ(matches.size(), resultTar.size()); } +*/ From e8ed2f3c3797deed9789ef33289882d44c1cf0b3 Mon Sep 17 00:00:00 2001 From: Nik Bamert Date: Tue, 7 Apr 2026 07:45:34 +0200 Subject: [PATCH 32/36] add target --- samples/target.cpp | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 samples/target.cpp diff --git a/samples/target.cpp b/samples/target.cpp new file mode 100644 index 0000000..4f9a1bc --- /dev/null +++ b/samples/target.cpp @@ -0,0 +1,17 @@ +#include +#include +int main() { + // This is evaluated at compile-time + std::cout << "Compiled for: " << hwy::TargetName(HWY_TARGET) << std::endl; + + // If you need logic based on the arch: +#if HWY_TARGET == HWY_AVX2 + std::cout << "Logic: Using 256-bit AVX2 paths." << std::endl; +#elif HWY_TARGET == HWY_NEON + std::cout << "Logic: Using 128-bit NEON paths." << std::endl; +#elif HWY_TARGET == HWY_SSE4 + std::cout << "Logic: Using 128-bit SSE4 paths." << std::endl; +#else + std::cout << "Logic: Using Scalar fallback." << std::endl; +#endif +} From ee46a5b9c5fe1841c8fb12747b93756470a85934 Mon Sep 17 00:00:00 2001 From: Nik Bamert Date: Tue, 7 Apr 2026 07:49:09 +0200 Subject: [PATCH 33/36] no div --- lib/gpc/kernels/sobel_hwy.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/gpc/kernels/sobel_hwy.cpp b/lib/gpc/kernels/sobel_hwy.cpp index d1b331f..fe93031 100644 --- a/lib/gpc/kernels/sobel_hwy.cpp +++ b/lib/gpc/kernels/sobel_hwy.cpp @@ -182,7 +182,7 @@ namespace testing { //#if defined(HWY_TARGET) && HWY_TARGET == HWY_NEON void sobel_hwy(uint8_t* in, uint8_t* blurred, int width, int height, uint8_t threshold) { //ndb::N_NEON::SobelKernel(in, blurred, width, height, threshold); - HWY_STATIC_DISPATCH(SobelKernel)(in, blurred, width, height, threshold); + HWY_STATIC_DISPATCH(SobelKernelNoDiv)(in, blurred, width, height, threshold); } //#endif } From cbddf6b986183b71cfcbd636d35beea8fbf4ca82 Mon Sep 17 00:00:00 2001 From: Nik Bamert Date: Tue, 7 Apr 2026 08:46:39 +0200 Subject: [PATCH 34/36] approximate division by 9 with fixed point multiplication for comparison with Hwy implementation. --- CMakeLists.txt | 1 + lib/gpc/kernels/sobel.cpp | 7 +++++-- samples/target.cpp | 11 ++++------- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 57b44ae..72f760c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,6 +6,7 @@ project(openGPC CXX) set (REQ_CPP11_FEATURES cxx_strong_enums cxx_auto_type) if(NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE) + add_compile_options(-O3 -funroll-loops) endif() set(CMAKE_CXX_STANDARD 17) diff --git a/lib/gpc/kernels/sobel.cpp b/lib/gpc/kernels/sobel.cpp index 531b3ed..7867cd3 100644 --- a/lib/gpc/kernels/sobel.cpp +++ b/lib/gpc/kernels/sobel.cpp @@ -59,9 +59,12 @@ void sobelNaive( // boundary) (unoptimized) for (int iy = 1; iy < height - 1; iy++) { for (int ix = 0; ix < width; ix++) { - int sx = (*p11 + *p31 + 2 * *p21 - *p13 - 2 * *p23 - *p33) / 9; - int sy = (*p11 + *p13 + 2 * *p12 - *p31 - 2 * *p32 - *p33) / 9; + // Approximate division by 9 with fixed-point multiplication (2^16/9 = 7282) + int16_t sum_x = (*p11 + *p31 + 2 * *p21 - *p13 - 2 * *p23 - *p33); + int16_t sum_y = (*p11 + *p13 + 2 * *p12 - *p31 - 2 * *p32 - *p33); + int sx = (static_cast(sum_x) * 7282) >> 16; + int sy = (static_cast(sum_y) * 7282) >> 16; int val = sx * sx + sy * sy; *optr = val > thresholdSq ? 255 : 0; diff --git a/samples/target.cpp b/samples/target.cpp index 4f9a1bc..6c03ab7 100644 --- a/samples/target.cpp +++ b/samples/target.cpp @@ -1,17 +1,14 @@ #include #include int main() { - // This is evaluated at compile-time std::cout << "Compiled for: " << hwy::TargetName(HWY_TARGET) << std::endl; - - // If you need logic based on the arch: #if HWY_TARGET == HWY_AVX2 - std::cout << "Logic: Using 256-bit AVX2 paths." << std::endl; + std::cout << "Using 256-bit AVX2 paths." << std::endl; #elif HWY_TARGET == HWY_NEON - std::cout << "Logic: Using 128-bit NEON paths." << std::endl; + std::cout << "Using 128-bit NEON paths." << std::endl; #elif HWY_TARGET == HWY_SSE4 - std::cout << "Logic: Using 128-bit SSE4 paths." << std::endl; + std::cout << "Using 128-bit SSE4 paths." << std::endl; #else - std::cout << "Logic: Using Scalar fallback." << std::endl; + std::cout << "Using Scalar fallback." << std::endl; #endif } From bd27dfaaf9c5b1cd402c9af0287828072d129e6d Mon Sep 17 00:00:00 2001 From: Nik Bamert Date: Tue, 7 Apr 2026 08:57:22 +0200 Subject: [PATCH 35/36] formatting --- format_code.sh | 2 +- lib/gpc/Feature.hpp | 11 +- lib/gpc/Fern.hpp | 17 +-- lib/gpc/buffer.hpp | 41 +++---- lib/gpc/feature.cpp | 49 ++++---- lib/gpc/fern.cpp | 36 +++--- lib/gpc/forest.cpp | 92 +++++++------- lib/gpc/forest.hpp | 8 +- lib/gpc/inference.hpp | 4 +- lib/gpc/kernels/box.cpp | 69 +++++++---- lib/gpc/kernels/box.hpp | 28 ++--- lib/gpc/kernels/box_hwy.cpp | 108 ++++++++++++----- lib/gpc/kernels/box_hwy.hpp | 12 +- lib/gpc/kernels/census.cpp | 5 +- lib/gpc/kernels/census.hpp | 3 +- lib/gpc/kernels/gpc.cpp | 66 +++++----- lib/gpc/kernels/gpc.hpp | 21 ++-- lib/gpc/kernels/gpc_hwy.cpp | 113 ++++++++++-------- lib/gpc/kernels/gpc_hwy.hpp | 12 +- lib/gpc/kernels/sobel.cpp | 86 ++++++++----- lib/gpc/kernels/sobel.hpp | 13 +- lib/gpc/kernels/sobel_hwy.cpp | 219 +++++++++++++++++++++++----------- lib/gpc/kernels/sobel_hwy.hpp | 13 +- lib/gpc/kernels/utils.cpp | 14 +-- lib/gpc/kernels/utils.hpp | 8 +- samples/sparsematch.cpp | 35 +++--- samples/target.cpp | 1 + 27 files changed, 622 insertions(+), 464 deletions(-) diff --git a/format_code.sh b/format_code.sh index 0eaf149..16d9cb5 100755 --- a/format_code.sh +++ b/format_code.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash set -euo pipefail -EXPECTED_VERSION="21.1.5" +EXPECTED_VERSION="21.1.8" root_folder=$(git rev-parse --show-toplevel) change_in_place=false diff --git a/lib/gpc/Feature.hpp b/lib/gpc/Feature.hpp index 8a8b55c..c2064b4 100644 --- a/lib/gpc/Feature.hpp +++ b/lib/gpc/Feature.hpp @@ -96,10 +96,10 @@ class Feature { * @param[in] trip The triplet */ void getDecisions(bool& ref, - bool& pos, - bool& neg, - params& params, - const GPCPatchTriplet& trip); + bool& pos, + bool& neg, + params& params, + const GPCPatchTriplet& trip); Feature(); /** @@ -138,8 +138,7 @@ class Feature { * @param path The path where we'd like to store the training data * in binary form. */ - void storeAllTriplets(std::vector& data, - std::string path); + void storeAllTriplets(std::vector& data, std::string path); /** * @brief Read triplets of training data from a binary file * written by the storeAllTriplets method. diff --git a/lib/gpc/Fern.hpp b/lib/gpc/Fern.hpp index d9554fd..68c6a1f 100644 --- a/lib/gpc/Fern.hpp +++ b/lib/gpc/Fern.hpp @@ -181,7 +181,7 @@ OptimizerSettings TauOptimizer(int taulo, */ OptimizerSettings ZeroOptimizer(int numResamples, bool onlyScoreNonSplitSamples, - double w1) ; + double w1); struct FernSettings { const int maxDepth; const int scale; @@ -227,7 +227,7 @@ class Fern { OptimizerSettings optsetting, int scoreUntilLevel, splitStats& s); - /** + /** * @brief Mark those samples in the set as "split" if they have been * correctly classified(ref=pos and pos!=neg) with the parameter * set in params @@ -238,7 +238,7 @@ class Fern { */ void markSplitSamples(std::vector& data, std::vector& params, - int numParams) ; + int numParams); /** * @brief Reset the mark on the training samples on whether they have been * split correctly or not Since we do not operate on copies of the training @@ -247,7 +247,7 @@ class Fern { * @param data */ void resetMarkOnSamples(std::vector& data); - + /** * @brief Train a fern given a set of training data and some optimizer * settings @@ -256,8 +256,8 @@ class Fern { * @param optsetting the optimizer settings */ void train(std::vector& trainingSamples, - OptimizerSettings optsetting) ; - + OptimizerSettings optsetting); + /** * @brief Returns the decision of the first five levels of the ferns * @@ -284,7 +284,10 @@ class Fern { * * @return */ -inline std::vector FernFactory(int num_S, int num_M, int num_L, int maxDepth) { +inline std::vector FernFactory(int num_S, + int num_M, + int num_L, + int maxDepth) { std::vector ferns; for (int i = 0; i < num_S; i++) ferns.push_back(Fern(FernSettings(maxDepth, 2))); diff --git a/lib/gpc/buffer.hpp b/lib/gpc/buffer.hpp index 903a507..0b58119 100644 --- a/lib/gpc/buffer.hpp +++ b/lib/gpc/buffer.hpp @@ -33,12 +33,12 @@ #include #include -#include -#include -#include #include -#include +#include #include +#include +#include +#include using namespace std; @@ -84,17 +84,18 @@ struct Descriptor { bool operator<(const Descriptor& d) const { return state < d.state; } bool operator<=(const Descriptor& d) const { return state <= d.state; } int operator%(const int& d) const { return state % d; } - static void serialize(const std::string& filename, const std::vector& data) { + static void serialize(const std::string& filename, + const std::vector& data) { std::ofstream outFile(filename); if (!outFile.is_open()) { - std::cerr << "Error opening file for writing: " << filename << std::endl; + std::cerr << "Error opening file for writing: " << filename + << std::endl; return; } for (const auto& desc : data) { - outFile << desc.point.x << "," - << desc.point.y << "," - << desc.state << "\n"; + outFile << desc.point.x << "," << desc.point.y << "," << desc.state + << "\n"; } outFile.close(); } @@ -102,11 +103,13 @@ struct Descriptor { /** * Deserializes a CSV file back into a vector of Descriptors. */ - static std::vector deserialize(const std::string& filename, bool srcDescr) { + static std::vector deserialize(const std::string& filename, + bool srcDescr) { std::vector result; std::ifstream inFile(filename); if (!inFile.is_open()) { - std::cerr << "Error opening file for reading: " << filename << std::endl; + std::cerr << "Error opening file for reading: " << filename + << std::endl; return result; } @@ -118,18 +121,16 @@ struct Descriptor { std::string x_str, y_str, state_str; // Split by comma - if (std::getline(ss, x_str, ',') && - std::getline(ss, y_str, ',') && + if (std::getline(ss, x_str, ',') && std::getline(ss, y_str, ',') && std::getline(ss, state_str, ',')) { - Descriptor d; d.point.x = std::stod(x_str); d.point.y = std::stod(y_str); d.state = std::stoull(state_str); - d.srcDescr = srcDescr; - - //if (d.point.y > 200 && d.point.y < 400) - result.push_back(d); + d.srcDescr = srcDescr; + + // if (d.point.y > 200 && d.point.y < 400) + result.push_back(d); } } inFile.close(); @@ -1024,8 +1025,8 @@ inline Buffer getDisparityVisualization( } return dispVis; } -inline Buffer getDisparityVisualization(ndb::Buffer& srcImg, - std::vector& support) { +inline Buffer getDisparityVisualization( + ndb::Buffer& srcImg, std::vector& support) { float min_disparity = 0; float max_disparity = 128; Buffer dispVis(Eigen::Vector2i(srcImg.width, srcImg.rows())); diff --git a/lib/gpc/feature.cpp b/lib/gpc/feature.cpp index 529970c..9e1322f 100644 --- a/lib/gpc/feature.cpp +++ b/lib/gpc/feature.cpp @@ -36,9 +36,9 @@ #include #include //for log2 #include +#include #include #include -#include #include #include #include @@ -51,19 +51,16 @@ using namespace std; namespace gpc { namespace training { void Feature::getDecisions(bool& ref, - bool& pos, - bool& neg, - params& params, - const GPCPatchTriplet& trip) { - ref = - ((int)trip.ref.feature(params.i) - (int)trip.ref.feature(params.j) < - params.tau); - pos = - ((int)trip.pos.feature(params.i) - (int)trip.pos.feature(params.j) < - params.tau); - neg = - ((int)trip.neg.feature(params.i) - (int)trip.neg.feature(params.j) < - params.tau); + bool& pos, + bool& neg, + params& params, + const GPCPatchTriplet& trip) { + ref = ((int)trip.ref.feature(params.i) - (int)trip.ref.feature(params.j) < + params.tau); + pos = ((int)trip.pos.feature(params.i) - (int)trip.pos.feature(params.j) < + params.tau); + neg = ((int)trip.neg.feature(params.i) - (int)trip.neg.feature(params.j) < + params.tau); } Feature::Feature() { @@ -77,7 +74,7 @@ Feature::Feature() { } void Feature::sampleHyperplane(int scale, params& params) { if (scale == 2) { - params.i = params.j; // s.t. they regenerate each iteration + params.i = params.j; // s.t. they regenerate each iteration while (params.i == params.j) { // i and j need to be distinct int i = randIJ7(rng); int j = randIJ7(rng); @@ -90,7 +87,7 @@ void Feature::sampleHyperplane(int scale, params& params) { params.j = 280 + (params.jx + 3) + 27 * (params.jy + 3); } } else if (scale == 1) { - params.i = params.j; // s.t. they regenerate each iteration + params.i = params.j; // s.t. they regenerate each iteration while (params.i == params.j) { // i and j need to be distinct int i = randIJ17(rng); int j = randIJ17(rng); @@ -103,7 +100,7 @@ void Feature::sampleHyperplane(int scale, params& params) { params.j = 140 + (params.jx + 8) + 27 * (params.jy + 8); } } else if (scale == 0) { - params.i = params.j; // s.t. they regenerate each iteration + params.i = params.j; // s.t. they regenerate each iteration while (params.i == params.j) { // i and j need to be distinct params.i = randIJ27(rng); params.j = randIJ27(rng); @@ -119,11 +116,11 @@ void Feature::sampleHyperplane(int scale, params& params) { params.tau = randTAU(rng); } void Feature::extractAllTriplets(ndb::Buffer& bwL, - ndb::Buffer& bwR, - std::vector& ref, - std::vector& pos, - std::vector& neg, - std::vector& triplets) { + ndb::Buffer& bwR, + std::vector& ref, + std::vector& pos, + std::vector& neg, + std::vector& triplets) { ndb::Buffer LL(bwL.rows(), bwL.cols()); LL.width = bwL.width; ndb::box(bwL.data(), LL.data(), bwL.cols(), bwL.rows(), 1); @@ -174,7 +171,7 @@ void Feature::extractAllTriplets(ndb::Buffer& bwL, } void Feature::storeAllTriplets(std::vector& data, - std::string path) { + std::string path) { ofstream fout; fout.open(path, ios::binary | ios::out); for (auto& triplet : data) { @@ -184,13 +181,13 @@ void Feature::storeAllTriplets(std::vector& data, } fout.close(); } -std::vector Feature::loadAllTriplets(std::string path) { +std::vector Feature::loadAllTriplets( + std::string path) { std::vector data; std::ifstream in(path, std::ifstream::ate | std::ifstream::binary); uint32_t filesize = in.tellg(); if (filesize % ((27 * 27) * 3)) { - cout << "ERR: File is not a training set of this feature type" - << endl; + cout << "ERR: File is not a training set of this feature type" << endl; cout << "FS: " << filesize << endl; return data; } diff --git a/lib/gpc/fern.cpp b/lib/gpc/fern.cpp index a171218..5a2632c 100644 --- a/lib/gpc/fern.cpp +++ b/lib/gpc/fern.cpp @@ -31,6 +31,8 @@ // The Global Patch Collider // Shenlong Wang, Sean Ryan Fanello, Christoph Rhemann, Shahram Izadi, Pushmeet // Kohli CVPR 2016 Code Author: Niklaus Bamert (bamertn@ethz.ch) +#include "gpc/Fern.hpp" + #include #include #include @@ -38,7 +40,6 @@ #include #include "gpc/Feature.hpp" -#include "gpc/Fern.hpp" using namespace std; namespace gpc { @@ -57,11 +58,11 @@ OptimizerSettings ZeroOptimizer(int numResamples, return OptimizerSettings(0, 1, numResamples, onlyScoreNonSplitSamples, w1); } void Fern::evalSplit(std::vector& data, - std::vector& params, - FernSettings fernsetting, - OptimizerSettings optsetting, - int scoreUntilLevel, - splitStats& s) { + std::vector& params, + FernSettings fernsetting, + OptimizerSettings optsetting, + int scoreUntilLevel, + splitStats& s) { s.tp = 0; s.fn = 0; s.fp = 0; @@ -80,8 +81,7 @@ void Fern::evalSplit(std::vector& data, bool refDec, posDec, negDec; // Decisions need to be added into a codeword - Feature.getDecisions( - refDec, posDec, negDec, params[i], triplet); + Feature.getDecisions(refDec, posDec, negDec, params[i], triplet); if (refDec) ref++; if (posDec) pos++; if (negDec) neg++; @@ -118,8 +118,8 @@ void Fern::evalSplit(std::vector& data, s.convcomb = (1. - w2) * s.prec + w2 * s.rec; } void Fern::markSplitSamples(std::vector& data, - std::vector& params, - int numParams) { + std::vector& params, + int numParams) { for (auto& triplet : data) { // Evaluate triplet on all given parameters uint64_t ref = 0, pos = 0, neg = 0; @@ -129,8 +129,7 @@ void Fern::markSplitSamples(std::vector& data, neg <<= 1; // shift by one bool refDec, posDec, negDec; - Feature.getDecisions( - refDec, posDec, negDec, params[i], triplet); + Feature.getDecisions(refDec, posDec, negDec, params[i], triplet); if (refDec) ref++; if (posDec) pos++; if (negDec) neg++; @@ -147,7 +146,7 @@ void Fern::resetMarkOnSamples(std::vector& data) { } void Fern::train(std::vector& trainingSamples, - OptimizerSettings optsetting) { + OptimizerSettings optsetting) { splitStats stats; float maxScore = 0.f; SplitParams_t bestParams; @@ -155,9 +154,9 @@ void Fern::train(std::vector& trainingSamples, fernparams.resize(fernsettings.maxDepth); cout << setw(7) << "Level" << setw(10) << "Prec" << setw(10) << "Rec" - << setw(10) << "Har" << setw(8) << "Tot" << setw(8) << "TP" - << setw(8) << "FP" << setw(8) << "FN" << setw(6) << "scale" - << setw(5) << "tau" << setw(5) << "i" << setw(5) << "j" << endl; + << setw(10) << "Har" << setw(8) << "Tot" << setw(8) << "TP" << setw(8) + << "FP" << setw(8) << "FN" << setw(6) << "scale" << setw(5) << "tau" + << setw(5) << "i" << setw(5) << "j" << endl; if (optsetting.onlyScoreNonSplitSamples_) resetMarkOnSamples(trainingSamples); for (int level = 0; level < fernsettings.maxDepth; level++) { @@ -166,8 +165,7 @@ void Fern::train(std::vector& trainingSamples, // Samples a hyperplane in the requested scale Feature.sampleHyperplane(fernsettings.scale, fernparams[level]); // Iterates over a small range of tau (intercept) - for (int tau = optsetting.taulo_; tau < optsetting.tauhi_; - tau++) { + for (int tau = optsetting.taulo_; tau < optsetting.tauhi_; tau++) { fernparams[level].tau = tau; // Score hyperplane set we have so far evalSplit(trainingSamples, @@ -202,7 +200,5 @@ std::vector Fern::getParameters() { return fernparams; } int Fern::getScale() { return fernsettings.scale; } - - } // namespace training } // namespace gpc diff --git a/lib/gpc/forest.cpp b/lib/gpc/forest.cpp index 87e0582..940eb33 100644 --- a/lib/gpc/forest.cpp +++ b/lib/gpc/forest.cpp @@ -32,7 +32,7 @@ // Shenlong Wang, Sean Ryan Fanello, Christoph Rhemann, Shahram Izadi, Pushmeet // Kohli CVPR 2016 Code Author: Niklaus Bamert (bamertn@ethz.ch) #include -//#include +// #include #include #include #include @@ -43,32 +43,32 @@ #include // GPC includes +#include + #include "gpc/Feature.hpp" #include "gpc/SintelOpticalFlow.hpp" #include "gpc/SintelStereo.hpp" #include "gpc/buffer.hpp" -#include "gpc/kernels/sobel.hpp" +#include "gpc/forest.hpp" +#include "gpc/hashmatch.hpp" #include "gpc/kernels/box.hpp" #include "gpc/kernels/gpc.hpp" +#include "gpc/kernels/sobel.hpp" #include "gpc/kernels/utils.hpp" -#include "gpc/hashmatch.hpp" -#include "gpc/forest.hpp" -#include - namespace gpc { namespace inference { - /** - * @brief Computes sparse matches on a pair of rectified and smoothed - * images. Here the src and tar images refer to the left and right images, - * respectively. - * - * @param src Preprocessed source(left) image - * @param tar Preprocessed target(right) image - * @param fastmask forest mask of relative integer offsets. - * - * @return - */ +/** + * @brief Computes sparse matches on a pair of rectified and smoothed + * images. Here the src and tar images refer to the left and right images, + * respectively. + * + * @param src Preprocessed source(left) image + * @param tar Preprocessed target(right) image + * @param fastmask forest mask of relative integer offsets. + * + * @return + */ std::vector Forest::depthPriorFast( PreprocessedImage& src, PreprocessedImage& tar, @@ -87,12 +87,13 @@ std::vector Forest::depthPriorFast( } // Use sort method for matching if (settings.useHashtable_ == false) { - t0 = sysTick(); + t0 = sysTick(); std::vector corr = findCorrespondences(statesSrc, statesTar); - t1 = sysTick(); - std::cout << "findCorrespondences (without allocation): " << gpc::inference::tickToMs(t1, t0) << " ms" << std::endl; - std::cout << "length src: " << statesSrc.size() << std::endl; + t1 = sysTick(); + std::cout << "findCorrespondences (without allocation): " + << gpc::inference::tickToMs(t1, t0) << " ms" << std::endl; + std::cout << "length src: " << statesSrc.size() << std::endl; return corr; } // Use hashtable matching @@ -100,9 +101,8 @@ std::vector Forest::depthPriorFast( for (auto& q : statesSrc) q.srcDescr = true; for (auto& q : statesTar) q.srcDescr = false; - ndb::Hashmatch hm( - 214673, - statesSrc.size() + statesTar.size()); + ndb::Hashmatch hm(214673, + statesSrc.size() + statesTar.size()); std::vector> corr; for (auto& q : statesSrc) hm.insert(q); for (auto& q : statesTar) hm.insert(q); @@ -110,8 +110,7 @@ std::vector Forest::depthPriorFast( // Store vertices in a format that is more convenient for us: std::vector corr2; for (auto& e : corr) { - corr2.push_back( - ndb::Correspondence(e.first.point, e.second.point)); + corr2.push_back(ndb::Correspondence(e.first.point, e.second.point)); } return corr2; @@ -207,7 +206,7 @@ std::vector Forest::evalFastMaskOnSubsetSSE( * @return the preprocessed image */ PreprocessedImage Forest::preprocessImage(ndb::Buffer& img, - InferenceSettings settings) { + InferenceSettings settings) { assert((settings.gradientThreshold_ >= 0 && settings.gradientThreshold_ <= 255) && "gradientThreshold needs to be within 0...255"); @@ -221,11 +220,11 @@ PreprocessedImage Forest::preprocessImage(ndb::Buffer& img, img.cols(), img.rows(), settings.numThreads_); - //4.2 *10^-5 ms + // 4.2 *10^-5 ms smooth.clearBoundary(); ndb::Buffer grad(img.rows(), img.cols()); grad.width = img.width; - //4.2*10-5ms (unclear how) + // 4.2*10-5ms (unclear how) ndb::sobel(img.data(), grad.data(), img.cols(), @@ -265,21 +264,20 @@ PreprocessedImage Forest::preprocessImage(ndb::Buffer& img, * @return Set of correspondences (ptSrc, ptTar) where * ptSrc and ptTar are points in the source and target images, respectively. */ -std::vector Forest::stereoMatch(PreprocessedImage& simg, - PreprocessedImage& timg, - FilterMask& forestmask, - InferenceSettings settings) { +std::vector Forest::stereoMatch( + PreprocessedImage& simg, + PreprocessedImage& timg, + FilterMask& forestmask, + InferenceSettings settings) { // make sure the delivered mask matches the image dimensions - assert( - (forestmask.width == simg.smooth.cols() && - forestmask.height == simg.smooth.rows()) && - "Source Image: dimension does not fit dimension of supplied forest " - "mask"); - assert( - (forestmask.width == timg.smooth.cols() && - forestmask.height == simg.smooth.rows()) && - "Targe Image: dimension does not fit dimension of supplied forest " - "mask"); + assert((forestmask.width == simg.smooth.cols() && + forestmask.height == simg.smooth.rows()) && + "Source Image: dimension does not fit dimension of supplied forest " + "mask"); + assert((forestmask.width == timg.smooth.cols() && + forestmask.height == simg.smooth.rows()) && + "Targe Image: dimension does not fit dimension of supplied forest " + "mask"); bool m_debug = false; // Match std::vector corr = @@ -303,9 +301,9 @@ std::vector Forest::stereoMatch(PreprocessedImage& simg, * of a point in the left image and d the disparity. */ std::vector Forest::rectifiedMatch(PreprocessedImage& simg, - PreprocessedImage& timg, - FilterMask& forestmask, - InferenceSettings settings) { + PreprocessedImage& timg, + FilterMask& forestmask, + InferenceSettings settings) { // Do matching std::vector corr = stereoMatch(simg, timg, forestmask, settings); @@ -379,4 +377,4 @@ FilterMask Forest::readForest(std::string path, int width, int height) { } } // namespace inference -} +} // namespace gpc diff --git a/lib/gpc/forest.hpp b/lib/gpc/forest.hpp index 6463aec..6226b83 100644 --- a/lib/gpc/forest.hpp +++ b/lib/gpc/forest.hpp @@ -61,7 +61,7 @@ inline std::chrono::high_resolution_clock::time_point sysTick() { return std::chrono::high_resolution_clock::now(); } inline float tickToMs(std::chrono::high_resolution_clock::time_point t0, - std::chrono::high_resolution_clock::time_point t1) { + std::chrono::high_resolution_clock::time_point t1) { return std::abs( 1000. * std::chrono::duration_cast>(t1 - t0) @@ -216,7 +216,7 @@ class Forest { std::vector& idx, FilterMask& fastmask, InferenceSettings& settings); - + /** * @brief Preprocesses an image. (smooth, binary sobel image and gradient * pixel indices) @@ -228,7 +228,7 @@ class Forest { */ PreprocessedImage preprocessImage(ndb::Buffer& img, InferenceSettings settings); - /** + /** * @brief Finds matches between two stereo images based on a given forest * mask. * @@ -261,7 +261,7 @@ class Forest { PreprocessedImage& timg, FilterMask& forestmask, InferenceSettings settings); - + /** * @brief Reads text-based forest format and returns a mask for a given * image size. diff --git a/lib/gpc/inference.hpp b/lib/gpc/inference.hpp index e074290..df1c840 100644 --- a/lib/gpc/inference.hpp +++ b/lib/gpc/inference.hpp @@ -48,11 +48,11 @@ #include "gpc/SintelOpticalFlow.hpp" #include "gpc/SintelStereo.hpp" #include "gpc/buffer.hpp" +#include "gpc/hashmatch.hpp" #include "gpc/kernels/box.hpp" -#include "gpc/kernels/sobel.hpp" #include "gpc/kernels/gpc.hpp" +#include "gpc/kernels/sobel.hpp" #include "gpc/kernels/utils.hpp" -#include "gpc/hashmatch.hpp" /** * @brief The inference class of the GPC forest diff --git a/lib/gpc/kernels/box.cpp b/lib/gpc/kernels/box.cpp index 714fbb6..5b51b60 100644 --- a/lib/gpc/kernels/box.cpp +++ b/lib/gpc/kernels/box.cpp @@ -30,11 +30,13 @@ // Code Author: Niklaus Bamert (bamertn@ethz.ch) #include "gpc/kernels/box.hpp" -#include "gpc/kernels/utils.hpp" + #include + +#include "gpc/kernels/utils.hpp" namespace ndb { -namespace testing { - void box_hwy(uint8_t* in, uint8_t* blurred, int width, int height); +namespace testing { +void box_hwy(uint8_t* in, uint8_t* blurred, int width, int height); } void boxNaive(uint8_t* in, uint8_t* blurred, int width, int height) { assert(width % 16 == 0 && "width must be multiple of 16!"); @@ -83,12 +85,12 @@ void boxNaive(uint8_t* in, uint8_t* blurred, int width, int height) { void boxSSE(uint8_t* in, uint8_t* blurred, int width, int height) { int start = 1; int end = height - 3; - + int x, y; - __m128i one_third = _mm_set1_epi16(21846); // 2^16/3 + 1 - - __m128i *dst0 = (__m128i*)(blurred + width * start); - __m128i *dst1 = (__m128i*)(blurred + width * (start + 1)); + __m128i one_third = _mm_set1_epi16(21846); // 2^16/3 + 1 + + __m128i* dst0 = (__m128i*)(blurred + width * start); + __m128i* dst1 = (__m128i*)(blurred + width * (start + 1)); for (y = start; y < end; y += 2) { const uint8_t *row0, *row1, *row2, *row3; @@ -111,8 +113,10 @@ void boxSSE(uint8_t* in, uint8_t* blurred, int width, int height) { unpack8to16(s00, a00, b00); unpack8to16(s01, a01, b01); unpack8to16(s02, a02, b02); - ra00 = _mm_mulhi_epi16(_mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third); - rb00 = _mm_mulhi_epi16(_mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third); + ra00 = _mm_mulhi_epi16( + _mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third); + rb00 = _mm_mulhi_epi16( + _mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third); // Row 1 Processing s00 = _mm_loadu_si128((__m128i*)(row1 - 1)); @@ -121,8 +125,10 @@ void boxSSE(uint8_t* in, uint8_t* blurred, int width, int height) { unpack8to16(s00, a00, b00); unpack8to16(s01, a01, b01); unpack8to16(s02, a02, b02); - ra01 = _mm_mulhi_epi16(_mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third); - rb01 = _mm_mulhi_epi16(_mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third); + ra01 = _mm_mulhi_epi16( + _mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third); + rb01 = _mm_mulhi_epi16( + _mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third); // Row 2 Processing s00 = _mm_loadu_si128((__m128i*)(row2 - 1)); @@ -131,12 +137,16 @@ void boxSSE(uint8_t* in, uint8_t* blurred, int width, int height) { unpack8to16(s00, a00, b00); unpack8to16(s01, a01, b01); unpack8to16(s02, a02, b02); - ra02 = _mm_mulhi_epi16(_mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third); - rb02 = _mm_mulhi_epi16(_mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third); + ra02 = _mm_mulhi_epi16( + _mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third); + rb02 = _mm_mulhi_epi16( + _mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third); // Accumulate rows 0, 1, 2 for dst0 - tmp0 = _mm_mulhi_epi16(_mm_adds_epi16(_mm_adds_epi16(ra00, ra01), ra02), one_third); - tmp1 = _mm_mulhi_epi16(_mm_adds_epi16(_mm_adds_epi16(rb00, rb01), rb02), one_third); + tmp0 = _mm_mulhi_epi16( + _mm_adds_epi16(_mm_adds_epi16(ra00, ra01), ra02), one_third); + tmp1 = _mm_mulhi_epi16( + _mm_adds_epi16(_mm_adds_epi16(rb00, rb01), rb02), one_third); pack16to8(tmp0, tmp1, res); _mm_store_si128(dst0++, res); @@ -147,16 +157,23 @@ void boxSSE(uint8_t* in, uint8_t* blurred, int width, int height) { unpack8to16(s00, a00, b00); unpack8to16(s01, a01, b01); unpack8to16(s02, a02, b02); - ra00 = _mm_mulhi_epi16(_mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third); - rb00 = _mm_mulhi_epi16(_mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third); + ra00 = _mm_mulhi_epi16( + _mm_adds_epi16(_mm_adds_epi16(a00, a01), a02), one_third); + rb00 = _mm_mulhi_epi16( + _mm_adds_epi16(_mm_adds_epi16(b00, b01), b02), one_third); // Accumulate rows 1, 2, 3 for dst1 - tmp0 = _mm_mulhi_epi16(_mm_adds_epi16(_mm_adds_epi16(ra01, ra02), ra00), one_third); - tmp1 = _mm_mulhi_epi16(_mm_adds_epi16(_mm_adds_epi16(rb01, rb02), rb00), one_third); + tmp0 = _mm_mulhi_epi16( + _mm_adds_epi16(_mm_adds_epi16(ra01, ra02), ra00), one_third); + tmp1 = _mm_mulhi_epi16( + _mm_adds_epi16(_mm_adds_epi16(rb01, rb02), rb00), one_third); pack16to8(tmp0, tmp1, res); _mm_store_si128(dst1++, res); - row0 += 16; row1 += 16; row2 += 16; row3 += 16; + row0 += 16; + row1 += 16; + row2 += 16; + row3 += 16; } dst0 += width / 16; dst1 += width / 16; @@ -168,11 +185,11 @@ void box(uint8_t* in, uint8_t* blurred, int width, int height, int numThreads) { #if defined(__ARM_NEON) || defined(__aarch64__) testing::box_hwy(in, blurred, width, height); #else - #if HWY_TARGET == HWY_AVX2 - boxSSE(in, blurred, width, height); - #else - boxNaive(in, blurred, width, height); - #endif +#if HWY_TARGET == HWY_AVX2 + boxSSE(in, blurred, width, height); +#else + boxNaive(in, blurred, width, height); +#endif #endif } } // namespace ndb diff --git a/lib/gpc/kernels/box.hpp b/lib/gpc/kernels/box.hpp index eef0b3d..b00dc88 100644 --- a/lib/gpc/kernels/box.hpp +++ b/lib/gpc/kernels/box.hpp @@ -48,25 +48,21 @@ namespace ndb { void boxNaive(uint8_t* in, uint8_t* blurred, int width, int height); /** - * @brief boxfilter using SSE2 instructions. Loosely based on - * https://www.ignorantus.com/box_sse2/, published under - * the https://creativecommons.org/publicdomain/zero/1.0/ licence. - * - * @param in input image - * @param blurred The blurred - * @param[in] width The width - * @param[in] height The height - * @param[in] numThreads number of threads to use - */ + * @brief boxfilter using SSE2 instructions. Loosely based on + * https://www.ignorantus.com/box_sse2/, published under + * the https://creativecommons.org/publicdomain/zero/1.0/ licence. + * + * @param in input image + * @param blurred The blurred + * @param[in] width The width + * @param[in] height The height + * @param[in] numThreads number of threads to use + */ void box(uint8_t* in, uint8_t* blurred, int width, int height, int numThreads); #if HWY_TARGET == HWY_AVX2 -void boxSSE(uint8_t* in, uint8_t* blurred, int width, int height); +void boxSSE(uint8_t* in, uint8_t* blurred, int width, int height); #endif -} +} // namespace ndb #endif - - - - diff --git a/lib/gpc/kernels/box_hwy.cpp b/lib/gpc/kernels/box_hwy.cpp index 3cc2736..ba2f5dd 100644 --- a/lib/gpc/kernels/box_hwy.cpp +++ b/lib/gpc/kernels/box_hwy.cpp @@ -1,17 +1,19 @@ -#define HWY_TARGET HWY_NEON +#define HWY_TARGET HWY_NEON #include -HWY_BEFORE_NAMESPACE(); +HWY_BEFORE_NAMESPACE(); namespace ndb { namespace HWY_NAMESPACE { namespace hn = hwy::HWY_NAMESPACE; - -void BoxKernel(const uint8_t* HWY_RESTRICT in, uint8_t* HWY_RESTRICT blurred, int width, int height) { +void BoxKernel(const uint8_t* HWY_RESTRICT in, + uint8_t* HWY_RESTRICT blurred, + int width, + int height) { const hn::ScalableTag d8; const hn::Half d8_h; const hn::Rebind d16; - + const size_t N = hn::Lanes(d8); const auto divisor = hn::Set(d16, (uint16_t)7282); @@ -20,46 +22,90 @@ void BoxKernel(const uint8_t* HWY_RESTRICT in, uint8_t* HWY_RESTRICT blurred, in const uint8_t* r1 = in + y * width; const uint8_t* r2 = in + (y + 1) * width; const uint8_t* r3 = in + (y + 2) * width; - + uint8_t* out0 = blurred + y * width + 1; uint8_t* out1 = blurred + (y + 1) * width + 1; for (int x = 0; x < width; x += N) { - auto v00 = hn::LoadU(d8, r0+x); auto v01 = hn::LoadU(d8, r0+x+1); auto v02 = hn::LoadU(d8, r0+x+2); - auto v10 = hn::LoadU(d8, r1+x); auto v11 = hn::LoadU(d8, r1+x+1); auto v12 = hn::LoadU(d8, r1+x+2); - auto v20 = hn::LoadU(d8, r2+x); auto v21 = hn::LoadU(d8, r2+x+1); auto v22 = hn::LoadU(d8, r2+x+2); - auto v30 = hn::LoadU(d8, r3+x); auto v31 = hn::LoadU(d8, r3+x+1); auto v32 = hn::LoadU(d8, r3+x+2); + auto v00 = hn::LoadU(d8, r0 + x); + auto v01 = hn::LoadU(d8, r0 + x + 1); + auto v02 = hn::LoadU(d8, r0 + x + 2); + auto v10 = hn::LoadU(d8, r1 + x); + auto v11 = hn::LoadU(d8, r1 + x + 1); + auto v12 = hn::LoadU(d8, r1 + x + 2); + auto v20 = hn::LoadU(d8, r2 + x); + auto v21 = hn::LoadU(d8, r2 + x + 1); + auto v22 = hn::LoadU(d8, r2 + x + 2); + auto v30 = hn::LoadU(d8, r3 + x); + auto v31 = hn::LoadU(d8, r3 + x + 1); + auto v32 = hn::LoadU(d8, r3 + x + 2); // Lower Half Math - auto s1_lo = hn::Add(hn::PromoteTo(d16, hn::LowerHalf(v11)), hn::Add(hn::PromoteTo(d16, hn::LowerHalf(v10)), hn::PromoteTo(d16, hn::LowerHalf(v12)))); - auto s2_lo = hn::Add(hn::PromoteTo(d16, hn::LowerHalf(v21)), hn::Add(hn::PromoteTo(d16, hn::LowerHalf(v20)), hn::PromoteTo(d16, hn::LowerHalf(v22)))); - - auto row0_lo = hn::Add(hn::Add(hn::PromoteTo(d16, hn::LowerHalf(v01)), hn::Add(hn::PromoteTo(d16, hn::LowerHalf(v00)), hn::PromoteTo(d16, hn::LowerHalf(v02)))), hn::Add(s1_lo, s2_lo)); - auto row1_lo = hn::Add(hn::Add(hn::PromoteTo(d16, hn::LowerHalf(v31)), hn::Add(hn::PromoteTo(d16, hn::LowerHalf(v30)), hn::PromoteTo(d16, hn::LowerHalf(v32)))), hn::Add(s1_lo, s2_lo)); + auto s1_lo = + hn::Add(hn::PromoteTo(d16, hn::LowerHalf(v11)), + hn::Add(hn::PromoteTo(d16, hn::LowerHalf(v10)), + hn::PromoteTo(d16, hn::LowerHalf(v12)))); + auto s2_lo = + hn::Add(hn::PromoteTo(d16, hn::LowerHalf(v21)), + hn::Add(hn::PromoteTo(d16, hn::LowerHalf(v20)), + hn::PromoteTo(d16, hn::LowerHalf(v22)))); + + auto row0_lo = hn::Add( + hn::Add(hn::PromoteTo(d16, hn::LowerHalf(v01)), + hn::Add(hn::PromoteTo(d16, hn::LowerHalf(v00)), + hn::PromoteTo(d16, hn::LowerHalf(v02)))), + hn::Add(s1_lo, s2_lo)); + auto row1_lo = hn::Add( + hn::Add(hn::PromoteTo(d16, hn::LowerHalf(v31)), + hn::Add(hn::PromoteTo(d16, hn::LowerHalf(v30)), + hn::PromoteTo(d16, hn::LowerHalf(v32)))), + hn::Add(s1_lo, s2_lo)); // Upper Half Math - auto s1_hi = hn::Add(hn::PromoteTo(d16, hn::UpperHalf(d8_h, v11)), hn::Add(hn::PromoteTo(d16, hn::UpperHalf(d8_h, v10)), hn::PromoteTo(d16, hn::UpperHalf(d8_h, v12)))); - auto s2_hi = hn::Add(hn::PromoteTo(d16, hn::UpperHalf(d8_h, v21)), hn::Add(hn::PromoteTo(d16, hn::UpperHalf(d8_h, v20)), hn::PromoteTo(d16, hn::UpperHalf(d8_h, v22)))); - - auto row0_hi = hn::Add(hn::Add(hn::PromoteTo(d16, hn::UpperHalf(d8_h, v01)), hn::Add(hn::PromoteTo(d16, hn::UpperHalf(d8_h, v00)), hn::PromoteTo(d16, hn::UpperHalf(d8_h, v02)))), hn::Add(s1_hi, s2_hi)); - auto row1_hi = hn::Add(hn::Add(hn::PromoteTo(d16, hn::UpperHalf(d8_h, v31)), hn::Add(hn::PromoteTo(d16, hn::UpperHalf(d8_h, v30)), hn::PromoteTo(d16, hn::UpperHalf(d8_h, v32)))), hn::Add(s1_hi, s2_hi)); + auto s1_hi = + hn::Add(hn::PromoteTo(d16, hn::UpperHalf(d8_h, v11)), + hn::Add(hn::PromoteTo(d16, hn::UpperHalf(d8_h, v10)), + hn::PromoteTo(d16, hn::UpperHalf(d8_h, v12)))); + auto s2_hi = + hn::Add(hn::PromoteTo(d16, hn::UpperHalf(d8_h, v21)), + hn::Add(hn::PromoteTo(d16, hn::UpperHalf(d8_h, v20)), + hn::PromoteTo(d16, hn::UpperHalf(d8_h, v22)))); - hn::StoreU(hn::OrderedDemote2To(d8, hn::MulHigh(row0_lo, divisor), hn::MulHigh(row0_hi, divisor)), d8, out0 + x); - hn::StoreU(hn::OrderedDemote2To(d8, hn::MulHigh(row1_lo, divisor), hn::MulHigh(row1_hi, divisor)), d8, out1 + x); + auto row0_hi = hn::Add( + hn::Add(hn::PromoteTo(d16, hn::UpperHalf(d8_h, v01)), + hn::Add(hn::PromoteTo(d16, hn::UpperHalf(d8_h, v00)), + hn::PromoteTo(d16, hn::UpperHalf(d8_h, v02)))), + hn::Add(s1_hi, s2_hi)); + auto row1_hi = hn::Add( + hn::Add(hn::PromoteTo(d16, hn::UpperHalf(d8_h, v31)), + hn::Add(hn::PromoteTo(d16, hn::UpperHalf(d8_h, v30)), + hn::PromoteTo(d16, hn::UpperHalf(d8_h, v32)))), + hn::Add(s1_hi, s2_hi)); + + hn::StoreU(hn::OrderedDemote2To(d8, + hn::MulHigh(row0_lo, divisor), + hn::MulHigh(row0_hi, divisor)), + d8, + out0 + x); + hn::StoreU(hn::OrderedDemote2To(d8, + hn::MulHigh(row1_lo, divisor), + hn::MulHigh(row1_hi, divisor)), + d8, + out1 + x); } } } -} // namespace HWY_NAMESPACE -} // namespace ndb +} // namespace HWY_NAMESPACE +} // namespace ndb HWY_AFTER_NAMESPACE(); namespace ndb { namespace testing { -//#if defined(HWY_TARGET) && HWY_TARGET == HWY_NEON - void box_hwy(uint8_t* in, uint8_t* blurred, int width, int height) { - //ndb::N_NEON::BoxKernel(in, blurred, width, height); - HWY_STATIC_DISPATCH(BoxKernel)(in, blurred, width, height); - } -//#endif -} +// #if defined(HWY_TARGET) && HWY_TARGET == HWY_NEON +void box_hwy(uint8_t* in, uint8_t* blurred, int width, int height) { + // ndb::N_NEON::BoxKernel(in, blurred, width, height); + HWY_STATIC_DISPATCH(BoxKernel)(in, blurred, width, height); } +// #endif +} // namespace testing +} // namespace ndb diff --git a/lib/gpc/kernels/box_hwy.hpp b/lib/gpc/kernels/box_hwy.hpp index 6c256b0..ae39d71 100644 --- a/lib/gpc/kernels/box_hwy.hpp +++ b/lib/gpc/kernels/box_hwy.hpp @@ -1,4 +1,4 @@ -#ifndef __NDB__KERNEL_BOX_HWY +#ifndef __NDB__KERNEL_BOX_HWY #define __NDB__KERNEL_BOX_HWY #include @@ -6,12 +6,12 @@ namespace ndb { namespace testing { - /** - * Entry point for benchmarking the MulHigh (approximate) version. - */ - void box_hwy(uint8_t* in, uint8_t* blurred, int width, int height); +/** + * Entry point for benchmarking the MulHigh (approximate) version. + */ +void box_hwy(uint8_t* in, uint8_t* blurred, int width, int height); -} +} // namespace testing } // namespace ndb diff --git a/lib/gpc/kernels/census.cpp b/lib/gpc/kernels/census.cpp index 6235b06..8b265cc 100644 --- a/lib/gpc/kernels/census.cpp +++ b/lib/gpc/kernels/census.cpp @@ -28,8 +28,9 @@ // POSSIBILITY OF SUCH DAMAGE. // // Code Author: Niklaus Bamert (bamertn@ethz.ch) -#include #include "gpc/kernels/census.hpp" + +#include void census5x5Naive(uint8_t* in, uint32_t* census, int width, int height) { uint32_t val; uint32_t* dst; @@ -200,5 +201,3 @@ void census5x5(uint8_t* in, uint32_t* census, int width, int height) { #endif } // census5x5 - - diff --git a/lib/gpc/kernels/census.hpp b/lib/gpc/kernels/census.hpp index 8353a4e..054a45f 100644 --- a/lib/gpc/kernels/census.hpp +++ b/lib/gpc/kernels/census.hpp @@ -45,7 +45,6 @@ namespace ndb { */ void census5x5Naive(uint8_t* in, uint32_t* census, int width, int height); - /** * @brief 5x5 dense census transform of input image. binary codes are returned * as a 32bit image @@ -57,5 +56,5 @@ void census5x5Naive(uint8_t* in, uint32_t* census, int width, int height); */ void census5x5(uint8_t* in, uint32_t* census, int width, int height); -} +} // namespace ndb #endif diff --git a/lib/gpc/kernels/gpc.cpp b/lib/gpc/kernels/gpc.cpp index 62ffa3e..7bdf97b 100644 --- a/lib/gpc/kernels/gpc.cpp +++ b/lib/gpc/kernels/gpc.cpp @@ -28,8 +28,9 @@ // POSSIBILITY OF SUCH DAMAGE. // // Code Author: Niklaus Bamert (bamertn@ethz.ch) -#include #include "gpc/kernels/gpc.hpp" + +#include namespace ndb { void gpcFilterNaive(uint8_t* in, const uint8_t* grad, @@ -76,8 +77,7 @@ void gpcFilterTauNaive(uint8_t* in, gpc[k] = tmp; j++; } -} - +} #if (HWY_ARCH_X86) && (HWY_TARGET == HWY_AVX2) bool isAllZeros(__m128i xmm) { @@ -85,13 +85,13 @@ bool isAllZeros(__m128i xmm) { 0xFFFF; } void gpcFilterSSE(uint8_t* in, - const uint8_t* grad, - uint32_t* gpc, - std::vector fastmask, - std::vector& idx, - int width, - int height) { - const int start = 13; + const uint8_t* grad, + uint32_t* gpc, + std::vector fastmask, + std::vector& idx, + int width, + int height) { + const int start = 13; const int end = height - 15; __m128i zero = _mm_set1_epi8(0); __m128i one = _mm_set1_epi8(1); @@ -118,8 +118,7 @@ void gpcFilterSSE(uint8_t* in, for (uint8_t i = 0; i < fastmask.size() && i < 64; i += 2) { out[k] |= _mm_and_si128( _mm_cmpgt_epu8( - _mm_lddqu_si128( - (__m128i*)(center + fastmask[i])), + _mm_lddqu_si128((__m128i*)(center + fastmask[i])), _mm_lddqu_si128( (__m128i*)(center + fastmask[i + 1]))), bitMask); @@ -153,30 +152,30 @@ void gpcFilter(uint8_t* in, std::vector fastmask, std::vector& idx, int width, - int height){ + int height) { assert(width % 16 == 0 && "width must be multiple of 16!"); #if defined(__ARM_NEON) || defined(__aarch64__) // Replace with call to highway gpcFilterNaive(in, grad, gpc, fastmask, idx, width, height); #else - #if (HWY_ARCH_X86) && (HWY_TARGET == HWY_AVX2) - gpcFilterSSE(in, grad, gpc, fastmask, idx, width, height); - #else - gpcFilterNaive(in, grad, gpc, fastmask, idx, width, height); +#if (HWY_ARCH_X86) && (HWY_TARGET == HWY_AVX2) + gpcFilterSSE(in, grad, gpc, fastmask, idx, width, height); +#else + gpcFilterNaive(in, grad, gpc, fastmask, idx, width, height); #endif #endif } #if (HWY_ARCH_X86) && (HWY_TARGET == HWY_AVX2) void gpcFilterTauSSE(uint8_t* in, - const uint8_t* grad, - uint32_t* gpc, - std::vector fastmask, - std::vector tau, - std::vector& idx, - int width, - int height){ - const int start = 13; + const uint8_t* grad, + uint32_t* gpc, + std::vector fastmask, + std::vector tau, + std::vector& idx, + int width, + int height) { + const int start = 13; const int end = height - 15; __m128i zero = _mm_set1_epi8(0); __m128i one = _mm_set1_epi8(1); @@ -203,8 +202,7 @@ void gpcFilterTauSSE(uint8_t* in, for (uint8_t i = 0; i < fastmask.size() && i < 64; i += 2) { out[k] |= _mm_and_si128( _mm_cmpgt_epu8( - _mm_lddqu_si128( - (__m128i*)(center + fastmask[i])), + _mm_lddqu_si128((__m128i*)(center + fastmask[i])), _mm_subs_epi8( _mm_lddqu_si128( (__m128i*)(center + fastmask[i + 1])), @@ -243,19 +241,17 @@ void gpcFilterTau(uint8_t* in, std::vector tau, std::vector& idx, int width, - int height){ + int height) { assert(width % 16 == 0 && "width must be multiple of 16!"); #if defined(__ARM_NEON) || defined(__aarch64__) // Replace with call to highway gpcFilterTauNaive(in, grad, gpc, fastmask, tau, idx, width, height); #else - #if (HWY_ARCH_X86) && (HWY_TARGET == HWY_AVX2) - gpcFilterTauSSE(in, grad, gpc, fastmask, tau, idx, width, height); - #else - gpcFilterTauNaive(in, grad, gpc, fastmask, tau, idx, width, height); +#if (HWY_ARCH_X86) && (HWY_TARGET == HWY_AVX2) + gpcFilterTauSSE(in, grad, gpc, fastmask, tau, idx, width, height); +#else + gpcFilterTauNaive(in, grad, gpc, fastmask, tau, idx, width, height); #endif #endif - } -} // namespace ndb - +} // namespace ndb diff --git a/lib/gpc/kernels/gpc.hpp b/lib/gpc/kernels/gpc.hpp index 49db7ae..bdd4bf4 100644 --- a/lib/gpc/kernels/gpc.hpp +++ b/lib/gpc/kernels/gpc.hpp @@ -58,7 +58,6 @@ void gpcFilter(uint8_t* in, int width, int height); - /** * @brief Applies a gpc filter defined by the pixel-difference tests in * fastmask. Naive implementation @@ -131,24 +130,22 @@ void gpcFilterTauNaive(uint8_t* in, #if (HWY_ARCH_X86) && (HWY_TARGET == HWY_AVX2) bool isAllZeros(__m128i xmm); void gpcFilterTauSSE(uint8_t* in, + const uint8_t* grad, + uint32_t* gpc, + std::vector fastmask, + std::vector tau, + std::vector& idx, + int width, + int height); +void gpcFilterSSE(uint8_t* in, const uint8_t* grad, uint32_t* gpc, std::vector fastmask, - std::vector tau, std::vector& idx, int width, int height); -void gpcFilterSSE(uint8_t* in, - const uint8_t* grad, - uint32_t* gpc, - std::vector fastmask, - std::vector& idx, - int width, - int height); - #endif - -} +} // namespace ndb #endif diff --git a/lib/gpc/kernels/gpc_hwy.cpp b/lib/gpc/kernels/gpc_hwy.cpp index 21dae2d..87b0f20 100644 --- a/lib/gpc/kernels/gpc_hwy.cpp +++ b/lib/gpc/kernels/gpc_hwy.cpp @@ -1,27 +1,27 @@ -//#define HWY_TARGET HWY_NEON +// #define HWY_TARGET HWY_NEON #include "gpc_hwy.hpp" -HWY_BEFORE_NAMESPACE(); +HWY_BEFORE_NAMESPACE(); namespace ndb { namespace HWY_NAMESPACE { namespace hn = hwy::HWY_NAMESPACE; -//dense! +// dense! #include namespace hn = hwy::HWY_NAMESPACE; // Dense Version void GPCKernel(const uint8_t* HWY_RESTRICT in, - const uint8_t* HWY_RESTRICT grad, + const uint8_t* HWY_RESTRICT grad, uint32_t* HWY_RESTRICT gpc, const std::vector& fastmask, - const std::vector& tau, - int width, int height) { - + const std::vector& tau, + int width, + int height) { const hn::ScalableTag d8; const hn::ScalableTag d32; const size_t N = hn::Lanes(d8); - + const int border = 13; const auto v_zero8 = hn::Zero(d8); const auto v_one8 = hn::Set(d8, 1); @@ -36,76 +36,80 @@ void GPCKernel(const uint8_t* HWY_RESTRICT in, // We use four 8-bit registers to build the 32 bits. // This keeps the entire hot-loop in 8-bit space. - auto v_acc0 = hn::Zero(d8); // Bits 0-7 - auto v_acc1 = hn::Zero(d8); // Bits 8-15 - auto v_acc2 = hn::Zero(d8); // Bits 16-23 - auto v_acc3 = hn::Zero(d8); // Bits 24-31 + auto v_acc0 = hn::Zero(d8); // Bits 0-7 + auto v_acc1 = hn::Zero(d8); // Bits 8-15 + auto v_acc2 = hn::Zero(d8); // Bits 16-23 + auto v_acc3 = hn::Zero(d8); // Bits 24-31 // Pass 1: Bits 0-7 for (int i = 0; i < 16; i += 2) { v_acc0 = hn::Add(v_acc0, v_acc0); - auto mask = hn::Gt(hn::LoadU(d8, in + k + fm[i]), - hn::LoadU(d8, in + k + fm[i+1])); + auto mask = hn::Gt(hn::LoadU(d8, in + k + fm[i]), + hn::LoadU(d8, in + k + fm[i + 1])); v_acc0 = hn::Or(v_acc0, hn::IfThenElse(mask, v_one8, v_zero8)); } // Pass 2: Bits 8-15 for (int i = 16; i < 32; i += 2) { v_acc1 = hn::Add(v_acc1, v_acc1); - auto mask = hn::Gt(hn::LoadU(d8, in + k + fm[i]), - hn::LoadU(d8, in + k + fm[i+1])); + auto mask = hn::Gt(hn::LoadU(d8, in + k + fm[i]), + hn::LoadU(d8, in + k + fm[i + 1])); v_acc1 = hn::Or(v_acc1, hn::IfThenElse(mask, v_one8, v_zero8)); } // Pass 3: Bits 16-23 for (int i = 32; i < 48; i += 2) { v_acc2 = hn::Add(v_acc2, v_acc2); - auto mask = hn::Gt(hn::LoadU(d8, in + k + fm[i]), - hn::LoadU(d8, in + k + fm[i+1])); + auto mask = hn::Gt(hn::LoadU(d8, in + k + fm[i]), + hn::LoadU(d8, in + k + fm[i + 1])); v_acc2 = hn::Or(v_acc2, hn::IfThenElse(mask, v_one8, v_zero8)); } // Pass 4: Bits 24-31 for (int i = 48; i < 64; i += 2) { v_acc3 = hn::Add(v_acc3, v_acc3); - auto mask = hn::Gt(hn::LoadU(d8, in + k + fm[i]), - hn::LoadU(d8, in + k + fm[i+1])); + auto mask = hn::Gt(hn::LoadU(d8, in + k + fm[i]), + hn::LoadU(d8, in + k + fm[i + 1])); v_acc3 = hn::Or(v_acc3, hn::IfThenElse(mask, v_one8, v_zero8)); } - // Final Assembly: Promote the four 8-bit chunks into 32-bit results. - // We use PromoteUpper/Lower to widen the data. - // N is the number of 8-bit lanes. We need to store N/4 results in d32. - - // To be perfectly safe across all Highway targets, we extract and combine: + // Final Assembly: Promote the four 8-bit chunks into 32-bit + // results. We use PromoteUpper/Lower to widen the data. N is the + // number of 8-bit lanes. We need to store N/4 results in d32. + + // To be perfectly safe across all Highway targets, we extract and + // combine: for (size_t lane = 0; lane < N; ++lane) { - uint32_t final_val = (uint32_t(hn::ExtractLane(v_acc0, lane)) << 24) | - (uint32_t(hn::ExtractLane(v_acc1, lane)) << 16) | - (uint32_t(hn::ExtractLane(v_acc2, lane)) << 8) | - (uint32_t(hn::ExtractLane(v_acc3, lane))); + uint32_t final_val = + (uint32_t(hn::ExtractLane(v_acc0, lane)) << 24) | + (uint32_t(hn::ExtractLane(v_acc1, lane)) << 16) | + (uint32_t(hn::ExtractLane(v_acc2, lane)) << 8) | + (uint32_t(hn::ExtractLane(v_acc3, lane))); row_out[x + lane] = final_val; } } } } void GPCKerneli(const uint8_t* HWY_RESTRICT in, - const uint8_t* HWY_RESTRICT grad, - uint32_t* HWY_RESTRICT gpc, - const std::vector& fastmask, - const std::vector& tau, - int width, int height) { + const uint8_t* HWY_RESTRICT grad, + uint32_t* HWY_RESTRICT gpc, + const std::vector& fastmask, + const std::vector& tau, + int width, + int height) { // We use the ScalableTag, but we will "Narrow" our view manually const hn::ScalableTag d32; - const hn::Rebind d8_n; // Same number of lanes as d32 - - const size_t N = hn::Lanes(d32); + const hn::Rebind + d8_n; // Same number of lanes as d32 + + const size_t N = hn::Lanes(d32); const auto v_zero = hn::Zero(d32); const bool use_tau = !tau.empty(); for (int y = 0; y < height; ++y) { for (int x = 0; x < width; x += N) { const uint8_t* centerGrad = grad + y * width + x; - + // 1. Load the gradient bytes for the current N lanes auto v_grad = hn::LoadU(d8_n, centerGrad); @@ -117,12 +121,14 @@ void GPCKerneli(const uint8_t* HWY_RESTRICT in, auto v_tmp = hn::Zero(d32); for (size_t i = 0; i < fastmask.size(); i += 2) { - v_tmp = hn::ShiftLeft<1>(v_tmp); + v_tmp = hn::ShiftLeft<1>(v_tmp); // 3. The "Promotion" that actually works on all platforms: // Promote N lanes of uint8 to N lanes of uint32 - auto v1 = hn::PromoteTo(d32, hn::LoadU(d8_n, in + y * width + x + fastmask[i])); - auto v2 = hn::PromoteTo(d32, hn::LoadU(d8_n, in + y * width + x + fastmask[i + 1])); + auto v1 = hn::PromoteTo( + d32, hn::LoadU(d8_n, in + y * width + x + fastmask[i])); + auto v2 = hn::PromoteTo( + d32, hn::LoadU(d8_n, in + y * width + x + fastmask[i + 1])); hn::Mask mask; if (use_tau) { @@ -132,7 +138,8 @@ void GPCKerneli(const uint8_t* HWY_RESTRICT in, mask = hn::Gt(v1, v2); } - v_tmp = hn::Add(v_tmp, hn::IfThenElse(mask, hn::Set(d32, 1), v_zero)); + v_tmp = hn::Add(v_tmp, + hn::IfThenElse(mask, hn::Set(d32, 1), v_zero)); } hn::StoreU(v_tmp, d32, gpc + y * width + x); @@ -140,18 +147,20 @@ void GPCKerneli(const uint8_t* HWY_RESTRICT in, } } -} // namespace HWY_NAMESPACE -} // namespace ndb +} // namespace HWY_NAMESPACE +} // namespace ndb HWY_AFTER_NAMESPACE(); namespace ndb { namespace testing { - void gpc_hwy(uint8_t* in, uint8_t* grad, uint32_t* HWY_RESTRICT gpc, - const std::vector& fastmask, - const std::vector& tau, int width, int height) { - - HWY_STATIC_DISPATCH(GPCKernel)(in, grad, gpc, fastmask, tau, width, height); - - } -} +void gpc_hwy(uint8_t* in, + uint8_t* grad, + uint32_t* HWY_RESTRICT gpc, + const std::vector& fastmask, + const std::vector& tau, + int width, + int height) { + HWY_STATIC_DISPATCH(GPCKernel)(in, grad, gpc, fastmask, tau, width, height); } +} // namespace testing +} // namespace ndb diff --git a/lib/gpc/kernels/gpc_hwy.hpp b/lib/gpc/kernels/gpc_hwy.hpp index f49d05a..8a83751 100644 --- a/lib/gpc/kernels/gpc_hwy.hpp +++ b/lib/gpc/kernels/gpc_hwy.hpp @@ -1,14 +1,20 @@ -#ifndef __NDB__KERNEL_GPC_HWY +#ifndef __NDB__KERNEL_GPC_HWY #define __NDB__KERNEL_GPC_HWY #include + #include namespace ndb { namespace testing { - void gpc_hwy(uint8_t* in, uint8_t* grad, uint32_t* HWY_RESTRICT gpc, const std::vector& fastmask, const std::vector& tau, int width, int height); - +void gpc_hwy(uint8_t* in, + uint8_t* grad, + uint32_t* HWY_RESTRICT gpc, + const std::vector& fastmask, + const std::vector& tau, + int width, + int height); } diff --git a/lib/gpc/kernels/sobel.cpp b/lib/gpc/kernels/sobel.cpp index 7867cd3..becaf50 100644 --- a/lib/gpc/kernels/sobel.cpp +++ b/lib/gpc/kernels/sobel.cpp @@ -28,12 +28,15 @@ // POSSIBILITY OF SUCH DAMAGE. // // Code Author: Niklaus Bamert (bamertn@ethz.ch) -#include #include "gpc/kernels/sobel.hpp" + +#include + #include "gpc/kernels/utils.hpp" namespace ndb { -namespace testing { - void sobel_hwy(uint8_t* in, uint8_t* blurred, int width, int height, uint8_t threshold); +namespace testing { +void sobel_hwy( + uint8_t* in, uint8_t* blurred, int width, int height, uint8_t threshold); } void sobelNaive( uint8_t* in, uint8_t* gradient, int width, int height, uint8_t threshold) { @@ -59,7 +62,8 @@ void sobelNaive( // boundary) (unoptimized) for (int iy = 1; iy < height - 1; iy++) { for (int ix = 0; ix < width; ix++) { - // Approximate division by 9 with fixed-point multiplication (2^16/9 = 7282) + // Approximate division by 9 with fixed-point multiplication (2^16/9 + // = 7282) int16_t sum_x = (*p11 + *p31 + 2 * *p21 - *p13 - 2 * *p23 - *p33); int16_t sum_y = (*p11 + *p13 + 2 * *p12 - *p31 - 2 * *p32 - *p33); @@ -81,23 +85,25 @@ void sobelNaive( } } } -//#ifdef _INTRINSICS_SSE +// #ifdef _INTRINSICS_SSE #if HWY_TARGET == HWY_AVX2 #include -void sobelSSE(const uint8_t* in, uint8_t* blurred, - int width, int start, int end, - uint8_t threshold) { - +void sobelSSE(const uint8_t* in, + uint8_t* blurred, + int width, + int start, + int end, + uint8_t threshold) { __m128i zero = _mm_setzero_si128(); - __m128i one_ninth = _mm_set1_epi16(7282); // 2^16/9 + __m128i one_ninth = _mm_set1_epi16(7282); // 2^16/9 __m128i binThres = _mm_set1_epi16(threshold * threshold); for (int y = start; y < end; y++) { const uint8_t* row1 = in + y * width; const uint8_t* row0 = row1 - width; const uint8_t* row2 = row1 + width; - + // Output destination for this specific row __m128i* dst = (__m128i*)(blurred + y * width + 1); @@ -109,23 +115,31 @@ void sobelSSE(const uint8_t* in, uint8_t* blurred, // Load and unpack 3x3 neighborhood (excluding center a11/b11) unpack8to16(_mm_loadu_si128((__m128i*)(row0 + x - 1)), a00, b00); - unpack8to16(_mm_loadu_si128((__m128i*)(row0 + x)), a01, b01); + unpack8to16(_mm_loadu_si128((__m128i*)(row0 + x)), a01, b01); unpack8to16(_mm_loadu_si128((__m128i*)(row0 + x + 1)), a02, b02); unpack8to16(_mm_loadu_si128((__m128i*)(row1 + x - 1)), a10, b10); unpack8to16(_mm_loadu_si128((__m128i*)(row1 + x + 1)), a12, b12); unpack8to16(_mm_loadu_si128((__m128i*)(row2 + x - 1)), a20, b20); - unpack8to16(_mm_loadu_si128((__m128i*)(row2 + x)), a21, b21); + unpack8to16(_mm_loadu_si128((__m128i*)(row2 + x)), a21, b21); unpack8to16(_mm_loadu_si128((__m128i*)(row2 + x + 1)), a22, b22); // --- SX Calculation --- // Left col (1,2,1) - raA = _mm_mulhi_epi16(_mm_add_epi16(_mm_add_epi16(a00, a20), _mm_add_epi16(a10, a10)), one_ninth); - rbA = _mm_mulhi_epi16(_mm_add_epi16(_mm_add_epi16(b00, b20), _mm_add_epi16(b10, b10)), one_ninth); + raA = _mm_mulhi_epi16( + _mm_add_epi16(_mm_add_epi16(a00, a20), _mm_add_epi16(a10, a10)), + one_ninth); + rbA = _mm_mulhi_epi16( + _mm_add_epi16(_mm_add_epi16(b00, b20), _mm_add_epi16(b10, b10)), + one_ninth); // Right col (-1,-2,-1) - raB = _mm_mulhi_epi16(_mm_add_epi16(_mm_add_epi16(a02, a22), _mm_add_epi16(a12, a12)), one_ninth); - rbB = _mm_mulhi_epi16(_mm_add_epi16(_mm_add_epi16(b02, b22), _mm_add_epi16(b12, b12)), one_ninth); + raB = _mm_mulhi_epi16( + _mm_add_epi16(_mm_add_epi16(a02, a22), _mm_add_epi16(a12, a12)), + one_ninth); + rbB = _mm_mulhi_epi16( + _mm_add_epi16(_mm_add_epi16(b02, b22), _mm_add_epi16(b12, b12)), + one_ninth); tmpa = _mm_sub_epi16(raA, raB); tmpb = _mm_sub_epi16(rbA, rbB); @@ -134,11 +148,19 @@ void sobelSSE(const uint8_t* in, uint8_t* blurred, // --- SY Calculation --- // Top row (1,2,1) - raA = _mm_mulhi_epi16(_mm_add_epi16(_mm_add_epi16(a00, a02), _mm_add_epi16(a01, a01)), one_ninth); - rbA = _mm_mulhi_epi16(_mm_add_epi16(_mm_add_epi16(b00, b02), _mm_add_epi16(b01, b01)), one_ninth); + raA = _mm_mulhi_epi16( + _mm_add_epi16(_mm_add_epi16(a00, a02), _mm_add_epi16(a01, a01)), + one_ninth); + rbA = _mm_mulhi_epi16( + _mm_add_epi16(_mm_add_epi16(b00, b02), _mm_add_epi16(b01, b01)), + one_ninth); // Bottom row (-1,-2,-1) - raB = _mm_mulhi_epi16(_mm_add_epi16(_mm_add_epi16(a20, a22), _mm_add_epi16(a21, a21)), one_ninth); - rbB = _mm_mulhi_epi16(_mm_add_epi16(_mm_add_epi16(b20, b22), _mm_add_epi16(b21, b21)), one_ninth); + raB = _mm_mulhi_epi16( + _mm_add_epi16(_mm_add_epi16(a20, a22), _mm_add_epi16(a21, a21)), + one_ninth); + rbB = _mm_mulhi_epi16( + _mm_add_epi16(_mm_add_epi16(b20, b22), _mm_add_epi16(b21, b21)), + one_ninth); tmpa = _mm_sub_epi16(raA, raB); tmpb = _mm_sub_epi16(rbA, rbB); @@ -147,8 +169,10 @@ void sobelSSE(const uint8_t* in, uint8_t* blurred, // --- Thresholding and Packing --- pack16to8( - _mm_unpacklo_epi8(_mm_cmpgt_epi16(_mm_adds_epi16(sxa, sya), binThres), zero), - _mm_unpacklo_epi8(_mm_cmpgt_epi16(_mm_adds_epi16(sxb, syb), binThres), zero), + _mm_unpacklo_epi8( + _mm_cmpgt_epi16(_mm_adds_epi16(sxa, sya), binThres), zero), + _mm_unpacklo_epi8( + _mm_cmpgt_epi16(_mm_adds_epi16(sxb, syb), binThres), zero), res); _mm_storeu_si128(dst++, res); @@ -164,14 +188,14 @@ void sobel(uint8_t* in, int numThreads) { assert(width % 16 == 0 && "width must be multiple of 16!"); #if defined(__ARM_NEON) || defined(__aarch64__) - sobelNaive(in, blurred, width, height, threshold); - //testing::sobel_hwy(in, blurred, width, height, threshold); // not exact! + sobelNaive(in, blurred, width, height, threshold); + // testing::sobel_hwy(in, blurred, width, height, threshold); // not exact! +#else +#ifndef _INTRINSICS_SSE + sobelNaive(in, blurred, width, height, threshold); #else - #ifndef _INTRINSICS_SSE - sobelNaive(in, blurred, width, height, threshold); - #else - sobelSSE(in, blurred, width, 1, height - 1, threshold); - #endif + sobelSSE(in, blurred, width, 1, height - 1, threshold); +#endif #endif } -} // namespace ndb +} // namespace ndb diff --git a/lib/gpc/kernels/sobel.hpp b/lib/gpc/kernels/sobel.hpp index c14b950..312a70c 100644 --- a/lib/gpc/kernels/sobel.hpp +++ b/lib/gpc/kernels/sobel.hpp @@ -35,10 +35,13 @@ namespace ndb { #if HWY_TARGET == HWY_AVX2 -void sobelSSE(const uint8_t* in, uint8_t* blurred, - int width, int start, int end, - uint8_t threshold); - +void sobelSSE(const uint8_t* in, + uint8_t* blurred, + int width, + int start, + int end, + uint8_t threshold); + #endif /** * @brief Naive 3x3 sobel filter implementation @@ -70,5 +73,5 @@ void sobel(uint8_t* in, int height, uint8_t threshold, int numThreads); -} +} // namespace ndb #endif diff --git a/lib/gpc/kernels/sobel_hwy.cpp b/lib/gpc/kernels/sobel_hwy.cpp index fe93031..14ad593 100644 --- a/lib/gpc/kernels/sobel_hwy.cpp +++ b/lib/gpc/kernels/sobel_hwy.cpp @@ -1,20 +1,23 @@ -//#define HWY_TARGET HWY_NEON +// #define HWY_TARGET HWY_NEON #include -HWY_BEFORE_NAMESPACE(); +HWY_BEFORE_NAMESPACE(); namespace ndb { namespace HWY_NAMESPACE { namespace hn = hwy::HWY_NAMESPACE; -void SobelKernelNoDiv(const uint8_t* HWY_RESTRICT in, uint8_t* HWY_RESTRICT gradient, - int width, int height, uint8_t threshold) { +void SobelKernelNoDiv(const uint8_t* HWY_RESTRICT in, + uint8_t* HWY_RESTRICT gradient, + int width, + int height, + uint8_t threshold) { const hn::ScalableTag d8; - const hn::Half d8_h; + const hn::Half d8_h; const hn::Rebind d16; // d32 has half the lanes of d16 const hn::Rebind> d32; const size_t N = hn::Lanes(d8); - const auto vDivMult = hn::Set(d16, (int16_t)7282); + const auto vDivMult = hn::Set(d16, (int16_t)7282); const auto vThreshSq = hn::Set(d32, (int32_t)threshold * threshold); const auto v255_16 = hn::Set(d16, (int16_t)255); const auto v255_8 = hn::Set(d8, (uint8_t)255); @@ -27,44 +30,75 @@ void SobelKernelNoDiv(const uint8_t* HWY_RESTRICT in, uint8_t* HWY_RESTRICT grad uint8_t* out = gradient + y * width + 1; for (int x = 0; x < width; x += N) { - auto v11 = hn::LoadU(d8, r0 + x); auto v12 = hn::LoadU(d8, r0 + x + 1); auto v13 = hn::LoadU(d8, r0 + x + 2); - auto v21 = hn::LoadU(d8, r1 + x); auto v23 = hn::LoadU(d8, r1 + x + 2); - auto v31 = hn::LoadU(d8, r2 + x); auto v32 = hn::LoadU(d8, r2 + x + 1); auto v33 = hn::LoadU(d8, r2 + x + 2); + auto v11 = hn::LoadU(d8, r0 + x); + auto v12 = hn::LoadU(d8, r0 + x + 1); + auto v13 = hn::LoadU(d8, r0 + x + 2); + auto v21 = hn::LoadU(d8, r1 + x); + auto v23 = hn::LoadU(d8, r1 + x + 2); + auto v31 = hn::LoadU(d8, r2 + x); + auto v32 = hn::LoadU(d8, r2 + x + 1); + auto v33 = hn::LoadU(d8, r2 + x + 2); // Helper to process 8 pixels into a 16-bit mask-like result - auto process_half = [&](auto p11, auto p12, auto p13, auto p21, auto p23, auto p31, auto p32, auto p33) { + auto process_half = [&](auto p11, + auto p12, + auto p13, + auto p21, + auto p23, + auto p31, + auto p32, + auto p33) { // Sobel derivatives in 16-bit - auto sx16 = hn::MulHigh(hn::Sub(hn::Add(hn::Add(p11, p31), hn::Add(p21, p21)), - hn::Add(hn::Add(p13, p33), hn::Add(p23, p23))), vDivMult); - auto sy16 = hn::MulHigh(hn::Sub(hn::Add(hn::Add(p11, p13), hn::Add(p12, p12)), - hn::Add(hn::Add(p31, p33), hn::Add(p32, p32))), vDivMult); + auto sx16 = hn::MulHigh( + hn::Sub(hn::Add(hn::Add(p11, p31), hn::Add(p21, p21)), + hn::Add(hn::Add(p13, p33), hn::Add(p23, p23))), + vDivMult); + auto sy16 = hn::MulHigh( + hn::Sub(hn::Add(hn::Add(p11, p13), hn::Add(p12, p12)), + hn::Add(hn::Add(p31, p33), hn::Add(p32, p32))), + vDivMult); // Magnitude squared in 32-bit auto sx_lo = hn::PromoteLowerTo(d32, sx16); auto sy_lo = hn::PromoteLowerTo(d32, sy16); - auto mag_lo = hn::Add(hn::Mul(sx_lo, sx_lo), hn::Mul(sy_lo, sy_lo)); + auto mag_lo = + hn::Add(hn::Mul(sx_lo, sx_lo), hn::Mul(sy_lo, sy_lo)); auto sx_hi = hn::PromoteUpperTo(d32, sx16); auto sy_hi = hn::PromoteUpperTo(d32, sy16); - auto mag_hi = hn::Add(hn::Mul(sx_hi, sx_hi), hn::Mul(sy_hi, sy_hi)); + auto mag_hi = + hn::Add(hn::Mul(sx_hi, sx_hi), hn::Mul(sy_hi, sy_hi)); - // Comparison in 32-bit, returning 16-bit values (0 or 255) to avoid mask issues - auto m_lo = hn::IfThenElse(hn::Gt(mag_lo, vThreshSq), hn::Set(d32, 255), hn::Zero(d32)); - auto m_hi = hn::IfThenElse(hn::Gt(mag_hi, vThreshSq), hn::Set(d32, 255), hn::Zero(d32)); + // Comparison in 32-bit, returning 16-bit values (0 or 255) to + // avoid mask issues + auto m_lo = hn::IfThenElse(hn::Gt(mag_lo, vThreshSq), + hn::Set(d32, 255), + hn::Zero(d32)); + auto m_hi = hn::IfThenElse(hn::Gt(mag_hi, vThreshSq), + hn::Set(d32, 255), + hn::Zero(d32)); return hn::OrderedDemote2To(d16, m_lo, m_hi); }; // Process halves using standard Highway promotion - auto res_lo = process_half( - hn::PromoteLowerTo(d16, v11), hn::PromoteLowerTo(d16, v12), hn::PromoteLowerTo(d16, v13), - hn::PromoteLowerTo(d16, v21), hn::PromoteLowerTo(d16, v23), - hn::PromoteLowerTo(d16, v31), hn::PromoteLowerTo(d16, v32), hn::PromoteLowerTo(d16, v33)); + auto res_lo = process_half(hn::PromoteLowerTo(d16, v11), + hn::PromoteLowerTo(d16, v12), + hn::PromoteLowerTo(d16, v13), + hn::PromoteLowerTo(d16, v21), + hn::PromoteLowerTo(d16, v23), + hn::PromoteLowerTo(d16, v31), + hn::PromoteLowerTo(d16, v32), + hn::PromoteLowerTo(d16, v33)); - auto res_hi = process_half( - hn::PromoteUpperTo(d16, v11), hn::PromoteUpperTo(d16, v12), hn::PromoteUpperTo(d16, v13), - hn::PromoteUpperTo(d16, v21), hn::PromoteUpperTo(d16, v23), - hn::PromoteUpperTo(d16, v31), hn::PromoteUpperTo(d16, v32), hn::PromoteUpperTo(d16, v33)); + auto res_hi = process_half(hn::PromoteUpperTo(d16, v11), + hn::PromoteUpperTo(d16, v12), + hn::PromoteUpperTo(d16, v13), + hn::PromoteUpperTo(d16, v21), + hn::PromoteUpperTo(d16, v23), + hn::PromoteUpperTo(d16, v31), + hn::PromoteUpperTo(d16, v32), + hn::PromoteUpperTo(d16, v33)); // Final store: 16-bit to 8-bit demotion auto final_val = hn::OrderedDemote2To(d8, res_lo, res_hi); @@ -72,10 +106,14 @@ void SobelKernelNoDiv(const uint8_t* HWY_RESTRICT in, uint8_t* HWY_RESTRICT grad } } } -void SobelKernel(const uint8_t* HWY_RESTRICT in, uint8_t* HWY_RESTRICT gradient, - int width, int height, uint8_t threshold) { +void SobelKernel(const uint8_t* HWY_RESTRICT in, + uint8_t* HWY_RESTRICT gradient, + int width, + int height, + uint8_t threshold) { // We target 4 pixels at a time as our base 'Scalable' unit. - // This allows easy promotion from 8 -> 16 -> 32 bit while keeping lane counts identical. + // This allows easy promotion from 8 -> 16 -> 32 bit while keeping lane + // counts identical. const hn::FixedTag d8; const hn::FixedTag d16; const hn::FixedTag d32; @@ -97,18 +135,28 @@ void SobelKernel(const uint8_t* HWY_RESTRICT in, uint8_t* HWY_RESTRICT gradient, return hn::PromoteTo(d32, hn::PromoteTo(d16, hn::LoadU(d8, p))); }; - auto p11 = load32(r0 + x); auto p12 = load32(r0 + x + 1); auto p13 = load32(r0 + x + 2); - auto p21 = load32(r1 + x); auto p23 = load32(r1 + x + 2); - auto p31 = load32(r2 + x); auto p32 = load32(r2 + x + 1); auto p33 = load32(r2 + x + 2); + auto p11 = load32(r0 + x); + auto p12 = load32(r0 + x + 1); + auto p13 = load32(r0 + x + 2); + auto p21 = load32(r1 + x); + auto p23 = load32(r1 + x + 2); + auto p31 = load32(r2 + x); + auto p32 = load32(r2 + x + 1); + auto p33 = load32(r2 + x + 2); + + // Note:: Division is very slow - we use it for now to match exactly + // with the naive non simd-implementation sx = (*p11 + *p31 + 2 * + // *p21 - *p13 - 2 * *p23 - *p33) / 9; + auto sx = + hn::Div(hn::Sub(hn::Add(hn::Add(p11, p31), hn::Add(p21, p21)), + hn::Add(hn::Add(p13, p33), hn::Add(p23, p23))), + vDiv); - // Note:: Division is very slow - we use it for now to match exactly with the naive non simd-implementation - // sx = (*p11 + *p31 + 2 * *p21 - *p13 - 2 * *p23 - *p33) / 9; - auto sx = hn::Div(hn::Sub(hn::Add(hn::Add(p11, p31), hn::Add(p21, p21)), - hn::Add(hn::Add(p13, p33), hn::Add(p23, p23))), vDiv); - // sy = (*p11 + *p13 + 2 * *p12 - *p31 - 2 * *p32 - *p33) / 9; - auto sy = hn::Div(hn::Sub(hn::Add(hn::Add(p11, p13), hn::Add(p12, p12)), - hn::Add(hn::Add(p31, p33), hn::Add(p32, p32))), vDiv); + auto sy = + hn::Div(hn::Sub(hn::Add(hn::Add(p11, p13), hn::Add(p12, p12)), + hn::Add(hn::Add(p31, p33), hn::Add(p32, p32))), + vDiv); // int val = sx * sx + sy * sy; auto magSq = hn::Add(hn::Mul(sx, sx), hn::Mul(sy, sy)); @@ -116,21 +164,24 @@ void SobelKernel(const uint8_t* HWY_RESTRICT in, uint8_t* HWY_RESTRICT gradient, // *optr = val > thresholdSq ? 255 : 0; auto mask = hn::Gt(magSq, vThreshSq); auto res32 = hn::IfThenElse(mask, v255, v0); - + // Demote 32 -> 16 -> 8 auto res8 = hn::DemoteTo(d8, hn::DemoteTo(d16, res32)); hn::StoreU(res8, d8, out + x); } } } -void SobelKerneli(const uint8_t* HWY_RESTRICT in, uint8_t* HWY_RESTRICT gradient, - int width, int height, uint8_t threshold) { +void SobelKerneli(const uint8_t* HWY_RESTRICT in, + uint8_t* HWY_RESTRICT gradient, + int width, + int height, + uint8_t threshold) { const hn::ScalableTag d8; - const hn::Half d8_h; - const hn::Rebind d16; + const hn::Half d8_h; + const hn::Rebind d16; const size_t N = hn::Lanes(d8); - const auto divisor = hn::Set(d16, (int16_t)7282); + const auto divisor = hn::Set(d16, (int16_t)7282); const auto threshSq = hn::Set(d16, (int16_t)(threshold * threshold)); const auto v255 = hn::Set(d16, 255); const auto v0 = hn::Zero(d16); @@ -142,48 +193,72 @@ void SobelKerneli(const uint8_t* HWY_RESTRICT in, uint8_t* HWY_RESTRICT gradient uint8_t* out = gradient + y * width + 1; for (int x = 0; x < width; x += N) { - auto v11 = hn::LoadU(d8, r0 + x); auto v12 = hn::LoadU(d8, r0 + x + 1); auto v13 = hn::LoadU(d8, r0 + x + 2); - auto v21 = hn::LoadU(d8, r1 + x); auto v23 = hn::LoadU(d8, r1 + x + 2); - auto v31 = hn::LoadU(d8, r2 + x); auto v32 = hn::LoadU(d8, r2 + x + 1); auto v33 = hn::LoadU(d8, r2 + x + 2); + auto v11 = hn::LoadU(d8, r0 + x); + auto v12 = hn::LoadU(d8, r0 + x + 1); + auto v13 = hn::LoadU(d8, r0 + x + 2); + auto v21 = hn::LoadU(d8, r1 + x); + auto v23 = hn::LoadU(d8, r1 + x + 2); + auto v31 = hn::LoadU(d8, r2 + x); + auto v32 = hn::LoadU(d8, r2 + x + 1); + auto v33 = hn::LoadU(d8, r2 + x + 2); - auto process = [&](auto p11, auto p12, auto p13, auto p21, auto p23, auto p31, auto p32, auto p33) { - auto sx = hn::Sub(hn::Add(hn::Add(p11, p31), hn::Add(p21, p21)), - hn::Add(hn::Add(p13, p33), hn::Add(p23, p23))); + auto process = [&](auto p11, + auto p12, + auto p13, + auto p21, + auto p23, + auto p31, + auto p32, + auto p33) { + auto sx = + hn::Sub(hn::Add(hn::Add(p11, p31), hn::Add(p21, p21)), + hn::Add(hn::Add(p13, p33), hn::Add(p23, p23))); sx = hn::MulHigh(sx, divisor); - auto sy = hn::Sub(hn::Add(hn::Add(p11, p13), hn::Add(p12, p12)), - hn::Add(hn::Add(p31, p33), hn::Add(p32, p32))); + auto sy = + hn::Sub(hn::Add(hn::Add(p11, p13), hn::Add(p12, p12)), + hn::Add(hn::Add(p31, p33), hn::Add(p32, p32))); sy = hn::MulHigh(sy, divisor); auto mag = hn::Add(hn::Mul(sx, sx), hn::Mul(sy, sy)); return hn::IfThenElse(hn::Gt(mag, threshSq), v255, v0); }; // Process Lower Half - auto res_lo = process( - hn::PromoteTo(d16, hn::LowerHalf(v11)), hn::PromoteTo(d16, hn::LowerHalf(v12)), hn::PromoteTo(d16, hn::LowerHalf(v13)), - hn::PromoteTo(d16, hn::LowerHalf(v21)), hn::PromoteTo(d16, hn::LowerHalf(v23)), - hn::PromoteTo(d16, hn::LowerHalf(v31)), hn::PromoteTo(d16, hn::LowerHalf(v32)), hn::PromoteTo(d16, hn::LowerHalf(v33))); + auto res_lo = process(hn::PromoteTo(d16, hn::LowerHalf(v11)), + hn::PromoteTo(d16, hn::LowerHalf(v12)), + hn::PromoteTo(d16, hn::LowerHalf(v13)), + hn::PromoteTo(d16, hn::LowerHalf(v21)), + hn::PromoteTo(d16, hn::LowerHalf(v23)), + hn::PromoteTo(d16, hn::LowerHalf(v31)), + hn::PromoteTo(d16, hn::LowerHalf(v32)), + hn::PromoteTo(d16, hn::LowerHalf(v33))); - // Process Upper Half - auto res_hi = process( - hn::PromoteTo(d16, hn::UpperHalf(d8_h, v11)), hn::PromoteTo(d16, hn::UpperHalf(d8_h, v12)), hn::PromoteTo(d16, hn::UpperHalf(d8_h, v13)), - hn::PromoteTo(d16, hn::UpperHalf(d8_h, v21)), hn::PromoteTo(d16, hn::UpperHalf(d8_h, v23)), - hn::PromoteTo(d16, hn::UpperHalf(d8_h, v31)), hn::PromoteTo(d16, hn::UpperHalf(d8_h, v32)), hn::PromoteTo(d16, hn::UpperHalf(d8_h, v33))); + // Process Upper Half + auto res_hi = process(hn::PromoteTo(d16, hn::UpperHalf(d8_h, v11)), + hn::PromoteTo(d16, hn::UpperHalf(d8_h, v12)), + hn::PromoteTo(d16, hn::UpperHalf(d8_h, v13)), + hn::PromoteTo(d16, hn::UpperHalf(d8_h, v21)), + hn::PromoteTo(d16, hn::UpperHalf(d8_h, v23)), + hn::PromoteTo(d16, hn::UpperHalf(d8_h, v31)), + hn::PromoteTo(d16, hn::UpperHalf(d8_h, v32)), + hn::PromoteTo(d16, hn::UpperHalf(d8_h, v33))); hn::StoreU(hn::OrderedDemote2To(d8, res_lo, res_hi), d8, out + x); } } } -} // namespace HWY_NAMESPACE -} // namespace ndb +} // namespace HWY_NAMESPACE +} // namespace ndb HWY_AFTER_NAMESPACE(); namespace ndb { namespace testing { -//#if defined(HWY_TARGET) && HWY_TARGET == HWY_NEON - void sobel_hwy(uint8_t* in, uint8_t* blurred, int width, int height, uint8_t threshold) { - //ndb::N_NEON::SobelKernel(in, blurred, width, height, threshold); - HWY_STATIC_DISPATCH(SobelKernelNoDiv)(in, blurred, width, height, threshold); - } -//#endif -} +// #if defined(HWY_TARGET) && HWY_TARGET == HWY_NEON +void sobel_hwy( + uint8_t* in, uint8_t* blurred, int width, int height, uint8_t threshold) { + // ndb::N_NEON::SobelKernel(in, blurred, width, height, threshold); + HWY_STATIC_DISPATCH(SobelKernelNoDiv)( + in, blurred, width, height, threshold); } +// #endif +} // namespace testing +} // namespace ndb diff --git a/lib/gpc/kernels/sobel_hwy.hpp b/lib/gpc/kernels/sobel_hwy.hpp index bc99199..7c5d4ce 100644 --- a/lib/gpc/kernels/sobel_hwy.hpp +++ b/lib/gpc/kernels/sobel_hwy.hpp @@ -1,4 +1,4 @@ -#ifndef __NDB__KERNEL_SOBEL_HWY +#ifndef __NDB__KERNEL_SOBEL_HWY #define __NDB__KERNEL_SOBEL_HWY #include @@ -6,11 +6,12 @@ namespace ndb { namespace testing { - /** - * Entry point for benchmarking the MulHigh (approximate) version. - */ - void sobel_hwy(uint8_t* in, uint8_t* blurred, int width, int height, uint8_t threshold); -} +/** + * Entry point for benchmarking the MulHigh (approximate) version. + */ +void sobel_hwy( + uint8_t* in, uint8_t* blurred, int width, int height, uint8_t threshold); +} // namespace testing } // namespace ndb diff --git a/lib/gpc/kernels/utils.cpp b/lib/gpc/kernels/utils.cpp index ce920e8..1b1ab4b 100644 --- a/lib/gpc/kernels/utils.cpp +++ b/lib/gpc/kernels/utils.cpp @@ -28,18 +28,16 @@ // POSSIBILITY OF SUCH DAMAGE. // // Code Author: Niklaus Bamert (bamertn@ethz.ch) +#include "gpc/kernels/utils.hpp" + #include -#include #include -#include "gpc/kernels/utils.hpp" +#include using namespace std; namespace ndb { -void arr2ind(const unsigned char* a, - int n, - int* ind, - int* m) { +void arr2ind(const unsigned char* a, int n, int* ind, int* m) { #if HWY_TARGET == HWY_AVX2 int i, m0, k; __m256i msk; @@ -104,8 +102,4 @@ void parFor(std::function const& f, for (auto& t : threads) t.join(); } - - - - } // namespace ndb diff --git a/lib/gpc/kernels/utils.hpp b/lib/gpc/kernels/utils.hpp index 18227ba..3985c1e 100644 --- a/lib/gpc/kernels/utils.hpp +++ b/lib/gpc/kernels/utils.hpp @@ -31,9 +31,10 @@ #ifndef __NDB__KERNEL_UTILS #define __NDB__KERNEL_UTILS +#include + #include #include -#include #include "gpc/buffer.hpp" using namespace std; @@ -59,10 +60,7 @@ namespace ndb { * @param ind output array (indices into n of nonzero elements) * @param m number of elements in output */ -void arr2ind(const unsigned char* a, - int n, - int* ind, - int* m); +void arr2ind(const unsigned char* a, int n, int* ind, int* m); #if HWY_TARGET == HWY_AVX2 /** diff --git a/samples/sparsematch.cpp b/samples/sparsematch.cpp index 57864e8..b25a96f 100644 --- a/samples/sparsematch.cpp +++ b/samples/sparsematch.cpp @@ -1,21 +1,20 @@ -#include #include +#include + #include "gpc/forest.hpp" using namespace std; -std::vector gpcFilterDense(uint8_t* in, - const std::vector& fastmask, - int width, - int height) { +std::vector gpcFilterDense( + uint8_t* in, const std::vector& fastmask, int width, int height) { uint32_t tmp; uint32_t usableW = width - 26; uint32_t usableH = height - 26; std::vector out(usableW * usableH); int j = 0; - for (int y=13;y *(in + idx + fastmask[i + 1])) @@ -56,7 +55,8 @@ int main(int argc, char** argv) { gpc::inference::InferenceSettings inferencesettings = gpc::inference::InferenceSettings() .builder() - .gradientThreshold(1) // gradientthres 20: matching ~3ms, 2: matching: ~30ms. + .gradientThreshold( + 1) // gradientthres 20: matching ~3ms, 2: matching: ~30ms. .verticalTolerance( 0) // 0px tolerance for rectified epipolar matches .dispHigh(128) // limit disparities to 128 @@ -73,7 +73,6 @@ int main(int argc, char** argv) { gpc::inference::FilterMask fm = forest.readForest(forestPath, simg.cols(), simg.rows()); - gpc::inference::time_point t0 = gpc::inference::sysTick(); gpc::inference::PreprocessedImage simgP = @@ -86,10 +85,13 @@ int main(int argc, char** argv) { std::vector supp = forest.rectifiedMatch(simgP, timgP, fm, inferencesettings); gpc::inference::time_point t2 = gpc::inference::sysTick(); - std::cout << "Number of features(s,t): " << simgP.mask.size() << "," << timgP.mask.size() << std::endl; + std::cout << "Number of features(s,t): " << simgP.mask.size() << "," + << timgP.mask.size() << std::endl; std::cout << "Number of matches: " << supp.size() << std::endl; - std::cout << "Preprocessing time: " << gpc::inference::tickToMs(t1, t0) << " ms" << std::endl; - std::cout << "Matching time: " << gpc::inference::tickToMs(t2, t1) << " ms" << std::endl; + std::cout << "Preprocessing time: " << gpc::inference::tickToMs(t1, t0) + << " ms" << std::endl; + std::cout << "Matching time: " << gpc::inference::tickToMs(t2, t1) << " ms" + << std::endl; /* std::vector statesSrc = forest.evalFastMaskOnSubsetSSE( simgP.smooth, simgP.grad, simgP.mask, fm, inferencesettings); @@ -97,10 +99,11 @@ int main(int argc, char** argv) { timgP.smooth, timgP.grad, timgP.mask, fm, inferencesettings); */ - std::vector statesSrc = gpcFilterDense(simgP.smooth.data(), fm.mask, simgP.smooth.cols(), simgP.smooth.rows()); - std::vector statesTar = gpcFilterDense(timgP.smooth.data(), fm.mask, timgP.smooth.cols(), timgP.smooth.rows()); + std::vector statesSrc = gpcFilterDense( + simgP.smooth.data(), fm.mask, simgP.smooth.cols(), simgP.smooth.rows()); + std::vector statesTar = gpcFilterDense( + timgP.smooth.data(), fm.mask, timgP.smooth.cols(), timgP.smooth.rows()); ndb::Descriptor::serialize("statesSrcLargeS.txt", statesSrc); ndb::Descriptor::serialize("statesTarLargeS.txt", statesTar); - } diff --git a/samples/target.cpp b/samples/target.cpp index 6c03ab7..f457f9d 100644 --- a/samples/target.cpp +++ b/samples/target.cpp @@ -1,4 +1,5 @@ #include + #include int main() { std::cout << "Compiled for: " << hwy::TargetName(HWY_TARGET) << std::endl; From 432cc9238937b6afa354e33c22b6045508d413c5 Mon Sep 17 00:00:00 2001 From: Nik Bamert Date: Tue, 7 Apr 2026 09:17:14 +0200 Subject: [PATCH 36/36] update approval test blob due to div /9 approximation in sobel filter --- CMakeLists.txt | 2 - benchmarks/box_bench.cpp | 4 - benchmarks/correspondence_bench.cpp | 1 - lib/gpc/kernels/gpc_hwy.cpp | 14 +- lib/gpc/kernels/sobel_hwy.cpp | 139 ------------------ ...e_matching.Approval.Inference.approved.txt | 2 +- tests/test_single_matching.cpp | 2 +- 7 files changed, 5 insertions(+), 159 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 72f760c..6d3a39e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -48,8 +48,6 @@ set(BENCHMARK_ENABLE_INSTALL OFF CACHE BOOL "" FORCE) # Force the library itself to build in Release mode set(CMAKE_BUILD_TYPE Release CACHE STRING "" FORCE) -#add_definitions(-DNDEBUG) - FetchContent_MakeAvailable(google_benchmark) FetchContent_MakeAvailable(google_benchmark) add_library(gpc_core diff --git a/benchmarks/box_bench.cpp b/benchmarks/box_bench.cpp index 7b7ff5c..f0aeeb8 100644 --- a/benchmarks/box_bench.cpp +++ b/benchmarks/box_bench.cpp @@ -7,11 +7,9 @@ static void BM_BoxHighway(benchmark::State& state) { std::vector in(w * h, 128); std::vector out(w * h, 0); state.SetLabel(hwy::TargetName(HWY_TARGET)); - // Warmup is handled automatically by the library for (auto _ : state) { ndb::testing::box_hwy(in.data(), out.data(), w, h); - // Ensure the compiler doesn't skip the work benchmark::DoNotOptimize(out.data()); benchmark::ClobberMemory(); } @@ -27,7 +25,6 @@ static void BM_BoxLegacySIMD(benchmark::State& state) { for (auto _ : state) { ndb::boxSSE(in.data(), out.data(), w, h); - // Ensure the compiler doesn't skip the work benchmark::DoNotOptimize(out.data()); benchmark::ClobberMemory(); } @@ -42,7 +39,6 @@ static void BM_BoxNaive(benchmark::State& state) { for (auto _ : state) { ndb::boxNaive(in.data(), out.data(), w, h); - // Ensure the compiler doesn't skip the work benchmark::DoNotOptimize(out.data()); benchmark::ClobberMemory(); } diff --git a/benchmarks/correspondence_bench.cpp b/benchmarks/correspondence_bench.cpp index 737d4dc..b847fb1 100644 --- a/benchmarks/correspondence_bench.cpp +++ b/benchmarks/correspondence_bench.cpp @@ -24,7 +24,6 @@ std::vector generate_pareto_ids(size_t count, double target_mea // 1e-9 epsilon prevents division by zero/infinity std::uniform_real_distribution dist(1e-9, 1.0); - // Alpha = 1.16 provides a classic "80/20" Pareto distribution const double alpha = 1.16; const double xm = target_mean * (alpha - 1.0) / alpha; diff --git a/lib/gpc/kernels/gpc_hwy.cpp b/lib/gpc/kernels/gpc_hwy.cpp index 87b0f20..d84710a 100644 --- a/lib/gpc/kernels/gpc_hwy.cpp +++ b/lib/gpc/kernels/gpc_hwy.cpp @@ -34,8 +34,6 @@ void GPCKernel(const uint8_t* HWY_RESTRICT in, for (int x = border; x <= width - border - (int)N; x += N) { const int k = row_base + x; - // We use four 8-bit registers to build the 32 bits. - // This keeps the entire hot-loop in 8-bit space. auto v_acc0 = hn::Zero(d8); // Bits 0-7 auto v_acc1 = hn::Zero(d8); // Bits 8-15 auto v_acc2 = hn::Zero(d8); // Bits 16-23 @@ -73,12 +71,7 @@ void GPCKernel(const uint8_t* HWY_RESTRICT in, v_acc3 = hn::Or(v_acc3, hn::IfThenElse(mask, v_one8, v_zero8)); } - // Final Assembly: Promote the four 8-bit chunks into 32-bit - // results. We use PromoteUpper/Lower to widen the data. N is the - // number of 8-bit lanes. We need to store N/4 results in d32. - - // To be perfectly safe across all Highway targets, we extract and - // combine: + //extract and combine: for (size_t lane = 0; lane < N; ++lane) { uint32_t final_val = (uint32_t(hn::ExtractLane(v_acc0, lane)) << 24) | @@ -110,10 +103,10 @@ void GPCKerneli(const uint8_t* HWY_RESTRICT in, for (int x = 0; x < width; x += N) { const uint8_t* centerGrad = grad + y * width + x; - // 1. Load the gradient bytes for the current N lanes + // Load the gradient bytes for the current N lanes auto v_grad = hn::LoadU(d8_n, centerGrad); - // 2. Promotion-free zero check + // Promotion-free zero check if (hn::AllTrue(d8_n, hn::Eq(v_grad, hn::Zero(d8_n)))) { continue; } @@ -123,7 +116,6 @@ void GPCKerneli(const uint8_t* HWY_RESTRICT in, for (size_t i = 0; i < fastmask.size(); i += 2) { v_tmp = hn::ShiftLeft<1>(v_tmp); - // 3. The "Promotion" that actually works on all platforms: // Promote N lanes of uint8 to N lanes of uint32 auto v1 = hn::PromoteTo( d32, hn::LoadU(d8_n, in + y * width + x + fastmask[i])); diff --git a/lib/gpc/kernels/sobel_hwy.cpp b/lib/gpc/kernels/sobel_hwy.cpp index 14ad593..493b52d 100644 --- a/lib/gpc/kernels/sobel_hwy.cpp +++ b/lib/gpc/kernels/sobel_hwy.cpp @@ -106,146 +106,7 @@ void SobelKernelNoDiv(const uint8_t* HWY_RESTRICT in, } } } -void SobelKernel(const uint8_t* HWY_RESTRICT in, - uint8_t* HWY_RESTRICT gradient, - int width, - int height, - uint8_t threshold) { - // We target 4 pixels at a time as our base 'Scalable' unit. - // This allows easy promotion from 8 -> 16 -> 32 bit while keeping lane - // counts identical. - const hn::FixedTag d8; - const hn::FixedTag d16; - const hn::FixedTag d32; - const auto vDiv = hn::Set(d32, 9); - const auto vThreshSq = hn::Set(d32, (int32_t)threshold * threshold); - const auto v255 = hn::Set(d32, 255); - const auto v0 = hn::Zero(d32); - - for (int y = 1; y < height - 1; ++y) { - const uint8_t* r0 = in + (y - 1) * width; - const uint8_t* r1 = in + y * width; - const uint8_t* r2 = in + (y + 1) * width; - uint8_t* out = gradient + y * width + 1; - - for (int x = 0; x < width; x += 4) { - // Load and promote immediately to 32-bit to match naive 'int' math - auto load32 = [&](const uint8_t* p) { - return hn::PromoteTo(d32, hn::PromoteTo(d16, hn::LoadU(d8, p))); - }; - - auto p11 = load32(r0 + x); - auto p12 = load32(r0 + x + 1); - auto p13 = load32(r0 + x + 2); - auto p21 = load32(r1 + x); - auto p23 = load32(r1 + x + 2); - auto p31 = load32(r2 + x); - auto p32 = load32(r2 + x + 1); - auto p33 = load32(r2 + x + 2); - - // Note:: Division is very slow - we use it for now to match exactly - // with the naive non simd-implementation sx = (*p11 + *p31 + 2 * - // *p21 - *p13 - 2 * *p23 - *p33) / 9; - auto sx = - hn::Div(hn::Sub(hn::Add(hn::Add(p11, p31), hn::Add(p21, p21)), - hn::Add(hn::Add(p13, p33), hn::Add(p23, p23))), - vDiv); - - // sy = (*p11 + *p13 + 2 * *p12 - *p31 - 2 * *p32 - *p33) / 9; - auto sy = - hn::Div(hn::Sub(hn::Add(hn::Add(p11, p13), hn::Add(p12, p12)), - hn::Add(hn::Add(p31, p33), hn::Add(p32, p32))), - vDiv); - - // int val = sx * sx + sy * sy; - auto magSq = hn::Add(hn::Mul(sx, sx), hn::Mul(sy, sy)); - - // *optr = val > thresholdSq ? 255 : 0; - auto mask = hn::Gt(magSq, vThreshSq); - auto res32 = hn::IfThenElse(mask, v255, v0); - - // Demote 32 -> 16 -> 8 - auto res8 = hn::DemoteTo(d8, hn::DemoteTo(d16, res32)); - hn::StoreU(res8, d8, out + x); - } - } -} -void SobelKerneli(const uint8_t* HWY_RESTRICT in, - uint8_t* HWY_RESTRICT gradient, - int width, - int height, - uint8_t threshold) { - const hn::ScalableTag d8; - const hn::Half d8_h; - const hn::Rebind d16; - - const size_t N = hn::Lanes(d8); - const auto divisor = hn::Set(d16, (int16_t)7282); - const auto threshSq = hn::Set(d16, (int16_t)(threshold * threshold)); - const auto v255 = hn::Set(d16, 255); - const auto v0 = hn::Zero(d16); - - for (int y = 1; y < height - 1; ++y) { - const uint8_t* r0 = in + (y - 1) * width; - const uint8_t* r1 = in + y * width; - const uint8_t* r2 = in + (y + 1) * width; - uint8_t* out = gradient + y * width + 1; - - for (int x = 0; x < width; x += N) { - auto v11 = hn::LoadU(d8, r0 + x); - auto v12 = hn::LoadU(d8, r0 + x + 1); - auto v13 = hn::LoadU(d8, r0 + x + 2); - auto v21 = hn::LoadU(d8, r1 + x); - auto v23 = hn::LoadU(d8, r1 + x + 2); - auto v31 = hn::LoadU(d8, r2 + x); - auto v32 = hn::LoadU(d8, r2 + x + 1); - auto v33 = hn::LoadU(d8, r2 + x + 2); - - auto process = [&](auto p11, - auto p12, - auto p13, - auto p21, - auto p23, - auto p31, - auto p32, - auto p33) { - auto sx = - hn::Sub(hn::Add(hn::Add(p11, p31), hn::Add(p21, p21)), - hn::Add(hn::Add(p13, p33), hn::Add(p23, p23))); - sx = hn::MulHigh(sx, divisor); - auto sy = - hn::Sub(hn::Add(hn::Add(p11, p13), hn::Add(p12, p12)), - hn::Add(hn::Add(p31, p33), hn::Add(p32, p32))); - sy = hn::MulHigh(sy, divisor); - auto mag = hn::Add(hn::Mul(sx, sx), hn::Mul(sy, sy)); - return hn::IfThenElse(hn::Gt(mag, threshSq), v255, v0); - }; - - // Process Lower Half - auto res_lo = process(hn::PromoteTo(d16, hn::LowerHalf(v11)), - hn::PromoteTo(d16, hn::LowerHalf(v12)), - hn::PromoteTo(d16, hn::LowerHalf(v13)), - hn::PromoteTo(d16, hn::LowerHalf(v21)), - hn::PromoteTo(d16, hn::LowerHalf(v23)), - hn::PromoteTo(d16, hn::LowerHalf(v31)), - hn::PromoteTo(d16, hn::LowerHalf(v32)), - hn::PromoteTo(d16, hn::LowerHalf(v33))); - - // Process Upper Half - auto res_hi = process(hn::PromoteTo(d16, hn::UpperHalf(d8_h, v11)), - hn::PromoteTo(d16, hn::UpperHalf(d8_h, v12)), - hn::PromoteTo(d16, hn::UpperHalf(d8_h, v13)), - hn::PromoteTo(d16, hn::UpperHalf(d8_h, v21)), - hn::PromoteTo(d16, hn::UpperHalf(d8_h, v23)), - hn::PromoteTo(d16, hn::UpperHalf(d8_h, v31)), - hn::PromoteTo(d16, hn::UpperHalf(d8_h, v32)), - hn::PromoteTo(d16, hn::UpperHalf(d8_h, v33))); - - hn::StoreU(hn::OrderedDemote2To(d8, res_lo, res_hi), d8, out + x); - } - } -} } // namespace HWY_NAMESPACE } // namespace ndb HWY_AFTER_NAMESPACE(); diff --git a/tests/test_single_matching.Approval.Inference.approved.txt b/tests/test_single_matching.Approval.Inference.approved.txt index 294072e..d481bd2 100644 --- a/tests/test_single_matching.Approval.Inference.approved.txt +++ b/tests/test_single_matching.Approval.Inference.approved.txt @@ -1 +1 @@ -[(13, 569, 0), (13, 570, 0), (13, 658, -1), (13, 659, -1), (13, 660, -1), (13, 671, -1), (13, 690, -2), (14, 562, 1), (14, 571, 1), (14, 572, 1), (14, 573, 1), (14, 609, 0), (14, 792, -5), (14, 793, -5), (14, 794, -5), (14, 857, -5), (15, 566, 2), (15, 828, -5), (15, 868, -6), (15, 1006, -10), (16, 850, -5), (143, 76, 103), (215, 753, 102), (236, 336, 128), (236, 337, 128), (237, 340, 128), (237, 341, 128), (239, 347, 128), (239, 351, 128), (239, 352, 128), (239, 353, 128), (239, 356, 128), (240, 263, 67), (240, 351, 128), (240, 352, 128), (240, 354, 128), (240, 357, 128), (240, 359, 128), (240, 362, 128), (241, 264, 68), (241, 362, 128), (241, 364, 128), (242, 364, 128), (242, 367, 128), (243, 267, 68), (243, 370, 128), (243, 371, 128), (243, 372, 127), (243, 373, 127), (243, 374, 127), (243, 375, 128), (243, 377, 128), (243, 421, 124), (244, 268, 69), (244, 377, 127), (244, 378, 127), (244, 380, 127), (244, 420, 124), (244, 421, 124), (244, 442, 127), (245, 270, 69), (245, 385, 127), (245, 386, 127), (245, 425, 124), (245, 439, 127), (246, 387, 128), (246, 388, 127), (246, 389, 127), (247, 442, 128), (247, 448, 128), (248, 374, 128), (248, 446, 128), (249, 276, 69), (250, 277, 69), (250, 412, 127), (250, 413, 126), (250, 456, 127), (252, 426, 127), (252, 932, 115), (254, 425, 126), (256, 287, 68), (257, 289, 68), (258, 290, 68), (259, 292, 69), (259, 421, 126), (262, 419, 127), (264, 418, 126), (266, 418, 6), (266, 550, 121), (267, 422, 126), (269, 286, 82), (270, 938, 119), (272, 582, 120), (272, 583, 120), (273, 843, 113), (275, 934, 124), (277, 560, 127), (278, 617, 119), (279, 570, 96), (279, 617, 119), (279, 620, 119), (279, 621, 119), (282, 623, -1), (283, 644, 118), (284, 637, 92), (285, 635, -2), (285, 642, 119), (287, 401, 40), (290, 557, -38), (291, 634, 125), (292, 643, 126), (292, 674, 118), (296, 570, -40), (296, 665, 5), (296, 715, 116), (296, 716, 116), (297, 674, 119), (298, 672, 120), (298, 673, 120), (299, 682, 119), (299, 723, 117), (300, 687, 119), (300, 688, 119), (300, 690, 118), (301, 580, -42), (301, 693, 119), (301, 698, 118), (301, 744, 115), (304, 587, -43), (304, 953, 125), (310, 600, -46), (310, 986, 128), (311, 506, 62), (312, 205, 127), (320, 840, 112), (321, 836, 112), (322, 237, 72), (322, 841, 111), (324, 843, 112), (324, 844, 112), (325, 838, 113), (325, 844, 112), (325, 845, 112), (326, 842, 113), (328, 228, 127), (329, 228, 127), (332, 845, 113), (333, 844, 114), (336, 719, 42), (338, 716, 38), (339, 235, 125), (339, 381, 99), (339, 718, 39), (344, 232, 52), (346, 455, -117), (346, 456, -116), (346, 554, -121), (354, 834, 125), (355, 831, 126), (359, 223, 124), (368, 218, 126), (370, 392, -97), (377, 58, 124), (385, 315, 123), (387, 316, 124), (387, 384, 94), (388, 313, 119), (391, 312, 122), (395, 299, 122), (396, 54, 109), (396, 475, -24), (400, 918, -107), (404, 180, 123), (407, 453, -109), (408, 286, 119), (409, 208, 123), (412, 504, -44), (414, 340, 121), (415, 220, 123), (415, 221, 123), (417, 561, 87), (418, 218, 123), (420, 1035, -94), (425, 338, 118), (433, 53, 124), (434, 379, 117), (436, 379, 118), (437, 52, 128), (437, 205, -47), (437, 379, 118), (439, 378, 109), (440, 317, 120), (443, 285, 120), (443, 394, 125), (446, 414, 126), (448, 202, 121), (452, 330, 119), (452, 719, 64), (454, 199, 122), (455, 52, 126), (455, 723, 65), (456, 377, 118), (457, 199, 122), (463, 325, 120), (465, 330, 112), (470, 192, 121), (472, 330, 120), (473, 327, 118), (473, 372, 115), (477, 372, 112), (480, 272, 108), (480, 279, 119), (481, 170, 120), (481, 186, 120), (481, 221, 115), (482, 169, 120), (483, 185, 121), (486, 823, 63), (489, 181, 120), (492, 370, 115), (500, 312, 118), (504, 179, 119), (505, 172, 122), (508, 171, 120), (509, 171, 120), (510, 961, 126), (517, 169, 121), (517, 217, 118), (518, 401, 124), (519, 400, 122), (520, 400, 124), (523, 321, 115), (523, 407, 126), (525, 171, 118), (526, 272, 117), (527, 363, 127), (529, 315, 116), (532, 171, 119), (534, 360, 110), (539, 360, 112), (543, 297, 117), (552, 588, 27), (555, 262, 96), (555, 295, 117), (555, 391, 123), (556, 357, 112), (557, 291, 116), (558, 594, 27), (565, 161, 115), (570, 169, 116), (570, 170, 120), (572, 354, 111), (572, 398, 122), (573, 313, 114), (573, 611, 28), (574, 609, 29), (576, 394, 105), (577, 615, 29), (578, 352, 113), (579, 352, 110), (580, 697, -38), (581, 166, 112), (583, 354, 112), (585, 622, 31), (589, 310, 112), (591, 262, 113), (593, 352, 113), (595, 204, 115), (597, 68, 117), (597, 69, 117), (598, 64, 117), (598, 66, 117), (598, 67, 117), (598, 68, 117), (598, 69, 117), (598, 70, 117), (598, 71, 117), (598, 72, 117), (598, 81, 117), (599, 73, 117), (599, 75, 117), (599, 86, 116), (599, 87, 116), (600, 83, 117), (600, 84, 117), (600, 85, 117), (600, 88, 117), (600, 94, 116), (600, 95, 116), (601, 92, 117), (601, 99, 116), (601, 100, 116), (601, 101, 116), (601, 102, 116), (602, 102, 116), (602, 106, 116), (602, 107, 116), (603, 151, 115), (603, 204, 114), (603, 205, 113), (604, 126, 115), (604, 146, 115), (604, 147, 115), (605, 146, 115), (605, 205, 113), (605, 348, 107), (606, 144, 115), (606, 146, 115), (606, 148, 115), (606, 150, 115), (606, 203, 114), (607, 150, 115), (608, 55, 118), (609, 204, 114), (610, 205, 114), (611, 205, 114), (611, 208, 114), (611, 209, 114), (611, 210, 114), (612, 150, 116), (612, 202, 114), (612, 205, 114), (612, 206, 114), (612, 208, 113), (612, 209, 113), (612, 345, 112), (613, 102, 117), (613, 202, 114), (613, 348, 111), (613, 652, -113), (614, 212, 114), (614, 305, 111), (614, 306, 111), (615, 103, 118), (615, 347, 111), (616, 154, 114), (616, 233, 113), (616, 234, 113), (616, 236, 113), (616, 250, 113), (616, 251, 113), (616, 303, 112), (616, 655, 34), (617, 132, 117), (617, 160, 116), (617, 239, 113), (617, 241, 113), (617, 242, 113), (617, 251, 113), (617, 347, 110), (618, 134, 117), (618, 137, 117), (618, 138, 117), (618, 250, 113), (619, 108, 118), (619, 109, 118), (619, 110, 118), (619, 129, 117), (619, 135, 117), (619, 136, 117), (619, 138, 117), (619, 147, 117), (619, 154, 116), (619, 253, 113), (620, 103, 118), (620, 110, 118), (620, 256, 113), (620, 258, 119), (620, 304, 112), (621, 111, 117), (621, 150, 116), (622, 129, 118), (622, 168, 117), (622, 308, 112), (622, 347, 111), (623, 129, 118), (623, 203, 115), (623, 303, 112), (623, 306, 111), (623, 309, 111), (623, 310, 111), (623, 346, 110), (623, 347, 111), (624, 130, 117), (624, 164, 116), (624, 205, 115), (624, 305, 112), (624, 308, 111), (624, 311, 112), (624, 388, 116), (625, 167, 117), (625, 169, 116), (625, 198, 116), (625, 205, 115), (625, 303, 111), (625, 316, 111), (625, 317, 111), (625, 318, 111), (625, 319, 111), (625, 339, 111), (625, 341, 111), (625, 342, 111), (625, 343, 111), (625, 389, 123), (626, 130, 117), (626, 138, 118), (626, 199, 115), (626, 200, 115), (626, 301, 112), (626, 326, 111), (626, 329, 111), (626, 330, 111), (626, 331, 111), (626, 332, 111), (626, 333, 111), (626, 339, 111), (626, 342, 111), (626, 343, 111), (627, 130, 117), (627, 141, 116), (627, 254, 115), (627, 337, 111), (627, 339, 111), (628, 50, 117), (628, 183, 116), (628, 184, 116), (628, 185, 116), (628, 254, 114), (628, 255, 114), (628, 302, 111), (628, 341, 111), (629, 97, 117), (629, 132, 118), (629, 188, 115), (629, 192, 116), (629, 193, 116), (629, 255, 114), (629, 341, 111), (630, 144, 117), (630, 193, 115), (630, 194, 115), (630, 302, 112), (630, 342, 111), (630, 344, 111), (630, 348, 110), (631, 134, 118), (631, 344, 111), (631, 348, 110), (632, 47, 118), (632, 50, 117), (632, 109, 119), (632, 196, 116), (632, 198, 116), (632, 202, 115), (632, 203, 115), (632, 205, 115), (632, 206, 115), (632, 342, 111), (633, 52, 118), (633, 53, 118), (633, 54, 118), (633, 206, 115), (633, 214, 114), (634, 53, 118), (634, 55, 118), (634, 64, 117), (635, 51, 118), (635, 76, 117), (635, 97, 117), (635, 112, 118), (635, 136, 116), (636, 61, 117), (636, 79, 117), (636, 97, 117), (636, 301, 113), (637, 75, 118), (637, 84, 115), (637, 305, 113), (637, 347, 112), (638, 347, 112), (638, 408, 126), (639, 83, 117), (639, 387, 117), (639, 408, 124), (640, 85, 117), (640, 89, 117), (640, 92, 117), (640, 277, 113), (640, 278, 113), (641, 85, 117), (641, 111, 119), (641, 347, 112), (642, 91, 114), (643, 144, 115), (643, 145, 115), (643, 343, 112), (643, 346, 111), (644, 87, 114), (644, 142, 115), (644, 342, 112), (645, 94, 115), (645, 112, 121), (645, 162, 115), (645, 343, 112), (646, 90, 115), (646, 143, 116), (646, 146, 116), (646, 148, 116), (646, 149, 116), (646, 166, 115), (646, 167, 115), (646, 347, 102), (647, 118, 121), (647, 151, 116), (647, 163, 116), (647, 164, 116), (647, 343, 111), (647, 364, 115), (648, 144, 116), (648, 364, 115), (648, 386, 124), (649, 92, 116), (649, 119, 121), (649, 170, 115), (650, 294, 113), (651, 93, 116), (652, 459, 84), (654, 318, 112), (655, 122, 121), (655, 318, 111), (655, 321, 112), (655, 322, 112), (655, 323, 112), (655, 334, 112), (655, 439, 87), (656, 228, 114), (656, 229, 114), (656, 233, 113), (656, 256, 113), (656, 321, 112), (656, 323, 112), (656, 332, 111), (656, 347, 111), (657, 125, 66), (657, 331, 111), (657, 347, 111), (658, 149, 118), (658, 244, 114), (659, 254, 113), (659, 255, 113), (659, 265, 113), (660, 261, 113), (661, 99, 116), (662, 126, 121), (662, 127, 121), (664, 152, 120), (665, 126, 123), (665, 311, 113), (666, 340, 111), (666, 341, 111), (666, 380, 117), (667, 119, 121), (667, 129, 121), (667, 331, 111), (667, 339, 112), (667, 340, 111), (667, 343, 112), (668, 328, 112), (668, 329, 112), (668, 330, 112), (668, 339, 112), (668, 343, 111), (668, 380, 118), (669, 342, 111), (669, 343, 111), (669, 344, 111), (670, 343, 111), (671, 343, 111), (672, 344, 111), (673, 103, 117), (673, 106, 118), (673, 120, 119), (674, 426, -43), (674, 432, 11), (675, 135, 122), (677, 136, 121), (677, 155, 122), (678, 156, 121), (679, 123, 120), (679, 352, 115), (680, 136, 122), (681, 157, 122), (682, 125, 119), (682, 347, 113), (682, 459, 11), (683, 459, 13), (684, 138, 123), (686, 346, 116), (688, 354, 114), (688, 379, 118), (689, 493, -77), (691, 356, 115), (691, 378, 116), (692, 455, -69), (693, 356, 114), (694, 164, 126), (695, 356, 114), (697, 119, 119), (698, 130, 118), (698, 390, 121), (698, 955, 88), (699, 970, 4), (699, 971, 4), (699, 972, 89), (700, 998, 109), (703, 122, 120), (705, 123, 120), (706, 354, 115), (707, 124, 120), (707, 354, 115), (708, 125, 120), (708, 149, 125), (709, 372, 116), (710, 126, 120), (711, 130, 120), (712, 127, 120), (712, 130, 120), (715, 128, 120), (716, 128, 120), (716, 174, 124), (718, 129, 120), (718, 175, 124), (719, 134, 120), (720, 490, -69), (721, 131, 120), (723, 159, 125), (725, 133, 121), (726, 133, 121), (727, 158, 126), (727, 160, 126), (729, 135, 121), (730, 135, 121), (730, 159, 126), (730, 162, 126), (731, 136, 121), (734, 161, 126), (738, 1036, -31), (741, 139, 122), (742, 141, 122), (742, 164, 128), (742, 433, -93), (743, 638, -73), (744, 140, 121), (746, 166, 127), (747, 192, 126), (751, 171, 66), (755, 170, 127), (756, 173, 128), (757, 468, 122), (757, 479, 119), (760, 172, 128), (763, 174, 128), (764, 176, 65), (768, 369, 118), (772, 154, 125), (775, 443, -4), (777, 448, -3), (778, 457, 108), (778, 458, 108), (784, 185, 63), (793, 1033, -10), (794, 884, -29), (795, 1011, -44), (796, 400, -6), (796, 1025, 128), (797, 427, -8), (798, 165, 126), (798, 1024, -5), (800, 433, 45), (801, 880, -11), (802, 412, 119), (802, 771, 52), (802, 937, 94), (816, 224, 126), (816, 389, -93), (816, 393, -93), (816, 876, -77), (820, 934, -91), (823, 653, 95), (824, 423, -25), (824, 965, 66), (830, 936, 86), (833, 871, 62), (839, 769, -124), (845, 470, 83), (848, 185, 128), (850, 481, 47), (850, 864, -124), (851, 424, 2), (864, 762, -125), (865, 383, 6), (865, 761, -124), (868, 346, 53), (873, 476, 7), (884, 337, -6), (885, 371, -87), (885, 372, -87), (888, 586, 60), (889, 373, 45), (900, 759, -50), (902, 642, -3), (913, 389, 37), (916, 435, 38), (917, 842, -107), (919, 704, 58), (927, 561, -120), (940, 695, 54), (944, 840, -85), (957, 691, 52), (963, 269, 54), (963, 935, 116), (964, 952, -89), (965, 274, 54), (965, 275, 54), (965, 988, -94), (965, 989, -94), (966, 828, -90), (966, 983, -92), (966, 984, -92), (966, 985, -92), (966, 986, -92), (970, 667, -48), (972, 689, 56), (975, 397, 118), (990, 1000, -97), (997, 813, -85), (1007, 769, 53), (1010, 812, -61), (1018, 392, 38), (1026, 867, 91), (1057, 665, -59), (1085, 971, -84), (1087, 382, 112), (1119, 787, 40), (1128, 1008, -1), (1129, 973, 72), (1129, 974, 72), (1130, 989, -2), (1132, 642, 47), (1150, 639, 52), (1157, 635, 48), (1173, 631, 46), (1174, 461, 93), (1197, 804, -98), (1202, 624, -55), (1214, 500, -60), (1223, 618, 52), (1236, 494, -77), (1238, 929, 81), (1258, 916, 88), (1261, 606, 45), (1266, 684, 66), (1270, 603, 47), (1276, 685, 58), (1281, 998, 110), (1281, 999, 110), (1281, 1000, 112), (1288, 928, 120), (1288, 929, 120), (1296, 78, 107), (1297, 86, 107), (1297, 100, 109), (1297, 101, 109), (1297, 103, 109), (1297, 104, 109), (1298, 75, 108), (1298, 102, 109), (1298, 105, 108), (1298, 106, 108), (1299, 75, 108), (1300, 69, 109), (1301, 68, 108), (1302, 69, 109), (1302, 75, 109), (1303, 73, 108), (1304, 50, 107), (1304, 65, 108), (1304, 76, 108), (1307, 507, 77), (1322, 666, 65), (1325, 588, 47), (1341, 197, 123), (1341, 719, 39), (1341, 720, 39), (1354, 344, 97), (1355, 752, 38), (1355, 753, 38), (1356, 343, 98), (1358, 335, 96), (1360, 335, 97), (1360, 336, 97), (1364, 336, 98), (1367, 561, -53), (1371, 461, 125), (1372, 459, 126), (1380, 337, 99), (1383, 335, 111), (1403, 715, 83), (1408, 820, 110), (1412, 825, 114), (1417, 856, 116), (1418, 570, 107), (1426, 910, 20), (1427, 890, 120), (1428, 889, 120), (1437, 942, 125), (1437, 943, 125), (1443, 963, 124), (1453, 896, 123), (1455, 417, 101), (1456, 305, 99), (1456, 308, 99), (1456, 416, 102), (1457, 306, 100), (1457, 416, 102), (1457, 917, 119), (1458, 897, 125), (1461, 477, 115), (1464, 410, 98), (1467, 334, 103), (1469, 407, 98), (1469, 440, 104), (1469, 441, 103), (1469, 443, 103), (1514, 460, 104), (1519, 897, 124), (1519, 899, 124), (1520, 897, 124), (1526, 929, 124), (1528, 899, 122), (1529, 912, 125), (1529, 913, 124), (1529, 916, 125), (1531, 839, 124), (1536, 969, 128), (1539, 962, 127), (1540, 964, 127), (1540, 965, 127), (1561, 1000, 128), (1584, 863, 49), (1686, 1017, -39), (1712, 979, 90), (1713, 978, 86), (1766, 971, 128), (1769, 966, 128), (1778, 834, -14), (1853, 970, 126), (1885, 905, 89), (1897, 935, 125), (1902, 171, 1), (1903, 67, 1), (1903, 291, 1), (1903, 292, 1), (1904, 260, 1), (1904, 261, 1), (1905, 329, 1)] +[(13, 570, 0), (13, 658, -1), (13, 659, -1), (13, 660, -1), (13, 690, -2), (14, 562, 1), (14, 571, 1), (14, 572, 1), (14, 573, 1), (14, 609, 0), (14, 792, -5), (14, 857, -5), (15, 566, 2), (15, 868, -6), (15, 1006, -10), (16, 850, -5), (143, 76, 103), (150, 736, 102), (165, 543, 81), (165, 546, 80), (175, 982, 128), (178, 546, 87), (183, 546, 81), (183, 743, 99), (188, 742, 102), (191, 749, 105), (195, 547, 77), (203, 756, 103), (215, 753, 102), (224, 690, 95), (236, 336, 128), (236, 337, 128), (237, 340, 128), (237, 341, 128), (239, 347, 128), (239, 351, 128), (239, 352, 128), (239, 353, 128), (239, 356, 128), (240, 263, 67), (240, 351, 128), (240, 352, 128), (240, 354, 128), (240, 357, 128), (240, 359, 128), (240, 362, 128), (241, 264, 68), (241, 362, 128), (241, 364, 128), (242, 364, 128), (242, 367, 128), (243, 267, 68), (243, 370, 128), (243, 371, 128), (243, 372, 127), (243, 373, 127), (243, 374, 127), (243, 375, 128), (243, 377, 128), (243, 421, 124), (244, 376, 127), (244, 377, 127), (244, 378, 127), (244, 420, 124), (244, 421, 124), (244, 442, 127), (245, 270, 69), (245, 385, 127), (245, 386, 127), (245, 425, 124), (245, 439, 127), (246, 387, 128), (246, 388, 127), (246, 389, 127), (246, 741, 102), (247, 362, 128), (247, 442, 128), (247, 448, 128), (248, 373, 128), (248, 374, 128), (248, 376, 128), (248, 446, 128), (249, 276, 69), (249, 404, 127), (250, 277, 69), (250, 411, 127), (250, 412, 127), (250, 413, 126), (250, 456, 127), (252, 426, 127), (254, 425, 126), (256, 287, 68), (257, 289, 68), (258, 290, 68), (259, 292, 69), (259, 421, 126), (261, 521, 94), (262, 419, 127), (264, 418, 126), (264, 853, 114), (266, 418, 6), (266, 550, 121), (267, 422, 126), (269, 286, 82), (272, 582, 120), (272, 583, 120), (273, 324, 116), (273, 843, 113), (275, 934, 124), (276, 839, 112), (277, 560, 127), (278, 585, 125), (278, 617, 119), (279, 570, 96), (279, 617, 119), (279, 620, 119), (279, 621, 119), (282, 582, 127), (282, 623, -1), (283, 644, 118), (284, 637, 92), (285, 635, -2), (285, 642, 119), (287, 401, 40), (288, 554, -37), (290, 557, -38), (290, 558, -39), (291, 634, 125), (292, 643, 126), (292, 674, 118), (296, 570, -40), (296, 665, 5), (296, 716, 116), (297, 674, 119), (298, 673, 120), (299, 677, 120), (299, 682, 119), (300, 687, 119), (300, 688, 119), (300, 690, 118), (301, 580, -42), (301, 693, 119), (301, 698, 118), (301, 744, 115), (304, 953, 125), (309, 952, 125), (309, 953, 125), (310, 600, -46), (312, 205, 127), (319, 236, 128), (320, 619, -51), (320, 840, 112), (321, 836, 112), (322, 237, 72), (322, 238, 126), (322, 841, 111), (324, 843, 112), (324, 844, 112), (325, 231, 128), (325, 838, 113), (325, 844, 112), (325, 845, 112), (325, 983, 128), (326, 842, 113), (328, 228, 127), (329, 228, 127), (329, 240, 125), (329, 706, 34), (332, 231, 126), (333, 844, 114), (336, 718, -52), (336, 719, 42), (338, 716, 38), (339, 235, 125), (339, 381, 99), (340, 748, -65), (344, 232, 52), (346, 455, -117), (346, 456, -116), (346, 554, -121), (347, 229, 125), (347, 969, 125), (348, 229, 123), (355, 241, 119), (355, 830, 126), (355, 831, 126), (356, 830, 126), (358, 222, 126), (361, 639, 75), (366, 217, 124), (368, 218, 126), (368, 689, 3), (377, 58, 124), (385, 314, 123), (385, 315, 123), (390, 314, 123), (391, 308, 123), (391, 312, 122), (393, 303, 121), (393, 307, 122), (394, 303, 121), (394, 305, 122), (395, 299, 122), (396, 294, 121), (396, 301, 122), (400, 918, -107), (404, 180, 123), (404, 293, 121), (407, 453, -109), (408, 286, 119), (409, 208, 123), (414, 340, 121), (414, 602, 57), (415, 220, 123), (415, 221, 123), (415, 222, 123), (418, 218, 123), (419, 224, 122), (420, 1035, -94), (420, 1036, -94), (421, 221, 122), (423, 226, 122), (423, 227, 122), (425, 338, 118), (433, 53, 124), (433, 384, 119), (434, 379, 117), (436, 379, 118), (437, 52, 128), (437, 379, 118), (439, 378, 109), (439, 380, 128), (440, 204, 122), (440, 317, 120), (443, 285, 120), (443, 394, 125), (446, 414, 126), (448, 202, 121), (449, 200, 121), (449, 202, 121), (452, 330, 119), (452, 719, 64), (453, 201, 119), (453, 705, 74), (453, 720, 64), (454, 199, 122), (455, 52, 126), (456, 226, 121), (456, 377, 118), (458, 327, 119), (459, 197, 121), (463, 325, 120), (465, 330, 112), (472, 330, 120), (473, 372, 115), (474, 188, 120), (477, 372, 112), (480, 272, 108), (480, 279, 119), (481, 170, 120), (481, 186, 120), (481, 221, 115), (481, 882, 118), (482, 169, 120), (483, 185, 121), (486, 823, 63), (488, 221, 115), (492, 370, 115), (493, 180, 114), (499, 313, 118), (502, 172, 120), (506, 173, 119), (508, 175, 121), (510, 961, 126), (511, 397, 123), (512, 961, 124), (513, 323, 120), (513, 988, -79), (517, 167, 123), (517, 169, 121), (517, 217, 118), (518, 401, 124), (518, 801, 111), (518, 960, 126), (519, 400, 122), (519, 957, 125), (520, 400, 124), (520, 801, 112), (523, 407, 126), (524, 961, 128), (525, 171, 118), (525, 306, 117), (526, 272, 117), (527, 363, 127), (529, 313, 117), (530, 313, 117), (530, 639, -38), (532, 171, 119), (532, 312, 118), (532, 641, -41), (533, 300, 116), (534, 360, 110), (534, 384, 121), (539, 360, 112), (541, 315, 115), (543, 297, 117), (551, 388, 121), (552, 588, 27), (555, 169, 118), (555, 262, 96), (555, 295, 117), (555, 391, 123), (556, 357, 112), (557, 291, 116), (557, 386, 122), (558, 594, 27), (561, 290, 116), (565, 161, 115), (565, 293, 116), (570, 161, 119), (570, 169, 116), (572, 354, 111), (572, 398, 122), (573, 611, 28), (574, 609, 29), (576, 394, 105), (577, 615, 29), (578, 352, 113), (579, 352, 110), (580, 697, -38), (581, 166, 112), (583, 354, 112), (585, 622, 31), (589, 310, 112), (589, 353, 112), (591, 262, 113), (593, 352, 113), (593, 895, 119), (594, 381, 115), (597, 68, 117), (597, 69, 117), (597, 165, 115), (598, 64, 117), (598, 66, 117), (598, 67, 117), (598, 68, 117), (598, 69, 117), (598, 70, 117), (598, 71, 117), (598, 72, 117), (598, 81, 117), (599, 72, 117), (599, 73, 117), (599, 75, 117), (599, 86, 116), (599, 87, 116), (600, 83, 117), (600, 84, 117), (600, 85, 117), (600, 88, 117), (600, 94, 116), (600, 95, 116), (601, 91, 117), (601, 92, 117), (601, 99, 116), (601, 100, 116), (601, 101, 116), (601, 102, 116), (602, 102, 116), (602, 106, 116), (602, 107, 116), (603, 151, 115), (603, 204, 114), (603, 205, 113), (604, 126, 115), (604, 146, 115), (604, 147, 115), (604, 203, 114), (605, 146, 115), (605, 205, 113), (605, 348, 107), (606, 144, 115), (606, 146, 115), (606, 148, 115), (606, 150, 115), (606, 203, 114), (607, 150, 115), (608, 55, 118), (609, 204, 114), (609, 350, 112), (610, 205, 114), (611, 205, 114), (611, 208, 114), (611, 209, 114), (611, 210, 114), (612, 150, 116), (612, 202, 114), (612, 205, 114), (612, 206, 114), (612, 208, 113), (612, 209, 113), (612, 345, 112), (613, 102, 117), (613, 202, 114), (613, 348, 111), (613, 652, -113), (614, 212, 114), (614, 305, 111), (614, 306, 111), (615, 103, 118), (616, 151, 116), (616, 154, 114), (616, 233, 113), (616, 234, 113), (616, 236, 113), (616, 250, 113), (616, 251, 113), (616, 303, 112), (616, 349, 111), (616, 655, 34), (617, 132, 117), (617, 160, 116), (617, 204, 116), (617, 239, 113), (617, 241, 113), (617, 242, 113), (617, 251, 113), (617, 347, 110), (618, 134, 117), (618, 137, 117), (618, 250, 113), (619, 108, 118), (619, 109, 118), (619, 110, 118), (619, 129, 117), (619, 135, 117), (619, 136, 117), (619, 138, 117), (619, 147, 117), (619, 154, 116), (619, 253, 113), (620, 103, 118), (620, 107, 118), (620, 110, 118), (620, 256, 113), (620, 258, 119), (620, 304, 112), (621, 111, 117), (621, 150, 116), (622, 129, 118), (622, 132, 117), (622, 168, 117), (622, 308, 112), (622, 347, 111), (623, 129, 118), (623, 169, 117), (623, 203, 115), (623, 303, 112), (623, 306, 111), (623, 309, 111), (623, 310, 111), (623, 346, 110), (623, 347, 111), (624, 102, 118), (624, 130, 117), (624, 133, 117), (624, 164, 116), (624, 305, 112), (624, 306, 111), (624, 308, 111), (624, 311, 112), (624, 388, 116), (625, 167, 117), (625, 169, 116), (625, 198, 116), (625, 205, 115), (625, 303, 111), (625, 316, 111), (625, 317, 111), (625, 318, 111), (625, 319, 111), (625, 339, 111), (625, 341, 111), (625, 342, 111), (625, 343, 111), (625, 389, 123), (626, 130, 117), (626, 138, 118), (626, 167, 117), (626, 199, 115), (626, 200, 115), (626, 203, 115), (626, 301, 112), (626, 326, 111), (626, 329, 111), (626, 330, 111), (626, 331, 111), (626, 332, 111), (626, 333, 111), (626, 339, 111), (626, 342, 111), (626, 343, 111), (627, 50, 118), (627, 130, 117), (627, 141, 116), (627, 202, 116), (627, 204, 115), (627, 254, 115), (627, 322, 111), (627, 337, 111), (627, 339, 111), (628, 50, 117), (628, 183, 116), (628, 184, 116), (628, 185, 116), (628, 254, 114), (628, 255, 114), (628, 302, 111), (628, 341, 111), (629, 132, 118), (629, 188, 115), (629, 192, 116), (629, 255, 114), (629, 341, 111), (630, 107, 118), (630, 144, 117), (630, 194, 115), (630, 302, 112), (630, 342, 111), (630, 344, 111), (630, 348, 110), (630, 410, 128), (631, 50, 117), (631, 134, 118), (631, 272, 114), (631, 344, 111), (631, 348, 110), (632, 47, 118), (632, 50, 117), (632, 196, 116), (632, 198, 116), (632, 202, 115), (632, 203, 115), (632, 205, 115), (632, 206, 115), (632, 342, 111), (633, 52, 118), (633, 53, 118), (633, 54, 118), (633, 206, 115), (633, 214, 114), (633, 343, 111), (634, 47, 117), (634, 53, 118), (634, 55, 118), (635, 51, 118), (635, 76, 117), (635, 97, 117), (635, 112, 118), (635, 136, 116), (636, 61, 117), (636, 79, 117), (636, 85, 117), (636, 97, 117), (636, 301, 113), (637, 84, 115), (637, 305, 113), (637, 347, 112), (638, 92, 117), (638, 347, 112), (638, 408, 126), (639, 83, 117), (639, 387, 117), (639, 406, 124), (639, 408, 124), (640, 85, 117), (640, 89, 117), (640, 92, 117), (640, 111, 120), (640, 277, 113), (640, 278, 113), (640, 279, 113), (641, 85, 117), (641, 92, 117), (641, 347, 112), (642, 91, 114), (643, 144, 115), (643, 145, 115), (643, 343, 112), (644, 87, 114), (644, 142, 115), (644, 342, 112), (645, 94, 115), (645, 112, 121), (645, 162, 115), (645, 343, 112), (646, 90, 115), (646, 111, 119), (646, 143, 116), (646, 146, 116), (646, 148, 116), (646, 149, 116), (646, 347, 102), (647, 118, 121), (647, 144, 116), (647, 151, 116), (647, 163, 116), (647, 164, 116), (647, 343, 111), (648, 144, 116), (648, 364, 115), (648, 386, 124), (649, 92, 116), (649, 119, 121), (649, 145, 116), (650, 112, 120), (650, 293, 113), (650, 294, 113), (650, 295, 113), (651, 93, 116), (651, 302, 112), (652, 459, 84), (654, 318, 112), (655, 122, 121), (655, 318, 111), (655, 321, 112), (655, 322, 112), (655, 323, 112), (655, 334, 112), (655, 439, 87), (656, 228, 114), (656, 229, 114), (656, 233, 113), (656, 256, 113), (656, 321, 112), (656, 323, 112), (656, 332, 111), (656, 347, 111), (657, 118, 120), (657, 125, 66), (657, 236, 114), (657, 331, 111), (657, 347, 111), (658, 149, 118), (658, 244, 114), (659, 254, 113), (659, 255, 113), (659, 381, 118), (660, 261, 113), (660, 869, -42), (661, 99, 116), (662, 126, 121), (662, 127, 121), (663, 120, 122), (664, 152, 120), (665, 311, 113), (666, 340, 111), (666, 341, 111), (666, 380, 117), (667, 119, 121), (667, 129, 121), (667, 331, 111), (667, 339, 112), (667, 340, 111), (667, 343, 112), (667, 344, 112), (668, 328, 112), (668, 329, 112), (668, 330, 112), (668, 339, 112), (668, 343, 111), (668, 380, 118), (669, 342, 111), (669, 343, 111), (669, 344, 111), (670, 343, 111), (670, 380, 117), (671, 343, 111), (672, 344, 111), (672, 410, 11), (673, 103, 117), (673, 106, 118), (673, 120, 119), (674, 146, 122), (674, 426, -43), (674, 432, 11), (677, 136, 121), (677, 155, 122), (678, 156, 121), (679, 123, 120), (679, 352, 115), (680, 136, 122), (681, 157, 122), (682, 125, 119), (682, 347, 113), (682, 459, 11), (683, 459, 13), (684, 352, 113), (686, 126, 121), (686, 138, 123), (686, 346, 116), (686, 353, 113), (687, 160, 122), (688, 354, 114), (688, 379, 118), (689, 493, -77), (691, 356, 115), (691, 378, 116), (692, 455, -69), (693, 356, 114), (694, 129, 121), (694, 164, 126), (697, 119, 119), (698, 390, 121), (698, 955, 88), (699, 130, 123), (699, 970, 4), (699, 971, 4), (699, 972, 89), (700, 998, 109), (701, 130, 119), (701, 372, 116), (702, 130, 119), (703, 122, 120), (703, 934, 90), (705, 123, 120), (705, 360, 114), (706, 354, 115), (707, 124, 120), (707, 354, 115), (708, 125, 120), (708, 149, 125), (709, 372, 116), (710, 126, 120), (711, 130, 120), (712, 127, 120), (712, 130, 120), (714, 363, 113), (716, 128, 120), (716, 174, 124), (718, 129, 120), (718, 175, 124), (719, 133, 120), (719, 134, 120), (720, 133, 120), (720, 490, -69), (721, 131, 120), (723, 159, 125), (725, 133, 121), (726, 133, 121), (727, 158, 126), (727, 160, 126), (729, 135, 121), (730, 135, 121), (730, 159, 126), (730, 162, 126), (731, 136, 121), (734, 161, 126), (738, 1036, -31), (740, 144, 122), (741, 139, 122), (742, 141, 122), (742, 164, 128), (742, 433, -93), (743, 142, 122), (743, 638, -73), (746, 166, 127), (747, 192, 126), (751, 171, 66), (755, 170, 127), (756, 173, 128), (757, 468, 122), (757, 479, 119), (757, 818, 59), (758, 686, 112), (760, 172, 128), (763, 174, 128), (764, 176, 65), (764, 961, -75), (768, 369, 118), (772, 154, 125), (775, 443, -4), (778, 383, 121), (778, 457, 108), (779, 456, -5), (784, 185, 63), (784, 387, 122), (785, 210, 128), (793, 1029, 66), (793, 1033, -10), (794, 884, -29), (795, 1011, -44), (796, 400, -6), (796, 1025, 128), (797, 427, -8), (798, 165, 126), (798, 1024, -5), (800, 433, 45), (801, 880, -11), (802, 412, 119), (802, 771, 52), (816, 224, 126), (816, 389, -93), (816, 393, -93), (819, 409, 104), (823, 653, 95), (824, 423, -25), (833, 871, 62), (839, 769, -124), (845, 470, 83), (848, 185, 128), (850, 481, 47), (850, 864, -124), (855, 763, -120), (855, 1017, -109), (864, 762, -125), (865, 383, 6), (865, 761, -124), (868, 346, 53), (869, 930, 84), (883, 350, 48), (884, 337, -6), (885, 371, -87), (888, 586, 60), (889, 373, 45), (900, 759, -50), (902, 642, -3), (913, 388, 37), (913, 389, 37), (916, 435, 38), (917, 842, -107), (919, 704, 58), (927, 561, -120), (940, 695, 54), (944, 840, -85), (947, 354, 72), (957, 691, 52), (961, 901, 34), (963, 935, 116), (964, 952, -89), (965, 274, 54), (965, 275, 54), (965, 988, -94), (965, 989, -94), (966, 828, -90), (966, 983, -92), (966, 984, -92), (966, 985, -92), (966, 986, -92), (970, 667, -48), (972, 689, 56), (975, 356, 84), (975, 397, 118), (981, 1018, 58), (989, 413, -70), (990, 1000, -97), (997, 813, -85), (1006, 832, 83), (1007, 769, 53), (1010, 812, -61), (1018, 392, 38), (1026, 867, 91), (1033, 777, 92), (1057, 665, -59), (1084, 974, -4), (1119, 787, 40), (1125, 850, 28), (1127, 574, 44), (1128, 1008, -1), (1129, 973, 72), (1129, 974, 72), (1130, 989, -2), (1132, 642, 47), (1150, 639, 52), (1150, 777, 126), (1157, 635, 48), (1161, 843, -14), (1173, 631, 46), (1174, 461, 93), (1197, 804, -98), (1202, 624, -55), (1214, 500, -60), (1223, 618, 52), (1236, 494, -77), (1238, 929, 81), (1258, 916, 88), (1261, 606, 45), (1266, 684, 66), (1270, 603, 47), (1276, 685, 58), (1281, 998, 110), (1281, 999, 110), (1296, 78, 107), (1297, 86, 107), (1297, 100, 109), (1297, 101, 109), (1297, 103, 109), (1297, 104, 109), (1298, 75, 108), (1298, 102, 109), (1298, 105, 108), (1298, 106, 108), (1299, 75, 108), (1300, 69, 109), (1301, 68, 108), (1302, 69, 109), (1302, 75, 109), (1303, 73, 108), (1304, 50, 107), (1304, 65, 108), (1304, 76, 108), (1307, 507, 77), (1308, 60, 108), (1308, 74, 108), (1320, 691, 122), (1320, 692, 124), (1322, 666, 65), (1325, 588, 47), (1341, 197, 123), (1341, 719, 39), (1341, 720, 39), (1354, 344, 97), (1355, 344, 97), (1355, 753, 38), (1356, 343, 98), (1358, 335, 96), (1360, 335, 97), (1360, 336, 97), (1364, 336, 98), (1367, 561, -53), (1371, 461, 125), (1372, 459, 126), (1380, 337, 99), (1383, 335, 111), (1388, 771, 98), (1408, 820, 110), (1412, 825, 114), (1412, 886, 120), (1413, 827, 114), (1417, 856, 116), (1418, 570, 107), (1421, 568, 103), (1422, 912, 19), (1426, 910, 20), (1427, 890, 120), (1428, 889, 120), (1433, 895, 120), (1437, 942, 125), (1437, 943, 125), (1441, 436, 101), (1443, 963, 124), (1447, 554, 109), (1452, 472, 113), (1453, 896, 123), (1454, 337, 102), (1454, 418, 102), (1454, 421, 100), (1455, 417, 101), (1456, 305, 99), (1456, 308, 99), (1456, 416, 102), (1456, 425, 102), (1457, 416, 102), (1457, 917, 119), (1458, 897, 125), (1461, 477, 115), (1461, 478, 115), (1461, 915, 127), (1464, 410, 98), (1467, 92, 125), (1467, 334, 103), (1467, 409, 99), (1468, 407, 98), (1468, 441, 103), (1469, 440, 104), (1469, 443, 103), (1470, 405, 98), (1472, 404, 98), (1502, 401, 111), (1504, 471, 122), (1511, 413, 59), (1514, 460, 104), (1514, 810, 126), (1519, 897, 124), (1519, 899, 124), (1520, 896, 122), (1526, 929, 124), (1527, 931, 125), (1528, 899, 122), (1529, 912, 125), (1529, 913, 124), (1529, 916, 125), (1531, 839, 124), (1536, 969, 128), (1539, 962, 127), (1540, 964, 127), (1540, 965, 127), (1544, 975, 120), (1556, 535, 126), (1584, 863, 49), (1634, 1013, 86), (1686, 1017, -39), (1712, 979, 90), (1713, 978, 86), (1758, 833, -38), (1763, 833, -22), (1766, 964, 127), (1766, 971, 128), (1768, 1013, 121), (1769, 966, 128), (1778, 834, -14), (1850, 981, 38), (1853, 970, 126), (1885, 905, 89), (1897, 935, 125), (1902, 171, 1), (1903, 67, 1), (1903, 291, 1), (1903, 292, 1), (1903, 299, 1), (1903, 300, 1), (1904, 260, 1), (1904, 261, 1), (1905, 329, 1)] diff --git a/tests/test_single_matching.cpp b/tests/test_single_matching.cpp index f7724b9..9443a33 100644 --- a/tests/test_single_matching.cpp +++ b/tests/test_single_matching.cpp @@ -49,7 +49,7 @@ TEST(Approval, Inference) std::stringstream ss; ss << supp; - EXPECT_EQ(866, supp.size()); + EXPECT_EQ(1024, supp.size()); ApprovalTests::Approvals::verify(ss.str()); } std::vector getSrcDescriptors() {